NameError: name 'Path' is not defined when using HF.Dataset.from_generator

train_json_files = glob(paths.TRAIN_JSON_FOLDER + "*.json")
from pathlib import Path

def get_gt_string_and_xy(filepath: Union[str, os.PathLike]) -> Dict[str, str]:
    """
    Get the ground truth string and x-y data from the given JSON file.
    :param filepath: The path to the JSON file
    :return dict: A dictionary containing the ground truth string, x-y data, chart type, id, and source
    """
    filepath = Path(filepath)
    with open(filepath) as fp:
        data = json.load(fp)

    all_x, all_y = process_data_series(data.get("data-series", []))
    chart_type = data.get('chart-type', '')
    chart_str = create_chart_string(chart_type)
    x_str = create_coordinate_string("x", all_x)
    y_str = create_coordinate_string("y", all_y)

    gt_string = chart_str + x_str + y_str

    return {
        "ground_truth": gt_string,
        "x": json.dumps(all_x),
        "y": json.dumps(all_y),
        "chart-type": chart_type,
        "id": filepath.stem,
        "source": data.get("source", ''),
    }
def gen_data(files: List[str], paths:paths, get_gt_string_and_xy:callable ) -> Dict[str, str]:
    """
    This function takes a list of json files and returns a generator that yields a
    dictionary with the ground truth string and the path to the image.
    :param files (list): A list of json files
    :return generator: A generator that yields a dictionary with the ground truth string and the path to the corresponding image.
    """
    for f in files:
        # Extract image ID from the file path
        image_id = f.split("/")[-1].split(".")[0]
        # Construct the image path based on the ID
        image_path = paths.TRAIN_IMAGES_FOLDER + image_id + ".jpg"
        # Yield a dictionary containing ground truth string, image path, and other information
        yield {
            **get_gt_string_and_xy(f),
            "image_path": image_path,
        }

ds = HFDataset.from_generator(
    gen_data, gen_kwargs={"files": train_json_files,"paths":paths,"get_gt_string_and_xy":get_gt_string_and_xy}, num_proc=config.NUM_PROCESS
)
print(f"Ground Truth string: \n {ds['ground_truth'][0]}")

I have this function to create a generator of my data, there is a function that uses here which is defined outside this function, I first have an error that this function and this class are not defined for me to Pass the function and class path as variable but now I have this Path and JSON are not defined despite the fact that I have already imported this library in my notebook

i can use from pathlib import Path inside function but i need to do that for all library that i want to use

---------------------------------------------------------------------------
RemoteTraceback                           Traceback (most recent call last)
RemoteTraceback: 
"""
Traceback (most recent call last):
  File "c:\Users\FR00CSS0000000040678\AppData\Local\Programs\Python\Python311\Lib\site-packages\datasets\builder.py", line 1726, in _prepare_split_single
    for key, record in generator:
  File "c:\Users\FR00CSS0000000040678\AppData\Local\Programs\Python\Python311\Lib\site-packages\datasets\packaged_modules\generator\generator.py", line 30, in _generate_examples
    for idx, ex in enumerate(self.config.generator(**gen_kwargs)):
  File "C:\Users\FR00CSS0000000040678\AppData\Local\Temp\ipykernel_22828\3924248845.py", line 18, in gen_data
  File "C:\Users\FR00CSS0000000040678\AppData\Local\Temp\ipykernel_22828\332669057.py", line 28, in get_gt_string_and_xy
NameError: name 'Path' is not defined

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "c:\Users\FR00CSS0000000040678\AppData\Local\Programs\Python\Python311\Lib\site-packages\multiprocess\pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
                    ^^^^^^^^^^^^^^^^^^^
  File "c:\Users\FR00CSS0000000040678\AppData\Local\Programs\Python\Python311\Lib\site-packages\datasets\utils\py_utils.py", line 614, in _write_generator_to_queue
    for i, result in enumerate(func(**kwargs)):
  File "c:\Users\FR00CSS0000000040678\AppData\Local\Programs\Python\Python311\Lib\site-packages\datasets\builder.py", line 1762, in _prepare_split_single
    raise DatasetGenerationError("An error occurred while generating the dataset") from e
datasets.exceptions.DatasetGenerationError: An error occurred while generating the dataset
"""
...
    772     return self._value
    773 else:
--> 774     raise self._value

DatasetGenerationError: An error occurred while generating the dataset

Solution

class generate_data(config):
    
    
    def __init__(self):
        self.TRAIN_FOLDER = "./train"
        self.TRAIN_IMAGES_FOLDER = "./train/images/"
        self.TRAIN_JSON_FOLDER = "./train/annotations/"
        self.X_START = "<s_x_values>"
        self.X_END = "</s_x_values>"
        self.Y_START = "<s_y_values>"
        self.Y_END = "</s_y_values>"
        self.CHART_START = "<s_chart>"
        self.CHART_END = "</s_chart>"
        self.added_tokens = [self.X_START, self.X_END, self.Y_START, self.Y_END, self.CHART_START, self.CHART_END]

    def custom_round(self,value: Union[int, float, str]) -> Union[str, float]:
        """
        Convert a float value to a string with custom decimal truncation rules.

        If the absolute value of the integer part is greater than 1, truncate to 1 decimal.
        Otherwise, truncate to 4 decimals.

        Args:
            value (int, float, str): The float value to convert

        Returns:
            Union[str, float]: The rounded float value as a string or float
        """
        if isinstance(value, (int, float)):
            str_value = str(value)

            if "." in str_value:
                integer_part, decimal_part = str_value.split(".")
                decimal_limit = 1 if abs(float(integer_part)) > 1 else 4
                truncated_decimal = decimal_part[:decimal_limit]

                return float(f"{integer_part}.{truncated_decimal}")
        
        return value

    def is_not_a_number(self,value: Union[int, float, str]) -> bool:
        """
        Check if a value is not a number (NaN).

        Args:
            value (int, float, str): The value to check

        Returns:
            bool: True if the value is NaN, False otherwise
        """
        return isinstance(value, float) and str(value).lower() == "nan"

    class RollingAverageMeter:
        """Computes and stores a rolling average and current value"""
        def __init__(self):
            self.reset()

        def reset(self):
            """
            Reset all values to their initial state.
            """
            self.current_value = 0
            self.rolling_average = 0
            self.sum = 0
            self.count = 0

        def update(self, value, weight=1):
            """
            Update values based on new data.

            Args:
                value: The new value to update
                weight: Weight associated with the new value
            """
            self.current_value = value
            self.sum += value * weight
            self.count += weight
            self.rolling_average = self.sum / self.count
        



    def process_data_series(self,data_series):
        all_x, all_y = [], []
        for d in data_series:
            x = self.custom_round(d["x"])
            y = self.custom_round(d["y"])
            # Ignore nan values
            try:
                if self.is_not_a_number(x) or self.is_not_a_number(y):
                    continue
            except:
                raise Exception(x,y)
            all_x.append(x)
            all_y.append(y)
        return all_x, all_y


    def create_chart_string(self,chart_type):
        return self.CHART_START + chart_type + self.CHART_END


    def create_coordinate_string(self,label, values):
        return f"<s_{label}_values>" + ";".join(map(str, values)) + f"</s_{label}_values>"


    def get_gt_string_and_xy(self,filepath: Union[str, os.PathLike]) -> Dict[str, str]:
        """
        Get the ground truth string and x-y data from the given JSON file.
        :param filepath: The path to the JSON file
        :return dict: A dictionary containing the ground truth string, x-y data, chart type, id, and source
        """
        import json
        from pathlib import Path
        filepath = Path(filepath)
        with open(filepath) as fp:
            data = json.load(fp)

        all_x, all_y = self.process_data_series(data.get("data-series", []))
        chart_type = data.get('chart-type', '')
        chart_str = self.create_chart_string(chart_type)
        x_str = self.create_coordinate_string("x", all_x)
        y_str = self.create_coordinate_string("y", all_y)

        gt_string = chart_str + x_str + y_str

        return {
            "ground_truth": gt_string,
            "x": json.dumps(all_x),
            "y": json.dumps(all_y),
            "chart-type": chart_type,
            "id": filepath.stem,
            "source": data.get("source", ''),
        }
    


    def gen_data(self,files: List[str]) -> Dict[str, str]:
        """
        This function takes a list of json files and returns a generator that yields a
        dictionary with the ground truth string and the path to the image.
        :param files (list): A list of json files
        :return generator: A generator that yields a dictionary with the ground truth string and the path to the corresponding image.
        """
        for f in files:
            # Extract image ID from the file path
            image_id = f.split("/")[-1].split(".")[0]
            # Construct the image path based on the ID
            image_path = self.TRAIN_IMAGES_FOLDER + image_id + ".jpg"
            # Yield a dictionary containing ground truth string, image path, and other information
            yield {
                **self.get_gt_string_and_xy(f),
                "image_path": image_path,
            }

    def get_generator(self):

        return  HFDataset.from_generator(
            self.gen_data, gen_kwargs={"files": glob(self.TRAIN_JSON_FOLDER + "*.json")}, num_proc=config.NUM_PROCESS
        )

I found a solution is to use a class to group all the functions I need, but I still have to use import into the function to work