Search code examples
pythonhuggingface-datasets

NameError: name 'Path' is not defined when using HF.Dataset.from_generator


train_json_files = glob(paths.TRAIN_JSON_FOLDER + "*.json")
from pathlib import Path

def get_gt_string_and_xy(filepath: Union[str, os.PathLike]) -> Dict[str, str]:
    """
    Get the ground truth string and x-y data from the given JSON file.
    :param filepath: The path to the JSON file
    :return dict: A dictionary containing the ground truth string, x-y data, chart type, id, and source
    """
    filepath = Path(filepath)
    with open(filepath) as fp:
        data = json.load(fp)

    all_x, all_y = process_data_series(data.get("data-series", []))
    chart_type = data.get('chart-type', '')
    chart_str = create_chart_string(chart_type)
    x_str = create_coordinate_string("x", all_x)
    y_str = create_coordinate_string("y", all_y)

    gt_string = chart_str + x_str + y_str

    return {
        "ground_truth": gt_string,
        "x": json.dumps(all_x),
        "y": json.dumps(all_y),
        "chart-type": chart_type,
        "id": filepath.stem,
        "source": data.get("source", ''),
    }
def gen_data(files: List[str], paths:paths, get_gt_string_and_xy:callable ) -> Dict[str, str]:
    """
    This function takes a list of json files and returns a generator that yields a
    dictionary with the ground truth string and the path to the image.
    :param files (list): A list of json files
    :return generator: A generator that yields a dictionary with the ground truth string and the path to the corresponding image.
    """
    for f in files:
        # Extract image ID from the file path
        image_id = f.split("/")[-1].split(".")[0]
        # Construct the image path based on the ID
        image_path = paths.TRAIN_IMAGES_FOLDER + image_id + ".jpg"
        # Yield a dictionary containing ground truth string, image path, and other information
        yield {
            **get_gt_string_and_xy(f),
            "image_path": image_path,
        }

ds = HFDataset.from_generator(
    gen_data, gen_kwargs={"files": train_json_files,"paths":paths,"get_gt_string_and_xy":get_gt_string_and_xy}, num_proc=config.NUM_PROCESS
)
print(f"Ground Truth string: \n {ds['ground_truth'][0]}")

I have this function to create a generator of my data, there is a function that uses here which is defined outside this function, I first have an error that this function and this class are not defined for me to Pass the function and class path as variable but now I have this Path and JSON are not defined despite the fact that I have already imported this library in my notebook

i can use from pathlib import Path inside function but i need to do that for all library that i want to use

---------------------------------------------------------------------------
RemoteTraceback                           Traceback (most recent call last)
RemoteTraceback: 
"""
Traceback (most recent call last):
  File "c:\Users\FR00CSS0000000040678\AppData\Local\Programs\Python\Python311\Lib\site-packages\datasets\builder.py", line 1726, in _prepare_split_single
    for key, record in generator:
  File "c:\Users\FR00CSS0000000040678\AppData\Local\Programs\Python\Python311\Lib\site-packages\datasets\packaged_modules\generator\generator.py", line 30, in _generate_examples
    for idx, ex in enumerate(self.config.generator(**gen_kwargs)):
  File "C:\Users\FR00CSS0000000040678\AppData\Local\Temp\ipykernel_22828\3924248845.py", line 18, in gen_data
  File "C:\Users\FR00CSS0000000040678\AppData\Local\Temp\ipykernel_22828\332669057.py", line 28, in get_gt_string_and_xy
NameError: name 'Path' is not defined

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "c:\Users\FR00CSS0000000040678\AppData\Local\Programs\Python\Python311\Lib\site-packages\multiprocess\pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
                    ^^^^^^^^^^^^^^^^^^^
  File "c:\Users\FR00CSS0000000040678\AppData\Local\Programs\Python\Python311\Lib\site-packages\datasets\utils\py_utils.py", line 614, in _write_generator_to_queue
    for i, result in enumerate(func(**kwargs)):
  File "c:\Users\FR00CSS0000000040678\AppData\Local\Programs\Python\Python311\Lib\site-packages\datasets\builder.py", line 1762, in _prepare_split_single
    raise DatasetGenerationError("An error occurred while generating the dataset") from e
datasets.exceptions.DatasetGenerationError: An error occurred while generating the dataset
"""
...
    772     return self._value
    773 else:
--> 774     raise self._value

DatasetGenerationError: An error occurred while generating the dataset

Solution

  • class generate_data(config):
        
        
        def __init__(self):
            self.TRAIN_FOLDER = "./train"
            self.TRAIN_IMAGES_FOLDER = "./train/images/"
            self.TRAIN_JSON_FOLDER = "./train/annotations/"
            self.X_START = "<s_x_values>"
            self.X_END = "</s_x_values>"
            self.Y_START = "<s_y_values>"
            self.Y_END = "</s_y_values>"
            self.CHART_START = "<s_chart>"
            self.CHART_END = "</s_chart>"
            self.added_tokens = [self.X_START, self.X_END, self.Y_START, self.Y_END, self.CHART_START, self.CHART_END]
    
        def custom_round(self,value: Union[int, float, str]) -> Union[str, float]:
            """
            Convert a float value to a string with custom decimal truncation rules.
    
            If the absolute value of the integer part is greater than 1, truncate to 1 decimal.
            Otherwise, truncate to 4 decimals.
    
            Args:
                value (int, float, str): The float value to convert
    
            Returns:
                Union[str, float]: The rounded float value as a string or float
            """
            if isinstance(value, (int, float)):
                str_value = str(value)
    
                if "." in str_value:
                    integer_part, decimal_part = str_value.split(".")
                    decimal_limit = 1 if abs(float(integer_part)) > 1 else 4
                    truncated_decimal = decimal_part[:decimal_limit]
    
                    return float(f"{integer_part}.{truncated_decimal}")
            
            return value
    
        def is_not_a_number(self,value: Union[int, float, str]) -> bool:
            """
            Check if a value is not a number (NaN).
    
            Args:
                value (int, float, str): The value to check
    
            Returns:
                bool: True if the value is NaN, False otherwise
            """
            return isinstance(value, float) and str(value).lower() == "nan"
    
        class RollingAverageMeter:
            """Computes and stores a rolling average and current value"""
            def __init__(self):
                self.reset()
    
            def reset(self):
                """
                Reset all values to their initial state.
                """
                self.current_value = 0
                self.rolling_average = 0
                self.sum = 0
                self.count = 0
    
            def update(self, value, weight=1):
                """
                Update values based on new data.
    
                Args:
                    value: The new value to update
                    weight: Weight associated with the new value
                """
                self.current_value = value
                self.sum += value * weight
                self.count += weight
                self.rolling_average = self.sum / self.count
            
    
    
    
        def process_data_series(self,data_series):
            all_x, all_y = [], []
            for d in data_series:
                x = self.custom_round(d["x"])
                y = self.custom_round(d["y"])
                # Ignore nan values
                try:
                    if self.is_not_a_number(x) or self.is_not_a_number(y):
                        continue
                except:
                    raise Exception(x,y)
                all_x.append(x)
                all_y.append(y)
            return all_x, all_y
    
    
        def create_chart_string(self,chart_type):
            return self.CHART_START + chart_type + self.CHART_END
    
    
        def create_coordinate_string(self,label, values):
            return f"<s_{label}_values>" + ";".join(map(str, values)) + f"</s_{label}_values>"
    
    
        def get_gt_string_and_xy(self,filepath: Union[str, os.PathLike]) -> Dict[str, str]:
            """
            Get the ground truth string and x-y data from the given JSON file.
            :param filepath: The path to the JSON file
            :return dict: A dictionary containing the ground truth string, x-y data, chart type, id, and source
            """
            import json
            from pathlib import Path
            filepath = Path(filepath)
            with open(filepath) as fp:
                data = json.load(fp)
    
            all_x, all_y = self.process_data_series(data.get("data-series", []))
            chart_type = data.get('chart-type', '')
            chart_str = self.create_chart_string(chart_type)
            x_str = self.create_coordinate_string("x", all_x)
            y_str = self.create_coordinate_string("y", all_y)
    
            gt_string = chart_str + x_str + y_str
    
            return {
                "ground_truth": gt_string,
                "x": json.dumps(all_x),
                "y": json.dumps(all_y),
                "chart-type": chart_type,
                "id": filepath.stem,
                "source": data.get("source", ''),
            }
        
    
    
        def gen_data(self,files: List[str]) -> Dict[str, str]:
            """
            This function takes a list of json files and returns a generator that yields a
            dictionary with the ground truth string and the path to the image.
            :param files (list): A list of json files
            :return generator: A generator that yields a dictionary with the ground truth string and the path to the corresponding image.
            """
            for f in files:
                # Extract image ID from the file path
                image_id = f.split("/")[-1].split(".")[0]
                # Construct the image path based on the ID
                image_path = self.TRAIN_IMAGES_FOLDER + image_id + ".jpg"
                # Yield a dictionary containing ground truth string, image path, and other information
                yield {
                    **self.get_gt_string_and_xy(f),
                    "image_path": image_path,
                }
    
        def get_generator(self):
    
            return  HFDataset.from_generator(
                self.gen_data, gen_kwargs={"files": glob(self.TRAIN_JSON_FOLDER + "*.json")}, num_proc=config.NUM_PROCESS
            )
    

    I found a solution is to use a class to group all the functions I need, but I still have to use import into the function to work