Search code examples
machine-learningdatasetfine-tuning

Fine tuning LayoutLmv3 using Cord-V2 dataset


I'm working on fine-tuning LayoutLMv3 using the CORD-v2 dataset. I'm struggling with the data preprocessing part, specifically on how to correctly extract the total amount (TTC) from the images. The examples I've found online seem to use the older CORD dataset, which has a different format. The new CORD-v2 dataset only includes images and ground truth labels.

How to approach this?

I've tried examples from YouTube and Hugging Face but haven't had any success.


Solution

  • I found a solution, you should create a label map for the data you want to extract, then Scale Bounding Boxes after that, and Detect Currency in Text because the problem is the dataset has a lot of different currencies and languages also so this is what I did

    label_map = { "total.total_price": 1, "other": 0 }

    def scale_bbox(box, original_size, target_size=(1000, 1000)):
        x_scale = target_size[0] / original_size[0]
        y_scale = target_size[1] / original_size[1]
        return [int(box[0] * x_scale), int(box[1] * y_scale), int(box[2] * x_scale), int(box[3] * y_scale)]
    
    def detect_currency(text):
        currency_symbols = {
            '$': 'USD',
            '€': 'EUR',
            '£': 'GBP',
            '¥': 'JPY',
            '₹': 'INR',
            '₩': 'KRW',
        }
        for symbol, currency in currency_symbols.items():
            if symbol in text:
                return currency
        return None
    
    def preprocess_data(examples):
        images = []
        words = []
        boxes = []
        labels = []
        original_size = (224, 224)
        currency_converter = CurrencyRates()
    
        for image, gt in zip(examples['image'], examples['ground_truth']):
            img = image.convert("RGB").resize(original_size)
            images.append(img)
            gt = json.loads(gt)
            batch_words = []
            batch_boxes = []
            batch_labels = []
    
            for item in gt['valid_line']:
                for w in item['words']:
                    text = w['text']
                    quad = w['quad']
                    bbox = scale_bbox([quad['x1'], quad['y1'], quad['x3'], quad['y3']], original_size)
                    bbox = [min(max(0, coord), 1000) for coord in bbox]
                    batch_words.append(text)
                    batch_boxes.append(bbox)
                    if item['category'] == 'total.total_price':
                        try:
                            total_amount_match = re.findall(r"\d+\.\d{2}", text)
                            if total_amount_match:
                                total_amount = float(total_amount_match[0])
                                detected_currency = detect_currency(text)
                                if detected_currency and detected_currency != 'USD':
                                    total_amount = currency_converter.convert(detected_currency, 'USD', total_amount)
                                text = f"{total_amount:.2f} USD"
                        except Exception as e:
                            print(f"Error processing text: {e}")
                        batch_labels.append(label_map["total.total_price"])
                    else:
                        batch_labels.append(label_map["other"])
    
            words.append(batch_words)
            boxes.append(batch_boxes)
            labels.append(batch_labels)
    
        encoding = processor(images, words, boxes=boxes, word_labels=labels, truncation=True, padding="max_length", max_length=512, return_tensors="pt")
        return encoding`enter code here`