Search code examples
pythonazure-cognitive-services

Azure Cognitive Form Recognizer to Extract Page Numbers using Python


The following code will use Azure Cognitive Form Analyser to extract text from PDFs. However, I would like help modifying the code to show page numbers. Can someone help with that?

import pandas as pd
from azure.ai.formrecognizer import DocumentAnalysisClient

# field_list = ["result.content"]

document_analysis_client = DocumentAnalysisClient(
endpoint=endpoint, credential=AzureKeyCredential(key)
)

for blob in container.list_blobs():
  blob_url = container_url + "/" + blob.name
  poller = document_analysis_client.begin_analyze_document_from_url(
            "prebuilt-read", blob_url)
  result = poller.result()
  print("Scanning " + blob.name + "...")
  print ("document contains", result.content)

mydf = result.content

Thanks


Solution

  • Technically for recognizer is not having an implicit key word like pageNumber to code and make it recognize. The following is the code block which the form recognizer recognizes by default. The recognizer will stick to few key words which are pre-modeled.

    from azure.core.credentials import AzureKeyCredential
    from azure.ai.formrecognizer import DocumentAnalysisClient
    
    endpoint = "YOUR_FORM_RECOGNIZER_ENDPOINT"
    key = "YOUR_FORM_RECOGNIZER_SUBSCRIPTION_KEY"
    
    formUrl = "https://raw.githubusercontent.com/Azure-Samples/cognitive-services-REST-api-samples/master/curl/form-recognizer/invoice_sample.jpg"
    
    document_analysis_client = DocumentAnalysisClient(
        endpoint=endpoint, credential=AzureKeyCredential(key)
    )
        
    poller = document_analysis_client.begin_analyze_document_from_url("prebuilt-invoice", formUrl)
    invoices = poller.result()
    
    for idx, invoice in enumerate(invoices.documents):
        print("--------Recognizing invoice #{}--------".format(idx + 1))
        vendor_name = invoice.fields.get("VendorName")
        if vendor_name:
            print(
                "Vendor Name: {} has confidence: {}".format(
                    vendor_name.value, vendor_name.confidence
                )
            )
        vendor_address = invoice.fields.get("VendorAddress")
        if vendor_address:
            print(
                "Vendor Address: {} has confidence: {}".format(
                    vendor_address.value, vendor_address.confidence
                )
            )
        vendor_address_recipient = invoice.fields.get("VendorAddressRecipient")
        if vendor_address_recipient:
            print(
                "Vendor Address Recipient: {} has confidence: {}".format(
                    vendor_address_recipient.value, vendor_address_recipient.confidence
                )
            )
        customer_name = invoice.fields.get("CustomerName")
        if customer_name:
            print(
                "Customer Name: {} has confidence: {}".format(
                    customer_name.value, customer_name.confidence
                )
            )
        customer_id = invoice.fields.get("CustomerId")
        if customer_id:
            print(
                "Customer Id: {} has confidence: {}".format(
                    customer_id.value, customer_id.confidence
                )
            )
        customer_address = invoice.fields.get("CustomerAddress")
        if customer_address:
            print(
                "Customer Address: {} has confidence: {}".format(
                    customer_address.value, customer_address.confidence
                )
            )
        customer_address_recipient = invoice.fields.get("CustomerAddressRecipient")
        if customer_address_recipient:
            print(
                "Customer Address Recipient: {} has confidence: {}".format(
                    customer_address_recipient.value,
                    customer_address_recipient.confidence,
                )
            )
        invoice_id = invoice.fields.get("InvoiceId")
        if invoice_id:
            print(
                "Invoice Id: {} has confidence: {}".format(
                    invoice_id.value, invoice_id.confidence
                )
            )
        invoice_date = invoice.fields.get("InvoiceDate")
        if invoice_date:
            print(
                "Invoice Date: {} has confidence: {}".format(
                    invoice_date.value, invoice_date.confidence
                )
            )
        invoice_total = invoice.fields.get("InvoiceTotal")
        if invoice_total:
            print(
                "Invoice Total: {} has confidence: {}".format(
                    invoice_total.value, invoice_total.confidence
                )
            )
        due_date = invoice.fields.get("DueDate")
        if due_date:
            print(
                "Due Date: {} has confidence: {}".format(
                    due_date.value, due_date.confidence
                )
            )
        purchase_order = invoice.fields.get("PurchaseOrder")
        if purchase_order:
            print(
                "Purchase Order: {} has confidence: {}".format(
                    purchase_order.value, purchase_order.confidence
                )
            )
        billing_address = invoice.fields.get("BillingAddress")
        if billing_address:
            print(
                "Billing Address: {} has confidence: {}".format(
                    billing_address.value, billing_address.confidence
                )
            )
        billing_address_recipient = invoice.fields.get("BillingAddressRecipient")
        if billing_address_recipient:
            print(
                "Billing Address Recipient: {} has confidence: {}".format(
                    billing_address_recipient.value,
                    billing_address_recipient.confidence,
                )
            )
        shipping_address = invoice.fields.get("ShippingAddress")
        if shipping_address:
            print(
                "Shipping Address: {} has confidence: {}".format(
                    shipping_address.value, shipping_address.confidence
                )
            )
        shipping_address_recipient = invoice.fields.get("ShippingAddressRecipient")
        if shipping_address_recipient:
            print(
                "Shipping Address Recipient: {} has confidence: {}".format(
                    shipping_address_recipient.value,
                    shipping_address_recipient.confidence,
                )
            )
        print("Invoice items:")
        for idx, item in enumerate(invoice.fields.get("Items").value):
            print("...Item #{}".format(idx + 1))
            item_description = item.value.get("Description")
            if item_description:
                print(
                    "......Description: {} has confidence: {}".format(
                        item_description.value, item_description.confidence
                    )
                )
            item_quantity = item.value.get("Quantity")
            if item_quantity:
                print(
                    "......Quantity: {} has confidence: {}".format(
                        item_quantity.value, item_quantity.confidence
                    )
                )
            unit = item.value.get("Unit")
            if unit:
                print(
                    "......Unit: {} has confidence: {}".format(
                        unit.value, unit.confidence
                    )
                )
            unit_price = item.value.get("UnitPrice")
            if unit_price:
                print(
                    "......Unit Price: {} has confidence: {}".format(
                        unit_price.value, unit_price.confidence
                    )
                )
            product_code = item.value.get("ProductCode")
            if product_code:
                print(
                    "......Product Code: {} has confidence: {}".format(
                        product_code.value, product_code.confidence
                    )
                )
            item_date = item.value.get("Date")
            if item_date:
                print(
                    "......Date: {} has confidence: {}".format(
                        item_date.value, item_date.confidence
                    )
                )
            tax = item.value.get("Tax")
            if tax:
                print(
                    "......Tax: {} has confidence: {}".format(tax.value, tax.confidence)
                )
            amount = item.value.get("Amount")
            if amount:
                print(
                    "......Amount: {} has confidence: {}".format(
                        amount.value, amount.confidence
                    )
                )
        subtotal = invoice.fields.get("SubTotal")
        if subtotal:
            print(
                "Subtotal: {} has confidence: {}".format(
                    subtotal.value, subtotal.confidence
                )
            )
        total_tax = invoice.fields.get("TotalTax")
        if total_tax:
            print(
                "Total Tax: {} has confidence: {}".format(
                    total_tax.value, total_tax.confidence
                )
            )
        previous_unpaid_balance = invoice.fields.get("PreviousUnpaidBalance")
        if previous_unpaid_balance:
            print(
                "Previous Unpaid Balance: {} has confidence: {}".format(
                    previous_unpaid_balance.value, previous_unpaid_balance.confidence
                )
            )
        amount_due = invoice.fields.get("AmountDue")
        if amount_due:
            print(
                "Amount Due: {} has confidence: {}".format(
                    amount_due.value, amount_due.confidence
                )
            )
        service_start_date = invoice.fields.get("ServiceStartDate")
        if service_start_date:
            print(
                "Service Start Date: {} has confidence: {}".format(
                    service_start_date.value, service_start_date.confidence
                )
            )
        service_end_date = invoice.fields.get("ServiceEndDate")
        if service_end_date:
            print(
                "Service End Date: {} has confidence: {}".format(
                    service_end_date.value, service_end_date.confidence
                )
            )
        service_address = invoice.fields.get("ServiceAddress")
        if service_address:
            print(
                "Service Address: {} has confidence: {}".format(
                    service_address.value, service_address.confidence
                )
            )
        service_address_recipient = invoice.fields.get("ServiceAddressRecipient")
        if service_address_recipient:
            print(
                "Service Address Recipient: {} has confidence: {}".format(
                    service_address_recipient.value,
                    service_address_recipient.confidence,
                )
            )
        remittance_address = invoice.fields.get("RemittanceAddress")
        if remittance_address:
            print(
                "Remittance Address: {} has confidence: {}".format(
                    remittance_address.value, remittance_address.confidence
                )
            )
        remittance_address_recipient = invoice.fields.get("RemittanceAddressRecipient")
        if remittance_address_recipient:
            print(
                "Remittance Address Recipient: {} has confidence: {}".format(
                    remittance_address_recipient.value,
                    remittance_address_recipient.confidence,
                )
            )
        print("----------------------------------------")
    

    This is the pre-build structure to recognize the invoice.

    For reference, I created a sample PDF file, with some page numbers in it. I included 2 pages and auto-generated page numbers through word and converted that into PDF and uploaded into form recognizer to recognize the page number. It successfully recognized the page number but to make it happen using python, it failed but recognized in JSON file.

    So, technically, it's hard to recognize the page number with some specific keyword like "pageNumber".

    Below is the repro operation performed.

    {
        "status": "succeeded",
        "createdDateTime": "2022-06-09T06:22:38Z",
        "lastUpdatedDateTime": "2022-06-09T06:22:50Z",
        "analyzeResult": {
            "apiVersion": "2022-06-30-preview",
            "modelId": "prebuilt-invoice",
            "stringIndexType": "textElements",
            "content": "Heading\nWelcome to my channel.\n1\nSample 2\n2",
            "pages": [
                {
                    "pageNumber": 1,
                    "angle": 0,
                    "width": 8.5,
                    "height": 11,
                    "unit": "inch",
                    "words": [
                        {
                            "content": "Heading",
                            "polygon": [
                                3.7708,
                                1.0779,
                                4.7416,
                                1.0779,
                                4.7416,
                                1.3302,
                                3.7708,
                                1.3302
                            ],
                            "confidence": 1,
                            "span": {
                                "offset": 0,
                                "length": 7
                            }
                        },
                        {
                            "content": "Welcome",
                            "polygon": [
                                3.419,
                                2.0336,
                                4.0422,
                                2.0336,
                                4.0422,
                                2.1499,
                                3.419,
                                2.1499
                            ],
                            "confidence": 1,
                            "span": {
                                "offset": 8,
                                "length": 7
                            }
                        },
                        {
                            "content": "to",
                            "polygon": [
                                4.0914,
                                2.0498,
                                4.2242,
                                2.0498,
                                4.2242,
                                2.1499,
                                4.0914,
                                2.1499
                            ],
                            "confidence": 1,
                            "span": {
                                "offset": 16,
                                "length": 2
                            }
                        },
                        {
                            "content": "my",
                            "polygon": [
                                4.281,
                                2.0684,
                                4.4712,
                                2.0684,
                                4.4712,
                                2.1781,
                                4.281,
                                2.1781
                            ],
                            "confidence": 1,
                            "span": {
                                "offset": 19,
                                "length": 2
                            }
                        },
                        {
                            "content": "channel.",
                            "polygon": [
                                4.519,
                                2.0336,
                                5.076,
                                2.0336,
                                5.076,
                                2.1499,
                                4.519,
                                2.1499
                            ],
                            "confidence": 1,
                            "span": {
                                "offset": 22,
                                "length": 8
                            }
                        },
                        {
                            "content": "1",
                            "polygon": [
                                7.4378,
                                10.1741,
                                7.4942,
                                10.1741,
                                7.4942,
                                10.273,
                                7.4378,
                                10.273
                            ],
                            "confidence": 1,
                            "span": {
                                "offset": 31,
                                "length": 1
                            }
                        }
                    ],
                    "lines": [
                        {
                            "content": "Heading",
                            "polygon": [
                                3.7708,
                                1.0779,
                                4.7416,
                                1.0779,
                                4.7416,
                                1.3302,
                                3.7708,
                                1.3302
                            ],
                            "spans": [
                                {
                                    "offset": 0,
                                    "length": 7
                                }
                            ]
                        },
                        {
                            "content": "Welcome to my channel.",
                            "polygon": [
                                3.419,
                                2.0336,
                                5.076,
                                2.0336,
                                5.076,
                                2.1781,
                                3.419,
                                2.1781
                            ],
                            "spans": [
                                {
                                    "offset": 8,
                                    "length": 22
                                }
                            ]
                        },
                        {
                            "content": "1",
                            "polygon": [
                                7.4378,
                                10.1741,
                                7.4942,
                                10.1741,
                                7.4942,
                                10.273,
                                7.4378,
                                10.273
                            ],
                            "spans": [
                                {
                                    "offset": 31,
                                    "length": 1
                                }
                            ]
                        }
                    ],
                    "spans": [
                        {
                            "offset": 0,
                            "length": 32
                        }
                    ],
                    "kind": "document"
                },
                {
                    "pageNumber": 2,
                    "angle": 0,
                    "width": 8.5,
                    "height": 11,
                    "unit": "inch",
                    "words": [
                        {
                            "content": "Sample",
                            "polygon": [
                                3.9465,
                                2.6969,
                                4.4322,
                                2.6969,
                                4.4322,
                                2.8414,
                                3.9465,
                                2.8414
                            ],
                            "confidence": 1,
                            "span": {
                                "offset": 33,
                                "length": 6
                            }
                        },
                        {
                            "content": "2",
                            "polygon": [
                                4.487,
                                2.7034,
                                4.5528,
                                2.7034,
                                4.5528,
                                2.8119,
                                4.487,
                                2.8119
                            ],
                            "confidence": 1,
                            "span": {
                                "offset": 40,
                                "length": 1
                            }
                        },
                        {
                            "content": "2",
                            "polygon": [
                                7.4333,
                                10.1732,
                                7.4939,
                                10.1732,
                                7.4939,
                                10.273,
                                7.4333,
                                10.273
                            ],
                            "confidence": 1,
                            "span": {
                                "offset": 42,
                                "length": 1
                            }
                        }
                    ],
                    "lines": [
                        {
                            "content": "Sample 2",
                            "polygon": [
                                3.9465,
                                2.6969,
                                4.5528,
                                2.6969,
                                4.5528,
                                2.8414,
                                3.9465,
                                2.8414
                            ],
                            "spans": [
                                {
                                    "offset": 33,
                                    "length": 8
                                }
                            ]
                        },
                        {
                            "content": "2",
                            "polygon": [
                                7.4333,
                                10.1732,
                                7.4939,
                                10.1732,
                                7.4939,
                                10.273,
                                7.4333,
                                10.273
                            ],
                            "spans": [
                                {
                                    "offset": 42,
                                    "length": 1
                                }
                            ]
                        }
                    ],
                    "spans": [
                        {
                            "offset": 33,
                            "length": 10
                        }
                    ],
                    "kind": "document"
                }
            ],
            "tables": [],
            "paragraphs": [
                {
                    "spans": [
                        {
                            "offset": 0,
                            "length": 7
                        }
                    ],
                    "boundingRegions": [
                        {
                            "pageNumber": 1,
                            "polygon": [
                                3.7708,
                                1.0779,
                                4.7416,
                                1.0779,
                                4.7416,
                                1.3302,
                                3.7708,
                                1.3302
                            ]
                        }
                    ],
                    "role": "title",
                    "content": "Heading"
                },
                {
                    "spans": [
                        {
                            "offset": 8,
                            "length": 22
                        }
                    ],
                    "boundingRegions": [
                        {
                            "pageNumber": 1,
                            "polygon": [
                                3.419,
                                2.0336,
                                5.076,
                                2.0336,
                                5.076,
                                2.1781,
                                3.419,
                                2.1781
                            ]
                        }
                    ],
                    "content": "Welcome to my channel."
                },
                {
                    "spans": [
                        {
                            "offset": 31,
                            "length": 1
                        }
                    ],
                    "boundingRegions": [
                        {
                            "pageNumber": 1,
                            "polygon": [
                                7.4378,
                                10.1741,
                                7.4942,
                                10.1741,
                                7.4942,
                                10.273,
                                7.4378,
                                10.273
                            ]
                        }
                    ],
                    "role": "pageNumber",
                    "content": "1"
                },
                {
                    "spans": [
                        {
                            "offset": 33,
                            "length": 8
                        }
                    ],
                    "boundingRegions": [
                        {
                            "pageNumber": 2,
                            "polygon": [
                                3.9465,
                                2.6969,
                                4.5528,
                                2.6969,
                                4.5528,
                                2.8414,
                                3.9465,
                                2.8414
                            ]
                        }
                    ],
                    "role": "title",
                    "content": "Sample 2"
                },
                {
                    "spans": [
                        {
                            "offset": 42,
                            "length": 1
                        }
                    ],
                    "boundingRegions": [
                        {
                            "pageNumber": 2,
                            "polygon": [
                                7.4333,
                                10.1732,
                                7.4939,
                                10.1732,
                                7.4939,
                                10.273,
                                7.4333,
                                10.273
                            ]
                        }
                    ],
                    "role": "pageNumber",
                    "content": "2"
                }
            ],
            "keyValuePairs": [],
            "styles": [],
            "documents": [
                {
                    "docType": "invoice",
                    "boundingRegions": [
                        {
                            "pageNumber": 1,
                            "polygon": [
                                0,
                                0,
                                8.5,
                                0,
                                8.5,
                                11,
                                0,
                                11
                            ]
                        },
                        {
                            "pageNumber": 2,
                            "polygon": [
                                0,
                                0,
                                8.5,
                                0,
                                8.5,
                                11,
                                0,
                                11
                            ]
                        }
                    ],
                    "fields": {},
                    "confidence": 1,
                    "spans": [
                        {
                            "offset": 0,
                            "length": 43
                        }
                    ]
                }
            ]
        }
    }
    

    if we observe the JSON and python scripts, the form recognizer is having limitations upto some keywords according to invoice. To check the page number, we may feel difficult with python, but JSON will recognize the page number. Check the screenshots below.

    enter image description here

    Text recognition was successful. In the below image, we can see, form recognizer identified the page number which was highlighted in yellow.

    enter image description here