The following code will use Azure Cognitive Form Analyser to extract text from PDFs. However, I would like help modifying the code to show page numbers. Can someone help with that?
import pandas as pd
from azure.ai.formrecognizer import DocumentAnalysisClient
# field_list = ["result.content"]
document_analysis_client = DocumentAnalysisClient(
endpoint=endpoint, credential=AzureKeyCredential(key)
)
for blob in container.list_blobs():
blob_url = container_url + "/" + blob.name
poller = document_analysis_client.begin_analyze_document_from_url(
"prebuilt-read", blob_url)
result = poller.result()
print("Scanning " + blob.name + "...")
print ("document contains", result.content)
mydf = result.content
Thanks
Technically for recognizer is not having an implicit key word like pageNumber to code and make it recognize. The following is the code block which the form recognizer recognizes by default. The recognizer will stick to few key words which are pre-modeled.
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient
endpoint = "YOUR_FORM_RECOGNIZER_ENDPOINT"
key = "YOUR_FORM_RECOGNIZER_SUBSCRIPTION_KEY"
formUrl = "https://raw.githubusercontent.com/Azure-Samples/cognitive-services-REST-api-samples/master/curl/form-recognizer/invoice_sample.jpg"
document_analysis_client = DocumentAnalysisClient(
endpoint=endpoint, credential=AzureKeyCredential(key)
)
poller = document_analysis_client.begin_analyze_document_from_url("prebuilt-invoice", formUrl)
invoices = poller.result()
for idx, invoice in enumerate(invoices.documents):
print("--------Recognizing invoice #{}--------".format(idx + 1))
vendor_name = invoice.fields.get("VendorName")
if vendor_name:
print(
"Vendor Name: {} has confidence: {}".format(
vendor_name.value, vendor_name.confidence
)
)
vendor_address = invoice.fields.get("VendorAddress")
if vendor_address:
print(
"Vendor Address: {} has confidence: {}".format(
vendor_address.value, vendor_address.confidence
)
)
vendor_address_recipient = invoice.fields.get("VendorAddressRecipient")
if vendor_address_recipient:
print(
"Vendor Address Recipient: {} has confidence: {}".format(
vendor_address_recipient.value, vendor_address_recipient.confidence
)
)
customer_name = invoice.fields.get("CustomerName")
if customer_name:
print(
"Customer Name: {} has confidence: {}".format(
customer_name.value, customer_name.confidence
)
)
customer_id = invoice.fields.get("CustomerId")
if customer_id:
print(
"Customer Id: {} has confidence: {}".format(
customer_id.value, customer_id.confidence
)
)
customer_address = invoice.fields.get("CustomerAddress")
if customer_address:
print(
"Customer Address: {} has confidence: {}".format(
customer_address.value, customer_address.confidence
)
)
customer_address_recipient = invoice.fields.get("CustomerAddressRecipient")
if customer_address_recipient:
print(
"Customer Address Recipient: {} has confidence: {}".format(
customer_address_recipient.value,
customer_address_recipient.confidence,
)
)
invoice_id = invoice.fields.get("InvoiceId")
if invoice_id:
print(
"Invoice Id: {} has confidence: {}".format(
invoice_id.value, invoice_id.confidence
)
)
invoice_date = invoice.fields.get("InvoiceDate")
if invoice_date:
print(
"Invoice Date: {} has confidence: {}".format(
invoice_date.value, invoice_date.confidence
)
)
invoice_total = invoice.fields.get("InvoiceTotal")
if invoice_total:
print(
"Invoice Total: {} has confidence: {}".format(
invoice_total.value, invoice_total.confidence
)
)
due_date = invoice.fields.get("DueDate")
if due_date:
print(
"Due Date: {} has confidence: {}".format(
due_date.value, due_date.confidence
)
)
purchase_order = invoice.fields.get("PurchaseOrder")
if purchase_order:
print(
"Purchase Order: {} has confidence: {}".format(
purchase_order.value, purchase_order.confidence
)
)
billing_address = invoice.fields.get("BillingAddress")
if billing_address:
print(
"Billing Address: {} has confidence: {}".format(
billing_address.value, billing_address.confidence
)
)
billing_address_recipient = invoice.fields.get("BillingAddressRecipient")
if billing_address_recipient:
print(
"Billing Address Recipient: {} has confidence: {}".format(
billing_address_recipient.value,
billing_address_recipient.confidence,
)
)
shipping_address = invoice.fields.get("ShippingAddress")
if shipping_address:
print(
"Shipping Address: {} has confidence: {}".format(
shipping_address.value, shipping_address.confidence
)
)
shipping_address_recipient = invoice.fields.get("ShippingAddressRecipient")
if shipping_address_recipient:
print(
"Shipping Address Recipient: {} has confidence: {}".format(
shipping_address_recipient.value,
shipping_address_recipient.confidence,
)
)
print("Invoice items:")
for idx, item in enumerate(invoice.fields.get("Items").value):
print("...Item #{}".format(idx + 1))
item_description = item.value.get("Description")
if item_description:
print(
"......Description: {} has confidence: {}".format(
item_description.value, item_description.confidence
)
)
item_quantity = item.value.get("Quantity")
if item_quantity:
print(
"......Quantity: {} has confidence: {}".format(
item_quantity.value, item_quantity.confidence
)
)
unit = item.value.get("Unit")
if unit:
print(
"......Unit: {} has confidence: {}".format(
unit.value, unit.confidence
)
)
unit_price = item.value.get("UnitPrice")
if unit_price:
print(
"......Unit Price: {} has confidence: {}".format(
unit_price.value, unit_price.confidence
)
)
product_code = item.value.get("ProductCode")
if product_code:
print(
"......Product Code: {} has confidence: {}".format(
product_code.value, product_code.confidence
)
)
item_date = item.value.get("Date")
if item_date:
print(
"......Date: {} has confidence: {}".format(
item_date.value, item_date.confidence
)
)
tax = item.value.get("Tax")
if tax:
print(
"......Tax: {} has confidence: {}".format(tax.value, tax.confidence)
)
amount = item.value.get("Amount")
if amount:
print(
"......Amount: {} has confidence: {}".format(
amount.value, amount.confidence
)
)
subtotal = invoice.fields.get("SubTotal")
if subtotal:
print(
"Subtotal: {} has confidence: {}".format(
subtotal.value, subtotal.confidence
)
)
total_tax = invoice.fields.get("TotalTax")
if total_tax:
print(
"Total Tax: {} has confidence: {}".format(
total_tax.value, total_tax.confidence
)
)
previous_unpaid_balance = invoice.fields.get("PreviousUnpaidBalance")
if previous_unpaid_balance:
print(
"Previous Unpaid Balance: {} has confidence: {}".format(
previous_unpaid_balance.value, previous_unpaid_balance.confidence
)
)
amount_due = invoice.fields.get("AmountDue")
if amount_due:
print(
"Amount Due: {} has confidence: {}".format(
amount_due.value, amount_due.confidence
)
)
service_start_date = invoice.fields.get("ServiceStartDate")
if service_start_date:
print(
"Service Start Date: {} has confidence: {}".format(
service_start_date.value, service_start_date.confidence
)
)
service_end_date = invoice.fields.get("ServiceEndDate")
if service_end_date:
print(
"Service End Date: {} has confidence: {}".format(
service_end_date.value, service_end_date.confidence
)
)
service_address = invoice.fields.get("ServiceAddress")
if service_address:
print(
"Service Address: {} has confidence: {}".format(
service_address.value, service_address.confidence
)
)
service_address_recipient = invoice.fields.get("ServiceAddressRecipient")
if service_address_recipient:
print(
"Service Address Recipient: {} has confidence: {}".format(
service_address_recipient.value,
service_address_recipient.confidence,
)
)
remittance_address = invoice.fields.get("RemittanceAddress")
if remittance_address:
print(
"Remittance Address: {} has confidence: {}".format(
remittance_address.value, remittance_address.confidence
)
)
remittance_address_recipient = invoice.fields.get("RemittanceAddressRecipient")
if remittance_address_recipient:
print(
"Remittance Address Recipient: {} has confidence: {}".format(
remittance_address_recipient.value,
remittance_address_recipient.confidence,
)
)
print("----------------------------------------")
This is the pre-build structure to recognize the invoice.
For reference, I created a sample PDF file, with some page numbers in it. I included 2 pages and auto-generated page numbers through word and converted that into PDF and uploaded into form recognizer to recognize the page number. It successfully recognized the page number but to make it happen using python, it failed but recognized in JSON file.
So, technically, it's hard to recognize the page number with some specific keyword like "pageNumber".
Below is the repro operation performed.
{
"status": "succeeded",
"createdDateTime": "2022-06-09T06:22:38Z",
"lastUpdatedDateTime": "2022-06-09T06:22:50Z",
"analyzeResult": {
"apiVersion": "2022-06-30-preview",
"modelId": "prebuilt-invoice",
"stringIndexType": "textElements",
"content": "Heading\nWelcome to my channel.\n1\nSample 2\n2",
"pages": [
{
"pageNumber": 1,
"angle": 0,
"width": 8.5,
"height": 11,
"unit": "inch",
"words": [
{
"content": "Heading",
"polygon": [
3.7708,
1.0779,
4.7416,
1.0779,
4.7416,
1.3302,
3.7708,
1.3302
],
"confidence": 1,
"span": {
"offset": 0,
"length": 7
}
},
{
"content": "Welcome",
"polygon": [
3.419,
2.0336,
4.0422,
2.0336,
4.0422,
2.1499,
3.419,
2.1499
],
"confidence": 1,
"span": {
"offset": 8,
"length": 7
}
},
{
"content": "to",
"polygon": [
4.0914,
2.0498,
4.2242,
2.0498,
4.2242,
2.1499,
4.0914,
2.1499
],
"confidence": 1,
"span": {
"offset": 16,
"length": 2
}
},
{
"content": "my",
"polygon": [
4.281,
2.0684,
4.4712,
2.0684,
4.4712,
2.1781,
4.281,
2.1781
],
"confidence": 1,
"span": {
"offset": 19,
"length": 2
}
},
{
"content": "channel.",
"polygon": [
4.519,
2.0336,
5.076,
2.0336,
5.076,
2.1499,
4.519,
2.1499
],
"confidence": 1,
"span": {
"offset": 22,
"length": 8
}
},
{
"content": "1",
"polygon": [
7.4378,
10.1741,
7.4942,
10.1741,
7.4942,
10.273,
7.4378,
10.273
],
"confidence": 1,
"span": {
"offset": 31,
"length": 1
}
}
],
"lines": [
{
"content": "Heading",
"polygon": [
3.7708,
1.0779,
4.7416,
1.0779,
4.7416,
1.3302,
3.7708,
1.3302
],
"spans": [
{
"offset": 0,
"length": 7
}
]
},
{
"content": "Welcome to my channel.",
"polygon": [
3.419,
2.0336,
5.076,
2.0336,
5.076,
2.1781,
3.419,
2.1781
],
"spans": [
{
"offset": 8,
"length": 22
}
]
},
{
"content": "1",
"polygon": [
7.4378,
10.1741,
7.4942,
10.1741,
7.4942,
10.273,
7.4378,
10.273
],
"spans": [
{
"offset": 31,
"length": 1
}
]
}
],
"spans": [
{
"offset": 0,
"length": 32
}
],
"kind": "document"
},
{
"pageNumber": 2,
"angle": 0,
"width": 8.5,
"height": 11,
"unit": "inch",
"words": [
{
"content": "Sample",
"polygon": [
3.9465,
2.6969,
4.4322,
2.6969,
4.4322,
2.8414,
3.9465,
2.8414
],
"confidence": 1,
"span": {
"offset": 33,
"length": 6
}
},
{
"content": "2",
"polygon": [
4.487,
2.7034,
4.5528,
2.7034,
4.5528,
2.8119,
4.487,
2.8119
],
"confidence": 1,
"span": {
"offset": 40,
"length": 1
}
},
{
"content": "2",
"polygon": [
7.4333,
10.1732,
7.4939,
10.1732,
7.4939,
10.273,
7.4333,
10.273
],
"confidence": 1,
"span": {
"offset": 42,
"length": 1
}
}
],
"lines": [
{
"content": "Sample 2",
"polygon": [
3.9465,
2.6969,
4.5528,
2.6969,
4.5528,
2.8414,
3.9465,
2.8414
],
"spans": [
{
"offset": 33,
"length": 8
}
]
},
{
"content": "2",
"polygon": [
7.4333,
10.1732,
7.4939,
10.1732,
7.4939,
10.273,
7.4333,
10.273
],
"spans": [
{
"offset": 42,
"length": 1
}
]
}
],
"spans": [
{
"offset": 33,
"length": 10
}
],
"kind": "document"
}
],
"tables": [],
"paragraphs": [
{
"spans": [
{
"offset": 0,
"length": 7
}
],
"boundingRegions": [
{
"pageNumber": 1,
"polygon": [
3.7708,
1.0779,
4.7416,
1.0779,
4.7416,
1.3302,
3.7708,
1.3302
]
}
],
"role": "title",
"content": "Heading"
},
{
"spans": [
{
"offset": 8,
"length": 22
}
],
"boundingRegions": [
{
"pageNumber": 1,
"polygon": [
3.419,
2.0336,
5.076,
2.0336,
5.076,
2.1781,
3.419,
2.1781
]
}
],
"content": "Welcome to my channel."
},
{
"spans": [
{
"offset": 31,
"length": 1
}
],
"boundingRegions": [
{
"pageNumber": 1,
"polygon": [
7.4378,
10.1741,
7.4942,
10.1741,
7.4942,
10.273,
7.4378,
10.273
]
}
],
"role": "pageNumber",
"content": "1"
},
{
"spans": [
{
"offset": 33,
"length": 8
}
],
"boundingRegions": [
{
"pageNumber": 2,
"polygon": [
3.9465,
2.6969,
4.5528,
2.6969,
4.5528,
2.8414,
3.9465,
2.8414
]
}
],
"role": "title",
"content": "Sample 2"
},
{
"spans": [
{
"offset": 42,
"length": 1
}
],
"boundingRegions": [
{
"pageNumber": 2,
"polygon": [
7.4333,
10.1732,
7.4939,
10.1732,
7.4939,
10.273,
7.4333,
10.273
]
}
],
"role": "pageNumber",
"content": "2"
}
],
"keyValuePairs": [],
"styles": [],
"documents": [
{
"docType": "invoice",
"boundingRegions": [
{
"pageNumber": 1,
"polygon": [
0,
0,
8.5,
0,
8.5,
11,
0,
11
]
},
{
"pageNumber": 2,
"polygon": [
0,
0,
8.5,
0,
8.5,
11,
0,
11
]
}
],
"fields": {},
"confidence": 1,
"spans": [
{
"offset": 0,
"length": 43
}
]
}
]
}
}
if we observe the JSON and python scripts, the form recognizer is having limitations upto some keywords according to invoice. To check the page number, we may feel difficult with python, but JSON will recognize the page number. Check the screenshots below.
Text recognition was successful. In the below image, we can see, form recognizer identified the page number which was highlighted in yellow.