I try to read text from PDF file by pdfminer. But it's not recognize upside-down text-box...
This is my code:
from io import BytesIO
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTContainer, LTTextBox
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage
def find_textboxes_recursively(layout_obj):
if isinstance(layout_obj, LTTextBox):
return [layout_obj]
if isinstance(layout_obj, LTContainer):
boxes = []
for child in layout_obj:
boxes.extend(find_textboxes_recursively(child))
return boxes
return []
def inspect_character(doc_binary):
laparams = LAParams(detect_vertical=True)
resource_manager = PDFResourceManager()
device = PDFPageAggregator(resource_manager, laparams=laparams)
interpreter = PDFPageInterpreter(resource_manager, device)
for page in enumerate(PDFPage.get_pages(BytesIO(doc_binary)), 1):
interpreter.process_page(page)
layout = device.get_result()
boxes = find_textboxes_recursively(layout)
boxes.sort(key=lambda b: (-b.y1, b.x0))
for box in boxes:
text = box.get_text().strip()
print(text)
The sample:
The result:
veniam aut totam! autem? Alias, quia eius! Aperiam Magni natus atque nam rem quibusdam adipisci magnam mollitia fuga. Hic, sit? adipisicing elit. Deserunt corrupti Lorem ipsum dolor sit amet consectetur
Lorem ipsum dolor sit amet consectetur adipisicing elit. Deserunt corrupti adipisci magnam mollitia fuga. Hic, sit? Magni natus atque nam rem quibusdam autem? Alias, quia eius! Aperiam veniam aut totam!
It should be the same, but the first paragraph was not reversed...
Because the only object which hold the direction info is LTChar. So, based on the character of the paragraph to find out which is reversed:
def isReverseBox(box):
if not box.get_text().strip():
return False
for text_box in box:
for char in text_box:
if isinstance(char, LTChar) and char.get_text().strip() != '':
matrix = char.matrix
if matrix[0] < 0 or matrix[2] < 0 or matrix[3] < 0:
return True
else:
return False