I am trying to extract text using Borb from a PDF and i can see there is a clear example to extract text with font names:
# create FontNameFilter
l0: FontNameFilter = FontNameFilter("Helvetica")
# filtered text just gets passed to SimpleTextExtraction
l1: SimpleTextExtraction = SimpleTextExtraction()
l0.add_listener(l1)
# read the Document
doc: typing.Optional[Document] = None
with open("UMIR-01032023-EN_4.pdf", "rb") as in_file_handle:
doc = PDF.loads(in_file_handle, [l0])
# check whether we have read a Document
assert doc is not None
# print the names of the Fonts
print(l1.get_text()[0])# create FontNameFilter
l0: FontNameFilter = FontNameFilter("Helvetica")
# filtered text just gets passed to SimpleTextExtraction
l1: SimpleTextExtraction = SimpleTextExtraction()
l0.add_listener(l1)
# read the Document
doc: typing.Optional[Document] = None
with open("UMIR-01032023-EN_4.pdf", "rb") as in_file_handle:
doc = PDF.loads(in_file_handle, [l0])
# check whether we have read a Document
assert doc is not None
# print the names of the Fonts
print(l1.get_text()[0])
I wanted to know if there is a way to extract the text using regex in font names for example:
If font name one is: ABCD-Font
Font name two is: ABCD-Bold-Font
How can i extract both.
So as suggested by Tim Roberts in a comment above.
I ended up creating a custom FontNameFilter which compares the font using startswith
instead of an exact match:
def _event_occurred(self, event: "Event") -> None:
# filter ChunkOfTextRenderEvent
if isinstance(event, ChunkOfTextRenderEvent):
font_name: typing.Optional[str] = event.get_font().get_font_name()
if font_name.startswith(self._font_name):
for l in self._listeners:
l._event_occurred(event)
return
# default
for l in self._listeners:
l._event_occurred(event)