I'm trying to extract from a multiple page PDF to then highlight some part of the PDF. For that, I need the coordinate of the text I extract.
I use tabula-py to extract tables with :
import tabula
# Read pdf into DataFrame
df = tabula.read_pdf("test.pdf", pages='all')
By using the param output_format='json'
, we can retrieve each text with coordinates, here is an example of dataframe format (the dataframe is a dict here is you want to load it) and json format:
{0: {0: nan,
1: nan,
2: 'Disability',
3: nan,
4: 'Category',
5: nan,
6: nan,
7: 'Blind',
8: 'Low Vision',
9: nan,
10: 'Dexterity',
11: 'Mobility'},
1: {0: nan,
1: nan,
2: nan,
3: 'Participants',
4: nan,
5: nan,
6: nan,
7: '5',
8: '5',
9: nan,
10: '5',
11: '3'},
2: {0: nan,
1: nan,
2: 'Ballots',
3: nan,
4: 'Completed',
5: nan,
6: nan,
7: '1',
8: '2',
9: nan,
10: '4',
11: '3'},
3: {0: nan,
1: 'Ballots',
2: nan,
3: 'Incomplete/',
4: nan,
5: 'Terminated',
6: nan,
7: '4',
8: '3',
9: nan,
10: '1',
11: '0'},
4: {0: nan,
1: nan,
2: nan,
3: nan,
4: 'Accuracy',
5: nan,
6: nan,
7: '34.5%, n=1',
8: '98.3% n=2',
9: '(97.7%, n=3)',
10: '98.3%, n=4',
11: '95.4%, n=3'},
5: {0: 'Results',
1: nan,
2: nan,
3: nan,
4: 'Time to',
5: nan,
6: 'complete',
7: '1199 sec, n=1',
8: '1716 sec, n=3',
9: '(1934 sec, n=2)',
10: '1672.1 sec, n=4',
11: '1416 sec, n=3'}}
{'extraction_method': 'stream',
'top': 143.0,
'left': 83.0,
'width': 461.0,
'height': 202.0,
'right': 544.0,
'bottom': 345.0,
'data': [[{'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
{'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
{'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
{'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
{'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
{'top': 149.0,
'left': 448.2,
'width': 38.58997344970703,
'height': 6.880000114440918,
'text': 'Results'}],
[{'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
{'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
{'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
{'top': 156.56,
'left': 311.4,
'width': 36.66999053955078,
'height': 6.880000114440918,
'text': 'Ballots'},
{'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
{'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''}],
[{'top': 163.88,
'left': 90.0,
'width': 49.150001525878906,
'height': 6.880000114440918,
'text': 'Disability'},
{'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
{'top': 163.88,
'left': 239.4,
'width': 36.66999053955078,
'height': 6.880000114440918,
'text': 'Ballots'},
{'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
{'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
{'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''}],
[{'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
{'top': 171.2,
'left': 167.16,
'width': 62.10999298095703,
'height': 6.880000114440918,
'text': 'Participants'},
{'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
{'top': 171.2,
'left': 311.4,
'width': 64.3899917602539,
'height': 6.880000114440918,
'text': 'Incomplete/'},
{'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
{'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''}],
[{'top': 178.52,
'left': 90.0,
'width': 47.230003356933594,
'height': 6.880000114440918,
'text': 'Category'},
{'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
{'top': 178.52,
'left': 239.4,
'width': 57.31000518798828,
'height': 6.880000114440918,
'text': 'Completed'},
{'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
{'top': 176.36,
'left': 400.68,
'width': 47.350013732910156,
'height': 6.880000114440918,
'text': 'Accuracy'},
{'top': 176.36,
'left': 483.12,
'width': 40.749977111816406,
'height': 6.880000114440918,
'text': 'Time to'}],
[{'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
{'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
{'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
{'top': 185.84,
'left': 311.4,
'width': 60.66999053955078,
'height': 6.880000114440918,
'text': 'Terminated'},
{'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
{'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''}],
[{'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
{'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
{'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
{'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
{'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
{'top': 193.16,
'left': 478.68,
'width': 49.629981994628906,
'height': 6.880000114440918,
'text': 'complete'}],
[{'top': 220.11,
'left': 90.0,
'width': 25.05999755859375,
'height': 6.329999923706055,
'text': 'Blind'},
{'top': 220.11,
'left': 195.12,
'width': 8.020004272460938,
'height': 6.329999923706055,
'text': '5'},
{'top': 220.11,
'left': 267.24,
'width': 8.02001953125,
'height': 6.329999923706055,
'text': '1'},
{'top': 220.11,
'left': 343.68,
'width': 8.02001953125,
'height': 6.329999923706055,
'text': '4'},
{'top': 220.11,
'left': 398.28,
'width': 51.94000244140625,
'height': 6.329999923706055,
'text': '34.5%, n=1'},
{'top': 220.11,
'left': 471.48,
'width': 63.819976806640625,
'height': 6.329999923706055,
'text': '1199 sec, n=1'}],
[{'top': 246.03,
'left': 90.0,
'width': 50.5,
'height': 6.329999923706055,
'text': 'Low Vision'},
{'top': 246.03,
'left': 195.12,
'width': 8.020004272460938,
'height': 6.329999923706055,
'text': '5'},
{'top': 246.03,
'left': 267.25,
'width': 8.019989013671875,
'height': 6.329999923706055,
'text': '2'},
{'top': 246.03,
'left': 343.69,
'width': 8.019989013671875,
'height': 6.329999923706055,
'text': '3'},
{'top': 246.03,
'left': 399.61,
'width': 49.170013427734375,
'height': 6.329999923706055,
'text': '98.3% n=2'},
{'top': 246.03,
'left': 471.49,
'width': 63.80999755859375,
'height': 6.329999923706055,
'text': '1716 sec, n=3'}],
[{'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
{'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
{'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
{'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
{'top': 271.47,
'left': 394.92,
'width': 58.66998291015625,
'height': 6.329999923706055,
'text': '(97.7%, n=3)'},
{'top': 271.47,
'left': 468.24,
'width': 70.41998291015625,
'height': 6.329999923706055,
'text': '(1934 sec, n=2)'}],
[{'top': 297.39,
'left': 90.0,
'width': 43.660003662109375,
'height': 6.329999923706055,
'text': 'Dexterity'},
{'top': 297.39,
'left': 195.11,
'width': 8.020004272460938,
'height': 6.329999923706055,
'text': '5'},
{'top': 297.39,
'left': 267.24,
'width': 8.02001953125,
'height': 6.329999923706055,
'text': '4'},
{'top': 297.39,
'left': 343.68,
'width': 8.02001953125,
'height': 6.329999923706055,
'text': '1'},
{'top': 297.39,
'left': 398.28,
'width': 51.94000244140625,
'height': 6.329999923706055,
'text': '98.3%, n=4'},
{'top': 297.39,
'left': 467.4,
'width': 72.10000610351562,
'height': 6.329999923706055,
'text': '1672.1 sec, n=4'}],
[{'top': 323.31,
'left': 90.0,
'width': 39.69999694824219,
'height': 6.329999923706055,
'text': 'Mobility'},
{'top': 323.31,
'left': 195.12,
'width': 8.020004272460938,
'height': 6.329999923706055,
'text': '3'},
{'top': 323.31,
'left': 267.25,
'width': 8.019989013671875,
'height': 6.329999923706055,
'text': '3'},
{'top': 323.31,
'left': 343.69,
'width': 8.019989013671875,
'height': 6.329999923706055,
'text': '0'},
{'top': 323.31,
'left': 398.29,
'width': 51.94000244140625,
'height': 6.329999923706055,
'text': '95.4%, n=3'},
{'top': 323.31,
'left': 471.49,
'width': 63.80999755859375,
'height': 6.329999923706055,
'text': '1416 sec, n=3'}]]}
Is there a way I could create a dataframe like the first one but with coordinates of each text as a Rect() for example ?
Here is a solution that might work for you. You populate a dataframe with a custom object TextRect with coordinates and texts. Then you separate coordinates and texts into 2 dataframes :
from fitz.fitz import Rect # Rect coordinates
from tabula import read_pdf # to extract table dataframe from pdf
import numpy as np
import pandas as pd
class TextRect:
"""
Class TextRect aim to store an object with a text and its coordinates in a Rect object.
Arguments:
- top (float) : top coordinates of the text
- left (float) : left coordinates of the text
- width (float) : width coordinates of the text
- height (float) : height coordinates of the text
- text (string) : text of the PDF we are dealing with
"""
def __init__(self, top, left, width, height, text):
self._rect = Rect(left + width, top - height, left, top)
self._text = text
def get_rect(self):
"""
Function to return the Rect object
"""
return self._rect
def get_text(self):
"""
Function to return the string text
"""
return self._text
def transform(x):
"""
Function to transform dictionary into TextRect object
Argument:
- x (dict) : dictionary with key {'top', 'left', 'width', 'height', 'text'}
Return:
- TextRect object with coordinates of text, text and page of text.
"""
return TextRect(**x)
def transform_text(textRect):
"""
Function to get the text of a TextRect object
Argument:
- textRect (TextRect) : TextRect object
Return:
- text (string)
"""
return textRect.get_text()
def transform_rect(x):
"""
Function to get the Rect coordinates of a TextRect object
Argument:
- textRect (TextRect) : TextRect object
Return:
- rectangle coordinates (Rect)
"""
return x.get_rect()
def get_text_df(json_df):
"""
Function to get a dataframe with text only
Arguments :
- json_df (pandas.DataFrame(TextRect)) : df is a dataframe from raw json table
Return : pandas.DataFrame(String)
"""
# Vectorisation of transform() function for more speed
vtransform = np.vectorize(transform)
vtransform_text = np.vectorize(transform_text)
# Apply on all value transform() and transform_text() to get from json
# dataframe the TextRect object then the string text
return json_df.apply(vtransform).apply(vtransform_text)
def get_rect_df(json_df):
"""
Function to get a dataframe with Rect coordinates only
Arguments :
- json_df (pandas.DataFrame(TextRect)) : df is a dataframe from raw json table
Return : pandas.DataFrame(Rect)
"""
# Vectorisation of transform() function for more speed
vtransform = np.vectorize(transform)
# Apply on all value transform() and transform_text() to get from json
# dataframe the TextRect object then the Rect coordinates
return json_df.apply(vtransform).applymap(transform_rect)
def extract_df_list(file_path):
"""
Method to extract 2 lists, one with dataframes containing text table,
one with dataframes containing coordinates tables.
Argument:
- file_path (string) : String path of the PDF file
Returns :
- document_txt (list of pandas.DataFrame) : List of DataFrame. Each dataframe correspond
to text found in all table.
- document_rect (list of pandas.DataFrame) : List of DataFrame. Each dataframe correspond
to text coordinates found in all table.
"""
json_pages = []
# For all page, we extract all table as a json and put them into a list
json_pages.append(read_pdf(file_path, pages='all', output_format='json'))
# Extract txt and coordinates from json
list_df_txt = []
list_df_rect = []
# For all page, we iterate over each json table found to extract text and coordinates
for json_df in json_pages:
list_df_txt.append(get_text_df(pd.DataFrame(json_df['data'])))
list_df_rect.append(get_rect_df(pd.DataFrame(json_df['data'])))
return list_df_txt, list_df_rect
# Get your 2 dataframes
list_df_1_txt, list_df_1_rect = extract_df_list("file_path_pdf")