Extract table from PDF with coordinates

I'm trying to extract from a multiple page PDF to then highlight some part of the PDF. For that, I need the coordinate of the text I extract.

I use tabula-py to extract tables with :

import tabula

# Read pdf into DataFrame
df = tabula.read_pdf("test.pdf", pages='all')

By using the param output_format='json', we can retrieve each text with coordinates, here is an example of dataframe format (the dataframe is a dict here is you want to load it) and json format:

{0: {0: nan,
  1: nan,
  2: 'Disability',
  3: nan,
  4: 'Category',
  5: nan,
  6: nan,
  7: 'Blind',
  8: 'Low Vision',
  9: nan,
  10: 'Dexterity',
  11: 'Mobility'},
 1: {0: nan,
  1: nan,
  2: nan,
  3: 'Participants',
  4: nan,
  5: nan,
  6: nan,
  7: '5',
  8: '5',
  9: nan,
  10: '5',
  11: '3'},
 2: {0: nan,
  1: nan,
  2: 'Ballots',
  3: nan,
  4: 'Completed',
  5: nan,
  6: nan,
  7: '1',
  8: '2',
  9: nan,
  10: '4',
  11: '3'},
 3: {0: nan,
  1: 'Ballots',
  2: nan,
  3: 'Incomplete/',
  4: nan,
  5: 'Terminated',
  6: nan,
  7: '4',
  8: '3',
  9: nan,
  10: '1',
  11: '0'},
 4: {0: nan,
  1: nan,
  2: nan,
  3: nan,
  4: 'Accuracy',
  5: nan,
  6: nan,
  7: '34.5%, n=1',
  8: '98.3% n=2',
  9: '(97.7%, n=3)',
  10: '98.3%, n=4',
  11: '95.4%, n=3'},
 5: {0: 'Results',
  1: nan,
  2: nan,
  3: nan,
  4: 'Time to',
  5: nan,
  6: 'complete',
  7: '1199 sec, n=1',
  8: '1716 sec, n=3',
  9: '(1934 sec, n=2)',
  10: '1672.1 sec, n=4',
  11: '1416 sec, n=3'}}

{'extraction_method': 'stream',
 'top': 143.0,
 'left': 83.0,
 'width': 461.0,
 'height': 202.0,
 'right': 544.0,
 'bottom': 345.0,
 'data': [[{'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
   {'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
   {'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
   {'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
   {'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
   {'top': 149.0,
    'left': 448.2,
    'width': 38.58997344970703,
    'height': 6.880000114440918,
    'text': 'Results'}],
  [{'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
   {'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
   {'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
   {'top': 156.56,
    'left': 311.4,
    'width': 36.66999053955078,
    'height': 6.880000114440918,
    'text': 'Ballots'},
   {'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
   {'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''}],
  [{'top': 163.88,
    'left': 90.0,
    'width': 49.150001525878906,
    'height': 6.880000114440918,
    'text': 'Disability'},
   {'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
   {'top': 163.88,
    'left': 239.4,
    'width': 36.66999053955078,
    'height': 6.880000114440918,
    'text': 'Ballots'},
   {'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
   {'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
   {'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''}],
  [{'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
   {'top': 171.2,
    'left': 167.16,
    'width': 62.10999298095703,
    'height': 6.880000114440918,
    'text': 'Participants'},
   {'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
   {'top': 171.2,
    'left': 311.4,
    'width': 64.3899917602539,
    'height': 6.880000114440918,
    'text': 'Incomplete/'},
   {'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
   {'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''}],
  [{'top': 178.52,
    'left': 90.0,
    'width': 47.230003356933594,
    'height': 6.880000114440918,
    'text': 'Category'},
   {'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
   {'top': 178.52,
    'left': 239.4,
    'width': 57.31000518798828,
    'height': 6.880000114440918,
    'text': 'Completed'},
   {'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
   {'top': 176.36,
    'left': 400.68,
    'width': 47.350013732910156,
    'height': 6.880000114440918,
    'text': 'Accuracy'},
   {'top': 176.36,
    'left': 483.12,
    'width': 40.749977111816406,
    'height': 6.880000114440918,
    'text': 'Time to'}],
  [{'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
   {'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
   {'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
   {'top': 185.84,
    'left': 311.4,
    'width': 60.66999053955078,
    'height': 6.880000114440918,
    'text': 'Terminated'},
   {'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
   {'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''}],
  [{'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
   {'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
   {'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
   {'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
   {'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
   {'top': 193.16,
    'left': 478.68,
    'width': 49.629981994628906,
    'height': 6.880000114440918,
    'text': 'complete'}],
  [{'top': 220.11,
    'left': 90.0,
    'width': 25.05999755859375,
    'height': 6.329999923706055,
    'text': 'Blind'},
   {'top': 220.11,
    'left': 195.12,
    'width': 8.020004272460938,
    'height': 6.329999923706055,
    'text': '5'},
   {'top': 220.11,
    'left': 267.24,
    'width': 8.02001953125,
    'height': 6.329999923706055,
    'text': '1'},
   {'top': 220.11,
    'left': 343.68,
    'width': 8.02001953125,
    'height': 6.329999923706055,
    'text': '4'},
   {'top': 220.11,
    'left': 398.28,
    'width': 51.94000244140625,
    'height': 6.329999923706055,
    'text': '34.5%, n=1'},
   {'top': 220.11,
    'left': 471.48,
    'width': 63.819976806640625,
    'height': 6.329999923706055,
    'text': '1199 sec, n=1'}],
  [{'top': 246.03,
    'left': 90.0,
    'width': 50.5,
    'height': 6.329999923706055,
    'text': 'Low Vision'},
   {'top': 246.03,
    'left': 195.12,
    'width': 8.020004272460938,
    'height': 6.329999923706055,
    'text': '5'},
   {'top': 246.03,
    'left': 267.25,
    'width': 8.019989013671875,
    'height': 6.329999923706055,
    'text': '2'},
   {'top': 246.03,
    'left': 343.69,
    'width': 8.019989013671875,
    'height': 6.329999923706055,
    'text': '3'},
   {'top': 246.03,
    'left': 399.61,
    'width': 49.170013427734375,
    'height': 6.329999923706055,
    'text': '98.3% n=2'},
   {'top': 246.03,
    'left': 471.49,
    'width': 63.80999755859375,
    'height': 6.329999923706055,
    'text': '1716 sec, n=3'}],
  [{'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
   {'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
   {'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
   {'top': 0.0, 'left': 0.0, 'width': 0.0, 'height': 0.0, 'text': ''},
   {'top': 271.47,
    'left': 394.92,
    'width': 58.66998291015625,
    'height': 6.329999923706055,
    'text': '(97.7%, n=3)'},
   {'top': 271.47,
    'left': 468.24,
    'width': 70.41998291015625,
    'height': 6.329999923706055,
    'text': '(1934 sec, n=2)'}],
  [{'top': 297.39,
    'left': 90.0,
    'width': 43.660003662109375,
    'height': 6.329999923706055,
    'text': 'Dexterity'},
   {'top': 297.39,
    'left': 195.11,
    'width': 8.020004272460938,
    'height': 6.329999923706055,
    'text': '5'},
   {'top': 297.39,
    'left': 267.24,
    'width': 8.02001953125,
    'height': 6.329999923706055,
    'text': '4'},
   {'top': 297.39,
    'left': 343.68,
    'width': 8.02001953125,
    'height': 6.329999923706055,
    'text': '1'},
   {'top': 297.39,
    'left': 398.28,
    'width': 51.94000244140625,
    'height': 6.329999923706055,
    'text': '98.3%, n=4'},
   {'top': 297.39,
    'left': 467.4,
    'width': 72.10000610351562,
    'height': 6.329999923706055,
    'text': '1672.1 sec, n=4'}],
  [{'top': 323.31,
    'left': 90.0,
    'width': 39.69999694824219,
    'height': 6.329999923706055,
    'text': 'Mobility'},
   {'top': 323.31,
    'left': 195.12,
    'width': 8.020004272460938,
    'height': 6.329999923706055,
    'text': '3'},
   {'top': 323.31,
    'left': 267.25,
    'width': 8.019989013671875,
    'height': 6.329999923706055,
    'text': '3'},
   {'top': 323.31,
    'left': 343.69,
    'width': 8.019989013671875,
    'height': 6.329999923706055,
    'text': '0'},
   {'top': 323.31,
    'left': 398.29,
    'width': 51.94000244140625,
    'height': 6.329999923706055,
    'text': '95.4%, n=3'},
   {'top': 323.31,
    'left': 471.49,
    'width': 63.80999755859375,
    'height': 6.329999923706055,
    'text': '1416 sec, n=3'}]]}

Is there a way I could create a dataframe like the first one but with coordinates of each text as a Rect() for example ?

Solution

Here is a solution that might work for you. You populate a dataframe with a custom object TextRect with coordinates and texts. Then you separate coordinates and texts into 2 dataframes :

from fitz.fitz import Rect  # Rect coordinates
from tabula import read_pdf  # to extract table dataframe from pdf
import numpy as np
import pandas as pd

class TextRect:
    """
    Class TextRect aim to store an object with a text and its coordinates in a Rect object.
    Arguments:
        - top (float) : top coordinates of the text
        - left (float) : left coordinates of the text
        - width (float) : width coordinates of the text
        - height (float) : height coordinates of the text
        - text (string) : text of the PDF we are dealing with
    """

    def __init__(self, top, left, width, height, text):
        self._rect = Rect(left + width, top - height, left, top)
        self._text = text

    def get_rect(self):
        """
        Function to return the Rect object
        """
        return self._rect

    def get_text(self):
        """
        Function to return the string text
        """
        return self._text


def transform(x):
    """
    Function to transform dictionary into TextRect object
    Argument:
        - x (dict) : dictionary with key {'top', 'left', 'width', 'height', 'text'}
    Return:
        - TextRect object with coordinates of text, text and page of text.
    """
    return TextRect(**x)


def transform_text(textRect):
    """
    Function to get the text of a TextRect object
    Argument:
        - textRect (TextRect) : TextRect object
    Return:
        - text (string)
    """
    return textRect.get_text()


def transform_rect(x):
    """
    Function to get the Rect coordinates of a TextRect object
    Argument:
        - textRect (TextRect) : TextRect object
    Return:
        - rectangle coordinates (Rect)
    """
    return x.get_rect()


def get_text_df(json_df):
    """
    Function to get a dataframe with text only
    Arguments :
        - json_df (pandas.DataFrame(TextRect)) : df is a dataframe from raw json table
    Return : pandas.DataFrame(String)
    """
    # Vectorisation of transform() function for more speed
    vtransform = np.vectorize(transform)
    vtransform_text = np.vectorize(transform_text)

    # Apply on all value transform() and transform_text() to get from json
    # dataframe the TextRect object then the string text
    return json_df.apply(vtransform).apply(vtransform_text)


def get_rect_df(json_df):
    """
    Function to get a dataframe with Rect coordinates only
    Arguments :
        - json_df (pandas.DataFrame(TextRect)) : df is a dataframe from raw json table
    Return : pandas.DataFrame(Rect)
    """
    # Vectorisation of transform() function for more speed
    vtransform = np.vectorize(transform)

    # Apply on all value transform() and transform_text() to get from json
    # dataframe the TextRect object then the Rect coordinates
    return json_df.apply(vtransform).applymap(transform_rect)

def extract_df_list(file_path):
    """
    Method to extract 2 lists, one with dataframes containing text table,
    one with dataframes containing coordinates tables.
    Argument:
        - file_path (string) : String path of the PDF file
    Returns :
        - document_txt (list of pandas.DataFrame) : List of DataFrame. Each dataframe correspond
        to text found in all table.
        - document_rect (list of pandas.DataFrame) : List of DataFrame. Each dataframe correspond
        to text coordinates found in all table.
    """
    json_pages = []

    # For all page, we extract all table as a json and put them into a list
    json_pages.append(read_pdf(file_path, pages='all', output_format='json'))

    # Extract txt and coordinates from json
    list_df_txt = []
    list_df_rect = []

    # For all page, we iterate over each json table found to extract text and coordinates
    for json_df in json_pages:
        list_df_txt.append(get_text_df(pd.DataFrame(json_df['data'])))
        list_df_rect.append(get_rect_df(pd.DataFrame(json_df['data'])))

    return list_df_txt, list_df_rect

# Get your 2 dataframes
list_df_1_txt, list_df_1_rect = extract_df_list("file_path_pdf")