Search code examples
pythonpdfpypdfpython-camelot

Dealing with PDFs containing both tables and non-tabular data using Camelot PDF parser


I am using the Camelot PDF parsing library to extract data from PDF files, but I am facing an issue when the PDFs contain both tables and non-tabular data. Camelot seems to only extract table data and ignores the non-tabular content. Here is the code snippet I am using

from langchain.document_loaders.csv_loader import CSVLoader

import camelot
import uuid
from camelot.core import TableList

def export_tables_as_csv(filepath):
    tables = camelot.read_pdf(filepath, backend="ghostscript")
    for i, table in enumerate(tables):
        tables.export(f'table_{i+1}.csv', f='csv')

def generate_random_filename():
    return str(uuid.uuid4())


from collections import namedtuple

Document = namedtuple('Document', ['page_content', 'metadata'])

def formChunksForTable(filepath=None, file_type=None, url=None):
    try:
        if not filepath:
            print("Error: Filepath is missing.")
            return []

        all_docs = []
        tables = camelot.read_pdf(filepath, backend="ghostscript", flavor='stream')

        if isinstance(tables, TableList):
            for i, table in enumerate(tables):
                if table.df is not None and not table.df.empty:
                    for row_idx, row in enumerate(table.df.values):
                        page_content = ' '.join(row)
                        metadata = {'source': f'table-page-{i+1}-row-{row_idx+1}'}
                        doc = Document(page_content, metadata)
                        all_docs.append(doc)
                else:
                    print(f"Warning: Table {i+1} is empty.")

            if all_docs:
                print("Documents:", all_docs)
            else:
                print("No valid tables found in the PDF.")
        else:
            print("No tables found in the PDF.")
        
        return all_docs
    except Exception as e:
        print(f"Error: {e}")
        return []


Solution

  • Libraries like Camelot and Tabula are libraries to help extracting tables from pdf, if you want to extract text, images, annotations, etc, I would suggest the use of PyMuPDF.

    Personally, building a pipeline with the combination of Camelot and PyMuPDF by building functions to allow the definition of the particular pages where the data comes worked really good for me (see the code below):

    # Libraries to extract data from PDF:
    import fitz # raw text from PyMuPDF
    from camelot import read_pdf # tables
    
    class DataExtractor:
    
        # __init__ and other functions here...
    
        #Function to get text information from each page (Using Fitz):
        def get_text(self, page_no):
            doc = fitz.open(self.file)
            wordlistbypage = doc.load_page(page_no).get_text("blocks", sort = False)
            doc.close()
            return wordlistbypage
    
        # Function to get the page size (Using Fitz):
        def get_page_size(self, page_no):
            doc = fitz.open(self.file)
            page = doc[page_no]
            # using the values of rect instead of mediabox because sometimes some documents has the page rotated:
            width, height = page.rect.width, page.rect.height
            doc.close()
            return width, height
    
        #Function to get tables from each page (Using Camelot):
        def get_tables(self, page_no):
            try:
                tables = read_pdf(self.file, pages=page_no, multiple_tables = True)
            except:
                tables = None
            return tables