Search code examples
regexpdfextractpdfplumber

Problems with table extraction using pdfplumber - empty fields in large tables


The script below extracts a PDF table using pdfpumbler. It is a table that always comes after the second appearance of the expression "Quadro de Definições" - and a table that is 12 pages long. The script works, but some fields are missing content, such as: “Custodiante”, “Fundo”, “Gestora”, “Escriturador”...

Here is the original PDF. It is in Brazilian Portuguese And here is the CSV I generated

Please, does anyone know what strategy I could use to capture all the information?

import logging
import pdfplumber
import pandas as pd
import os
import re

# Configure logging to display informational messages
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Set the fixed path for the PDF file
PDF_PATH = 'data/prospectos/52670402000105-opd08122023v01-000566736.pdf'

def extract_tables_from_pdf(pdf_path):
    """
    Extract tables from the specified PDF file.
    
    Args:
    pdf_path (str): Path to the PDF file.
    
    Returns:
    list: A list of extracted tables, or None if extraction fails.
    """
    # Check if the PDF file exists
    if not os.path.exists(pdf_path):
        logging.error(f"The file {pdf_path} does not exist.")
        return None

    try:
        # Open the PDF file using pdfplumber
        with pdfplumber.open(pdf_path) as pdf:
            logging.info(f"PDF loaded successfully. Total pages: {len(pdf.pages)}")

            # Find the second occurrence of "Quadro de Definições"
            page_with_second_occurrence = None
            occurrences = 0
            for page_num, page in enumerate(pdf.pages, start=1):
                if "Quadro de Definições" in page.extract_text():
                    occurrences += 1
                    if occurrences == 2:
                        page_with_second_occurrence = page_num
                        break

            # Check if we found the second occurrence
            if page_with_second_occurrence is None:
                logging.warning("Could not find two occurrences of 'Quadro de Definições'.")
                return None
            
            logging.info(f"The second 'Quadro de Definições' appears on page: {page_with_second_occurrence}")

            # Define the range of pages to extract (12 pages after the second occurrence)
            start_page = page_with_second_occurrence
            end_page = min(start_page + 12, len(pdf.pages))

            logging.info(f"Extracting tables from pages {start_page} to {end_page}")

            # Extract tables from the specified range of pages
            tables = []
            for page in pdf.pages[start_page-1:end_page]:
                page_tables = page.extract_tables()
                if page_tables:
                    tables.extend(page_tables)

            logging.info(f"Number of tables extracted: {len(tables)}")

            return tables

    except Exception as e:
        logging.error(f"An error occurred while processing the PDF: {str(e)}")
        return None

def safe_strip(cell):
    """
    Safely strip whitespace from a cell, handling None values.
    
    Args:
    cell: The cell content to strip.
    
    Returns:
    str: The stripped string or an empty string if cell is None.
    """
    if cell is None:
        return ''
    return str(cell).strip()

def process_and_combine_tables(tables):
    """
    Process and combine all extracted tables into a single DataFrame.
    
    Args:
    tables (list): List of tables extracted from the PDF.
    
    Returns:
    pandas.DataFrame: A DataFrame containing all processed and combined table data.
    """
    processed_tables = []
    for table_index, table in enumerate(tables):
        # Remove empty rows
        table = [row for row in table if row and any(safe_strip(cell) != '' for cell in row)]
        
        # Process each row
        processed_rows = []
        for row in table:
            if len(row) == 1:
                # If the row has only one column, split it into two at the first double space
                content = safe_strip(row[0])
                split_row = re.split(r'\s{2,}', content, maxsplit=1)
                processed_rows.append(split_row if len(split_row) == 2 else [split_row[0], ''])
            else:
                # If the row has multiple columns, take the first two
                processed_rows.append([safe_strip(cell) for cell in row[:2]])
        
        processed_tables.extend(processed_rows)
    
    # Create a DataFrame with all processed rows
    df = pd.DataFrame(processed_tables, columns=['Term', 'Definition'])
    
    # Remove rows where the Term is empty
    df = df[df['Term'] != '']
    
    # Remove duplicates, keeping the first occurrence
    df = df.drop_duplicates(subset='Term', keep='first')
    
    # Set 'Term' as the index of the DataFrame
    df.set_index('Term', inplace=True)
    
    return df

if __name__ == "__main__":
    # Log the start of the table extraction process
    logging.info(f"Starting table extraction from file: {PDF_PATH}")
    
    # Extract tables from the PDF
    extracted_tables = extract_tables_from_pdf(PDF_PATH)
    
    if extracted_tables:
        try:
            # Process and combine the extracted tables
            df_combined = process_and_combine_tables(extracted_tables)
            print(df_combined)
            
            # Save the combined DataFrame to a CSV file
            csv_path = 'combined definitions framework.csv'
            df_combined.to_csv(csv_path)
            logging.info(f"Combined DataFrame saved to '{csv_path}'")
        except Exception as e:
            logging.error(f"Error processing and combining tables: {str(e)}")
    else:
        logging.warning("Could not extract tables from the PDF.")
    
    # Log the completion of the extraction and combination process
    logging.info("Table extraction and combination process completed.")

Solution

  • The solution is to improve the table extraction by playing with its settings.
    Here is a revised version of you script:

    import logging
    import pdfplumber
    import pandas as pd
    import os
    
    # Configure logging to display informational messages
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    
    # Set the fixed path for the PDF file
    PDF_PATH = 'data/prospectos/52670402000105-opd08122023v01-000566736.pdf'
    
    # default settings for table extraction from: https://pypi.org/project/pdfplumber/#Table-extraction%20settings
    # settings = {
    #     "vertical_strategy": "lines", 
    #     "horizontal_strategy": "lines",
    #     "explicit_vertical_lines": [],
    #     "explicit_horizontal_lines": [],
    #     "snap_tolerance": 3,
    #     "snap_x_tolerance": 3,
    #     "snap_y_tolerance": 3,
    #     "join_tolerance": 3,
    #     "join_x_tolerance": 3,
    #     "join_y_tolerance": 3,
    #     "edge_min_length": 3,
    #     "min_words_vertical": 3,
    #     "min_words_horizontal": 3,
    #     "intersection_tolerance": 3,
    #     "intersection_x_tolerance": 3,
    #     "intersection_y_tolerance": 3,
    #     "text_tolerance": 3,
    #     "text_x_tolerance": 3,
    #     "text_y_tolerance": 3
    # }
    
    # Custom settings for table extraction 
    settings = {
        "vertical_strategy": "lines", 
        "horizontal_strategy": "lines",
        "snap_x_tolerance": 20,          # impacts number of columns
        "snap_y_tolerance": 2.5,         # impacts rows alignment and number or rows
        "intersection_x_tolerance": 1    # impacts row sep
    }
    
    def extract_tables_from_pdf(pdf_path):
        """
        Extract tables from the specified PDF file.
        
        Args:
        pdf_path (str): Path to the PDF file.
        
        Returns:
        list: A list of extracted tables, or None if extraction fails.
        """
        # Check if the PDF file exists
        if not os.path.exists(pdf_path):
            logging.error(f"The file {pdf_path} does not exist.")
            return None
    
        try:
            # Open the PDF file using pdfplumber
            with pdfplumber.open(pdf_path) as pdf:
                logging.info(f"PDF loaded successfully. Total pages: {len(pdf.pages)}")
    
                # Find the second occurrence of "Quadro de Definições"
                page_with_second_occurrence = None
                occurrences = 0
                for page_num, page in enumerate(pdf.pages, start=1):
                    if "Quadro de Definições" in page.extract_text():
                        occurrences += 1
                        if occurrences == 2:
                            page_with_second_occurrence = page_num
                            break
    
                # Check if we found the second occurrence
                if page_with_second_occurrence is None:
                    logging.warning("Could not find two occurrences of 'Quadro de Definições'.")
                    return None
                
                logging.info(f"The second 'Quadro de Definições' appears on page: {page_with_second_occurrence}")
    
                # Define the range of pages to extract (12 pages after the second occurrence)
                start_page = page_with_second_occurrence
                end_page = min(start_page + 12, len(pdf.pages))
    
                logging.info(f"Extracting tables from pages {start_page} to {end_page}")
    
                # Extract tables from the specified range of pages
                tables = []
                for page in pdf.pages[start_page-1:end_page]:
                    page_tables = page.extract_tables(table_settings=settings)
                    if page_tables:
                        tables.extend(page_tables)
    
                logging.info(f"Number of tables extracted: {len(tables)}")
    
                return tables
    
        except Exception as e:
            logging.error(f"An error occurred while processing the PDF: {str(e)}")
            return None
    
    
    def process_and_combine_tables(tables):
        """
        Process and combine all extracted tables into a single DataFrame.
        
        Args:
        tables (list): List of tables extracted from the PDF.
        
        Returns:
        pandas.DataFrame: A DataFrame containing all processed and combined table data.
        """
        processed_tables = []
    
        # regroup tables
        regr_tables = []
        for table in tables:
            for row in table:
                regr_tables.append(row)
        table = regr_tables
            
        # regroup definition splitted by page break
        for i in range(len(table)-2, -1, -1):        # iterate from last to first to avoid index offset when deleting list'elements.
            if table[i+1][0]=='':
                table[i][1]+= f"\n{table[i+1][1]}"
                del table[i+1]                        # deleting list's element
    
        processed_tables = table
        
        # Create a DataFrame with all processed rows
        df = pd.DataFrame(processed_tables, columns=['Term', 'Definition'])
        
        # Set 'Term' as the index of the DataFrame
        df.set_index('Term', inplace=True)
        
        return df
    
    
    if __name__ == "__main__":
        # Log the start of the table extraction process
        logging.info(f"Starting table extraction from file: {PDF_PATH}")
        
        # Extract tables from the PDF
        extracted_tables = extract_tables_from_pdf(PDF_PATH)
        
        if extracted_tables:
            try:
                # Process and combine the extracted tables
                df_combined = process_and_combine_tables(extracted_tables)
                print(df_combined)
                
                # Save the combined DataFrame to a CSV file
                csv_path = 'combined definitions framework.csv'
                df_combined.to_csv(csv_path)
                logging.info(f"Combined DataFrame saved to '{csv_path}'")
            except Exception as e:
                logging.error(f"Error processing and combining tables: {str(e)}")
        else:
            logging.warning("Could not extract tables from the PDF.")
        
        # Log the completion of the extraction and combination process
        logging.info("Table extraction and combination process completed.")