Problems with table extraction using pdfplumber - empty fields in large tables

The script below extracts a PDF table using pdfpumbler. It is a table that always comes after the second appearance of the expression "Quadro de Definições" - and a table that is 12 pages long. The script works, but some fields are missing content, such as: “Custodiante”, “Fundo”, “Gestora”, “Escriturador”...

Here is the original PDF. It is in Brazilian Portuguese And here is the CSV I generated

Please, does anyone know what strategy I could use to capture all the information?

import logging
import pdfplumber
import pandas as pd
import os
import re

# Configure logging to display informational messages
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Set the fixed path for the PDF file
PDF_PATH = 'data/prospectos/52670402000105-opd08122023v01-000566736.pdf'

def extract_tables_from_pdf(pdf_path):
    """
    Extract tables from the specified PDF file.
    
    Args:
    pdf_path (str): Path to the PDF file.
    
    Returns:
    list: A list of extracted tables, or None if extraction fails.
    """
    # Check if the PDF file exists
    if not os.path.exists(pdf_path):
        logging.error(f"The file {pdf_path} does not exist.")
        return None

    try:
        # Open the PDF file using pdfplumber
        with pdfplumber.open(pdf_path) as pdf:
            logging.info(f"PDF loaded successfully. Total pages: {len(pdf.pages)}")

            # Find the second occurrence of "Quadro de Definições"
            page_with_second_occurrence = None
            occurrences = 0
            for page_num, page in enumerate(pdf.pages, start=1):
                if "Quadro de Definições" in page.extract_text():
                    occurrences += 1
                    if occurrences == 2:
                        page_with_second_occurrence = page_num
                        break

            # Check if we found the second occurrence
            if page_with_second_occurrence is None:
                logging.warning("Could not find two occurrences of 'Quadro de Definições'.")
                return None
            
            logging.info(f"The second 'Quadro de Definições' appears on page: {page_with_second_occurrence}")

            # Define the range of pages to extract (12 pages after the second occurrence)
            start_page = page_with_second_occurrence
            end_page = min(start_page + 12, len(pdf.pages))

            logging.info(f"Extracting tables from pages {start_page} to {end_page}")

            # Extract tables from the specified range of pages
            tables = []
            for page in pdf.pages[start_page-1:end_page]:
                page_tables = page.extract_tables()
                if page_tables:
                    tables.extend(page_tables)

            logging.info(f"Number of tables extracted: {len(tables)}")

            return tables

    except Exception as e:
        logging.error(f"An error occurred while processing the PDF: {str(e)}")
        return None

def safe_strip(cell):
    """
    Safely strip whitespace from a cell, handling None values.
    
    Args:
    cell: The cell content to strip.
    
    Returns:
    str: The stripped string or an empty string if cell is None.
    """
    if cell is None:
        return ''
    return str(cell).strip()

def process_and_combine_tables(tables):
    """
    Process and combine all extracted tables into a single DataFrame.
    
    Args:
    tables (list): List of tables extracted from the PDF.
    
    Returns:
    pandas.DataFrame: A DataFrame containing all processed and combined table data.
    """
    processed_tables = []
    for table_index, table in enumerate(tables):
        # Remove empty rows
        table = [row for row in table if row and any(safe_strip(cell) != '' for cell in row)]
        
        # Process each row
        processed_rows = []
        for row in table:
            if len(row) == 1:
                # If the row has only one column, split it into two at the first double space
                content = safe_strip(row[0])
                split_row = re.split(r'\s{2,}', content, maxsplit=1)
                processed_rows.append(split_row if len(split_row) == 2 else [split_row[0], ''])
            else:
                # If the row has multiple columns, take the first two
                processed_rows.append([safe_strip(cell) for cell in row[:2]])
        
        processed_tables.extend(processed_rows)
    
    # Create a DataFrame with all processed rows
    df = pd.DataFrame(processed_tables, columns=['Term', 'Definition'])
    
    # Remove rows where the Term is empty
    df = df[df['Term'] != '']
    
    # Remove duplicates, keeping the first occurrence
    df = df.drop_duplicates(subset='Term', keep='first')
    
    # Set 'Term' as the index of the DataFrame
    df.set_index('Term', inplace=True)
    
    return df

if __name__ == "__main__":
    # Log the start of the table extraction process
    logging.info(f"Starting table extraction from file: {PDF_PATH}")
    
    # Extract tables from the PDF
    extracted_tables = extract_tables_from_pdf(PDF_PATH)
    
    if extracted_tables:
        try:
            # Process and combine the extracted tables
            df_combined = process_and_combine_tables(extracted_tables)
            print(df_combined)
            
            # Save the combined DataFrame to a CSV file
            csv_path = 'combined definitions framework.csv'
            df_combined.to_csv(csv_path)
            logging.info(f"Combined DataFrame saved to '{csv_path}'")
        except Exception as e:
            logging.error(f"Error processing and combining tables: {str(e)}")
    else:
        logging.warning("Could not extract tables from the PDF.")
    
    # Log the completion of the extraction and combination process
    logging.info("Table extraction and combination process completed.")

Solution

The solution is to improve the table extraction by playing with its settings.
Here is a revised version of you script:

import logging
import pdfplumber
import pandas as pd
import os

# Configure logging to display informational messages
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Set the fixed path for the PDF file
PDF_PATH = 'data/prospectos/52670402000105-opd08122023v01-000566736.pdf'

# default settings for table extraction from: https://pypi.org/project/pdfplumber/#Table-extraction%20settings
# settings = {
#     "vertical_strategy": "lines", 
#     "horizontal_strategy": "lines",
#     "explicit_vertical_lines": [],
#     "explicit_horizontal_lines": [],
#     "snap_tolerance": 3,
#     "snap_x_tolerance": 3,
#     "snap_y_tolerance": 3,
#     "join_tolerance": 3,
#     "join_x_tolerance": 3,
#     "join_y_tolerance": 3,
#     "edge_min_length": 3,
#     "min_words_vertical": 3,
#     "min_words_horizontal": 3,
#     "intersection_tolerance": 3,
#     "intersection_x_tolerance": 3,
#     "intersection_y_tolerance": 3,
#     "text_tolerance": 3,
#     "text_x_tolerance": 3,
#     "text_y_tolerance": 3
# }

# Custom settings for table extraction 
settings = {
    "vertical_strategy": "lines", 
    "horizontal_strategy": "lines",
    "snap_x_tolerance": 20,          # impacts number of columns
    "snap_y_tolerance": 2.5,         # impacts rows alignment and number or rows
    "intersection_x_tolerance": 1    # impacts row sep
}

def extract_tables_from_pdf(pdf_path):
    """
    Extract tables from the specified PDF file.
    
    Args:
    pdf_path (str): Path to the PDF file.
    
    Returns:
    list: A list of extracted tables, or None if extraction fails.
    """
    # Check if the PDF file exists
    if not os.path.exists(pdf_path):
        logging.error(f"The file {pdf_path} does not exist.")
        return None

    try:
        # Open the PDF file using pdfplumber
        with pdfplumber.open(pdf_path) as pdf:
            logging.info(f"PDF loaded successfully. Total pages: {len(pdf.pages)}")

            # Find the second occurrence of "Quadro de Definições"
            page_with_second_occurrence = None
            occurrences = 0
            for page_num, page in enumerate(pdf.pages, start=1):
                if "Quadro de Definições" in page.extract_text():
                    occurrences += 1
                    if occurrences == 2:
                        page_with_second_occurrence = page_num
                        break

            # Check if we found the second occurrence
            if page_with_second_occurrence is None:
                logging.warning("Could not find two occurrences of 'Quadro de Definições'.")
                return None
            
            logging.info(f"The second 'Quadro de Definições' appears on page: {page_with_second_occurrence}")

            # Define the range of pages to extract (12 pages after the second occurrence)
            start_page = page_with_second_occurrence
            end_page = min(start_page + 12, len(pdf.pages))

            logging.info(f"Extracting tables from pages {start_page} to {end_page}")

            # Extract tables from the specified range of pages
            tables = []
            for page in pdf.pages[start_page-1:end_page]:
                page_tables = page.extract_tables(table_settings=settings)
                if page_tables:
                    tables.extend(page_tables)

            logging.info(f"Number of tables extracted: {len(tables)}")

            return tables

    except Exception as e:
        logging.error(f"An error occurred while processing the PDF: {str(e)}")
        return None


def process_and_combine_tables(tables):
    """
    Process and combine all extracted tables into a single DataFrame.
    
    Args:
    tables (list): List of tables extracted from the PDF.
    
    Returns:
    pandas.DataFrame: A DataFrame containing all processed and combined table data.
    """
    processed_tables = []

    # regroup tables
    regr_tables = []
    for table in tables:
        for row in table:
            regr_tables.append(row)
    table = regr_tables
        
    # regroup definition splitted by page break
    for i in range(len(table)-2, -1, -1):        # iterate from last to first to avoid index offset when deleting list'elements.
        if table[i+1][0]=='':
            table[i][1]+= f"\n{table[i+1][1]}"
            del table[i+1]                        # deleting list's element

    processed_tables = table
    
    # Create a DataFrame with all processed rows
    df = pd.DataFrame(processed_tables, columns=['Term', 'Definition'])
    
    # Set 'Term' as the index of the DataFrame
    df.set_index('Term', inplace=True)
    
    return df


if __name__ == "__main__":
    # Log the start of the table extraction process
    logging.info(f"Starting table extraction from file: {PDF_PATH}")
    
    # Extract tables from the PDF
    extracted_tables = extract_tables_from_pdf(PDF_PATH)
    
    if extracted_tables:
        try:
            # Process and combine the extracted tables
            df_combined = process_and_combine_tables(extracted_tables)
            print(df_combined)
            
            # Save the combined DataFrame to a CSV file
            csv_path = 'combined definitions framework.csv'
            df_combined.to_csv(csv_path)
            logging.info(f"Combined DataFrame saved to '{csv_path}'")
        except Exception as e:
            logging.error(f"Error processing and combining tables: {str(e)}")
    else:
        logging.warning("Could not extract tables from the PDF.")
    
    # Log the completion of the extraction and combination process
    logging.info("Table extraction and combination process completed.")