Search code examples
pythonpython-3.xpandaspippython-docx

Extract a Word table from multiple docx files using python docx


I have a quite a few word files that have same table structure that I need to extract and save them into a csv/excel as a separate sheet (in .xls) for each word.docx.

Below only extracts first table.. and doesn't loop through whole docx.. is there a way we can loop through entire .doc and all the files in the folder

import os
from docx import Document
import pandas as pd
folder = 'C:/Users/trans/downloads/test'
file_names = [f for f in os.listdir(folder) if f.endswith(".docx") ]
file_names = [os.path.join(folder, file) for file in file_names]
print(file_names)
tables = []
for file in file_names:
    document = Document(file)
    for table in document.tables:
     df = [['' for i in range(len(table.columns))] for j in range(len(table.rows))]
     for i, row in enumerate(table.rows):
         for j, cell in enumerate(row.cells):
            if cell.text:
                df[i][j] = cell.text
    tables.append(pd.DataFrame(df))
    print(df)
    for nr, i in enumerate(tables):
        i.to_csv('C:/Users/trans/downloads/test/'"table_" + str(nr) + ".csv")

Solution

  • Python Code:

    import glob
    from docx import Document
    import pandas as pd
    folder = 'C:/Users/trans/downloads/test'
    file_names = glob.glob(folder + '/*.docx')
    
    tables = []
    for file in file_names:
        document = Document(file)
        for table in document.tables:
            df = [['' for i in range(len(table.columns))] for j in range(len(table.rows))]
            for i, row in enumerate(table.rows):
                for j, cell in enumerate(row.cells):
                    if cell.text:
                        df[i][j] = cell.text
            tables.append(pd.DataFrame(df))`
    for index, table in enumerate(tables):
        table.to_csv('C:/Users/trans/downloads/test/table_' + str(index) + ".csv")
    

    Java Code:

    This code uses Apache POI to handle the .docx files and JOpenDocument to handle the CSV conversion. Make sure to include the necessary dependencies in your project.

    import java.io.File;
    import java.io.IOException;
    import java.util.ArrayList;
    import java.util.List;
    import org.apache.poi.xwpf.usermodel.XWPFDocument;
    import org.apache.poi.xwpf.usermodel.XWPFTable;
    import org.apache.poi.xwpf.usermodel.XWPFTableCell;
    import org.apache.poi.xwpf.usermodel.XWPFTableRow;
    import org.jopendocument.dom.spreadsheet.Sheet;
    import org.jopendocument.dom.spreadsheet.SpreadSheet;
    
    public class DocxToCsv {
        public static void main(String[] args) throws IOException {
            String folder = "C:/Users/trans/downloads/test";
            File[] fileNames = new File(folder).listFiles((dir, name) -> name.endsWith(".docx"));
    
            List<Sheet> tables = new ArrayList<>();
            for (File file : fileNames) {
                XWPFDocument document = new XWPFDocument(file);
                List<XWPFTable> docTables = document.getTables();
                for (XWPFTable table : docTables) {
                    List<XWPFTableRow> rows = table.getRows();
                    Sheet sheet = SpreadSheet.createEmpty(rows.size(), rows.get(0).getTableCells().size()).getSheet(0);
    
                    for (int i = 0; i < rows.size(); i++) {
                        XWPFTableRow row = rows.get(i);
                        List<XWPFTableCell> cells = row.getTableCells();
                        for (int j = 0; j < cells.size(); j++) {
                            XWPFTableCell cell = cells.get(j);
                            String cellText = cell.getText();
                            if (!cellText.isEmpty()) {
                                sheet.setValueAt(cellText, i, j);
                            }
                        }
                    }
                    tables.add(sheet);
                }
            }
    
            for (int i = 0; i < tables.size(); i++) {
                Sheet table = tables.get(i);
                table.getSpreadSheet().saveAs("C:/Users/trans/downloads/test/table_" + i + ".csv");
            }
        }
    }