Search code examples
pythonpython-3.xexcelpython-docxdoc

How to get no. of characters of files ending with .docx and .doc from a directory and divide each file's characters by 65 and save them to an xlsx


I have a folder of many word document files ending with .doc and .docx.

This code is working only for .docx I want this for .doc also

import docx
import os

charCounts = {}
directory = os.fsencode('.')
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith(".docx"):
        #filename = os.path.join(directory, filename)
        doc = docx.Document(filename)
        chars = sum(len(p.text) for p in doc.paragraphs)
        charCounts[filename] = chars / 65

# uses openpyxl package
from openpyxl import Workbook
wb = Workbook()
ws = wb.active

ws.cell(row=1, column=2, value='File Name')
ws.cell(row=1, column=4, value='chars/65')
for i, x in enumerate(charCounts):
    ws.cell(row=i + 3, column=2, value=x)
    ws.cell(row=i + 3, column=4, value=charCounts[x])
    ws.cell(row=len(charCounts) + 3, column=4, value=sum(charCounts.values()))
path = './charCounts.xlsx'
wb.save(path)

Images:-

I have files like these. enter image description here

I want them to happen like these:

enter image description here

Notice two things here.

File names in excel sheet have been arranged number-wise.

Second thing is in excel sheet, the file extensions have been removed. I want it Like that.


Solution

  • Here is an update to the code in your question which will do what I believe you have asked:

    # uses python-docx package
    import docx
    import os
    
    # uses pywin32 package
    import win32com.client as win32
    from win32com.client import constants
    app = win32.gencache.EnsureDispatch('Word.Application')
    
    charCounts = {}
    fileDir = '.' # Put the path of the directory to be searched here
    os.chdir(fileDir)
    cwd = os.getcwd()
    directory = os.fsencode(cwd)
    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        if filename.startswith('TEMP_CONVERTED_WORD_FILE_'):
            continue
        filenameOrig = None
        if filename.endswith(".doc"):
            filenameOrig = filename
            src_path = os.path.join(cwd, filename)
            src_path_norm = os.path.normpath(src_path)
            doc = app.Documents.Open(src_path_norm)
            doc.Activate()
            docxPath = 'TEMP_CONVERTED_WORD_FILE_' + filename[:-4] + ".docx"
            dest_path = os.path.join(cwd, docxPath)
            dest_path_norm = os.path.normpath(dest_path)
            app.ActiveDocument.SaveAs(dest_path_norm, FileFormat=constants.wdFormatXMLDocument)
            doc.Close(False)
            filename = docxPath
        if filename.endswith(".docx"):
            src_path = os.path.join(cwd, filename)
            src_path_norm = os.path.normpath(src_path)
            doc = docx.Document(src_path_norm)
            chars = sum(len(p.text) for p in doc.paragraphs) + sum(len(p.text) for section in doc.sections for hf in [section.header, section.footer] for p in hf.paragraphs)
            charCounts[filenameOrig if filenameOrig else filename] = chars / 65
    charCounts = {k:charCounts[k] for k in sorted(charCounts)}
    
    # uses openpyxl package
    from openpyxl import Workbook
    wb = Workbook()
    ws = wb.active
    
    ws.cell(row=1, column=2, value='File Name')
    ws.cell(row=1, column=4, value='chars/65')
    for i, x in enumerate(charCounts):
        ws.cell(row=i + 3, column=2, value=x[:-4] if x.endswith('.doc') else x[:-5])
        ws.cell(row=i + 3, column=4, value=charCounts[x])
    ws.cell(row=len(charCounts) + 3, column=3, value='Total')
    ws.cell(row=len(charCounts) + 3, column=4, value=sum(charCounts.values()))
    path = './charCounts.xlsx'
    wb.save(path)
    

    Explanation:

    • For every file with name ending in .docx except those starting with TEMP_CONVERTED_WORD_FILE_, store character count (divided by 65) by filename as key in a dictionary charCount
    • For every file ending in .doc, use the pywin32 package of Win32 extensions to convert it to a .docx file with TEMP_CONVERTED_WORD_FILE_ prepended to the filename, then store character count (divided by 65) by its original filename as key in the same dictionary as above
    • Replace the charCounts dictionary with one that has insertion order by the filename key
    • Iterate through charCounts storing the contents in an Excel file, taking care to truncate the .doc or .docx suffix from the filename key.