I have a folder of many word document files ending with .doc and .docx.
This code is working only for .docx I want this for .doc also
import docx
import os
charCounts = {}
directory = os.fsencode('.')
for file in os.listdir(directory):
filename = os.fsdecode(file)
if filename.endswith(".docx"):
#filename = os.path.join(directory, filename)
doc = docx.Document(filename)
chars = sum(len(p.text) for p in doc.paragraphs)
charCounts[filename] = chars / 65
# uses openpyxl package
from openpyxl import Workbook
wb = Workbook()
ws = wb.active
ws.cell(row=1, column=2, value='File Name')
ws.cell(row=1, column=4, value='chars/65')
for i, x in enumerate(charCounts):
ws.cell(row=i + 3, column=2, value=x)
ws.cell(row=i + 3, column=4, value=charCounts[x])
ws.cell(row=len(charCounts) + 3, column=4, value=sum(charCounts.values()))
path = './charCounts.xlsx'
wb.save(path)
Images:-
I want them to happen like these:
Notice two things here.
File names in excel sheet have been arranged number-wise.
Second thing is in excel sheet, the file extensions have been removed. I want it Like that.
Here is an update to the code in your question which will do what I believe you have asked:
# uses python-docx package
import docx
import os
# uses pywin32 package
import win32com.client as win32
from win32com.client import constants
app = win32.gencache.EnsureDispatch('Word.Application')
charCounts = {}
fileDir = '.' # Put the path of the directory to be searched here
os.chdir(fileDir)
cwd = os.getcwd()
directory = os.fsencode(cwd)
for file in os.listdir(directory):
filename = os.fsdecode(file)
if filename.startswith('TEMP_CONVERTED_WORD_FILE_'):
continue
filenameOrig = None
if filename.endswith(".doc"):
filenameOrig = filename
src_path = os.path.join(cwd, filename)
src_path_norm = os.path.normpath(src_path)
doc = app.Documents.Open(src_path_norm)
doc.Activate()
docxPath = 'TEMP_CONVERTED_WORD_FILE_' + filename[:-4] + ".docx"
dest_path = os.path.join(cwd, docxPath)
dest_path_norm = os.path.normpath(dest_path)
app.ActiveDocument.SaveAs(dest_path_norm, FileFormat=constants.wdFormatXMLDocument)
doc.Close(False)
filename = docxPath
if filename.endswith(".docx"):
src_path = os.path.join(cwd, filename)
src_path_norm = os.path.normpath(src_path)
doc = docx.Document(src_path_norm)
chars = sum(len(p.text) for p in doc.paragraphs) + sum(len(p.text) for section in doc.sections for hf in [section.header, section.footer] for p in hf.paragraphs)
charCounts[filenameOrig if filenameOrig else filename] = chars / 65
charCounts = {k:charCounts[k] for k in sorted(charCounts)}
# uses openpyxl package
from openpyxl import Workbook
wb = Workbook()
ws = wb.active
ws.cell(row=1, column=2, value='File Name')
ws.cell(row=1, column=4, value='chars/65')
for i, x in enumerate(charCounts):
ws.cell(row=i + 3, column=2, value=x[:-4] if x.endswith('.doc') else x[:-5])
ws.cell(row=i + 3, column=4, value=charCounts[x])
ws.cell(row=len(charCounts) + 3, column=3, value='Total')
ws.cell(row=len(charCounts) + 3, column=4, value=sum(charCounts.values()))
path = './charCounts.xlsx'
wb.save(path)
Explanation:
.docx
except those starting with TEMP_CONVERTED_WORD_FILE_
, store character count (divided by 65) by filename as key in a dictionary charCount
.doc
, use the pywin32
package of Win32 extensions to convert it to a .docx
file with TEMP_CONVERTED_WORD_FILE_
prepended to the filename, then store character count (divided by 65) by its original filename as key in the same dictionary as abovecharCounts
dictionary with one that has insertion order by the filename keycharCounts
storing the contents in an Excel file, taking care to truncate the .doc
or .docx
suffix from the filename key.