Search code examples
pythonexcelpython-3.xxlsx

Open multiple excel files, open every sheet on each file, and save the image


I have multiple excel files with images inside, the images are located on different excel sheets. My objective is to save the images to my computer. These images will be used for face recognition later on.

I have constructed some code to open the excel file and grab the images. However it only take from one sheet rather than all the sheet.

import face_recognition
import pandas as pd
import win32com.client as win32
from PIL import ImageGrab
import os

#Read working directory
print(os.getcwd()) #get current working directory
os.chdir("E:/DATA/Master data") #set working directory
print(os.getcwd())#check updated working directory

#Reading xlsx file in a folder
path1="E:/DATA/Master data"
files= os.listdir(path1)
print(files)
listlength = len(files)

#Extracting data from each xlsx file
for f in files:
    excel = win32.gencache.EnsureDispatch('Excel.Application')
    count=0
while (count<listlength):
    a = files.pop(count)
    path_name = path1 + "/" + a
    workbook = excel.Workbooks.Open(path_name)
    wb_folder = workbook.Path
    wb_name = workbook.Name
    wb_path = os.path.join(wb_folder, wb_name)
    for sheet in workbook.Worksheets:
        for i, shape in enumerate(sheet.Shapes):
            if shape.Name.startswith('Picture'):
                shape.Copy()
                image = ImageGrab.grabclipboard()
                image.save('{}.jpg'.format(i+1), 'jpeg')

I expect to get all images from every sheet on multiple excel files.


Solution

  • The variable i is being reset for each sheet so your file name is the same and thus the file is being overwritten. Add a second variable which gets incremented for each sheet so filename includes that as well.

    This is tested as working, I added excel.Visible so you can see the sheets popup :) Also logging so you can see what's happening. Instead of using a global count variable I just concatenated the workbook name to the sheet name then used the "n" variable from each sheets image.

    import win32com.client as win32
    from PIL import ImageGrab
    import os
    
    def ensureDirExists(filePath):
        if not os.path.exists(filePath):
            os.makedirs(filePath)
    
    def absoluteListDir(directory):
       for dirpath,_,filenames in os.walk(directory):
           for f in filenames:
               yield os.path.abspath(os.path.join(dirpath, f))
    
    dataDirectory = "data"
    outputDirectory = "images"
    
    ensureDirExists(dataDirectory)
    ensureDirExists(outputDirectory)
    
    excel = win32.gencache.EnsureDispatch('Excel.Application')
    excel.Visible = True
    
    files = absoluteListDir(dataDirectory)
    
    for file in files:
        print("=" * 20)
        print("Opening Workbook: ", file)
        workbook = excel.Workbooks.Open(file)
    
        for sheet in workbook.Sheets:
            print("Scraping Sheet: ", sheet.Name)
            for n, shape in enumerate(sheet.Shapes):
                if shape.Name.startswith("Picture"):
                    shape.Copy()
                    image = ImageGrab.grabclipboard()
                    outputFile = "{}/{}_{}_{}.jpg".format(outputDirectory, workbook.Name, sheet.Name, n)
                    print("Saving Image File: ", outputFile)
                    image.save(outputFile, "jpeg")
    
        print("Closing Workbook")
        workbook.Close(True)