How do I search a string/pattern in all the sheets of a workbook and return all matching sheet numbers of the workbook?
I can traverse all the sheets in an Excel workbook, one by one, and search the string in each sheet (like a linear search) but it is inefficient and takes a long time, and I have to process hundreds of workbooks or even more.
Update 1: Sample code
from multiprocessing import Pool
from multiprocessing.dummy import Pool as ThreadPool
def searchSheets(fnames):
#Search Logic here
#Loop over each Sheet
#Search for string 'Balance' in each Sheet
#Return matching Sheet Number
if __name__ == '__main__':
__spec__ = None
folder = "C://AB//"
if os.path.exists(folder):
files = glob.glob(folder + "*.xlsx")
#Multi threading
pool = Pool()
#Suggested by @Dan D,files) # It did not work
Update 2:Error
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\multiprocessing\", line 119, in work
result = (True, func(*args, **kwds))
File "C:\ProgramData\Anaconda3\lib\multiprocessing\", line 44, in mapst
return list(map(*args))
File "C:\", line 36, in searchSheet
wb = xl_wb(f)
File "C:\ProgramData\Anaconda3\lib\site-packages\xlrd\", line 116,
in open_workbook
with open(filename, "rb") as f:
FileNotFoundError: [Errno 2] No such file or directory: 'C'
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:\", line 167, in <module>,files)
File "C:\ProgramData\Anaconda3\lib\multiprocessing\", line 266, in map
return self._map_async(func, iterable, mapstar, chunksize).get()
File "C:\ProgramData\Anaconda3\lib\multiprocessing\", line 644, in get
raise self._value
FileNotFoundError: [Errno 2] No such file or directory: 'C'
from multiprocessing import Pool
from multiprocessing.dummy import Pool as ThreadPool
def searchSheets(fnames):
#Search Logic here
#Loop over each Sheet
#Search for string 'Balance' in each Sheet
#Return matching Sheet Number
if __name__ == '__main__':
__spec__ = None
folder = "C://AB//"
if os.path.exists(folder):
files = glob.glob(folder + "*.xlsx")
#Multi threading
pool = Pool()
#Suggested by @Dan D,files) # It did not work,[workbook for workbook in files],)
multiprocessing.freeze_support() # this line is needed on window
#only,found it in may other posts
#pool.join() #Removed this from code as it made all the workers to wait