I wrote code that sucessfully parses thousands of different kind of pdfs.
However with this pdf, i get an error. Here is a very simple test code sample, that reproduces the error. My original code is too long to share here
file = open('C:/Users/username/file.pdf', 'rb')
rsrcmgr = PDFResourceManager()
device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
pages = PDFPage.get_pages(file)
for page in pages:
interpreter.process_page(page)
layout = device.get_result()
https://filetransfer.io/data-package/dWnZbcWl#link
Here is the full error message
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_15652/28568702.py in <module>
7 for page in pages:
----> 8 interpreter.process_page(page)
9 layout = device.get_result()
C:\ProgramData\miniforge3\lib\site-packages\pdfminer\pdfinterp.py in process_page(self, page)
839 ctm = (1, 0, 0, 1, -x0, -y0)
840 self.device.begin_page(page, ctm)
--> 841 self.render_contents(page.resources, page.contents, ctm=ctm)
842 self.device.end_page(page)
843 return
C:\ProgramData\miniforge3\lib\site-packages\pdfminer\pdfinterp.py in render_contents(self, resources, streams, ctm)
852 self.init_resources(resources)
853 self.init_state(ctm)
--> 854 self.execute(list_value(streams))
855 return
856
C:\ProgramData\miniforge3\lib\site-packages\pdfminer\pdfinterp.py in execute(self, streams)
857 def execute(self, streams):
858 try:
--> 859 parser = PDFContentParser(streams)
860 except PSEOF:
861 # empty page
C:\ProgramData\miniforge3\lib\site-packages\pdfminer\pdfinterp.py in __init__(self, streams)
219 self.streams = streams
220 self.istream = 0
--> 221 PSStackParser.__init__(self, None)
222 return
223
C:\ProgramData\miniforge3\lib\site-packages\pdfminer\psparser.py in __init__(self, fp)
513
514 def __init__(self, fp):
--> 515 PSBaseParser.__init__(self, fp)
516 self.reset()
517 return
C:\ProgramData\miniforge3\lib\site-packages\pdfminer\psparser.py in __init__(self, fp)
167 def __init__(self, fp):
168 self.fp = fp
--> 169 self.seek(0)
170 return
171
C:\ProgramData\miniforge3\lib\site-packages\pdfminer\pdfinterp.py in seek(self, pos)
233
234 def seek(self, pos):
--> 235 self.fillfp()
236 PSStackParser.seek(self, pos)
237 return
C:\ProgramData\miniforge3\lib\site-packages\pdfminer\pdfinterp.py in fillfp(self)
229 else:
230 raise PSEOF('Unexpected EOF, file truncated?')
--> 231 self.fp = BytesIO(strm.get_data())
232 return
233
C:\ProgramData\miniforge3\lib\site-packages\pdfminer\pdftypes.py in get_data(self)
290 def get_data(self):
291 if self.data is None:
--> 292 self.decode()
293 return self.data
294
C:\ProgramData\miniforge3\lib\site-packages\pdfminer\pdftypes.py in decode(self)
271 raise PDFNotImplementedError('Unsupported filter: %r' % f)
272 # apply predictors
--> 273 if 'Predictor' in params:
274 pred = int_value(params['Predictor'])
275 if pred == 1:
TypeError: argument of type 'PDFObjRef' is not iterable
Can somebody try to load this into memory and if successful tell me how they did it?
Package versions used
conda 4.11.0 py39hcbf5309_0 conda-forge
ipython 7.28.0 py39h832f523_0 conda-forge
notebook 6.4.4 pyha770c72_0 conda-forge
pdfminer 20191125 pyhd8ed1ab_1 conda-forge
pillow 8.3.2 py39h916092e_0 conda-forge
pyparsing 2.4.7 pyh9f0ad1d_0 conda-forge
pytesseract 0.3.8 pyhd8ed1ab_0 conda-forge
python 3.9.7 h7840368_3_cpython conda-forge
wcwidth 0.2.5 pyh9f0ad1d_2 conda-forge
wheel 0.37.0 pyhd8ed1ab_1 conda-forge
I checked for problems with metadata but that is fine. I checked for encryption but that is also not the problem. Multipage is also no problem.
When I change
if 'Predictor' in params:
to:
if isinstance(params, dict) and 'Predictor' in params:
in file pdftypes.py
(line 273), I don't get the error any more.
See: https://github.com/pdfminer/pdfminer.six/pull/471
The fix from PR 471, is not included in version 20191125
.