Search code examples
pythonpdfpython-3.xweb-scrapingpdf-parsing

How open and read pdf (originally .html) file using Python3


I need to open this file in python3:

http://www.arch.gob.ec/index.php/descargas/doc_download/478-historial-de-produccion-nacional-de-crudo-2011.html

Here will I have to read it, and extract the data tables. I have searched for several hours but nothing seem to work. I am new to scraping/parsing and it is the first time I have looked in to file handling of PDF.

Thanks for all kind of help!


Solution

  • Found a way that works for me.

    url = 'http://www.arch.gob.ec/index.php/descargas/doc_download/478-historial-de-produccion-nacional-de-crudo-2011.html'
    
    (pdfFile, headers) = urllib.request.urlretrieve(url)
    print(os.path.abspath(pdfFile))
    s = pdf_convert(str(os.path.abspath(pdfFile)))
    

    where pdf_convert is:

    def pdf_convert(path):
    outtype='txt'
    opts={}
    # Create file that that can be populated in Desktop
    outfile = 'c:\\users\\yourusername\\Desktop\\temp2.txt'
    outdir = '/'.join(path.split('/')[:-1])
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    # ?outfile = None
    # ?outtype = None
    outdir = None
    #layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': outdir = v
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    #PDFDocument.debug = debug
    #PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager()
    
    outtype = 'text'
    
    if outfile:
        outfp = open(outfile, 'w')
    
    else:
        outfp = sys.stdout
    device = TextConverter(rsrcmgr, outfp, laparams=laparams)
    
    
    fp = open(path, 'rb')
    process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
                    check_extractable=True)
    fp.close()
    device.close()
    outfp.close()
    with open ('c:\\users\\studma~1\\Desktop\\temp2.txt', 'r') as myfile:
        data = myfile.read()
    myfile.close()
    return str(data)