Search code examples
pythonexcelpdfopenpyxlpypdf

Problem with set values on excel file with PyPDF2 and Openpyxl


I have the next code:

import PyPDF2
import openpyxl
from openpyxl import Workbook
from openpyxl import load_workbook
from openpyxl.worksheet.cell_range import CellRange

#Definir Archivo Excel
file = 'C:\\Users\\Desktop\\PYTHON_PDF\\PRUEBA_PDF.xlsx'
wb = openpyxl.load_workbook(file)#define workook
ws = wb['Hoja1']#define sheet1
#define max values for columns
max_row_for_a = max((a.row for a in ws['A'] if a.value is not None))
max_row_for_b = max((b.row for b in ws['B'] if b.value is not None))

#LOOP
for row in ws.iter_rows(min_row=2, max_col=1, max_row=max_row_for_a):
    pdf=  row[0].value
    print (pdf)
    for row2 in ws.iter_rows(min_row=2, max_col=2, max_row=max_row_for_b, min_col=2):
        extracto = row2[0].value
        pdfselect=open(pdf,"rb")
        leer = PyPDF2.PdfFileReader(pdfselect)
        pagina = leer.getPage(0)  
        extracto =  pagina.extractText()
        print(extracto)
        wb.save("PRUEBA_PDF2.xlsx")
        wb.close

The idea is read the excel's column A with the pdf's names and write in the column B the pdf extracts, but when i execute the code don't show anything, even dont show error. I try with a minimal case like this and i dont have problems:

#pdfselect=open("ejemplo.pdf","rb")
#leer = PyPDF2.PdfFileReader(pdfselect)
#pagina = leer.getPage(0)   
#sheet =   wb.active 
#ws['B2'] = pagina.extractText()
#wb.save("PRUEBA_PDF2.xlsx")
#wb.close

what i'm doing wrong? Greetings!


Solution

  • I found the problem. The correct code it must like this:

    import PyPDF2
    import openpyxl
    from openpyxl import Workbook
    from openpyxl import load_workbook
    from openpyxl.worksheet.cell_range import CellRange
    
    #Definir Archivo Excel
    
    file = 'C:\\Users\\lorrego\\Desktop\\PYTHON_PDF\\PRUEBA_PDF.xlsx'
    wb = openpyxl.load_workbook(file)#define workook
    ws = wb['Hoja1']#define la hoja 1
    max_row_for_a = max((a.row for a in ws['A'] if a.value is not None))
    
    
    
    for row in ws.iter_rows(min_row=2, max_col=2, max_row=max_row_for_a):
    
        pdf=  row[0].value  
        pdfselected=open(pdf,"rb")
        leer = PyPDF2.PdfFileReader(pdfselected)
        pagina = leer.getPage(0)  
        ws.cell(row=row[0].row, column=2).value = pagina.extractText()
        wb.save("C:\\Users\\Desktop\\PYTHON_PDF\\PRUEBA_PDF.xlsx")
        wb.close