Search code examples
pythonexcelbeautifulsoupxlwt

Writing results to .xls (Submit 2 queries to webpage, and store different results into .xls)


I am using Python 2.76 to submit queries to an .aspx webpage and pick up the results by BeautifulSoup, and want to store them into an Excel spreadsheet.

import mechanize
import re
import xlwt
from bs4 import BeautifulSoup
import urllib2

book = xlwt.Workbook(encoding='utf-8', style_compression = 0)
sheet = book.add_sheet('Legi', cell_overwrite_ok = True)

for items in ['university student', 'high school student']:

    url = r'http://legistar.council.nyc.gov/Legislation.aspx'
    request = mechanize.Request(url)
    response = mechanize.urlopen(request)
    forms = mechanize.ParseResponse(response, backwards_compat=False)
    form = forms[0]
    response.close()

    form['ctl00$ContentPlaceHolder1$txtSearch'] = items
    
    submit_page = mechanize.urlopen(form.click())
    soup = BeautifulSoup(submit_page.read())
    aa = soup.find_all(href=re.compile('LegislationDetail'))
    for bb in aa:
        cc = bb.text

        #print cc
        results = []
        results.append(cc)

    for row, legi_no in enumerate(results):
      sheet.write (row, 0, legi_no)

book.save("C:\\legi results.xls")

It finds and pick up the results, if I print the variable ‘cc’. however the writing into Excel spreadsheet is not successful because it only writes the first cell.


Solution

  • You create the results variable inside the for bb in aa loop.

    This means results will get initialized to [] for each value in aa and in the end results will contain only one element (the last one), which is ofcourse not intended.

    Put the results outside and it should work fine, as shown below.

    import mechanize
    import re
    import xlwt
    from bs4 import BeautifulSoup
    import urllib2
    
    book = xlwt.Workbook(encoding='utf-8', style_compression = 0)
    sheet = book.add_sheet('Legi', cell_overwrite_ok = True)
    
    for items in ['university student', 'high school student']:
    
        url = r'http://legistar.council.nyc.gov/Legislation.aspx'
        request = mechanize.Request(url)
        response = mechanize.urlopen(request)
        forms = mechanize.ParseResponse(response, backwards_compat=False)
        form = forms[0]
        response.close()
    
        form['ctl00$ContentPlaceHolder1$txtSearch'] = items
    
        submit_page = mechanize.urlopen(form.click())
        soup = BeautifulSoup(submit_page.read())
        aa = soup.find_all(href=re.compile('LegislationDetail'))
    
        results = [] # Initialize results here !!!
        for bb in aa:
            cc = bb.text
    
            #print cc
            results.append(cc)
    
        for row, legi_no in enumerate(results):
          sheet.write (row, 0, legi_no)
    
    book.save("C:\\legi results.xls")