Search code examples
pythonpandasbeautifulsoupnested-loops

output of beautiful soup nested loop vertically into dataframe


How do I place the output of this web scrape (data) into a pandas DataFrame? The code below only displays one 'row' of data, and it is in a vertical arrangement. How do I make DataFrame look exactly like print(data) - ie. horizontal? Also, I'm such a newb - this is so 'not pythonic'. Please suggest how to make this into functions(?) ? I tried all sort of append and concat functions, but the nested looping is throwing everything into error. Please help.
Thank you in advance.

from bs4 import BeautifulSoup as bs
import urllib.request
import re
import pandas as pd

Url1 = 'http://www5.statcan.gc.ca/cimt-cicm/topNCountryCommodities-marchandises?lang=eng&chapterId=27&sectionId=0&refMonth='
Url2 = '&refYr='
Url3 = '&freq=6&countryId=999&usaState=0&provId=1&arrayId=9900000&commodityId=271111&commodityName=Natural+gas%2C+liquefied&topNDefault=10&tradeType=3'

regexQ = re.compile('Date1 Qty')
regexC = re.compile('Footnote')
regexV = re.compile('Date1 Val')
import csv
import collections

for year in range(2016, 2018):
    for month in range(0, 13):
        sauce = urllib.request.urlopen(Url1+str(month)+Url2+str(year)+Url3).read().decode('utf-8')
        soup = bs(sauce, 'lxml')
        select = soup.find_all('option', selected=True)
        for i in select:
            Month=select[4]
            Year=select[3]
            month=Month.text
            year=Year.text
            #print(year)
            #print(month, year)
        value = [x.text.strip() for x in soup.find_all("td", {"headers": regexV})]
        country = [x.text.strip() for x in soup.find_all("a", {"href": regexC})]
        quantity = [x.text.strip() for x in soup.find_all("td", {"headers": regexQ})]
        total_list = [list(x) for x in zip(country,quantity,value)]
        for item in total_list:
            h=[year , month]
            data = h + item
            print(data)
            df = pd.DataFrame(data=data)
            df.head

Solution

  • You can use:

    from bs4 import BeautifulSoup as bs
    import urllib.request
    import re
    import pandas as pd
    
    Url1 = 'http://www5.statcan.gc.ca/cimt-cicm/topNCountryCommodities-marchandises?lang=eng&chapterId=27&sectionId=0&refMonth='
    Url2 = '&refYr='
    Url3 = '&freq=6&countryId=999&usaState=0&provId=1&arrayId=9900000&commodityId=271111&commodityName=Natural+gas%2C+liquefied&topNDefault=10&tradeType=3'
    
    regexQ = re.compile('Date1 Qty')
    regexC = re.compile('Footnote')
    regexV = re.compile('Date1 Val')
    import csv
    import collections
    
    #create list for all rows
    data = []
    for year in range(2016, 2018):
        for month in range(0, 13):
            sauce = urllib.request.urlopen(Url1+str(month)+Url2+str(year)+Url3).read().decode('utf-8')
            soup = bs(sauce, 'lxml')
            select = soup.find_all('option', selected=True)
            print (select)
            for i in select:
                month=select[4].text
                #convert year to integer 
                year=int(select[3].text)
                #print(year)
                #print(month, year)
    
            #replace , from numeric values, convert to integer
            value = [int(x.text.strip().replace(',','')) for x in soup.find_all("td", {"headers": regexV})]
            country = [x.text.strip() for x in soup.find_all("a", {"href": regexC})]
            quantity = [int(x.text.strip().replace(',',''))  for x in soup.find_all("td", {"headers": regexQ})]
            for item in zip(country,quantity,value):
                #append each row to list data 
                data.append([year , month] + list(item))
    
    #create Dataframe
    cols = ['year','month','country','quantity','value']
    df = pd.DataFrame(data=data, columns=cols)
    print (df.head())
       year     month              country   quantity     value
    0  2016   January                World  279605300  69598782
    1  2016   January  Trinidad and Tobago  140185912  36503857
    2  2016   January               Norway  137835921  32414865
    3  2016   January        United States    1583467    680060
    4  2016  February                World    1255156    532059