How do I place the output of this web scrape (data) into a pandas DataFrame?
The code below only displays one 'row' of data, and it is in a vertical arrangement. How do I make DataFrame look exactly like print(data) - ie. horizontal?
Also, I'm such a newb - this is so 'not pythonic'. Please suggest how to make this into functions(?) ?
I tried all sort of append and concat functions, but the nested looping is throwing everything into error. Please help.
Thank you in advance.
from bs4 import BeautifulSoup as bs
import urllib.request
import re
import pandas as pd
Url1 = 'http://www5.statcan.gc.ca/cimt-cicm/topNCountryCommodities-marchandises?lang=eng&chapterId=27§ionId=0&refMonth='
Url2 = '&refYr='
Url3 = '&freq=6&countryId=999&usaState=0&provId=1&arrayId=9900000&commodityId=271111&commodityName=Natural+gas%2C+liquefied&topNDefault=10&tradeType=3'
regexQ = re.compile('Date1 Qty')
regexC = re.compile('Footnote')
regexV = re.compile('Date1 Val')
import csv
import collections
for year in range(2016, 2018):
for month in range(0, 13):
sauce = urllib.request.urlopen(Url1+str(month)+Url2+str(year)+Url3).read().decode('utf-8')
soup = bs(sauce, 'lxml')
select = soup.find_all('option', selected=True)
for i in select:
Month=select[4]
Year=select[3]
month=Month.text
year=Year.text
#print(year)
#print(month, year)
value = [x.text.strip() for x in soup.find_all("td", {"headers": regexV})]
country = [x.text.strip() for x in soup.find_all("a", {"href": regexC})]
quantity = [x.text.strip() for x in soup.find_all("td", {"headers": regexQ})]
total_list = [list(x) for x in zip(country,quantity,value)]
for item in total_list:
h=[year , month]
data = h + item
print(data)
df = pd.DataFrame(data=data)
df.head
You can use:
from bs4 import BeautifulSoup as bs
import urllib.request
import re
import pandas as pd
Url1 = 'http://www5.statcan.gc.ca/cimt-cicm/topNCountryCommodities-marchandises?lang=eng&chapterId=27§ionId=0&refMonth='
Url2 = '&refYr='
Url3 = '&freq=6&countryId=999&usaState=0&provId=1&arrayId=9900000&commodityId=271111&commodityName=Natural+gas%2C+liquefied&topNDefault=10&tradeType=3'
regexQ = re.compile('Date1 Qty')
regexC = re.compile('Footnote')
regexV = re.compile('Date1 Val')
import csv
import collections
#create list for all rows
data = []
for year in range(2016, 2018):
for month in range(0, 13):
sauce = urllib.request.urlopen(Url1+str(month)+Url2+str(year)+Url3).read().decode('utf-8')
soup = bs(sauce, 'lxml')
select = soup.find_all('option', selected=True)
print (select)
for i in select:
month=select[4].text
#convert year to integer
year=int(select[3].text)
#print(year)
#print(month, year)
#replace , from numeric values, convert to integer
value = [int(x.text.strip().replace(',','')) for x in soup.find_all("td", {"headers": regexV})]
country = [x.text.strip() for x in soup.find_all("a", {"href": regexC})]
quantity = [int(x.text.strip().replace(',','')) for x in soup.find_all("td", {"headers": regexQ})]
for item in zip(country,quantity,value):
#append each row to list data
data.append([year , month] + list(item))
#create Dataframe
cols = ['year','month','country','quantity','value']
df = pd.DataFrame(data=data, columns=cols)
print (df.head())
year month country quantity value
0 2016 January World 279605300 69598782
1 2016 January Trinidad and Tobago 140185912 36503857
2 2016 January Norway 137835921 32414865
3 2016 January United States 1583467 680060
4 2016 February World 1255156 532059