I'm scraping a website to gather the ten most recent articles based on some keywords. Once I get my data (keyword used, article name, the URL/hyperlink, and the publication date) I want to write it all to an xls file. So far it only writes the results for the last keyword as opposed to all four, it's just overwriting the same section of the spreadsheet. How can I display my entire list, not just the most recent section?
import requests
from bs4 import BeautifulSoup
import datetime
import xlwt
from xlwt import Formula
today = datetime.date.today().strftime("%Y%m%d")
keywords = ('PNC', 'Huntington', 'KeyCorp', 'Fifth Third')
for keyword in keywords:
keyword.replace("+", " ")
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
def article_fetch(keyword):
url = 'https://www.americanbanker.com/search?query={}'.format(keyword)
r = requests.get(url, headers = headers)
soup = BeautifulSoup(r.text, 'html.parser')
mylist = []
cols = "KeyWord", "Article", "URL", "Publication Date"
mylist.append(cols)
for articles in soup.find_all("div", "feed-item"):
article = articles.find("h4").text.strip()
timestamp = articles.find("span", "timestamp").text.strip()
article_url = 'https://{}'.format(articles.find("a")["href"][2:])
link = 'HYPERLINK("{}", "Link" )'.format(article_url)
item = [keyword, article, Formula(link), timestamp]
mylist.append(item)
book = xlwt.Workbook()
sheet = book.add_sheet("Articles")
for i, row in enumerate(mylist):
for j, col in enumerate(row):
sheet.write(i, j, col)
book.save("C:\Python\American Banker\American Banker {}.xls".format(today))
for keyword in keywords:
article_fetch(keyword)
print('Workbook Saved')
I would expect to see my entire list, which would have results for all four keywords. However I am only seeing the results for the last keyword.
I've moved Excel file generation to the end of the script:
import requests
from bs4 import BeautifulSoup
import datetime
import xlwt
from xlwt import Formula
today = datetime.date.today().strftime("%Y%m%d")
keywords = ('PNC', 'Huntington', 'KeyCorp', 'Fifth Third')
for keyword in keywords:
keyword.replace("+", " ")
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
def article_fetch(keyword):
url = 'https://www.americanbanker.com/search?query={}'.format(keyword)
r = requests.get(url, headers = headers)
soup = BeautifulSoup(r.text, 'html.parser')
for articles in soup.find_all("div", "feed-item"):
article = articles.find("h4").text.strip()
timestamp = articles.find("span", "timestamp").text.strip()
article_url = 'https://{}'.format(articles.find("a")["href"][2:])
link = 'HYPERLINK("{}", "Link" )'.format(article_url)
item = [keyword, article, Formula(link), timestamp]
mylist.append(item)
mylist = []
cols = "KeyWord", "Article", "URL", "Publication Date"
mylist.append(cols)
for keyword in keywords:
article_fetch(keyword)
book = xlwt.Workbook()
sheet = book.add_sheet('Articles')
for i, row in enumerate(mylist):
for j, col in enumerate(row):
sheet.write(i, j, col)
book.save("American Banker {}.xls".format(today))
print('Workbook Saved')
Data won't lost anymore: