Search code examples
pythonbeautifulsoupsteamdata-extraction

Python, Beautiful soup, how to extract data and print to csv file


So I have been working on this for a while and I cannot seem to find an answer to it or figure it out. So I am extracting data from steam and I need to figure out how to get the platforms, for example mac and turn it into a number (string number). For example if a game supports mac it will show up in my list as a "1" but if it does not it will show up as a "0". I am having the problem of the code only running once and making it all to "1".

from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from datetime import timedelta
import datetime
import time
import csv
my_url = 'https://store.steampowered.com/search/?specials=1&page=1'

#opening up connectin, grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()

#html parsing
page_soup = soup(page_html, "html.parser")

#grab products
containers = page_soup.findAll("div", {"class":"responsive_search_name_combined"})

filename = "products.csv"
f = open(filename, "w", encoding='UTF-8')
headers = "Titles, Release_date, Discount, Price before, Price after, Positive review, Reviewers, Win, Lin, Osx, Time \n"
f.write(headers)
#f.write(headers)
#len(containers)
#containers[1]
ts = time.time()
st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
print(st)

for container in containers:
    titles_container = container.findAll("span",{"class":"title"})
    titl = titles_container[0].text
    print(titl)

    product_container = container.findAll("div",{"class":"search_released"})
    product_date = product_container[0].text
    print(product_date)

    product_discount_container = container.findAll("div",{"class":"search_discount"})
    product_discount = product_discount_container[0].text
    print(product_discount)

    product_price_container_before = container.findAll("div",{"class":"search_price"})
    product_price_before = product_price_container_before[0].text
    test = re.findall('(\d+\W)',product_price_before)
    testing = test[0] + test[1]
    print(testing)

    product_price_container_after = container.findAll("div",{"class":"discounted"})
    for product_price_after in product_price_container_after:
        product_price_after.find("span").extract()
        print(product_price_after.text)

    product_review_container = container.findAll("span",{"class":"search_review_summary"})
    for product_review in product_review_container:
        prr = product_review.get('data-tooltip-html')
        a = re.findall('(\d+%)|(\d+\d+)',prr)
        c = a[1][1]
        print(c)


    product_platform_container = container.findAll("span",{"class":"platform_img"})
    for product_platform in product_platform_container:
        platform = product_platform.get('class')[1]
        platt = re.findall('(\Aw)',platform)
        plattt = re.findall('(\Am)',platform)
        platttt = re.findall('(\Al)',platform)
        print(platt)
        print(plattt)
        print(platttt)

        for p in plattt:
            if "m" in p:
                macken = "1"    
            elif "m" not in p:
                macken = "0"
            print(macken)


    f.write(titl + "," + product_date.replace(",","") + "," + product_discount.replace("\n", "") + "," + testing.replace(",", ".") + "," + product_price_after.text.replace("\n","").replace(" ", "").replace(",",".").replace("\t\t\t\t\t\t\t","") + "," + a[0][0] + "," + c.replace(",","") + "," + y + "," + macken + "," + "blah" + "," + st + "\n")

f.close()
pd.read_csv("products.csv", error_bad_lines=False)

I am also writing it over to csv file. So when I write it to the csv file it just say 1, 1, 1, 1, 1...

I am getting the data from this page: 'https://store.steampowered.com/search/?specials=1&page=1'

I know this question is a little confusing so hopefully you can help, if there is any more code you need, let me know.


Solution

  • your statement was wrong that is why you getting 1, see the code below!

    import requests,csv
    from bs4 import BeautifulSoup
    
    
    req = requests.get('https://store.steampowered.com/search/?specials=1&page=1')
    soup = BeautifulSoup(req.content,'html.parser')
    data = []
    for platform in soup.find_all('div', attrs={'class':'col search_name ellipsis'}):
        title = platform.find('span',attrs={'class':'title'}).text
        if platform.find('span',attrs={'class':'win'}):
            win = '1'
        else:
            win = '0'
    
        if platform.find('span',attrs={'class':'mac'}):
            mac = '1'
        else:
            mac = '0'
    
        if platform.find('span',attrs={'class':'linux'}):
            linux = '1'
        else:
            linux = '0'
    
        data.append({
            'title':title.encode('utf-8'),
            'win':win,
            'mac':mac,
            'linux':linux})
    
    with open('data.csv', 'w', newline='') as f:
        fields = ['title','win','mac','linux']
        writer = csv.DictWriter(f, fieldnames=fields)
        writer.writeheader()
        writer.writerows(data)