Search code examples
pythonpandasweb-scrapingbeautifulsoupdata-science

Data scraping from Vivino.com


I am trying to collect data from vivino.com and the DataFrame comes out empty, I can see that my soup is collecting the website info, but can't see where my error is.

My code:

def get_data():  

    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}

    r = requests.get("https://www.vivino.com/explore?e=eJzLLbI1VMvNzLM1UMtNrLA1NTBQS660DQhRS7Z1DQ1SKwDKpqfZliUWZaaWJOao5SfZFhRlJqeq5dsmFierlZdExwJVJFcWA-mCEgC1YxlZ", headers=headers)#, proxies=proxies)
    content = r.content
    soup = BeautifulSoup(content, "html.parser")

and as I need the wine maker, wine name and ratings this is how I've tried this:

alls = []
    for d in soup.findAll('div', attrs={'class':'explorerCard__titleColumn--28kWX'}):
        
        Winery = d.find_all("a", attrs={"class":"VintageTitle_winery--2YoIr"})
        Wine = d.find_all("a", attrs={"class":"VintageTitle_wine--U7t9G"})
        Rating = d.find_all("div", attrs={"class":"VivinoRatingWide_averageValue--1zL_5"})
        num_Reviews = d.find_all("div", attrs={"class":"VivinoRatingWide__basedOn--s6y0t"})
        Stars = d.find_all("div", attrs={"aria-label":"rating__rating--ZZb_x rating__vivino--1vGCy"})

        alll=[]

        if Winery is not None:
            #print(n[0]["alt"])
            alll.append(Winery.text)

        else:
            alll.append("unknown-winery")

        if Wine is not None:
            #print(wine.text)
            alll.append(wine.text)
        else:
            alll.append("0")

        if Rating is not None:
            #print(rating.text)
            alll.append(rating.text)

        else:
            alll.append("0")
...

and then getting the data into a DataFrame:

results = []
for i in range(1, no_pages+1):
    results.append(get_data())
flatten = lambda l: [item for sublist in l for item in sublist]
df = pd.DataFrame(flatten(results),columns=['Winery','Wine','Rating','num_review', 'Stars'])
df.to_csv('redwines.csv', index=False, encoding='utf-8')

Solution

  • The previous answer is correct but it needs the user-agent header set:

    import requests
    import pandas as pd
    
    r = requests.get(
        "https://www.vivino.com/api/explore/explore",
        params = {
            "country_code": "FR",
            "country_codes[]":"pt",
            "currency_code":"EUR",
            "grape_filter":"varietal",
            "min_rating":"1",
            "order_by":"price",
            "order":"asc",
            "page": 1,
            "price_range_max":"500",
            "price_range_min":"0",
            "wine_type_ids[]":"1"
        },
        headers= {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0"
        }
    )
    results = [
        (
            t["vintage"]["wine"]["winery"]["name"], 
            f'{t["vintage"]["wine"]["name"]} {t["vintage"]["year"]}',
            t["vintage"]["statistics"]["ratings_average"],
            t["vintage"]["statistics"]["ratings_count"]
        )
        for t in r.json()["explore_vintage"]["matches"]
    ]
    dataframe = pd.DataFrame(results,columns=['Winery','Wine','Rating','num_review'])
    
    print(dataframe)
    

    You will need to increment the page field to iterate over the next results