Search code examples
pythonmysqlbeautifulsoupscreen-scraping

Python Web Scraping: Duplication and output display problem


There is a problem with my code somewhere which I tried but can't identify. It relates to the output of the loops not showing and inserting into my DB correctly. I would like each scraped line of data print as output and then to be inserted into a database table. So far, all I get is just one result printed as a duplicate many times (doesn't even have the right price).

Actual current output:

Ford C-MAX 2019 1.1 Petrol 0
Ford C-MAX 2019 1.1 Petrol 0
Ford C-MAX 2019 1.1 Petrol 0
...

Desired output as per web page ads (just an example as it's dynamic):

Ford C-MAX 2019 1.1 Petrol 15950
Ford C-MAX 2014 1.6 Diesel 12000
Ford C-MAX 2011 1.6 Diesel 9000
...

Code:

from __future__ import print_function
import requests
import re
import locale
import time
from time import sleep
from random import randint
from currency_converter import CurrencyConverter
c = CurrencyConverter()
from bs4 import BeautifulSoup
import pandas as pd
from datetime import date, datetime, timedelta
import mysql.connector
import numpy as np
import itertools

locale.setlocale( locale.LC_ALL, 'en_US.UTF-8' )

pages = np.arange(0, 210, 30)

entered = datetime.now()
make = "Ford"
model = "C-MAX"


def insertvariablesintotable(make, model, year, liter, fuel, price, entered):
    try:
        cnx = mysql.connector.connect(user='root', password='', database='FYP', host='127.0.0.2', port='8000')
        cursor = cnx.cursor()

        cursor.execute('CREATE TABLE IF NOT EXISTS ford_cmax ( make VARCHAR(15), model VARCHAR(20), '
                       'year INT(4), liter VARCHAR(3), fuel VARCHAR(6), price INT(6), entered TIMESTAMP) ')

        insert_query = """INSERT INTO ford_cmax (make, model, year, liter, fuel, price, entered) VALUES (%s,%s,%s,%s,%s,%s,%s)"""
        record = (make, model, year, liter, fuel, price, entered)

        cursor.execute(insert_query, record)

        cnx.commit()

    finally:
        if (cnx.is_connected()):
            cursor.close()
            cnx.close()

for response in pages:

    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    response = requests.get("https://www.donedeal.ie/cars/Ford/C-MAX?start=" + str(response), headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    cnx = mysql.connector.connect(user='root', password='', database='FYP', host='127.0.0.2', port='8000')
    cursor = cnx.cursor()

    for details in soup.findAll('ul', attrs={'class': 'card__body-keyinfo'}):

        details = details.text
        #print(details)
        year = details[:4]
        liter = details[4:7]
        fuel = details[8:14] #exludes electric which has 2 extra
        mileage = re.findall("[0-9]*,[0-9][0-9][0-9]..." , details)
        mileage = ''.join(mileage)
        mileage = mileage.replace(",", "")
        if "mi" in mileage:
            mileage = mileage.rstrip('mi')
            mileage = round(float(mileage) * 1.609)
        mileage = str(mileage)
        if "km" in mileage:
            mileage = mileage.rstrip('km')
        mileage = mileage.replace("123" or "1234" or "12345" or "123456", "0")

    for price in soup.findAll('p', attrs={'class': 'card__price'}):

        price = price.text
        price = price.replace("No Price", "0")
        price = price.replace("123" or "1234" or "12345" or "123456", "0")
        price = price.replace(",","")
        price = price.replace("€", "")
        if "p/m" in price:
            #price = price[:-3]
            price = price.rstrip('p/m')
            price = "0"
        if "£" in price:
            price = price.replace("£", "")
            price = c.convert(price, 'GBP', 'EUR')
            price = round(price)

    print(make, model, year, liter, fuel, price)

    #insertvariablesintotable(make, model, year, liter, fuel, price, entered) #same result as above

Solution

  • I had a look at your code and the website you're trying to get the data from and it looks like you're retrieving the page, then looping on all the prices you're getting from that page using price as a variable but overwriting it every time you enter the for loop. Same goes for your details for loop.

    Here is what you could try instead:

    make = "Ford"
    model = "C-MAX"
    price_list = [] # we will store prices here
    details_list = [] # and details like year, liter, mileage there
    for response in range(1,60,30): # I changed to a range loop for testing
    
        headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"
        }
        response = requests.get(
            "https://www.donedeal.ie/cars/Ford/C-MAX?start=" + str(response),
            headers=headers,
        )
        soup = BeautifulSoup(response.text, "html.parser")
        count = 0
        for details in soup.findAll("ul", attrs={"class": "card__body-keyinfo"}):
            if count == 30:
                break # Takes us out of the for loop
            details = details.text
            # print(details)
            year = details[:4]
            liter = details[4:7]
            fuel = details[8:14]  # exludes electric which has 2 extra
            mileage = re.findall("[0-9]*,[0-9][0-9][0-9]...", details)
            mileage = "".join(mileage)
            mileage = mileage.replace(",", "")
            if "mi" in mileage:
                mileage = mileage.rstrip("mi")
                mileage = round(float(mileage) * 1.609)
            mileage = str(mileage)
            if "km" in mileage:
                mileage = mileage.rstrip("km")
            mileage = mileage.replace("123" or "1234" or "12345" or "123456", "0")
            details_list.append((year, liter, fuel, mileage)) # end of one loop go-through, we append
            count += 1 We update count value 
        count = 0
        for price in soup.findAll("p", attrs={"class": "card__price"}):
            if count == 30:
                break # Takes us out of the for loop
            price = price.text
            price = price.replace("No Price", "0")
            price = price.replace("123" or "1234" or "12345" or "123456", "0")
            price = price.replace(",", "")
            price = price.replace("€", "")
            if "£" in price:
                price = price.replace("£", "")
                price = c.convert(price, "GBP", "EUR")
                price = round(price)
            if "p/m" in price:
                # price = price[:-3]
                price = price.rstrip("p/m")
                price = "0"
            else:
                price_list.append(price) # end of loop go-through, we append but only if it is not a "p/m" price
                count += 1 # We update count value only when a value is appended to the list
    
    for i in range(len(price_list)):
        print(
        make,
        model,
        details_list[i][0],
        details_list[i][1],
        details_list[i][2],
        price_list[i],
    )
        #add your insertvariablesintotable(make,model,details_list[i][0], details_list[i][1],details_list[i][2],price_list[i]) there
    

    Edit: I did not add the p/m prices to the list as they were making the lengths of details_list and price_list differ. If you want to add the p/m prices too, you will have to rework the code. Also, you don't want to have the 3 cars at the very bottom of the page as they might not be Ford C-MAX, but other models instead, maybe even other manufacturers.