There is a problem with my code somewhere which I tried but can't identify. It relates to the output of the loops not showing and inserting into my DB correctly. I would like each scraped line of data print as output and then to be inserted into a database table. So far, all I get is just one result printed as a duplicate many times (doesn't even have the right price).
Actual current output:
Ford C-MAX 2019 1.1 Petrol 0
Ford C-MAX 2019 1.1 Petrol 0
Ford C-MAX 2019 1.1 Petrol 0
...
Desired output as per web page ads (just an example as it's dynamic):
Ford C-MAX 2019 1.1 Petrol 15950
Ford C-MAX 2014 1.6 Diesel 12000
Ford C-MAX 2011 1.6 Diesel 9000
...
Code:
from __future__ import print_function
import requests
import re
import locale
import time
from time import sleep
from random import randint
from currency_converter import CurrencyConverter
c = CurrencyConverter()
from bs4 import BeautifulSoup
import pandas as pd
from datetime import date, datetime, timedelta
import mysql.connector
import numpy as np
import itertools
locale.setlocale( locale.LC_ALL, 'en_US.UTF-8' )
pages = np.arange(0, 210, 30)
entered = datetime.now()
make = "Ford"
model = "C-MAX"
def insertvariablesintotable(make, model, year, liter, fuel, price, entered):
try:
cnx = mysql.connector.connect(user='root', password='', database='FYP', host='127.0.0.2', port='8000')
cursor = cnx.cursor()
cursor.execute('CREATE TABLE IF NOT EXISTS ford_cmax ( make VARCHAR(15), model VARCHAR(20), '
'year INT(4), liter VARCHAR(3), fuel VARCHAR(6), price INT(6), entered TIMESTAMP) ')
insert_query = """INSERT INTO ford_cmax (make, model, year, liter, fuel, price, entered) VALUES (%s,%s,%s,%s,%s,%s,%s)"""
record = (make, model, year, liter, fuel, price, entered)
cursor.execute(insert_query, record)
cnx.commit()
finally:
if (cnx.is_connected()):
cursor.close()
cnx.close()
for response in pages:
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
response = requests.get("https://www.donedeal.ie/cars/Ford/C-MAX?start=" + str(response), headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
cnx = mysql.connector.connect(user='root', password='', database='FYP', host='127.0.0.2', port='8000')
cursor = cnx.cursor()
for details in soup.findAll('ul', attrs={'class': 'card__body-keyinfo'}):
details = details.text
#print(details)
year = details[:4]
liter = details[4:7]
fuel = details[8:14] #exludes electric which has 2 extra
mileage = re.findall("[0-9]*,[0-9][0-9][0-9]..." , details)
mileage = ''.join(mileage)
mileage = mileage.replace(",", "")
if "mi" in mileage:
mileage = mileage.rstrip('mi')
mileage = round(float(mileage) * 1.609)
mileage = str(mileage)
if "km" in mileage:
mileage = mileage.rstrip('km')
mileage = mileage.replace("123" or "1234" or "12345" or "123456", "0")
for price in soup.findAll('p', attrs={'class': 'card__price'}):
price = price.text
price = price.replace("No Price", "0")
price = price.replace("123" or "1234" or "12345" or "123456", "0")
price = price.replace(",","")
price = price.replace("€", "")
if "p/m" in price:
#price = price[:-3]
price = price.rstrip('p/m')
price = "0"
if "£" in price:
price = price.replace("£", "")
price = c.convert(price, 'GBP', 'EUR')
price = round(price)
print(make, model, year, liter, fuel, price)
#insertvariablesintotable(make, model, year, liter, fuel, price, entered) #same result as above
I had a look at your code and the website you're trying to get the data from and it looks like you're retrieving the page, then looping on all the prices you're getting from that page using price
as a variable but overwriting it every time you enter the for loop. Same goes for your details for
loop.
Here is what you could try instead:
make = "Ford"
model = "C-MAX"
price_list = [] # we will store prices here
details_list = [] # and details like year, liter, mileage there
for response in range(1,60,30): # I changed to a range loop for testing
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"
}
response = requests.get(
"https://www.donedeal.ie/cars/Ford/C-MAX?start=" + str(response),
headers=headers,
)
soup = BeautifulSoup(response.text, "html.parser")
count = 0
for details in soup.findAll("ul", attrs={"class": "card__body-keyinfo"}):
if count == 30:
break # Takes us out of the for loop
details = details.text
# print(details)
year = details[:4]
liter = details[4:7]
fuel = details[8:14] # exludes electric which has 2 extra
mileage = re.findall("[0-9]*,[0-9][0-9][0-9]...", details)
mileage = "".join(mileage)
mileage = mileage.replace(",", "")
if "mi" in mileage:
mileage = mileage.rstrip("mi")
mileage = round(float(mileage) * 1.609)
mileage = str(mileage)
if "km" in mileage:
mileage = mileage.rstrip("km")
mileage = mileage.replace("123" or "1234" or "12345" or "123456", "0")
details_list.append((year, liter, fuel, mileage)) # end of one loop go-through, we append
count += 1 We update count value
count = 0
for price in soup.findAll("p", attrs={"class": "card__price"}):
if count == 30:
break # Takes us out of the for loop
price = price.text
price = price.replace("No Price", "0")
price = price.replace("123" or "1234" or "12345" or "123456", "0")
price = price.replace(",", "")
price = price.replace("€", "")
if "£" in price:
price = price.replace("£", "")
price = c.convert(price, "GBP", "EUR")
price = round(price)
if "p/m" in price:
# price = price[:-3]
price = price.rstrip("p/m")
price = "0"
else:
price_list.append(price) # end of loop go-through, we append but only if it is not a "p/m" price
count += 1 # We update count value only when a value is appended to the list
for i in range(len(price_list)):
print(
make,
model,
details_list[i][0],
details_list[i][1],
details_list[i][2],
price_list[i],
)
#add your insertvariablesintotable(make,model,details_list[i][0], details_list[i][1],details_list[i][2],price_list[i]) there
Edit: I did not add the p/m prices to the list as they were making the lengths of details_list and price_list differ. If you want to add the p/m prices too, you will have to rework the code. Also, you don't want to have the 3 cars at the very bottom of the page as they might not be Ford C-MAX, but other models instead, maybe even other manufacturers.