I'm attempting to scrape data from a website but I'm encountering issues with multiple pages. Somehow, my iterations always result in the error message 'All arrays must be of the same length'. Can somebody help me identify where I went wrong? Below is the code I'm using:
import requests
from bs4 import BeautifulSoup
import pandas as pd
def replaced(text):
return text.replace('\n\n\n\n\n','')
total_page = 3
current_page = 1
judul_list = []
harga_list = []
distance = []
transmit = []
location = []
sp = []
rec_seller = []
while current_page <= total_page:
url = f""
req = requests.get
headers = {"User-Agent": }
page_request = requests.get(url, headers=headers)
soup = BeautifulSoup(page_request.content, "html.parser")
containers = soup.find_all('div', {'class' : 'grid'})
container = containers[0]
judul = container.findAll('h2', {'class' : 'listing__title epsilon flush'})
judul_list += [replaced(i.text) for i in judul]
harga = container.findAll('div', {'class' : 'listing__price delta weight--bold'})
harga_list += [replaced(j.text) for j in harga]
specs = container.findAll('div', {'class' : 'listing__specs soft-quarter--ends soft-half--sides milli'})
specs_list = [replaced(k.text) for k in specs]
distance += [k.split('|')[1].strip() for k in specs_list]
transmit += [k.split('|')[2].strip() for k in specs_list]
location += [k.split('|')[3].strip() for k in specs_list]
sp += [k.split('|')[4].strip() for k in specs_list]
rec_seller += [k.split('|')[5].strip() for k in specs_list]
current_page += 1
tahun = [a.split()[0].strip('|') for a in judul_list]
merek = [a.split()[1].strip('|') for a in judul_list]
series = [a.split()[2].strip('|') for a in judul_list]
# Create DataFrame
data = {
'Tahun': tahun,
'Merek': merek,
'Series': series,
'Harga': harga_list,
'Distance': distance,
'Transmit': transmit,
'Location': location,
'SP': sp,
'Rec_Seller': rec_seller
}
In newer code avoid old syntax findAll()
instead use find_all()
or select()
with css selectors
- For more take a minute to check docs
Instead of using a multitude of different lists, whose equality in length cannot be guaranteed, just try a list
of dictionaries
- this also has the charming advantage that missing values are simply ignored during the transformation into a dataframe
.
To do this, also change your strategy of selecting the elements, focus on the containers and iterate these to extract the respective contents.
Furthermore, in case of doubt, the generation of the specs could also be more generic and your def
to replace the line breaks could is not necessarry simply use get_text()
with parameter strip=True
import requests
from bs4 import BeautifulSoup
import pandas as pd
total_page = 3
current_page = 1
data = []
while current_page <= total_page:
url = f"https://www.mobil123.com/mobil-dijual/indonesia?page_number={current_page}&page_size=25"
req = requests.get
headers = {"User-Agent": ''}
page_request = requests.get(url, headers=headers)
soup = BeautifulSoup(page_request.content, "html.parser")
for e in soup.select('article.listing'):
d = {
'judul': e.h2.get_text(strip=True),
'harga': e.find('div', {'class' : 'listing__price delta weight--bold'}).get_text(strip=True) if e.find('div', {'class' : 'listing__price delta weight--bold'}) else None
}
d.update({e.get('class')[-1].split('--')[-1]:e.next for e in soup.select('.listing__specs i') if not 'thumb' in e.get('class')[-1]})
data.append(d)
current_page += 1
pd.DataFrame(data)
judul | harga | meter | transmission | location | user-formal | |
---|---|---|---|---|---|---|
0 | 2023 Suzuki Ignis 1.2 GX Hatchback | Rp 175.000.000 | 0 - 5K KM | Automatic | DKI Jakarta | Dealer |
1 | 2023 Suzuki Jimny 1.5 Wagon | Rp 475.000.000 | 0 - 5K KM | Automatic | DKI Jakarta | Dealer |
... | ||||||
73 | 2019 Mercedes-Benz GLS400 3.0 4MATIC AMG SUV - VERY LOW ODO | Rp 1.250.000.000 | 70 - 75K KM | Automatic | Jawa Barat | Dealer |
74 | 2018 Toyota Voxy 2.0 Wagon | Rp 319.000.000 | 70 - 75K KM | Automatic | Jawa Barat | Dealer |