Search code examples
pythondataframeweb-scrapingbeautifulsouppython-requests

How to Extract Data from Multiple Pages Using BeautifulSoup?


I'm attempting to scrape data from a website but I'm encountering issues with multiple pages. Somehow, my iterations always result in the error message 'All arrays must be of the same length'. Can somebody help me identify where I went wrong? Below is the code I'm using:

import requests
from bs4 import BeautifulSoup
import pandas as pd

def replaced(text):
    return text.replace('\n\n\n\n\n','')

total_page = 3
current_page = 1

judul_list = []
harga_list = []
distance = []
transmit = []
location = []
sp = []
rec_seller = []

while current_page <= total_page:
    url = f""
    req = requests.get
    headers = {"User-Agent": }
    page_request = requests.get(url, headers=headers)
    soup = BeautifulSoup(page_request.content, "html.parser")
    containers = soup.find_all('div', {'class' : 'grid'})
    container = containers[0]

    judul = container.findAll('h2', {'class' : 'listing__title epsilon flush'})
    judul_list += [replaced(i.text) for i in judul]
    harga = container.findAll('div', {'class' : 'listing__price delta weight--bold'})
    harga_list += [replaced(j.text) for j in harga]
    specs = container.findAll('div', {'class' : 'listing__specs soft-quarter--ends soft-half--sides milli'})
    specs_list = [replaced(k.text) for k in specs]
    distance += [k.split('|')[1].strip() for k in specs_list]
    transmit += [k.split('|')[2].strip() for k in specs_list]
    location += [k.split('|')[3].strip() for k in specs_list]
    sp += [k.split('|')[4].strip() for k in specs_list]
    rec_seller += [k.split('|')[5].strip() for k in specs_list]

    current_page += 1

tahun = [a.split()[0].strip('|') for a in judul_list]
merek = [a.split()[1].strip('|') for a in judul_list]
series = [a.split()[2].strip('|') for a in judul_list]

# Create DataFrame
data = {
    'Tahun': tahun,
    'Merek': merek,
    'Series': series,
    'Harga': harga_list,
    'Distance': distance,
    'Transmit': transmit,
    'Location': location,
    'SP': sp,
    'Rec_Seller': rec_seller
}

Solution

  • In newer code avoid old syntax findAll() instead use find_all() or select() with css selectors - For more take a minute to check docs


    Instead of using a multitude of different lists, whose equality in length cannot be guaranteed, just try a list of dictionaries - this also has the charming advantage that missing values are simply ignored during the transformation into a dataframe.

    To do this, also change your strategy of selecting the elements, focus on the containers and iterate these to extract the respective contents.

    Furthermore, in case of doubt, the generation of the specs could also be more generic and your def to replace the line breaks could is not necessarry simply use get_text() with parameter strip=True

    Example
    import requests
    from bs4 import BeautifulSoup
    import pandas as pd
    
    total_page = 3
    current_page = 1
    
    data = []
    
    while current_page <= total_page:
        url = f"https://www.mobil123.com/mobil-dijual/indonesia?page_number={current_page}&page_size=25"
        req = requests.get
        headers = {"User-Agent": ''}
        page_request = requests.get(url, headers=headers)
        soup = BeautifulSoup(page_request.content, "html.parser")
        for e in soup.select('article.listing'):
          d = {
                'judul': e.h2.get_text(strip=True),
                'harga': e.find('div', {'class' : 'listing__price delta weight--bold'}).get_text(strip=True) if e.find('div', {'class' : 'listing__price delta weight--bold'}) else None
          }
          d.update({e.get('class')[-1].split('--')[-1]:e.next for e in soup.select('.listing__specs i') if not 'thumb' in e.get('class')[-1]})
          data.append(d)
        current_page += 1
    
    pd.DataFrame(data)
    
    judul harga meter transmission location user-formal
    0 2023 Suzuki Ignis 1.2 GX Hatchback Rp 175.000.000 0 - 5K KM Automatic DKI Jakarta Dealer
    1 2023 Suzuki Jimny 1.5 Wagon Rp 475.000.000 0 - 5K KM Automatic DKI Jakarta Dealer
    ...
    73 2019 Mercedes-Benz GLS400 3.0 4MATIC AMG SUV - VERY LOW ODO Rp 1.250.000.000 70 - 75K KM Automatic Jawa Barat Dealer
    74 2018 Toyota Voxy 2.0 Wagon Rp 319.000.000 70 - 75K KM Automatic Jawa Barat Dealer