Search code examples
pythonlistindexoutofrangeexception

List index out of range with web scraping


my code has the list index out of range error.

import requests
from bs4 import BeautifulSoup
import re
import pyperclip
# import pandas as pd
import csv



# Get a name of the agency

def getAgency(pageURL):

    res = requests.get(pageURL)
    res.raise_for_status()

    soup = BeautifulSoup(res.text, 'html.parser')
    elems = soup.select('h3.company-name > a')

    names = []
    for i in range(len(elems)):
        names.append(str(elems[i].text.strip()))
    return names


def getWebsite(pageURL):

    res = requests.get(pageURL)
    res.raise_for_status()

    soup = BeautifulSoup(res.text, 'html.parser')
    elems = soup.select('li.website-link.website-link-a > a')

    sites = []
    for elem in elems:
        if elem.find('/your-project') != -1:
            elems.remove(elem)

        else:
            pass

    for i in range(len(elems)):
        sites.append(str(elems[i]["href"]))
    return sites

allNames = []
for pagenumber in range(0,1):
    names = getAgency('https://clutch.co/agencies/digital?page=' + str(pagenumber))
    allNames += names

allSites = []
for pagenumber in range(0,1):
    sites = getWebsite('https://clutch.co/agencies/digital?page=' + str(pagenumber))
    allSites += sites

final = []
with open('text.csv', 'w', newline='') as f:
    a = csv.writer(f, delimiter=',')
    for index in range(len(allNames)):
        final.append(",".join([allNames[index].replace(",", " "), allSites[index]]))
        a.writerows(final)

finalresult = "\n".join(final) 
pyperclip.copy(finalresult)

I know what causes this error, this if statement.

for elem in elems:
        if elem.find('/your-project') != -1:
            elems.remove(elem)

        else:
            pass

When I delete an item from the list, the index number decreases by 1. And so, in this for loop, the index will be the length of the allNames, and I used the same index with the allSites. So, when the index will be the last item of allNames, the allSites will throw an error because it will be out of range. What can I do to solve this problem?

for index in range(len(allNames)):
            final.append(",".join([allNames[index].replace(",", " "), allSites[index]]))
            a.writerows(final)

Solution

  • You need to parse the code in one iteration and spit out pairs of names and sites, and then filter to throw out the pairs.

    As your code is right now the lists aren't even the same length before you try to filter them by '/your-project' because there are advertising links that match 'li.website-link.wibise-link-a > a'