my code has the list index out of range error.
import requests
from bs4 import BeautifulSoup
import re
import pyperclip
# import pandas as pd
import csv
# Get a name of the agency
def getAgency(pageURL):
res = requests.get(pageURL)
res.raise_for_status()
soup = BeautifulSoup(res.text, 'html.parser')
elems = soup.select('h3.company-name > a')
names = []
for i in range(len(elems)):
names.append(str(elems[i].text.strip()))
return names
def getWebsite(pageURL):
res = requests.get(pageURL)
res.raise_for_status()
soup = BeautifulSoup(res.text, 'html.parser')
elems = soup.select('li.website-link.website-link-a > a')
sites = []
for elem in elems:
if elem.find('/your-project') != -1:
elems.remove(elem)
else:
pass
for i in range(len(elems)):
sites.append(str(elems[i]["href"]))
return sites
allNames = []
for pagenumber in range(0,1):
names = getAgency('https://clutch.co/agencies/digital?page=' + str(pagenumber))
allNames += names
allSites = []
for pagenumber in range(0,1):
sites = getWebsite('https://clutch.co/agencies/digital?page=' + str(pagenumber))
allSites += sites
final = []
with open('text.csv', 'w', newline='') as f:
a = csv.writer(f, delimiter=',')
for index in range(len(allNames)):
final.append(",".join([allNames[index].replace(",", " "), allSites[index]]))
a.writerows(final)
finalresult = "\n".join(final)
pyperclip.copy(finalresult)
I know what causes this error, this if statement.
for elem in elems:
if elem.find('/your-project') != -1:
elems.remove(elem)
else:
pass
When I delete an item from the list, the index number decreases by 1. And so, in this for loop, the index will be the length of the allNames, and I used the same index with the allSites. So, when the index will be the last item of allNames, the allSites will throw an error because it will be out of range. What can I do to solve this problem?
for index in range(len(allNames)):
final.append(",".join([allNames[index].replace(",", " "), allSites[index]]))
a.writerows(final)
You need to parse the code in one iteration and spit out pairs of names and sites, and then filter to throw out the pairs.
As your code is right now the lists aren't even the same length before you try to filter them by '/your-project' because there are advertising links that match 'li.website-link.wibise-link-a > a'