Search code examples
pythonbeautifulsouphtml-parsing

encountering a problem when trying to scrape


Hi I am just starting to learn some coding via tutorials and started practicing but getting an error when trying to run a test. please see my code below. Hoping to get some help.

The following is my code:

from bs4 import BeautifulSoup as soup
from urllib.request import Request, urlopen
req = Request('https://www.niche.com/k12/search/best-schools/s/arkansas/?gradeLevel=middle&gradeLevel=high&type=traditional&type=charter&type=magnet&type=private', headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
req.close()
page_soup = soup(webpage, "html.parser")
card = page_soup.findAll("div",{"class":"card"})

csv_file = open('headmasters_scrape.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['School', 'Niche_Grade', 'School_Type'])

for cards in card:
    Item_1 = cards.findAll("h2", {"class":"search-result__title"})
    School = Item_1[0].text
    Item_2 = cards.findAll("figure", {"class":"search-result-grade"})
    Niche_Grade = Item_2[0].text
    Item_3 = cards.findAll("li", {"class":"search-result-tagline__item"})
    School_Type = Item_3[0].text

    print("School: " + School)
    print("Niche_Grade: " + Niche_Grade)
    print("School_Type: " + School_Type)    

    print()

    csv_writer.writerow([School, Niche_Grade, School_Type])

    csv_file.close()

The error that I'm getting is as follows:

Traceback (most recent call last):
File "C:\Users\Amdin\Downloads\Webscrape\Edited_Version.py", line 4, in 
<module>
webpage = urlopen(req).read()
File "C:\Users\Amdin\Anaconda3\lib\urllib\request.py", line 222, in 
urlopen
return opener.open(url, data, timeout)
File "C:\Users\Amdin\Anaconda3\lib\urllib\request.py", line 525, in open
response = self._open(req, data)
File "C:\Users\Amdin\Anaconda3\lib\urllib\request.py", line 548, in 
_open
'unknown_open', req)
File "C:\Users\Amdin\Anaconda3\lib\urllib\request.py", line 503, in 
_call_chain
result = func(*args)
File "C:\Users\Amdin\Anaconda3\lib\urllib\request.py", line 1387, in 
unknown_open
raise URLError('unknown url type: %s' % type)
urllib.error.URLError: <urlopen error unknown url type: https>

Solution

  • from bs4 import BeautifulSoup
    import requests
    import pandas as pd
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0'}
    r = requests.get("https://www.niche.com/k12/search/best-schools/s/arkansas/?gradeLevel=middle&gradeLevel=high&type=traditional&type=charter&type=magnet&type=private", headers=headers)
    
    soup = BeautifulSoup(r.text, 'html.parser')
    
    name = []
    grade = []
    sctype = []
    for item in soup.findAll("h2", {'class': 'search-result__title'}):
        name.append(item.text)
    for item in soup.select("div[class^=niche__grade]"):
        grade.append(item.text)
    for item in soup.findAll("ul", {'class': 'search-result-tagline'}):
        sctype.append(item.next_element.text)
    
    data = []
    for a, b, c in zip(name, grade, sctype):
        lol = a, b, c
        data.append(lol)
    
    df = pd.DataFrame(data, columns=["School", "Niche_Grade", "School_Type"])
    
    df.to_csv('data.csv', index=False)
    

    Output: check online enter image description here