Search code examples
pythonweb-scrapingbeautifulsouphtml-parsing

BeautifulSoup getting href of a list - need to simplify the script - replace multiprocessing


I have the following soup:

<a href="some_url">next</a>
<span class="class">...</span>

From this I want to extract the href, "some_url" This I want to extract the href, "some_url"

and the whole list of the pages that are listed on this page: https://www.catholic-hierarchy.org/diocese/laa.html

note: there are a whole lot of links to sub-pages: which i need to parse. at the moment: getting all the data out it : -dioceses -Urls -description -contact-data -etc. etx.

The example below will grab all URLs of dioceses, get some info about each of them and creates final dataframe. To speed-up the process multiprocessing.Pool is used:

but wait: how to get this scraper running without the support of the multiprocessing!? i want to run it in Colab - therefore in need to get rid of the multiprocessing-feature.

How to achieve this..!?

import requests
from bs4 import BeautifulSoup
from multiprocessing import Pool


def get_dioceses_urls(section_url):
    dioceses_urls = set()

    while True:
        print(section_url)

        soup = BeautifulSoup(
            requests.get(section_url, headers=headers).content, "lxml"
        )
        for a in soup.select('ul a[href^="d"]'):
            dioceses_urls.add(
                "https://www.catholic-hierarchy.org/diocese/" + a["href"]
            )

        # is there Next Page button?
        next_page = soup.select_one('a:has(img[alt="[Next Page]"])')
        if next_page:
            section_url = (
                "https://www.catholic-hierarchy.org/diocese/"
                + next_page["href"]
            )
        else:
            break

    return dioceses_urls


def get_diocese_info(url):
    print(url)

    soup = BeautifulSoup(requests.get(url, headers=headers).content, "html5lib")

    data = {
        "Title 1": soup.h1.get_text(strip=True),
        "Title 2": soup.h2.get_text(strip=True),
        "Title 3": soup.h3.get_text(strip=True) if soup.h3 else "-",
        "URL": url,
    }

    li = soup.find(
        lambda tag: tag.name == "li"
        and "type of jurisdiction:" in tag.text.lower()
        and tag.find() is None
    )
    if li:
        for l in li.find_previous("ul").find_all("li"):
            t = l.get_text(strip=True, separator=" ")
            if ":" in t:
                k, v = t.split(":", maxsplit=1)
                data[k.strip()] = v.strip()

    # get other info about the diocese
    # ...

    return data


if __name__ == "__main__":
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:99.0) Gecko/20100101 Firefox/99.0"
    }

    # get main sections:
    url = "https://www.catholic-hierarchy.org/diocese/laa.html"
    soup = BeautifulSoup(
        requests.get(url, headers=headers).content, "html.parser"
    )

    main_sections = [url]
    for a in soup.select("a[target='_parent']"):
        main_sections.append(
            "https://www.catholic-hierarchy.org/diocese/" + a["href"]
        )

    all_data, dioceses_urls = [], set()
    with Pool() as pool:
        # get all dioceses urls:
        for urls in pool.imap_unordered(get_dioceses_urls, main_sections):
            dioceses_urls.update(urls)

        # get info about all dioceses:
        for info in pool.imap_unordered(get_diocese_info, dioceses_urls):
            all_data.append(info)

    # create dataframe from the info about dioceses
    df = pd.DataFrame(all_data).sort_values("Title 1")

    # save it to csv file
    df.to_csv("data.csv", index=False)
    print(df.head().to_markdown())

update: well see what i get back if i run the script on colab:

https://www.catholic-hierarchy.org/diocese/laa.htmlhttps://www.catholic-hierarchy.org/diocese/lab.html

---------------------------------------------------------------------------
RemoteTraceback                           Traceback (most recent call last)
RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/usr/lib/python3.7/multiprocessing/pool.py", line 121, in worker
    result = (True, func(*args, **kwds))
  File "<ipython-input-1-f5ea34a0190f>", line 21, in get_dioceses_urls
    next_page = soup.select_one('a:has(img[alt="[Next Page]"])')
  File "/usr/local/lib/python3.7/dist-packages/bs4/element.py", line 1403, in select_one
    value = self.select(selector, limit=1)
  File "/usr/local/lib/python3.7/dist-packages/bs4/element.py", line 1528, in select
    'Only the following pseudo-classes are implemented: nth-of-type.')
NotImplementedError: Only the following pseudo-classes are implemented: nth-of-type.
"""

The above exception was the direct cause of the following exception:

NotImplementedError                       Traceback (most recent call last)
<ipython-input-1-f5ea34a0190f> in <module>
     81     with Pool() as pool:
     82         # get all dioceses urls:
---> 83         for urls in pool.imap_unordered(get_dioceses_urls, main_sections):
     84             dioceses_urls.update(urls)
     85 

/usr/lib/python3.7/multiprocessing/pool.py in next(self, timeout)
    746         if success:
    747             return value
--> 748         raise value
    749 
    750     __next__ = next                    # XXX

NotImplementedError: Only the following pseudo-classes are implemented: nth-of-type.

Solution

  • problem with running script on google colab is that it currently only supports python 3.7, which doesn't support the newest version of beautifulsoup, so your a:has operator is not supported, i have replaced it with a loop on all a tags, which is slightly slower but the code works on google colab, and there is no need to remove multprocessing, but if you do need to remove multiprocessing then you should convert your functions into corountines and run them as tasks using asyncio as suggested by @Barry the Platipus.

    def get_dioceses_urls(section_url):
        dioceses_urls = set()
    
        while True:
            print(section_url)
    
            soup = BeautifulSoup(
                requests.get(section_url, headers=headers).content, "lxml"
            )
            for a in soup.select('ul a[href^="d"]'):
                dioceses_urls.add(
                    "https://www.catholic-hierarchy.org/diocese/" + a["href"]
                )
    
            # is there Next Page button?
            next_page = None
            for a in soup.find_all('a'):
                if a.img:
                    if a.img["alt"] == "[Next Page]":
                      next_page = a
                      break
            if next_page:
                section_url = (
                    "https://www.catholic-hierarchy.org/diocese/"
                    + next_page["href"]
                )
            else:
                break
    
        return dioceses_urls