Search code examples
htmlpython-3.xweb-scrapingbeautifulsouppython-requests

Scraping text from long element using Requests bs4 Python3.8


I'm using Python3.8.5 on Ubuntu 20.04. How can I scrape this html shown below and found here into a Pandas DataFrame.

Here is my current code:

import pathlib
import sys

import lxml
import pandas as pd
import requests
from bs4 import BeautifulSoup

response = requests.get('http://nemweb.com.au/Reports/Current/')
soup = BeautifulSoup(response.text, 'lxml')
names = soup.find('body')
print(
    f"Type = {type(names)}\n"
    f"Length = {len(names)}\n"
)
name_list = names.find('pre')
print(name_list.text)
for elem in name_list.text:
    print(elem)
#Do I need to use regex here?

enter image description here


Solution

  • If you want a DataFrame you might want to try this:

    By the way, this works with any report URL from nemweb.com.au - /Reports/Current/

    Note: I'm using .head(10) to show the first 10 items for a given dataframe.

    import pandas as pd
    import requests
    from bs4 import BeautifulSoup
    from tabulate import tabulate
    
    headers = ["Date", "Time", "Type", "URL"]
    
    
    def make_soup(catalog_url: str):
        return BeautifulSoup(requests.get(catalog_url).text, "lxml")
    
    
    def process_soup(soup: BeautifulSoup) -> tuple:
        text = soup.getText().split()[8:]
        follow_urls = [a["href"] for a in soup.find_all("a", href=True)[1:]]
        catalog = [text[i:i + 8] for i in range(0, len(text), 8)]
        return follow_urls, catalog
    
    
    def build_dataframe(processed_soup: tuple) -> pd.DataFrame:
        follow_urls, catalog = processed_soup
        frame = []
        for index, item in enumerate(catalog):
            *date, hour, am, type_, _ = item
            frame.append(
                [
                    " ".join(date),
                    f"{hour} {am}",
                    type_,
                    f"http://nemweb.com.au{follow_urls[index]}"]
            )
        return pd.DataFrame(frame, columns=headers)
    
    
    def dump_to_csv(dataframe: pd.DataFrame, file_name: str = "default_name"):
        dataframe.to_csv(f"{file_name}.csv", index=False)
        print(f"File {file_name} saved!")
    
    
    if __name__ == "__main__":
        target_url = "http://nemweb.com.au/Reports/Current/"
        df = build_dataframe(process_soup(make_soup(target_url)))
        print(tabulate(df.head(10), headers=headers, showindex=False, tablefmt="pretty"))
        dump_to_csv(df, file_name=target_url.rsplit("/")[-2])
    

    Output:

    +-----------------------------+----------+-------+-------------------------------------------------------------------+
    |            Date             |   Time   | Type  |                                URL                                |
    +-----------------------------+----------+-------+-------------------------------------------------------------------+
    |   Saturday, April 3, 2021   | 9:50 AM  | <dir> |   http://nemweb.com.au/Reports/Current/Adjusted_Prices_Reports/   |
    |    Monday, April 5, 2021    | 8:00 AM  | <dir> |         http://nemweb.com.au/Reports/Current/Alt_Limits/          |
    |    Monday, April 5, 2021    | 1:12 AM  | <dir> | http://nemweb.com.au/Reports/Current/Ancillary_Services_Payments/ |
    |    Monday, April 5, 2021    | 11:30 AM | <dir> |    http://nemweb.com.au/Reports/Current/Auction_Units_Reports/    |
    |    Monday, April 5, 2021    | 4:43 AM  | <dir> |      http://nemweb.com.au/Reports/Current/Bidmove_Complete/       |
    |   Thursday, April 1, 2021   | 4:44 AM  | <dir> |       http://nemweb.com.au/Reports/Current/Bidmove_Summary/       |
    | Wednesday, December 2, 2020 | 10:44 AM | <dir> |           http://nemweb.com.au/Reports/Current/Billing/           |
    |    Monday, April 5, 2021    | 7:40 AM  | <dir> |         http://nemweb.com.au/Reports/Current/Causer_Pays/         |
    | Thursday, February 4, 2021  | 9:10 PM  | <dir> |    http://nemweb.com.au/Reports/Current/Causer_Pays_Elements/     |
    |  Monday, November 28, 2016  | 7:50 PM  | <dir> |     http://nemweb.com.au/Reports/Current/Causer_Pays_Rslcpf/      |
    +-----------------------------+----------+-------+-------------------------------------------------------------------+
    File Current saved!