html python-3.x web-scraping beautifulsoup python-requests

Scraping text from long element using Requests bs4 Python3.8

I'm using Python3.8.5 on Ubuntu 20.04. How can I scrape this html shown below and found here into a Pandas DataFrame.

Here is my current code:

import pathlib
import sys

import lxml
import pandas as pd
import requests
from bs4 import BeautifulSoup

response = requests.get('http://nemweb.com.au/Reports/Current/')
soup = BeautifulSoup(response.text, 'lxml')
names = soup.find('body')
print(
    f"Type = {type(names)}\n"
    f"Length = {len(names)}\n"
)
name_list = names.find('pre')
print(name_list.text)
for elem in name_list.text:
    print(elem)
#Do I need to use regex here?

Solution

If you want a DataFrame you might want to try this:

By the way, this works with any report URL from nemweb.com.au - /Reports/Current/

Note: I'm using .head(10) to show the first 10 items for a given dataframe.

import pandas as pd
import requests
from bs4 import BeautifulSoup
from tabulate import tabulate

headers = ["Date", "Time", "Type", "URL"]


def make_soup(catalog_url: str):
    return BeautifulSoup(requests.get(catalog_url).text, "lxml")


def process_soup(soup: BeautifulSoup) -> tuple:
    text = soup.getText().split()[8:]
    follow_urls = [a["href"] for a in soup.find_all("a", href=True)[1:]]
    catalog = [text[i:i + 8] for i in range(0, len(text), 8)]
    return follow_urls, catalog


def build_dataframe(processed_soup: tuple) -> pd.DataFrame:
    follow_urls, catalog = processed_soup
    frame = []
    for index, item in enumerate(catalog):
        *date, hour, am, type_, _ = item
        frame.append(
            [
                " ".join(date),
                f"{hour} {am}",
                type_,
                f"http://nemweb.com.au{follow_urls[index]}"]
        )
    return pd.DataFrame(frame, columns=headers)


def dump_to_csv(dataframe: pd.DataFrame, file_name: str = "default_name"):
    dataframe.to_csv(f"{file_name}.csv", index=False)
    print(f"File {file_name} saved!")


if __name__ == "__main__":
    target_url = "http://nemweb.com.au/Reports/Current/"
    df = build_dataframe(process_soup(make_soup(target_url)))
    print(tabulate(df.head(10), headers=headers, showindex=False, tablefmt="pretty"))
    dump_to_csv(df, file_name=target_url.rsplit("/")[-2])

Output:

+-----------------------------+----------+-------+-------------------------------------------------------------------+
|            Date             |   Time   | Type  |                                URL                                |
+-----------------------------+----------+-------+-------------------------------------------------------------------+
|   Saturday, April 3, 2021   | 9:50 AM  | <dir> |   http://nemweb.com.au/Reports/Current/Adjusted_Prices_Reports/   |
|    Monday, April 5, 2021    | 8:00 AM  | <dir> |         http://nemweb.com.au/Reports/Current/Alt_Limits/          |
|    Monday, April 5, 2021    | 1:12 AM  | <dir> | http://nemweb.com.au/Reports/Current/Ancillary_Services_Payments/ |
|    Monday, April 5, 2021    | 11:30 AM | <dir> |    http://nemweb.com.au/Reports/Current/Auction_Units_Reports/    |
|    Monday, April 5, 2021    | 4:43 AM  | <dir> |      http://nemweb.com.au/Reports/Current/Bidmove_Complete/       |
|   Thursday, April 1, 2021   | 4:44 AM  | <dir> |       http://nemweb.com.au/Reports/Current/Bidmove_Summary/       |
| Wednesday, December 2, 2020 | 10:44 AM | <dir> |           http://nemweb.com.au/Reports/Current/Billing/           |
|    Monday, April 5, 2021    | 7:40 AM  | <dir> |         http://nemweb.com.au/Reports/Current/Causer_Pays/         |
| Thursday, February 4, 2021  | 9:10 PM  | <dir> |    http://nemweb.com.au/Reports/Current/Causer_Pays_Elements/     |
|  Monday, November 28, 2016  | 7:50 PM  | <dir> |     http://nemweb.com.au/Reports/Current/Causer_Pays_Rslcpf/      |
+-----------------------------+----------+-------+-------------------------------------------------------------------+
File Current saved!