I'm using Python3.8.5 on Ubuntu 20.04. How can I scrape this html shown below and found here into a Pandas DataFrame.
Here is my current code:
import pathlib
import sys
import lxml
import pandas as pd
import requests
from bs4 import BeautifulSoup
response = requests.get('http://nemweb.com.au/Reports/Current/')
soup = BeautifulSoup(response.text, 'lxml')
names = soup.find('body')
print(
f"Type = {type(names)}\n"
f"Length = {len(names)}\n"
)
name_list = names.find('pre')
print(name_list.text)
for elem in name_list.text:
print(elem)
#Do I need to use regex here?
If you want a DataFrame
you might want to try this:
By the way, this works with any report URL from nemweb.com.au - /Reports/Current/
Note: I'm using .head(10)
to show the first 10 items for a given dataframe.
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tabulate import tabulate
headers = ["Date", "Time", "Type", "URL"]
def make_soup(catalog_url: str):
return BeautifulSoup(requests.get(catalog_url).text, "lxml")
def process_soup(soup: BeautifulSoup) -> tuple:
text = soup.getText().split()[8:]
follow_urls = [a["href"] for a in soup.find_all("a", href=True)[1:]]
catalog = [text[i:i + 8] for i in range(0, len(text), 8)]
return follow_urls, catalog
def build_dataframe(processed_soup: tuple) -> pd.DataFrame:
follow_urls, catalog = processed_soup
frame = []
for index, item in enumerate(catalog):
*date, hour, am, type_, _ = item
frame.append(
[
" ".join(date),
f"{hour} {am}",
type_,
f"http://nemweb.com.au{follow_urls[index]}"]
)
return pd.DataFrame(frame, columns=headers)
def dump_to_csv(dataframe: pd.DataFrame, file_name: str = "default_name"):
dataframe.to_csv(f"{file_name}.csv", index=False)
print(f"File {file_name} saved!")
if __name__ == "__main__":
target_url = "http://nemweb.com.au/Reports/Current/"
df = build_dataframe(process_soup(make_soup(target_url)))
print(tabulate(df.head(10), headers=headers, showindex=False, tablefmt="pretty"))
dump_to_csv(df, file_name=target_url.rsplit("/")[-2])
Output:
+-----------------------------+----------+-------+-------------------------------------------------------------------+
| Date | Time | Type | URL |
+-----------------------------+----------+-------+-------------------------------------------------------------------+
| Saturday, April 3, 2021 | 9:50 AM | <dir> | http://nemweb.com.au/Reports/Current/Adjusted_Prices_Reports/ |
| Monday, April 5, 2021 | 8:00 AM | <dir> | http://nemweb.com.au/Reports/Current/Alt_Limits/ |
| Monday, April 5, 2021 | 1:12 AM | <dir> | http://nemweb.com.au/Reports/Current/Ancillary_Services_Payments/ |
| Monday, April 5, 2021 | 11:30 AM | <dir> | http://nemweb.com.au/Reports/Current/Auction_Units_Reports/ |
| Monday, April 5, 2021 | 4:43 AM | <dir> | http://nemweb.com.au/Reports/Current/Bidmove_Complete/ |
| Thursday, April 1, 2021 | 4:44 AM | <dir> | http://nemweb.com.au/Reports/Current/Bidmove_Summary/ |
| Wednesday, December 2, 2020 | 10:44 AM | <dir> | http://nemweb.com.au/Reports/Current/Billing/ |
| Monday, April 5, 2021 | 7:40 AM | <dir> | http://nemweb.com.au/Reports/Current/Causer_Pays/ |
| Thursday, February 4, 2021 | 9:10 PM | <dir> | http://nemweb.com.au/Reports/Current/Causer_Pays_Elements/ |
| Monday, November 28, 2016 | 7:50 PM | <dir> | http://nemweb.com.au/Reports/Current/Causer_Pays_Rslcpf/ |
+-----------------------------+----------+-------+-------------------------------------------------------------------+
File Current saved!