I'm trying to write a Python script to extract some tag values from within a table located on this page: https://azure.microsoft.com/en-us/pricing/details/virtual-machines/windows/
I've included a screenshot of the HTML source code, however I can't figure out how to extract the price data for columns 6,7,8 and 9. Below is the code I've already written.
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = 'https://azure.microsoft.com/en-us/pricing/details/virtual-machines/windows/'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
table1 = soup.find_all('table', class_= 'sd-table')
#writing the first few columns to text file
with open('examplefile.txt', 'w') as r:
for row in table1.find_all('tr'):
for cell in row.find_all('td'):
r.write(cell.text.ljust(5))
r.write('\n')
I'm ultimately trying to extract out all of the values for each row and save it into a Pandas DataFrame, or a CSV. Thanks.
The table values seem to be embedded in a JSON string that can be gotten with json.loads. Then we can get the value by indicating a "regional"
key of country's region.
It's a bit more complicated but at least it gets the values that we put into a dataframe as follows:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import os
import numpy as np
# force maximum dataframe column width
pd.set_option('display.max_colwidth', 0)
url = 'https://azure.microsoft.com/en-us/pricing/details/virtual-machines/windows/'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
tables = soup.find_all('div', {'class': 'row row-size3 column'})
region = 'us-west-2' # Adjust your region here
def parse_table_as_dataframe(table):
data = []
header = []
c5 = c6 = c7 = c8 = []
rows = []
columns = []
name = table.h3.text
try:
# This part gets the first word in each column header so the table
# fits reasonably in the display, adjust to your preference
header = [h.text.split()[0].strip() for h in table.thead.find_all('th')][1::]
except AttributeError:
return 'N/A'
for row in table.tbody.find_all('tr'):
for c in row.find_all('td')[1::]:
if c.text.strip() not in (u'', u'$-') :
if 'dash' in c.text.strip():
columns.append('-') # replace "‐ &dash:" with a `-`
else:
columns.append(c.text.strip())
else:
try:
data_text = c.span['data-amount']
# data = json.loads(data_text)['regional']['asia-pacific-southeast']
data = json.loads(data_text)['regional'][region]
columns.append(data)
except (KeyError, TypeError):
columns.append('N/A')
num_rows = len(table.tbody.find_all('tr'))
num_columns = len(header)
# For debugging
# print(len(columns), columns)
# print(num_rows, num_columns)
df = pd.DataFrame(np.array(columns).reshape(num_rows, num_columns), columns=header)
return df
for n, table in enumerate(tables):
print(n, table.h3.text)
print(parse_table_as_dataframe(table))
gets 24 dataframes, one for each table from the page:
0 B-series
Instance Core RAM Temporary Pay One Three 3
0 B1S 1 1.00 GiB 2 GiB 0.017 0.01074 0.00838 0.00438
1 B2S 2 4.00 GiB 8 GiB 0.065 0.03483 0.02543 0.01743
2 B1MS 1 2.00 GiB 4 GiB 0.032 0.01747 0.01271 0.00871
3 B2MS 2 8.00 GiB 16 GiB 0.122 0.06165 0.04289 0.03489
4 B4MS 4 16.00 GiB 32 GiB 0.229 0.12331 0.08579 0.06979
5 B8MS 8 32.00 GiB 64 GiB 0.438 0.24661 0.17157 0.13957
...
...
23 H-series
Instance Core RAM Temporary Pay One Three 3
0 H8 8 56.00 GiB 1,000 GiB 1.129 0.90579 0.72101 0.35301
1 H16 16 112.00 GiB 2,000 GiB 2.258 1.81168 1.44205 0.70605
2 H8m 8 112.00 GiB 1,000 GiB 1.399 1.08866 0.84106 0.47306
3 H16m 16 224.00 GiB 2,000 GiB 2.799 2.17744 1.68212 0.94612
4 H16mr 16 224.00 GiB 2,000 GiB 3.012 2.32162 1.77675 1.04075
5 H16r 16 112.00 GiB 2,000 GiB 2.417 1.91933 1.51267 0.77667