python dataframe html-table beautifulsoup html-parsing

Extracting table tag values using BeautifulSoup in Python?

I'm trying to write a Python script to extract some tag values from within a table located on this page: https://azure.microsoft.com/en-us/pricing/details/virtual-machines/windows/

I've included a screenshot of the HTML source code, however I can't figure out how to extract the price data for columns 6,7,8 and 9. Below is the code I've already written.

import requests
import pandas as pd
from bs4 import BeautifulSoup

url = 'https://azure.microsoft.com/en-us/pricing/details/virtual-machines/windows/'

response = requests.get(url)

soup = BeautifulSoup(response.content, 'html.parser')

table1 = soup.find_all('table', class_= 'sd-table')

#writing the first few columns to text file

with open('examplefile.txt', 'w') as r:
    for row in table1.find_all('tr'):
        for cell in row.find_all('td'):
            r.write(cell.text.ljust(5))
        r.write('\n')

I'm ultimately trying to extract out all of the values for each row and save it into a Pandas DataFrame, or a CSV. Thanks.

Solution

The table values seem to be embedded in a JSON string that can be gotten with json.loads. Then we can get the value by indicating a "regional" key of country's region.

It's a bit more complicated but at least it gets the values that we put into a dataframe as follows:

import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import os
import numpy as np

# force maximum dataframe column width
pd.set_option('display.max_colwidth', 0)

url = 'https://azure.microsoft.com/en-us/pricing/details/virtual-machines/windows/'

response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
tables = soup.find_all('div', {'class': 'row row-size3 column'})

region = 'us-west-2' # Adjust your region here

def parse_table_as_dataframe(table):
    data = []
    header = []
    c5 = c6 = c7 = c8 = []

    rows = []
    columns = []

    name = table.h3.text

    try:
        # This part gets the first word in each column header so the table
        # fits reasonably in the display, adjust to your preference 
        header = [h.text.split()[0].strip() for h in table.thead.find_all('th')][1::]
    except AttributeError:
        return 'N/A'

    for row in table.tbody.find_all('tr'):
        for c in row.find_all('td')[1::]:
            if c.text.strip() not in (u'', u'$-') :
                if 'dash' in c.text.strip():
                    columns.append('-') # replace "&dash; &dash:" with a `-`
                else:
                    columns.append(c.text.strip())  
            else:
                try:
                    data_text = c.span['data-amount']
                    # data = json.loads(data_text)['regional']['asia-pacific-southeast']
                    data = json.loads(data_text)['regional'][region]
                    columns.append(data)
                except (KeyError, TypeError):
                    columns.append('N/A')



    num_rows = len(table.tbody.find_all('tr'))
    num_columns = len(header)

    # For debugging
    # print(len(columns), columns)
    # print(num_rows, num_columns)

    df = pd.DataFrame(np.array(columns).reshape(num_rows, num_columns), columns=header)
    return df

for n, table in enumerate(tables):
    print(n, table.h3.text)
    print(parse_table_as_dataframe(table))

gets 24 dataframes, one for each table from the page:

0 B-series
  Instance Core        RAM Temporary    Pay      One    Three        3
0  B1S      1    1.00 GiB   2 GiB     0.017  0.01074  0.00838  0.00438
1  B2S      2    4.00 GiB   8 GiB     0.065  0.03483  0.02543  0.01743
2  B1MS     1    2.00 GiB   4 GiB     0.032  0.01747  0.01271  0.00871
3  B2MS     2    8.00 GiB   16 GiB    0.122  0.06165  0.04289  0.03489
4  B4MS     4    16.00 GiB  32 GiB    0.229  0.12331  0.08579  0.06979
5  B8MS     8    32.00 GiB  64 GiB    0.438  0.24661  0.17157  0.13957

...

...

23 H-series
  Instance Core         RAM  Temporary    Pay      One    Three        3
0  H8       8    56.00 GiB   1,000 GiB  1.129  0.90579  0.72101  0.35301
1  H16      16   112.00 GiB  2,000 GiB  2.258  1.81168  1.44205  0.70605
2  H8m      8    112.00 GiB  1,000 GiB  1.399  1.08866  0.84106  0.47306
3  H16m     16   224.00 GiB  2,000 GiB  2.799  2.17744  1.68212  0.94612
4  H16mr    16   224.00 GiB  2,000 GiB  3.012  2.32162  1.77675  1.04075
5  H16r     16   112.00 GiB  2,000 GiB  2.417  1.91933  1.51267  0.77667