Search code examples
pythoncsvweb-scrapingbeautifulsoupurllib

PYTHON - BEAUTIFULSOUP how to scrape empty TD(table data) as an empty value instead of skipping it


i would like to scrape a webpage into a 4 column csv files, and some of the table data contains no data, and i would like to write it out as an empty cell value instead of skipping it with .text. I also tried to use .string but it is giving me TypeError: can only concatenate str (not "NoneType") to str I also would like to set a dynamic find to get if <td> has <a href> then append the <a> tag data if not, append what is in <td> but write out as empty(or text "None") value if <td> has no data. You can see HTML sample below.

from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup

my_url = 'https://www.example.com'

# opening up connection, grabbing the page
uClient = uReq(my_url)

page_soup = soup(uClient.read(), "lxml")
uClient.close()

# containers = page_soup.find("table", {"class": "typetable"}).find_all("tr",{"class":"typetable"})
# container = containers[0]

containers = page_soup.find_all("tr", {"class": "typetable"})

# print(containers.td)

tds = []

out_filename = "output.csv"

headers = "Parameter,Type_Value,Cardinality,Description \n"

f = open(out_filename, "w")
f.write(headers)

parameter = []
type_value = []
cardinality = []
description = []

for container in containers:

    parameter = container.findAll('td')[0].text
 
    type_value = container.find_all('td')[1].text

    cardinality = container.find_all('td')[2].text

    description = container.find_all('td')[3].text


    print("parameter: " + parameter + "\n")
    print("type_value: " + type_value + "\n")
    print("cardinality: " + cardinality + "\n")
    print("description: " + description + "\n")

    #f.write(parameter + ', ' + type_value + ', ' + cardinality + ', "' + description + ' "\n')
    f.write(f'{parameter},{str(type_value)},{cardinality},"{description}"\n')

f.close()

Here is an example html:

<tr class="typetable">
  <td>Data 1&nbsp;</td>
  <td>Data 2&nbsp;</td>
  <td>&nbsp;</td>
  <td>Data 4&nbsp;</td>
</tr>
<tr class="typetable">
  <td>Data 10&nbsp;</td>
  <td>
     <a href="#2ndPage">2ndPage</a>"&nbsp;"
  </td>
  <td>Data 3&nbsp;</td>
  <td>&nbsp;</td>
</tr>

I would like the output to show

Parameter,Type_Value,Cardinality,Description
Data 1,Data 2,,"Data 4"
Data 1,2ndPage,Data 3,

I have been testing and looking up sample on stackoverflow for weeks now :(, please help. thanks in advance!


Solution

  • You can use this script to extract data from the table:

    import csv
    from bs4 import BeautifulSoup
    
    
    txt = '''<tr class="typetable">
      <td>Data 1&nbsp;</td>
      <td>Data 2&nbsp;</td>
      <td>&nbsp;</td>
      <td>Data 4&nbsp;</td>
    </tr>
    <tr class="typetable">
      <td>Data 10&nbsp;</td>
      <td>
         <a href="#2ndPage">2ndPage</a>"&nbsp;"
      </td>
      <td>Data 3&nbsp;</td>
      <td>&nbsp;</td>
    </tr>'''
    
    soup = BeautifulSoup(txt, 'html.parser')
    
    all_data = []
    for row in soup.select('tr.typetable'):
        tds = [td.a.get_text(strip=True) if td.a else td.get_text(strip=True) for td in row.select('td')]
        all_data.append(tds)
    
    
    with open('data.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        writer.writerow(['Parameter','Type_Value','Cardinality','Description'])
        for row in all_data:
            writer.writerow(row)
    

    Writes this data.csv:

    Parameter,Type_Value,Cardinality,Description
    Data 1,Data 2,,Data 4
    Data 10,2ndPage,Data 3,