Search code examples
pythoncountbeautifulsoupfinanceedgar

Pythonic counting and scraping to skip first table if two tables exist


I'm trying to get .xml data from SEC filings. It's in the second table. But, if I get to a page that doesn't have the .xml, I want the html vers, first & only table. Could someone please help me understand how to iterate or skip a the first table if there are two, and to get the first a['href'] in the first table if only one is present?

from urllib2 import urlopen
import requests
from bs4 import BeautifulSoup
tableCount = 0
linklist = [https://www.sec.gov/Archives/edgar/data/1070789/000149315217011092/0001493152-17-011092-index.htm, https://www.sec.gov/Archives/edgar/data/1592603/000139160917000254/0001391609-17-000254-index.htm]
for l in linklist:
html = urlopen(l)
soup = BeautifulSoup(html.read().decode('latin-1', 'ignore'),"lxml")    
table = soup.findAll(class_='tableFile') # works for getting all .htm links
for item in table:
    tableCount +=1
url = table[0].a["href"]
if table.count >= 1:
    url = table[1].a["href"]
else:
    url = table.a["href"]

Solution

  • You always need the info from last table in both cases, so you can use index -1 of list to get the last table:

    import requests
    from bs4 import BeautifulSoup
    
    urls = ['https://www.sec.gov/Archives/edgar/data/1070789/000149315217011092/0001493152-17-011092-index.htm',
            'https://www.sec.gov/Archives/edgar/data/1592603/000139160917000254/0001391609-17-000254-index.htm']
    for url in urls:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")
        tables = soup.findAll('table', class_='tableFile')
    
        # assume xml table always comes after html one
        table = tables[-1]
        for a in table.findAll('a'):
            print(a['href'])  # you may filter out txt or xsd here