Search code examples
pythonweb-scrapingpython-requestsurllib

How can I get requests to get more than the first table of a site?


import bs4 as bs
import urllib.request

link = urllib.request.urlopen('https://www.health.pa.gov/topics/disease/coronavirus/Pages/Cases.aspx')
soup = bs.BeautifulSoup(link, 'lxml')

table = soup.find('table')

table_rows = table.find_all('tr')

for tr in table_rows:
    td = tr.find_all('td')
    row = [i.text for i in td]
    print(row)

I am currently trying to make a simple website displaying only the number of COVID-19 cases in a certain county in the US. However, when I run my code, it only gives me data from the first table of the site I am trying to scrape from.

Any help would be greatly appreciated!


Solution

  • You are using soup.find('table') which will only get the first item found. Use soup.find_all():

    import bs4 as bs
    import urllib.request
    
    link = urllib.request.urlopen('https://www.health.pa.gov/topics/disease/coronavirus/Pages/Cases.aspx')
    soup = bs.BeautifulSoup(link, 'lxml')
    
    body = soup.find('body') # get the body so you can do soup.find_all() inside it
    tables = soup.find_all('table')
    
    for table in tables:
    
        table_rows = table.find_all('tr')
    
        for tr in table_rows:
            td = tr.find_all('td')
            row = [i.text for i in td]
            print(row)
    

    There is a lot of garbage in the output but it is possible to filter it down by div id/class or table headers.

    Output:

    ['\nTotal Cases*', '\u200bDeaths', '\n\u200bNegative']
    ['60,622', '4,342', '259,210']
    ['\n\u200bAge Range', '\u200bPercent of Cases*']
    ['\u200b0-4', '\u200b< 1%']
    ['\u200b5-12', '\u200b< 1%']
    ['\u200b13-18', '\u200b1%']
    ['\u200b19-24', '6%']
    ['\u200b25-49', '\u200b37%']
    ['\u200b50-64', '\u200b26%']
    ['\n\u200b65+', '\u200b28%']
    ['\n\u200bAge Range', '\u200bPercent of Cases']
    ['\u200b0-29', '\u200b2%']
    ['30-49', '\n5%']
    ['50-64', '10%']
    ['\u200b65-79', '20%']
    ['\u200b80+', '19%']
    ['County', 'Total Cases', 'Negatives\xa0', 'Deaths\xa0']
    ['Adams', '183', '2035', '6']
    ['Allegheny', '1582', '21626', '141']
    ['Armstrong', '57', '955', '5']
    ['Beaver', '516', '2663', '83']
    ['Bedford\xa0', '30', '429', '1']
    ['Berks', '3593', '8171', '207']
    ['Blair', '32', '1733', '0']
    ['Bradford', '41', '1030', '2']
    ['Bucks', '4325', '13245', '410']
    ['Butler', '202', '2958', '6']
    ['Cambria', '49', '2315', '1']
    ['Cameron', '2', '84', '0']
    ['Carbon', '206', '1575', '17']
    ['Centre', '129', '1413', '6']
    ['Chester', '2060', '8001', '218']
    ['Clarion', '24', '570', '1']
    ['Clearfield', '33', '721', '0']
    ['Clinton', '43', '396', '0']
    ['Columbia', '334', '980', '33']
    ['Crawford', '21', '811', '0']
    ['Cumberland', '492', '2745', '41']
    ['Dauphin', '912', '6987', '41']
    ['Delaware', '5409', '13810', '466']
    ['Elk', '6', '229', '1']
    ['Erie', '129', '2955', '3']
    ['Fayette', '87', '2370', '4']
    ['Forest', '7', '45', '0']
    ['Franklin', '580', '3932', '17']
    ['Fulton', '11', '144', '0']
    ['Greene', '27', '567', '1']
    ['Huntingdon', '204', '588', '0']
    ['Indiana', '84', '956', '6']
    ['Jefferson', '7', '397', '0']
    ['Juniata', '94', '243', '1']
    ['Lackawanna', '1273', '3994', '141']
    ['Lancaster', '2428', '11065', '186']
    ['Lawrence', '72', '960', '7']
    ['Lebanon', '863', '3501', '19']
    ['Lehigh', '3396', '10165', '136']
    ['Luzerne', '2491', '7587', '127']
    ['Lycoming', '141', '1607', '7']
    ['McKean', '10', '321', '1']
    ['Mercer', '83', '1027', '2']
    ['Mifflin', '57', '942', '0']
    ['Monroe', '1242', '3917', '70']
    ['Montgomery', '5697', '23738', '608']
    ['Montour', '50', '2986', '1']
    ['Northampton', '2600', '9196', '196']
    ['Northumberland', '136', '983', '0']
    ['Perry', '36', '447', '1']
    ['Philadelphia', '15835', '40378', '1004']
    ['Pike', '458', '1620', '22']
    ['Potter', '4', '109', '0']
    ['Schuylkill', '506', '3022', '15']
    ['Snyder', '33', '291', '2']
    ['Somerset', '32', '1129', '1']
    ['Sullivan', '2', '64', '0']
    ['Susquehanna', '82', '480', '14']
    ['Tioga', '16', '400', '1']
    ['Union', '44', '731', '1']
    ['Venango', '7', '344', '0']
    ['Warren', '2', '235', '0']
    ['Washington', '129', '2989', '4']
    ['Wayne', '107', '692', '7']
    ['Westmoreland', '423', '6671', '32']
    ['Wyoming', '28', '310', '3']
    ['York', '828', '9630', '16']
    ['Sex', 'Positive Cases\xa0', 'Percent of Cases*', 'Deaths\xa0']
    ['Female', '33,184', '55%', '2183']
    ['Male', '26,784', '44%', '2141']
    ['Neither', '3', '0%', '0']
    ['Not reported', '651', '1%', '18']
    ['Race', 'Positive Cases', 'Percent of Cases** ', 'Deaths\xa0']
    ['African American/Black', '7073', '12%', '498']
    ['Asian', '754', '1%', '53']
    ['White', '15,163', '25%', '1640']
    ['Other', '312', '1%', '14']
    ['Not reported', '37,320', '61%', '2137']
    ['Region', 'Positive', 'Negative', 'Inconclusive\xa0']
    ['Northcentral\xa0', '918', '10990', '16']
    ['Northeast', '11707', '39536', '135']
    ['Northwest', '337', '8699', '18']
    ['Southcentral', '4187', '33356', '72']
    ['Southeast', '39125', '121430', '854']
    ['Southwest', '3026', '45199', '36']
    ['Facility County', 'Number of Facilities with Cases', 'Number of Cases Among Residents', 'Number of Cases Among Employees', 'Number of Deaths']
    ['ADAMS', '3', '23', '4', '4']
    ['ALLEGHENY', '36', '393', '109', '110']
    ['ARMSTRONG', '1', '5', '6', '3']
    ['BEAVER', '3', '340', '26', '76']
    ['BERKS', '25', '710', '100', '139']
    ['BUCKS', '52', '1376', '320', '328']
    ['BUTLER', '6', '13', '10', '2']
    ['CAMBRIA', '1', '1', '.', '0']
    ['CARBON', '2', '58', '5', '13']
    ['CENTRE', '3', '16', '11', '4']
    ['CHESTER', '38', '677', '127', '185']
    ['CLARION', '1', '1', '1', '0']
    ['CLEARFIELD', '2', '2', '.', '0']
    ['COLUMBIA', '3', '95', '33', '29']
    ['CUMBERLAND', '8', '239', '57', '42']
    ['DAUPHIN', '3', '229', '49', '26']
    ['DELAWARE', '39', '1510', '273', '378']
    ['ERIE', '4', '3', '2', '0']
    ['FAYETTE', '1', '3', '.', '1']
    ['FRANKLIN', '5', '79', '13', '8']
    ['HUNTINGDON', '1', '0', '1', '0']
    ['INDIANA', '4', '14', '2', '4']
    ['LACKAWANNA', '15', '560', '96', '121']
    ['LANCASTER', '32', '608', '174', '163']
    ['LAWRENCE', '2', '0', '2', '0']
    ['LEBANON', '4', '81', '14', '13']
    ['LEHIGH', '26', '590', '122', '106']
    ['LUZERNE', '19', '404', '71', '93']
    ['LYCOMING', '3', '68', '12', '8']
    ['MERCER', '1', '1', '.', '0']
    ['MIFFLIN', '2', '1', '1', '0']
    ['MONROE', '8', '146', '44', '35']
    ['MONTGOMERY', '88', '2153', '110', '506']
    ['NORTHAMPTON', '14', '634', '154', '128']
    ['NORTHUMBERLAND', '1', '1', '.', '0']
    ['PHILADELPHIA', '59', '1614', '8', '410']
    ['PIKE', '2', '31', '5', '9']
    ['SCHUYLKILL', '10', '66', '19', '2']
    ['SUSQUEHANNA', '3', '42', '16', '14']
    ['UNION', '1', '0', '1', '0']
    ['WASHINGTON', '3', '6', '2', '1']
    ['WAYNE', '1', '0', '1', '0']
    ['WESTMORELAND', '9', '134', '34', '27']
    ['YORK', '6', '10', '4', '3']
    ['PENNSYLVANIA', '550', '12937', '2039', '2991']
    

    You can put some more logic in there to split each table into its own dictionary or list