Search code examples
pythonurlweb-scrapingbeautifulsoupwiki

Extracting URLs from wikipedia tables


I am using the code below, taken from here, to scrape a wikipedia table. In addition to the test information, I would like to visit each hyperlink for each cite/country, and copy the text from each of those pages. Is there a way to do that with BeautifulSoup?

# -*- coding: utf-8 -*-
"""
Scrape a table from wikipedia using python. Allows for cells spanning multiple rows and/or columns. Outputs csv files for
each table
"""

from bs4 import BeautifulSoup
import urllib.request
import os
import codecs

# wiki = "https://en.wikipedia.org/wiki/International_Phonetic_Alphabet_chart_for_English_dialects"
wiki = 'https://en.wikipedia.org/wiki/List_of_national_capitals_in_alphabetical_order'
header = {'User-Agent': 'Mozilla/5.0'}  # Needed to prevent 403 error on Wikipedia
req = urllib.request.Request(wiki, headers=header)
page = urllib.request.urlopen(req)
soup = BeautifulSoup(page, "html.parser")

tables = soup.findAll("table", {"class": "wikitable"})

# show tables
# for table in tables:
#     print("###############")
#     print(table)#.text)#[:100])

for tn in range(len(tables)):
    table = tables[tn]

    # preinit list of lists
    rows = table.findAll("tr")
    row_lengths = [len(r.findAll(['th', 'td'])) for r in rows]
    ncols = max(row_lengths)
    nrows = len(rows)
    data = []
    for i in range(nrows):
        rowD = []
        for j in range(ncols):
            rowD.append('')
        data.append(rowD)

    # process html
    for i in range(len(rows)):
        row = rows[i]
        rowD = []
        cells = row.findAll(["td", "th"])
        for j in range(len(cells)):
            cell = cells[j]

            # lots of cells span cols and rows so lets deal with that
            cspan = int(cell.get('colspan', 1))
            rspan = int(cell.get('rowspan', 1))
            for k in range(rspan):
                for l in range(cspan):
                    data[i + k][j + l] += cell.text

        data.append(rowD)

        # write data out

        page = os.path.split(wiki)[1]
    fname = 'output_{}_t{}.csv'.format(page, tn)
    f = codecs.open(fname, 'w')  # ,encoding='utf-8')
    for i in range(nrows):
        rowStr = ','.join(data[i])
        rowStr = rowStr.replace('\n', '')
        # print(rowStr)
        rowStr = rowStr  # .encode('unicode_escape')
        f.write(rowStr + '\n')

    f.close()

Solution

  • from bs4 import BeautifulSoup
    import requests
    wiki_url = 'https://en.wikipedia.org/wiki/List_of_national_capitals_in_alphabetical_order'
    print('Fetching main wiki article: %s' % wiki_url)
    page = requests.get(wiki_url).text
    print('Done. Extracting table links..')
    html = BeautifulSoup(page)
    table = html.find('table', 'wikitable')
    links = table.findAll('a')
    links_content = {}
    print('Done extracting links. About to fetch: %s links..' % len(links))
    for link in links:
        print('Fetching: %s' % link)
        links_content[link] = requests.get(link).text