Search code examples
pythonpython-3.xweb-scrapingwikipediawikipedia-api

Extracting table data from wikipedia API


I need to extract the table at the right side for any politician from wikipedia.

I tried to use the wikipedia API for this purpose. But I was not able to extract the table data. The code I have tried so far is as follows :

import wikipedia
person = wikipedia.search("Rahul Gandhi")[0]
person # prints "Rahul Gandhi"
wikipedia.summary(person) # able to get summary
page = wikipedia.page(person)
page.url # prints "url"
print(page.content) # prints complete content, but not the tables

I have also tried to scrape the tables, but it's difficult to obtain the data in a structured form.

import urllib3
import requests
from bs4 import BeautifulSoup
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
url = "https://en.wikipedia.org/wiki/Rahul_Gandhi"
session = requests.Session()
html = session.get(url, verify=False).content
soup = BeautifulSoup(html, "lxml")

table = soup.find("table",{"class":"infobox vcard"})
info = table.findAll('tr')

for row in info:
    content = []
    if row.find('th'):
        content += [row.find('th').text]
    if row.find('a'):
        content += [row.find('a').text]
    if row.find('td'):
        content += [row.find('td').text]
    print(content)
# Output : 
['Rahul Gandhi']
['', 'Gandhi in May 2019']
['']
['President of the Indian National Congress', 'President of the Indian National Congress']
['In office16 December 2017\xa0– 10 August 2019']
['Preceded by', 'Sonia Gandhi', 'Sonia Gandhi']
['Succeeded by', 'Sonia Gandhi', 'Sonia Gandhi (Interim)']
['Member of Parliament, Lok Sabha', 'Member of Parliament, Lok Sabha']
['Incumbent', 'Incumbent']
['Assumed office 23 May 2019']
['Preceded by', 'M. I. Shanavas', 'M. I. Shanavas']
['Constituency', 'Wayanad', 'Wayanad, Kerala']
['In office17 May 2004\xa0– 23 May 2019']
['Preceded by', 'Sonia Gandhi', 'Sonia Gandhi']
['Succeeded by', 'Smriti Irani', 'Smriti Irani']
['Constituency', 'Amethi', 'Amethi, Uttar Pradesh']
['Vice-President of the Indian National Congress', 'Indian National Congress']
['In office19 January 2013\xa0– 16 December 2017']
['President', 'Sonia Gandhi', 'Sonia Gandhi']
['Preceded by', 'Position established']
['Succeeded by', 'Position abolished']
['General Secretary of Indian National Congress', 'Indian National Congress']
['In office25 September 2007\xa0– 19 January 2013']
['President', 'Sonia Gandhi', 'Sonia Gandhi']
['Chair of Indian Youth Congress', 'Indian Youth Congress']
['Incumbent', 'Incumbent']
['Assumed office 25 September 2007']
['Preceded by', 'Position established']
['Chair of National Students’ Union of India', 'National Students’ Union of India']
['Incumbent', 'Incumbent']
['Assumed office 25 September 2007']
['Preceded by', 'Position established']
['\n']
['Personal details']
['Born', ' (1970-06-19) 19 June 1970 (age\xa050)New Delhi, India']
['Political party', 'Indian National Congress', 'Indian National Congress']
['Parents', 'Rajiv Gandhi', 'Rajiv GandhiSonia Gandhi']
['Relatives', 'Nehru–Gandhi family', 'Nehru–Gandhi family']
['Education', "St. Stephen's College, Delhi", "St. Stephen's College, DelhiHarvard UniversityRollins College (BA)Trinity College, Cambridge (MPhil)"]
['Signature', '', '']
['Website', 'Official website', 'Official website']

Solution

  • Using information about tag and number of items in row I created

    import urllib3
    import requests
    from bs4 import BeautifulSoup
    import json
    
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
    
    url = 'https://en.wikipedia.org/wiki/Rahul_Gandhi'
    #url = 'https://en.wikipedia.org/wiki/Sonia_Gandhi'
    
    session = requests.Session()
    
    html = session.get(url, verify=False).content
    
    soup = BeautifulSoup(html, 'lxml')
    
    table = soup.find('table', {'class': 'infobox vcard'})
    
    # --- 
    
    content = {}
    header1 = None
    header2 = None
    current = content
    
    for row in table.find_all('tr'):
        
        children = list(row.children)
        
        # replace '<br>' with '\n'
        for item in children:
            for br in item.find_all('br'):
                br.replace_with('\n' + br.text)
    
        # headers/subheaders (sections/subsections)
        if len(children) == 1:
    
            #html = str(children[0]).strip()
            
            # skip empty rows
            inner_html = children[0].decode_contents().strip()
            if not inner_html:
                continue
            #print(inner_html)
            
            text = children[0].get_text().strip() # don't `get_text(strip=True)` to keep `\n`
    
            # clean text - replace non-breaking space         
            text = text.replace('\u00a0', ' ')
            #print(item.name, '|', text)
    
            images = [{
                        'src': x.get('src'),
                        'width': x.get('width', ''),
                        'height': x.get('height', ''),
                        'alt': x.get('alt'),
                      } for x in children[0].find_all('img')]
    
            links  = [{
                        'text': x.text,
                        'href': x.get('href', ''),
                        'title': x.get('title', ''),
                      } for x in children[0].find_all('a')]
            
            # create headers / section
            if children[0].name == 'th':
                header1 = text
                
                section = {
                    'type': 'header',
                    #'html': html,
                    'key' : text,
                    'text': text,   # text in header
                    'links': links, # links in header
                    'images': images,
                    'items': {},    # items in section
                }
                
                content[header1] = section  # add section to content
                current = section['items']  # keep access to add items later
    
            # create subheaders / subsection
            if children[0].name == 'td':
                header2 = text
    
                section = {
                    'type': 'header',
                    #'html': html,
                    'key' : text,
                    'text': text,   # text in subheader
                    'links': links, # links in subheader
                    'images': images,
                    'items': {},    # items in subsection
                }
                
                content[header1]['items'][header2] = section  # add section to content
                current = section['items']  # keep access to add items later
                
        # items in sections/sections
        if len(children) == 2:
            #html   = str(children[1])
    
            # skip empty rows
            #inner_html = children[0].decode_contents().strip()
            #if not inner_html:
            #    continue
            #print(inner_html)
            
            key    = children[0].get_text().strip()
            text   = children[1].get_text().strip()
            
            links  = [{
                        'text': x.text,
                        'href': x.get('href', ''),
                        'title': x.get('title', ''),
                      } for x in children[1].find_all('a')]
            
            images = [{
                        'src': x.get('src'),
                        'width': x.get('width', ''),
                        'height': x.get('height', ''),
                        'alt': x.get('alt'),
                      } for x in children[1].find_all('img')]
    
            # clean text - replace non-breaking space 
            text = text.replace('\u00a0', ' ')
    
            current[key] = {
                'type': 'item',
                #'html': html,
                'key': key,
                'text': text,
                'links': links,
                'images': images,
                'items': {}
            }
    
            #print(content[key])
    
    #first_key = list(content.keys())[0]
    #print(first_key)
    #print(json.dumps(content[first_key], indent=2))
    
    print(json.dumps(content, indent=2))
    

    Which gives this:

    {
      "Rahul Gandhi": {
        "type": "header",
        "key": "Rahul Gandhi",
        "text": "Rahul Gandhi",
        "links": [],
        "images": [],
        "items": {
          "Gandhi in May 2019": {
            "type": "header",
            "key": "Gandhi in May 2019",
            "text": "Gandhi in May 2019",
            "links": [
              {
                "text": "",
                "href": "/wiki/File:Rahul_Gandhi,_Member_of_Parliament,_Wayanad,_Kerala.jpg",
                "title": ""
              }
            ],
            "images": [
              {
                "src": "//upload.wikimedia.org/wikipedia/commons/thumb/d/d5/Rahul_Gandhi%2C_Member_of_Parliament%2C_Wayanad%2C_Kerala.jpg/220px-Rahul_Gandhi%2C_Member_of_Parliament%2C_Wayanad%2C_Kerala.jpg",
                "width": "220",
                "height": "293",
                "alt": "Rahul Gandhi, Member of Parliament, Wayanad, Kerala.jpg"
              }
            ],
            "items": {}
          }
        }
      },
      "President of the Indian National Congress": {
        "type": "header",
        "key": "President of the Indian National Congress",
        "text": "President of the Indian National Congress",
        "links": [
          {
            "text": "President of the Indian National Congress",
            "href": "/wiki/List_of_Presidents_of_the_Indian_National_Congress",
            "title": "List of Presidents of the Indian National Congress"
          }
        ],
        "images": [],
        "items": {
          "In office\n16 December 2017 \u2013 10 August 2019": {
            "type": "header",
            "key": "In office\n16 December 2017 \u2013 10 August 2019",
            "text": "In office\n16 December 2017 \u2013 10 August 2019",
            "links": [],
            "images": [],
            "items": {
              "Preceded by": {
                "type": "item",
                "key": "Preceded by",
                "text": "Sonia Gandhi",
                "links": [
                  {
                    "text": "Sonia Gandhi",
                    "href": "/wiki/Sonia_Gandhi",
                    "title": "Sonia Gandhi"
                  }
                ],
                "images": [],
                "items": {}
              },
              "Succeeded by": {
                "type": "item",
                "key": "Succeeded by",
                "text": "Sonia Gandhi (Interim)",
                "links": [
                  {
                    "text": "Sonia Gandhi",
                    "href": "/wiki/Sonia_Gandhi",
                    "title": "Sonia Gandhi"
                  }
                ],
                "images": [],
                "items": {}
              }
            }
          }
        }
      },
      "Member of Parliament, Lok Sabha": {
        "type": "header",
        "key": "Member of Parliament, Lok Sabha",
        "text": "Member of Parliament, Lok Sabha",
        "links": [
          {
            "text": "Member of Parliament, Lok Sabha",
            "href": "/wiki/Member_of_Parliament,_Lok_Sabha",
            "title": "Member of Parliament, Lok Sabha"
          }
        ],
        "images": [],
        "items": {
          "Incumbent": {
            "type": "header",
            "key": "Incumbent",
            "text": "Incumbent",
            "links": [
              {
                "text": "Incumbent",
                "href": "/wiki/Incumbent",
                "title": "Incumbent"
              }
            ],
            "images": [],
            "items": {}
          },
          "Assumed office \n23 May 2019": {
            "type": "header",
            "key": "Assumed office \n23 May 2019",
            "text": "Assumed office \n23 May 2019",
            "links": [],
            "images": [],
            "items": {
              "Preceded by": {
                "type": "item",
                "key": "Preceded by",
                "text": "M. I. Shanavas",
                "links": [
                  {
                    "text": "M. I. Shanavas",
                    "href": "/wiki/M._I._Shanavas",
                    "title": "M. I. Shanavas"
                  }
                ],
                "images": [],
                "items": {}
              },
              "Constituency": {
                "type": "item",
                "key": "Constituency",
                "text": "Wayanad, Kerala",
                "links": [
                  {
                    "text": "Wayanad",
                    "href": "/wiki/Wayanad_(Lok_Sabha_constituency)",
                    "title": "Wayanad (Lok Sabha constituency)"
                  },
                  {
                    "text": "Kerala",
                    "href": "/wiki/Kerala",
                    "title": "Kerala"
                  }
                ],
                "images": [],
                "items": {}
              }
            }
          },
          "In office\n17 May 2004 \u2013 23 May 2019": {
            "type": "header",
            "key": "In office\n17 May 2004 \u2013 23 May 2019",
            "text": "In office\n17 May 2004 \u2013 23 May 2019",
            "links": [],
            "images": [],
            "items": {
              "Preceded by": {
                "type": "item",
                "key": "Preceded by",
                "text": "Sonia Gandhi",
                "links": [
                  {
                    "text": "Sonia Gandhi",
                    "href": "/wiki/Sonia_Gandhi",
                    "title": "Sonia Gandhi"
                  }
                ],
                "images": [],
                "items": {}
              },
              "Succeeded by": {
                "type": "item",
                "key": "Succeeded by",
                "text": "Smriti Irani",
                "links": [
                  {
                    "text": "Smriti Irani",
                    "href": "/wiki/Smriti_Irani",
                    "title": "Smriti Irani"
                  }
                ],
                "images": [],
                "items": {}
              },
              "Constituency": {
                "type": "item",
                "key": "Constituency",
                "text": "Amethi, Uttar Pradesh",
                "links": [
                  {
                    "text": "Amethi",
                    "href": "/wiki/Amethi_(Lok_Sabha_constituency)",
                    "title": "Amethi (Lok Sabha constituency)"
                  },
                  {
                    "text": "Uttar Pradesh",
                    "href": "/wiki/Uttar_Pradesh",
                    "title": "Uttar Pradesh"
                  }
                ],
                "images": [],
                "items": {}
              }
            }
          }
        }
      },
      "Vice-President of the Indian National Congress": {
        "type": "header",
        "key": "Vice-President of the Indian National Congress",
        "text": "Vice-President of the Indian National Congress",
        "links": [
          {
            "text": "Indian National Congress",
            "href": "/wiki/Indian_National_Congress",
            "title": "Indian National Congress"
          }
        ],
        "images": [],
        "items": {
          "In office\n19 January 2013 \u2013 16 December 2017": {
            "type": "header",
            "key": "In office\n19 January 2013 \u2013 16 December 2017",
            "text": "In office\n19 January 2013 \u2013 16 December 2017",
            "links": [],
            "images": [],
            "items": {
              "President": {
                "type": "item",
                "key": "President",
                "text": "Sonia Gandhi",
                "links": [
                  {
                    "text": "Sonia Gandhi",
                    "href": "/wiki/Sonia_Gandhi",
                    "title": "Sonia Gandhi"
                  }
                ],
                "images": [],
                "items": {}
              },
              "Preceded by": {
                "type": "item",
                "key": "Preceded by",
                "text": "Position established",
                "links": [],
                "images": [],
                "items": {}
              },
              "Succeeded by": {
                "type": "item",
                "key": "Succeeded by",
                "text": "Position abolished",
                "links": [],
                "images": [],
                "items": {}
              }
            }
          }
        }
      },
      "General Secretary of Indian National Congress": {
        "type": "header",
        "key": "General Secretary of Indian National Congress",
        "text": "General Secretary of Indian National Congress",
        "links": [
          {
            "text": "Indian National Congress",
            "href": "/wiki/Indian_National_Congress",
            "title": "Indian National Congress"
          }
        ],
        "images": [],
        "items": {
          "In office\n25 September 2007 \u2013 19 January 2013": {
            "type": "header",
            "key": "In office\n25 September 2007 \u2013 19 January 2013",
            "text": "In office\n25 September 2007 \u2013 19 January 2013",
            "links": [],
            "images": [],
            "items": {
              "President": {
                "type": "item",
                "key": "President",
                "text": "Sonia Gandhi",
                "links": [
                  {
                    "text": "Sonia Gandhi",
                    "href": "/wiki/Sonia_Gandhi",
                    "title": "Sonia Gandhi"
                  }
                ],
                "images": [],
                "items": {}
              }
            }
          }
        }
      },
      "Chair of Indian Youth Congress": {
        "type": "header",
        "key": "Chair of Indian Youth Congress",
        "text": "Chair of Indian Youth Congress",
        "links": [
          {
            "text": "Indian Youth Congress",
            "href": "/wiki/Indian_Youth_Congress",
            "title": "Indian Youth Congress"
          }
        ],
        "images": [],
        "items": {
          "Incumbent": {
            "type": "header",
            "key": "Incumbent",
            "text": "Incumbent",
            "links": [
              {
                "text": "Incumbent",
                "href": "/wiki/Incumbent",
                "title": "Incumbent"
              }
            ],
            "images": [],
            "items": {}
          },
          "Assumed office \n25 September 2007": {
            "type": "header",
            "key": "Assumed office \n25 September 2007",
            "text": "Assumed office \n25 September 2007",
            "links": [],
            "images": [],
            "items": {
              "Preceded by": {
                "type": "item",
                "key": "Preceded by",
                "text": "Position established",
                "links": [],
                "images": [],
                "items": {}
              }
            }
          }
        }
      },
      "Chair of National Students\u2019 Union of India": {
        "type": "header",
        "key": "Chair of National Students\u2019 Union of India",
        "text": "Chair of National Students\u2019 Union of India",
        "links": [
          {
            "text": "National Students\u2019 Union of India",
            "href": "/wiki/National_Students%E2%80%99_Union_of_India",
            "title": "National Students\u2019 Union of India"
          }
        ],
        "images": [],
        "items": {
          "Incumbent": {
            "type": "header",
            "key": "Incumbent",
            "text": "Incumbent",
            "links": [
              {
                "text": "Incumbent",
                "href": "/wiki/Incumbent",
                "title": "Incumbent"
              }
            ],
            "images": [],
            "items": {}
          },
          "Assumed office \n25 September 2007": {
            "type": "header",
            "key": "Assumed office \n25 September 2007",
            "text": "Assumed office \n25 September 2007",
            "links": [],
            "images": [],
            "items": {
              "Preceded by": {
                "type": "item",
                "key": "Preceded by",
                "text": "Position established",
                "links": [],
                "images": [],
                "items": {}
              }
            }
          }
        }
      },
      "Personal details": {
        "type": "header",
        "key": "Personal details",
        "text": "Personal details",
        "links": [],
        "images": [],
        "items": {
          "Born": {
            "type": "item",
            "key": "Born",
            "text": "(1970-06-19) 19 June 1970 (age 50)\nNew Delhi, India",
            "links": [],
            "images": [],
            "items": {}
          },
          "Political party": {
            "type": "item",
            "key": "Political party",
            "text": "Indian National Congress",
            "links": [
              {
                "text": "Indian National Congress",
                "href": "/wiki/Indian_National_Congress",
                "title": "Indian National Congress"
              }
            ],
            "images": [],
            "items": {}
          },
          "Parents": {
            "type": "item",
            "key": "Parents",
            "text": "Rajiv Gandhi\nSonia Gandhi",
            "links": [
              {
                "text": "Rajiv Gandhi",
                "href": "/wiki/Rajiv_Gandhi",
                "title": "Rajiv Gandhi"
              },
              {
                "text": "Sonia Gandhi",
                "href": "/wiki/Sonia_Gandhi",
                "title": "Sonia Gandhi"
              }
            ],
            "images": [],
            "items": {}
          },
          "Relatives": {
            "type": "item",
            "key": "Relatives",
            "text": "Nehru\u2013Gandhi family",
            "links": [
              {
                "text": "Nehru\u2013Gandhi family",
                "href": "/wiki/Nehru%E2%80%93Gandhi_family",
                "title": "Nehru\u2013Gandhi family"
              }
            ],
            "images": [],
            "items": {}
          },
          "Education": {
            "type": "item",
            "key": "Education",
            "text": "St. Stephen's College, Delhi\nHarvard University\nRollins College (BA)\nTrinity College, Cambridge (MPhil)",
            "links": [
              {
                "text": "St. Stephen's College, Delhi",
                "href": "/wiki/St._Stephen%27s_College,_Delhi",
                "title": "St. Stephen's College, Delhi"
              },
              {
                "text": "Harvard University",
                "href": "/wiki/Harvard_University",
                "title": "Harvard University"
              },
              {
                "text": "Rollins College",
                "href": "/wiki/Rollins_College",
                "title": "Rollins College"
              },
              {
                "text": "BA",
                "href": "/wiki/Bachelor_of_Arts",
                "title": "Bachelor of Arts"
              },
              {
                "text": "Trinity College, Cambridge",
                "href": "/wiki/Trinity_College,_Cambridge",
                "title": "Trinity College, Cambridge"
              },
              {
                "text": "MPhil",
                "href": "/wiki/Master_of_Philosophy",
                "title": "Master of Philosophy"
              }
            ],
            "images": [],
            "items": {}
          },
          "Signature": {
            "type": "item",
            "key": "Signature",
            "text": "",
            "links": [
              {
                "text": "",
                "href": "/wiki/File:Signature_of_Rahul_Gandhi.svg",
                "title": "Rahul Gandhi's signature"
              }
            ],
            "images": [
              {
                "src": "//upload.wikimedia.org/wikipedia/commons/thumb/a/a5/Signature_of_Rahul_Gandhi.svg/128px-Signature_of_Rahul_Gandhi.svg.png",
                "width": "128",
                "height": "44",
                "alt": ""
              }
            ],
            "items": {}
          },
          "Website": {
            "type": "item",
            "key": "Website",
            "text": "Official website",
            "links": [
              {
                "text": "Official website",
                "href": "http://rahulgandhi.in",
                "title": ""
              }
            ],
            "images": [],
            "items": {}
          }
        }
      }
    }
    

    BTW:

    I had use headers to group items because there are many Preceded by, etc.

    I tried to get all informations about text, links, images and create the same fields in all elements even if they don't have some values.

    I'm only not sure if it is good to use headers as keys - it could be easier to use list of sections instead of keys like "Member of Parliament, Lok Sabha" which can be different for different person.