Search code examples
pythonscreen-scrapingwikipedia

scraping links from wikipedia


So i am trying to scrape links from a random wikipedia page here is my code thus far:

from bs4 import BeautifulSoup
import requests
import pandas as pd
import urllib2


# function get random page
def get_random():
    import requests
#    r = requests.get('https://en.wikipedia.org/wiki/Special:Random')

    r = requests.get('https://en.wikipedia.org/wiki/Carole_Ann')

    return r.url
#========================


#finding the valid link
def validlink(href):
    if href:
        if re.compile('^/wiki/').search(href):
            if not re.compile('/\w+:').search(href):
                return True
    return False
#validlink()===========



#the first site
a1 = get_random()

#print("the first site is: " + a1) 
# the first site end()====

#looking for the article name:

blin = requests.get(a1)

soup = BeautifulSoup(blin.text, 'html.parser')

title = soup.find('h1', {'class' : 'firstHeading'})

print("starting website: " + a1 + " Titled:  " + title.text)

print("")
#=============================

    #first article done

#find body:
import re

body = requests.get(a1).text

soup = BeautifulSoup(body, 'lxml')

for link in soup.findAll("a"):
    url = link.get("href", "")
print(
#======================

i know i'm doing this last part wrong. Im new to python so i just have no idea how to go about this part, what i need is to pull all of the links from a random site that the random page takes me to, then i pull the link and title off of that site, then i need to pull the wikipedia links off of that page which is what i am looking to do in that last bit of code there heres another snip:

enter image description here

and at this point i want to print all of the links that it finds after they have been tested against my valid links function at the top:

enter image description here

again forgive me for being new and not understanding at this. But please help i cannot figure this out.

so the question that i have is: i need to create a snippet of code that will pull out all of the website links off of the wikipedia page (which note i still dont know how to do the for loop was my best guess based on my own research) then i need to test the links that i pulled against my validlink function, and print out all of the valid links.


Solution

  • If you whan it as list then create new list and append() url if it is valid.

    Because the same url can be many times on page so I also check if url is already on list.

    valid_urls = []
    
    for link in soup.find_all('a'): # find_all('a', {'href': True}):
        url = link.get('href', '')
        if url not in valid_urls and validlink(url):
            valid_urls.append(url)
    
    print(valid_urls)
    

    from bs4 import BeautifulSoup
    import requests
    import re
    
    # --- functions ---
    
    def is_valid(url):
        """finding the valid link"""
    
        if url:
            if url.startswith('/wiki/'): # you don't need `re` to check it
                if not re.compile('/\w+:').search(url):
                    return True
    
        return False
    
    # --- main ---
    
    #random_url = 'https://en.wikipedia.org/wiki/Special:Random'
    random_url = 'https://en.wikipedia.org/wiki/Carole_Ann'
    
    r = requests.get(random_url)
    print('url:', r.url)
    
    soup = BeautifulSoup(r.text, 'html.parser')
    
    title = soup.find('h1', {'class': 'firstHeading'})
    
    print('starting website:', r.url)
    print('titled:', title.text)
    print()
    
    valid_urls = []
    
    for link in soup.find_all('a'): # find_all('a', {'href': True}):
        url = link.get('href', '')
        if url not in valid_urls and is_valid(url):
            valid_urls.append(url)
    
    #print(valid_urls)
    
    #for url in valid_urls:        
    #    print(url)
    
    print('\n'.join(valid_urls))