Search code examples
pythonpandasbeautifulsoupurllib2

Pandas: Write all re.search results to csv from BeautifulSoup


I have these beginnings of a Python pandas script that searches for values in on Google and grabs any PDF links it can find on the first page.

I have two questions, listed below.

import pandas as pd
from bs4 import BeautifulSoup
import urllib2
import re

df = pd.DataFrame(["Shakespeare", "Beowulf"], columns=["Search"])    

print "Searching for PDFs ..."

hdr = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
    "Accept-Encoding": "none",
    "Accept-Language": "en-US,en;q=0.8",
    "Connection": "keep-alive"}

def crawl(search):
    google = "http://www.google.com/search?q="
    url = google + search + "+" + "PDF"
    req = urllib2.Request(url, headers=hdr)

    pdf_links = None
    placeholder = None #just a column placeholder

    try:
        page = urllib2.urlopen(req).read()
        soup = BeautifulSoup(page)
        cite = soup.find_all("cite", attrs={"class":"_Rm"})
        for link in cite:
            all_links = re.search(r".+", link.text).group().encode("utf-8")
            if all_links.endswith(".pdf"):
                pdf_links = re.search(r"(.+)pdf$", all_links).group()
            print pdf_links

    except urllib2.HTTPError, e:
        print e.fp.read()

    return pd.Series([pdf_links, placeholder])

df[["PDF links", "Placeholder"]] = df["Search"].apply(crawl)

df.to_csv(FileName, index=False, delimiter=",")

The results from print pdf_links will be:

davidlucking.com/documents/Shakespeare-Complete%20Works.pdf
sparks.eserver.org/books/shakespeare-tempest.pdf
www.w3.org/People/maxf/.../hamlet.pdf
www.w3.org/People/maxf/.../hamlet.pdf
www.w3.org/People/maxf/.../hamlet.pdf
www.w3.org/People/maxf/.../hamlet.pdf
www.w3.org/People/maxf/.../hamlet.pdf
www.w3.org/People/maxf/.../hamlet.pdf
www.w3.org/People/maxf/.../hamlet.pdf
calhoun.k12.il.us/teachers/wdeffenbaugh/.../Shakespeare%20Sonnets.pdf
www.yorku.ca/inpar/Beowulf_Child.pdf
www.yorku.ca/inpar/Beowulf_Child.pdf
https://is.muni.cz/el/1441/.../2._Beowulf.pdf
https://is.muni.cz/el/1441/.../2._Beowulf.pdf
https://is.muni.cz/el/1441/.../2._Beowulf.pdf
https://is.muni.cz/el/1441/.../2._Beowulf.pdf
www.penguin.com/static/pdf/.../beowulf.pdf
www.neshaminy.org/cms/lib6/.../380/text.pdf
www.neshaminy.org/cms/lib6/.../380/text.pdf
sparks.eserver.org/books/beowulf.pdf

And the csv output will look like:

Search         PDF Links
Shakespeare    calhoun.k12.il.us/teachers/wdeffenbaugh/.../Shakespeare%20Sonnets.pdf
Beowulf        sparks.eserver.org/books/beowulf.pdf

Questions:

  • Is there a way to write all of the results as rows to the csv instead of just the bottom one? And if possible, include the value in Search for each row that corresponds to "Shakespeare" or "Beowulf"?
  • How can I write out the full pdf links without long links automatically abbreviating with "..."?

Solution

  • This will get you all the proper pdf links using soup.find_all("a",href=True) and save them in a Dataframe and to a csv:

    hdr = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
        "Accept-Encoding": "none",
        "Accept-Language": "en-US,en;q=0.8",
        "Connection": "keep-alive"}
    
    
    def crawl(columns=None, *search):
        df = pd.DataFrame(columns= columns)
        for term in search:
            google = "http://www.google.com/search?q="
            url = google + term + "+" + "PDF"
            req = urllib2.Request(url, headers=hdr)
            try:
                page = urllib2.urlopen(req).read()
                soup = BeautifulSoup(page)
                pdfs = []
                links = soup.find_all("a",href=True)
                for link in links:
                    lk = link["href"]
                    if lk.endswith(".pdf"):
                         pdfs.append((term, lk))
                df2 = pd.DataFrame(pdfs, columns=columns)
                df = df.append(df2, ignore_index=True)
            except urllib2.HTTPError, e:
                print e.fp.read()
        return df
    
    
    df = crawl(["Search", "PDF link"],"Shakespeare","Beowulf")
    df.to_csv("out.csv",index=False)
    

    out.csv:

    Search,PDF link
    Shakespeare,http://davidlucking.com/documents/Shakespeare-Complete%20Works.pdf
    Shakespeare,http://www.w3.org/People/maxf/XSLideMaker/hamlet.pdf
    Shakespeare,http://sparks.eserver.org/books/shakespeare-tempest.pdf
    Shakespeare,https://phillipkay.files.wordpress.com/2011/07/william-shakespeare-plays.pdf
    Shakespeare,http://www.artsvivants.ca/pdf/eth/activities/shakespeare_overview.pdf
    Shakespeare,http://triggs.djvu.org/djvu-editions.com/SHAKESPEARE/SONNETS/Download.pdf
    Beowulf,http://www.yorku.ca/inpar/Beowulf_Child.pdf
    Beowulf,https://is.muni.cz/el/1441/podzim2013/AJ2RC_STAL/2._Beowulf.pdf
    Beowulf,http://teacherweb.com/IL/Steinmetz/MottramM/Beowulf---Seamus-Heaney.pdf
    Beowulf,http://www.penguin.com/static/pdf/teachersguides/beowulf.pdf
    Beowulf,http://www.neshaminy.org/cms/lib6/PA01000466/Centricity/Domain/380/text.pdf
    Beowulf,http://www.sparknotes.com/free-pdfs/uscellular/download/beowulf.pdf