Search code examples
pythonpython-3.xweb-scrapingweb-crawlergoogle-crawlers

Google news crawler flip pages


continuing on previous work to crawl all news result about query and to return title and url, I am refining the crawler to get all results from all pages in Google News. Current code seems can only return the 1st page Googel news search result. Would be grateful to know how to get all pages results. Many thanks!

my codes below:

import requests
from bs4 import BeautifulSoup
import time
import datetime
from random import randint 
import numpy as np
import pandas as pd


query2Google = input("What do you want from Google News?\n")

def QGN(query2Google):
    s = '"'+query2Google+'"' #Keywords for query
    s = s.replace(" ","+")
    date = str(datetime.datetime.now().date()) #timestamp
    filename =query2Google+"_"+date+"_"+'SearchNews.csv' #csv filename
    f = open(filename,"wb")
    url = "http://www.google.com.sg/search?q="+s+"&tbm=nws&tbs=qdr:y" # URL for query of news results within one year and sort by date 

    #htmlpage = urllib2.urlopen(url).read()
    time.sleep(randint(0, 2))#waiting 

    htmlpage = requests.get(url)
    print("Status code: "+ str(htmlpage.status_code))
    soup = BeautifulSoup(htmlpage.text,'lxml')

    df = []
    for result_table in soup.findAll("div", {"class": "g"}):
        a_click = result_table.find("a")
        #print ("-----Title----\n" + str(a_click.renderContents()))#Title

        #print ("----URL----\n" + str(a_click.get("href"))) #URL

        #print ("----Brief----\n" + str(result_table.find("div", {"class": "st"}).renderContents()))#Brief

        #print ("Done")
        df=np.append(df,[str(a_click.renderContents()).strip("b'"),str(a_click.get("href")).strip('/url?q='),str(result_table.find("div", {"class": "st"}).renderContents()).strip("b'")])


        df = np.reshape(df,(-1,3))
        df1 = pd.DataFrame(df,columns=['Title','URL','Brief'])
    print("Search Crawl Done!")

    df1.to_csv(filename, index=False,encoding='utf-8')
    f.close()
    return

QGN(query2Google)

Solution

  • There used to be an ajax api, but it's no longer avaliable .
    Still , you can modify your script with a for loop if you want to get a number of pages , or a while loop if you want to get all pages .
    Example :

    url = "http://www.google.com.sg/search?q="+s+"&tbm=nws&tbs=qdr:y&start="  
    pages = 10    # the number of pages you want to crawl # 
    
    for next in range(0, pages*10, 10) : 
        page = url + str(next)
        time.sleep(randint(1, 5))    # you may need longer than that #
        htmlpage = requests.get(page)    # you should add User-Agent and Referer #
        print("Status code: " + str(htmlpage.status_code))
        if htmlpage.status_code != 200 : 
            break    # something went wrong #  
        soup = BeautifulSoup(htmlpage.text, 'lxml')
    
        ... process response here ...
    
        next_page = soup.find('td', { 'class':'b', 'style':'text-align:left' }) 
        if next_page is None or next_page.a is None : 
            break    # there are no more pages #
    

    Keep in mind that google doesn't like bots , you might get a ban .
    You could add 'User-Agent' and 'Referer' in headers to simulate a web browser , and use time.sleep(random.uniform(2, 6)) to simulate a human ... or use selenium.