Search code examples
pythonpython-3.xweb-scrapingattributeerrorquora

"AttributeError" in web scraping using python


When I executed that same code on my laptop using Jupyter, I got the following error

AttributeError 
Traceback (most recent call last) in # form cycles) 
   excludedPages = filter(isInternalNode, getChildren("http://www.quora.com/directory")) 
-->excludedPages.append("http://www.quora.com") 
   excludedPages.append("http://www.quora.com#")    
   excludedPages.append("http://www.quora.com/") 
AttributeError: 'filter' object has no attribute 'append'

The code is here- https://github.com/jessicatysu/quora/blob/master/numfollowers.py


Solution

  • This code is for Python 2 - you can see print without () which works only in Python 2.

    But Python 2 has also other differences. In Python 2 filter() creates list but in Python 3 filter() is "lazy" and it doesn't create list at once and in some situations you have to use list() to convert filter() to list

    And you have to do it before you use append()

    excludedPages = list(filter(...))
    

    EDIT:

    Here is code which runs without error.

    In code you can see comments # changed with more information.

    But problem is that this script is 7 years old (from 2013) and Quora changed HTML:

    • starting page /directory doesn't display list of users
      (probably for security or GDPR General Data Protection Regulation)
    • it uses JavaScript but mechanize can't run JavaScript

    so code is useless :)

    You would have to use Selenium to control real web browser which can run JavaScript. And you would have to analyze web pages to create new code.

    # Grabs N people randomly from the directory using reservoir sampling, then
    # counts the number of followers they have.  I never got to run this script
    # to completion because Quora blocked the script before I added the rate
    # limits.
    
    import mechanize
    import random
    import http.cookiejar as cookielib  # changed: in Python 3 module `cookielib` was renamed to `http.cookiejar`
    import re
    from time import sleep
    
    NUM_SAMPLES = 1000
    FOLLOWERS_FILE = "followers.txt"
    USERS_FILE = "users.txt"
    ERR_LOG = "errors.txt"
    
    err = open(ERR_LOG, 'w')
    
    # Randomly chosen Quora users (written in the form of links to Quora
    # profiles)
    users = []
    curUserIdx = 1
    
    # Regular expressions that will be used multiple times
    leaf = re.compile("-") # Separator between first and last names!
    internalNode = re.compile("directory/page")
    fnum = re.compile("Followers.*>([0-9]+)<.*Following")
    
    # We use this function to open pages instead of br.open to avoid putting a
    # high load on Quora's servers.  This means the script takes a lot longer
    # though - estimated time 1 day for 2 million users.  (21400 page accesses
    # * 4 seconds per access = 23.8 hours.)
    def openPage(site):
        print('[DEBUG] openPage:', site)  # changed: add only for debug 
        result = br.open(site)  # changed: add `result =`
        sleep(3)
        return result  # changed: add `return result`
    
    # Gets child links
    def getChildren(node):
        try:
            openPage(node)
            print(br.links())
            return ["http://www.quora.com" + link.url for link in br.links()]
        except:
            print("Could not get children of " + node)
            err.write("Could not get children of " + node)
            return []
    
    # Checks to see if the link is a user profile.
    def isLeaf(node):
        return leaf.search(node)
    
    # Checks to see if the link is an intermediate node in the directory.
    def isInternalNode(node):
        return internalNode.search(node)
    
    # Checks to see if the page is part of the people directory
    def inPeopleDirectory(node):
        try:
            page = openPage(node)
            html = page.read()
        except Exception as ex:  # changed: display some info about problem
            print('ex:', ex)     # changed: display some info about problem
            print("Could not open site " + node)
            err.write("Could not open site " + node)
            return False
    
        # --- change : add decode with try/except ---
        try:
            html = html.decode('utf-8')
        except:
            print("Could not decode HTML using UTF-8 " + node)
            err.write("Could not decode HTML using UTF-8 " + node)
            return False
        # --- change : end ---
    
        return "People on Quora" in html
    
    # Applies reservoir sampling to a candidate leaf
    def sample(node):
        # curUserIdx is 1-indexed
        global users, curUserIdx
        # Initialize the list
        if (curUserIdx <= NUM_SAMPLES):
            users.append(node)
    
        # Replace elements
        else:
            # random.randint chooses a random integer, inclusive
            choice = random.randint(1, curUserIdx)
            if (choice <= NUM_SAMPLES):
                users[choice - 1] = node
        curUserIdx += 1
    
    # Gets the number of followers for a user
    def getFollowers(profile):
        try:
            page = openPage(profile)
            m = fnum.search(page.read())
            if m:
                return m.group(1)
        except:
            print("Could not get number of followers for " + profile)
            err.write("Could not get number of followers for " + profile)
    
    # Traverses the tree using depth first search.
    def crawl(node):
        for child in getChildren(node):
            if child in excludedPages:
                pass
            elif isLeaf(child):
                print("Sampling " + child)
                sample(child)
            elif isInternalNode(child):
                print("Crawling internal node " + child)
                crawl(child)
            else:
                print("Passing on link " + child)
    
    # Initialize browser
    
    br = mechanize.Browser()
    
    cj = cookielib.LWPCookieJar()
    br.set_cookiejar(cj)
    
    br.set_handle_equiv(True)
    br.set_handle_gzip(True)
    br.set_handle_redirect(True)
    br.set_handle_referer(True)
    br.set_handle_robots(False)
    
    # Follows refresh 0 but not hangs on refresh > 0
    br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
    
    # User-Agent
    br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; \
    rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
    
    # Get list of top level pages (and exclude them from searches, because they
    # form cycles)
    excludedPages = list(filter(isInternalNode, getChildren("https://www.quora.com/directory")))  # changed: add `list()`
    excludedPages.append("https://www.quora.com")
    excludedPages.append("https://www.quora.com#")
    excludedPages.append("https://www.quora.com/")
    excludedPages.append("https://www.quora.com/about/tos")
    print('[DEBUG] topPages:', list(excludedPages))  # changed: add only for debug 
    
    topPages = filter(inPeopleDirectory, excludedPages)
    print('[DEBUG] topPages:', list(topPages))  # changed: add only for debug
    
    # Access Quora directory (it's public access!)
    for page in topPages:
        crawl(page)
    
    # Get followers for each user
    ff = open(FOLLOWERS_FILE, 'w')
    uf = open(USERS_FILE, 'w')
    
    # Write these in two separate steps in case something goes wrong with
    # getFollowers.  I don't want to lose my random sample, because that is the
    # hardest part to get.
    for u in users:
        uf.write(u + "\n")
    uf.close()
    
    for u in users:
        numFollowers = getFollowers(u)
        if numFollowers:
            ff.write(u + "\t" + getFollowers(u) + "\n")
    
    ff.close()
    err.close()