Search code examples
pythonpraw

IndexError: list index out of range (on Reddit data crawler)


is expected the below is supposed to run without issues.

Solution to Reddit data:

    import requests
    import re
    import praw
    from datetime import date
    import csv
    import pandas as pd
    import time
    import sys

    class Crawler(object):
        '''
            basic_url is the reddit site.
            headers is for requests.get method
            REX is to find submission ids.
        '''
        def __init__(self, subreddit="apple"):
            '''
                Initialize a Crawler object.
                    subreddit is the topic you want to parse. default is r"apple"
                basic_url is the reddit site.
                headers is for requests.get method
                REX is to find submission ids.
                submission_ids save all the ids of submission you will parse.
                reddit is an object created using praw API. Please check it before you use.
            '''
            self.basic_url = "https://www.reddit.com"
            self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'}
            self.REX = re.compile(r"<div class=\" thing id-t3_[\w]+")
            self.subreddit = subreddit
            self.submission_ids = []
            self.reddit = praw.Reddit(client_id="your_id", client_secret="your_secret", user_agent="subreddit_comments_crawler")

        def get_submission_ids(self, pages=2):
            '''
                Collect all ids of submissions..
                One page has 25 submissions.
                page url: https://www.reddit.com/r/subreddit/?count25&after=t3_id
                    id(after) is the last submission from last page.
            '''
    #         This is page url.
            url = self.basic_url + "/r/" + self.subreddit

            if pages <= 0:
                return []

            text = requests.get(url, headers=self.headers).text
            ids = self.REX.findall(text)
            ids = list(map(lambda x: x[-6:], ids))
            if pages == 1:
                self.submission_ids = ids
                return ids

            count = 0
            after = ids[-1]
            for i in range(1, pages):
                count += 25
                temp_url = self.basic_url + "/r/" + self.subreddit + "?count=" + str(count) + "&after=t3_" + ids[-1]
                text = requests.get(temp_url, headers=self.headers).text
                temp_list = self.REX.findall(text)
                temp_list = list(map(lambda x: x[-6:], temp_list))
                ids += temp_list
                if count % 100 == 0:
                    time.sleep(60)
            self.submission_ids = ids
            return ids

        def get_comments(self, submission):
            '''
                Submission is an object created using praw API.
            '''
    #         Remove all "more comments".
            submission.comments.replace_more(limit=None)
            comments = []
            for each in submission.comments.list():
                try:
                    comments.append((each.id, each.link_id[3:], each.author.name, date.fromtimestamp(each.created_utc).isoformat(), each.score, each.body) )
                except AttributeError as e: # Some comments are deleted, we cannot access them.
    #                 print(each.link_id, e)
                    continue
            return comments

        def save_comments_submissions(self, pages):
            '''
                1. Save all the ids of submissions.
                2. For each submission, save information of this submission. (submission_id, #comments, score, subreddit, date, title, body_text)
                3. Save comments in this submission. (comment_id, submission_id, author, date, score, body_text)
                4. Separately, save them to two csv file.
                Note: You can link them with submission_id.
                Warning: According to the rule of Reddit API, the get action should not be too frequent. Safely, use the defalut time span in this crawler.
            '''

            print("Start to collect all submission ids...")
            self.get_submission_ids(pages)
            print("Start to collect comments...This may cost a long time depending on # of pages.")
            submission_url = self.basic_url + "/r/" + self.subreddit + "/comments/"
            comments = []
            submissions = []
            count = 0
            for idx in self.submission_ids:
                temp_url = submission_url + idx
                submission = self.reddit.submission(url=temp_url)
                submissions.append((submission.name[3:], submission.num_comments, submission.score, submission.subreddit_name_prefixed, date.fromtimestamp(submission.created_utc).isoformat(), submission.title, submission.selftext))
                temp_comments = self.get_comments(submission)
                comments += temp_comments
                count += 1
                print(str(count) + " submissions have got...")
                if count % 50 == 0:
                    time.sleep(60)
            comments_fieldnames = ["comment_id", "submission_id", "author_name", "post_time", "comment_score", "text"]
            df_comments = pd.DataFrame(comments, columns=comments_fieldnames)
            df_comments.to_csv("comments.csv")
            submissions_fieldnames = ["submission_id", "num_of_comments", "submission_score", "submission_subreddit", "post_date", "submission_title", "text"]
            df_submission = pd.DataFrame(submissions, columns=submissions_fieldnames)
            df_submission.to_csv("submissions.csv")
            return df_comments


    if __name__ == "__main__":
        args = sys.argv[1:]
        if len(args) != 2:
            print("Wrong number of args...")
            exit()

        subreddit, pages = args
        c = Crawler(subreddit)
        c.save_comments_submissions(int(pages))

but I got:

(base) UserAir:scrape_reddit user$ python reddit_crawler.py apple 2

Start to collect all submission ids...

Traceback (most recent call last):

File "reddit_crawler.py", line 127, in

c.save_comments_submissions(int(pages))

File "reddit_crawler.py", line 94, in save_comments_submissions

self.get_submission_ids(pages)

File "reddit_crawler.py", line 54, in get_submission_ids

after = ids[-1]

IndexError: list index out of range


Solution

  • Erik's answer diagnoses the specific cause of this error, but more broadly I think this is caused by you not using PRAW to its fullest potential. Your script imports requests and performs a lot of manual requests that PRAW has methods for already. The whole point of PRAW is to prevent you from having to write these requests that do things such as paginate a listing, so I recommend you take advantage of that.

    As an example, your get_submission_ids function (which scrapes the web version of Reddit and handles paginating) could be replaced by just

    def get_submission_ids(self, pages=2):
        return [
            submission.id
            for submission in self.reddit.subreddit(self.subreddit).hot(
                limit=25 * pages
            )
        ]
    

    because the .hot() function does everything you tried to do by hand.

    I'm going to go one step further here and have the function just return a list of Submission objects, because the rest of your code ends up doing things that would better by done by interacting with the PRAW Submission object. Here's that code (I renamed the function to reflect its updated purpose):

    def get_submissions(self, pages=2):
        return list(self.reddit.subreddit(self.subreddit).hot(limit=25 * pages))
    

    (I've updated this function to just return its result, as your version both returns the value and sets it as self.submission_ids, unless pages is 0. That felt quite inconsistent, so I made it just return the value.)

    Your get_comments function looks good.

    The save_comments_submissions function, like get_submission_ids, does a lot of manual work that PRAW can handle. You construct a temp_url that has the full URL of a post, and then use that to make a PRAW Submission object, but we can replace that with directly using the one returned by get_submissions. You also have some calls to time.sleep() which I removed because PRAW will automatically sleep the appropriate amount for you. Lastly, I removed the return value of this function because the point of the function is to save data to disk, not to return it to anywhere else, and the rest of your script doesn't use the return value. Here's the updated version of that function:

    def save_comments_submissions(self, pages):
        """
            1. Save all the ids of submissions.
            2. For each submission, save information of this submission. (submission_id, #comments, score, subreddit, date, title, body_text)
            3. Save comments in this submission. (comment_id, submission_id, author, date, score, body_text)
            4. Separately, save them to two csv file.
            Note: You can link them with submission_id.
            Warning: According to the rule of Reddit API, the get action should not be too frequent. Safely, use the defalut time span in this crawler.
        """
    
        print("Start to collect all submission ids...")
        submissions = self.get_submissions(pages)
        print(
            "Start to collect comments...This may cost a long time depending on # of pages."
        )
        comments = []
        pandas_submissions = []
        for count, submission in enumerate(submissions):
            pandas_submissions.append(
                (
                    submission.name[3:],
                    submission.num_comments,
                    submission.score,
                    submission.subreddit_name_prefixed,
                    date.fromtimestamp(submission.created_utc).isoformat(),
                    submission.title,
                    submission.selftext,
                )
            )
            temp_comments = self.get_comments(submission)
            comments += temp_comments
            print(str(count) + " submissions have got...")
    
        comments_fieldnames = [
            "comment_id",
            "submission_id",
            "author_name",
            "post_time",
            "comment_score",
            "text",
        ]
        df_comments = pd.DataFrame(comments, columns=comments_fieldnames)
        df_comments.to_csv("comments.csv")
        submissions_fieldnames = [
            "submission_id",
            "num_of_comments",
            "submission_score",
            "submission_subreddit",
            "post_date",
            "submission_title",
            "text",
        ]
        df_submission = pd.DataFrame(pandas_submissions, columns=submissions_fieldnames)
        df_submission.to_csv("submissions.csv")
    

    Here's an updated version of the whole script that uses PRAW fully:

    from datetime import date
    import sys
    
    
    import pandas as pd
    import praw
    
    
    class Crawler:
        """
            basic_url is the reddit site.
            headers is for requests.get method
            REX is to find submission ids.
        """
    
        def __init__(self, subreddit="apple"):
            """
                Initialize a Crawler object.
                    subreddit is the topic you want to parse. default is r"apple"
                basic_url is the reddit site.
                headers is for requests.get method
                REX is to find submission ids.
                submission_ids save all the ids of submission you will parse.
                reddit is an object created using praw API. Please check it before you use.
            """
            self.subreddit = subreddit
            self.submission_ids = []
            self.reddit = praw.Reddit(
                client_id="your_id",
                client_secret="your_secret",
                user_agent="subreddit_comments_crawler",
            )
    
        def get_submissions(self, pages=2):
            """
                Collect all submissions..
                One page has 25 submissions.
                page url: https://www.reddit.com/r/subreddit/?count25&after=t3_id
                    id(after) is the last submission from last page.
            """
            return list(self.reddit.subreddit(self.subreddit).hot(limit=25 * pages))
    
        def get_comments(self, submission):
            """
                Submission is an object created using praw API.
            """
            #         Remove all "more comments".
            submission.comments.replace_more(limit=None)
            comments = []
            for each in submission.comments.list():
                try:
                    comments.append(
                        (
                            each.id,
                            each.link_id[3:],
                            each.author.name,
                            date.fromtimestamp(each.created_utc).isoformat(),
                            each.score,
                            each.body,
                        )
                    )
                except AttributeError as e:  # Some comments are deleted, we cannot access them.
                    #                 print(each.link_id, e)
                    continue
            return comments
    
        def save_comments_submissions(self, pages):
            """
                1. Save all the ids of submissions.
                2. For each submission, save information of this submission. (submission_id, #comments, score, subreddit, date, title, body_text)
                3. Save comments in this submission. (comment_id, submission_id, author, date, score, body_text)
                4. Separately, save them to two csv file.
                Note: You can link them with submission_id.
                Warning: According to the rule of Reddit API, the get action should not be too frequent. Safely, use the defalut time span in this crawler.
            """
    
            print("Start to collect all submission ids...")
            submissions = self.get_submissions(pages)
            print(
                "Start to collect comments...This may cost a long time depending on # of pages."
            )
            comments = []
            pandas_submissions = []
            for count, submission in enumerate(submissions):
                pandas_submissions.append(
                    (
                        submission.name[3:],
                        submission.num_comments,
                        submission.score,
                        submission.subreddit_name_prefixed,
                        date.fromtimestamp(submission.created_utc).isoformat(),
                        submission.title,
                        submission.selftext,
                    )
                )
                temp_comments = self.get_comments(submission)
                comments += temp_comments
                print(str(count) + " submissions have got...")
    
            comments_fieldnames = [
                "comment_id",
                "submission_id",
                "author_name",
                "post_time",
                "comment_score",
                "text",
            ]
            df_comments = pd.DataFrame(comments, columns=comments_fieldnames)
            df_comments.to_csv("comments.csv")
            submissions_fieldnames = [
                "submission_id",
                "num_of_comments",
                "submission_score",
                "submission_subreddit",
                "post_date",
                "submission_title",
                "text",
            ]
            df_submission = pd.DataFrame(pandas_submissions, columns=submissions_fieldnames)
            df_submission.to_csv("submissions.csv")
    
    
    if __name__ == "__main__":
        args = sys.argv[1:]
        if len(args) != 2:
            print("Wrong number of args...")
            exit()
    
        subreddit, pages = args
        c = Crawler(subreddit)
        c.save_comments_submissions(int(pages))
    

    I realize that my answer here gets into Code Review territory, but I hope that this answer is helpful for understanding some of the things PRAW can do. Your "list index out of range" error would have been avoided by using the pre-existing library code, so I do consider this to be a solution to your problem.