Search code examples
pythonpandasweb-scrapingredditpraw

How to get user information from scraped reddit posts?


I've was able to scrape the top reddit posts from a specific subreddit after a certain date. I collected the titles, post text, and other attributes about these posts into a dataframe.

However, I also want to collect attributes about the authors of each post. I started with attempting to collect the comment karma for the other of each post, but I've run into errors.

This was the line I added in to attempt to get the authors of each posts' comment karma

# Author Karama
posts_dict["Author Karma"].append(reddit_read_only.redditor(post.author)).comment_karma

This is the resulting error...

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[11], line 33
     30 posts_dict["Author"].append(post.author)
     32 # Author Karama
---> 33 posts_dict["Author Karma"].append(reddit_read_only.redditor(post.author)).comment_karma
     35 # Unique ID of each post
     36 posts_dict["ID"].append(post.id)

File ~\anaconda3\lib\site-packages\praw\util\deprecate_args.py:43, in _deprecate_args.<locals>.wrapper.<locals>.wrapped(*args, **kwargs)
     36     arg_string = _generate_arg_string(_old_args[: len(args)])
     37     warn(
     38         f"Positional arguments for {func.__qualname__!r} will no longer be"
     39         f" supported in PRAW 8.\nCall this function with {arg_string}.",
     40         DeprecationWarning,
     41         stacklevel=2,
     42     )
---> 43 return func(**dict(zip(_old_args, args)), **kwargs)

File ~\anaconda3\lib\site-packages\praw\reddit.py:908, in Reddit.redditor(self, name, fullname)
    896 @_deprecate_args("name", "fullname")
    897 def redditor(
    898     self, name: Optional[str] = None, *, fullname: Optional[str] = None
    899 ) -> "praw.models.Redditor":
    900     """Return a lazy instance of :class:`.Redditor`.
    901 
    902     :param name: The name of the redditor.
   (...)
    906 
    907     """
--> 908     return models.Redditor(self, name=name, fullname=fullname)

File ~\anaconda3\lib\site-packages\praw\models\reddit\redditor.py:156, in Redditor.__init__(self, reddit, name, fullname, _data)
    146 """Initialize a :class:`.Redditor` instance.
    147 
    148 :param reddit: An instance of :class:`.Reddit`.
   (...)
    153 
    154 """
    155 if (name, fullname, _data).count(None) != 2:
--> 156     raise TypeError(
    157         "Exactly one of 'name', 'fullname', or '_data' must be provided."
    158     )
    159 if _data:
    160     assert (
    161         isinstance(_data, dict) and "name" in _data
    162     ), "Please file a bug with PRAW."

TypeError: Exactly one of 'name', 'fullname', or '_data' must be provided.

Here's a look at the full code that was working before adding in the previously mentioned line.

import praw
import pandas as pd
import datetime

subreddit = reddit_read_only.subreddit("SuicideWatch")

# Scraping the top posts of all time
posts = subreddit.top(time_filter = "all", limit = None)
 
posts_dict = {"Title": [], "Post Text": [], "Author": [],
              "Author Karma": [],"ID": [], "Score": [],
              "Total Comments": [],"Created On":[], "Post URL": [],
              "Original Content": [], "Edited": [], "Saved": []
              }

start_date = '01-01-20 00:00:00'
start_date = datetime.datetime.strptime(start_date, '%d-%m-%y %H:%M:%S').timestamp()

for post in posts:
    # Date of each posts' creation
    date = post.created_utc
    if date > start_date:
        # Title of each post
        posts_dict["Title"].append(post.title)
     
        # Text inside a post
        posts_dict["Post Text"].append(post.selftext)
 
        # Author of the post
        posts_dict["Author"].append(post.author)
        
        # Author Karama
        posts_dict["Author Karma"].append(reddit_read_only.redditor(post.author)).comment_karma

        # Unique ID of each post
        posts_dict["ID"].append(post.id)
     
        # The score of a post
        posts_dict["Score"].append(post.score)
     
        # Total number of comments inside the post
        posts_dict["Total Comments"].append(post.num_comments)
        
        # Comments: instance of a commentforest
        #posts_dict["Comments"].append(post.comments)
         
        # Date the post was Created
        posts_dict["Created On"].append(post.created_utc)
        
        # URL of each post
        posts_dict["Post URL"].append(post.url)
        
        # Flair of each post
        posts_dict["Original Content"].append(post.is_original_content)
        
        # Edited Check for each post
        posts_dict["Edited"].append(post.edited)
        
        # Saved check for each post
        posts_dict["Saved"].append(post.saved)
        
# Saving the data in a pandas dataframe
all_posts = pd.DataFrame(posts_dict)
all_posts['Created On'] = pd.to_datetime(all_posts['Created On'],  unit='s')

I've since updated my code to look for posts where there is an Author instead of posts that may have None set as the other and changed the line to add comment karma to this:

# Author of the post's Comment Karma
posts_dict["Author Comment Karma"].append(post.author.comment_karma)

However, now I'm getting an Attribute Error:

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[15], line 34
     31 posts_dict["Author"].append(post.author.name)
     33 # Author of the post's Comment Karma
---> 34 posts_dict["Author Comment Karma"].append(post.author.comment_karma)
     36 # Unique ID of each post
     37 posts_dict["ID"].append(post.id)

File /opt/anaconda3/lib/python3.9/site-packages/praw/models/reddit/base.py:35, in RedditBase.__getattr__(self, attribute)
     33 if not attribute.startswith("_") and not self._fetched:
     34     self._fetch()
---> 35     return getattr(self, attribute)
     36 raise AttributeError(
     37     f"{self.__class__.__name__!r} object has no attribute {attribute!r}"
     38 )

File /opt/anaconda3/lib/python3.9/site-packages/praw/models/reddit/base.py:36, in RedditBase.__getattr__(self, attribute)
     34     self._fetch()
     35     return getattr(self, attribute)
---> 36 raise AttributeError(
     37     f"{self.__class__.__name__!r} object has no attribute {attribute!r}"
     38 )

AttributeError: 'Redditor' object has no attribute 'comment_karma'

Solution

  • You can use post.author.comment_karma to get an user's comment karma