# importing libraries and packages
import snscrape.modules.twitter as sntwitter
import pandas
import time
import pandas as pd
# Creating list to append tweet data
ManUtd_list = []
# Using TwitterSearchScraper to scrape data and append tweets to list
for i,tweet in enumerate(sntwitter.TwitterSearchScraper('Man Utd since:2020-12-31 until:2021-01-02').get_items()):
if i>10000: #number of tweets you want to scrape
break
ManUtd_list.append([tweet.date, tweet.id, tweet.content, tweet.user.username]) #declare the attributes to be returned
# Creating a dataframe from the tweets list above
ManUtd_df = pd.DataFrame(ManUtd_list, columns=['Datetime', 'Tweet Id', 'Text', 'Username'])
I am looking to scrape 10,000 tweets a day for these date range, how can I encode it so that the scraper loops through each date specified in the range and retrieves a maximum of 10000?
Implementing data ranges is easy, just filter
the result (generator) by the data
attribute. Here is a working example:
import snscrape.modules.twitter as sntwitter
import itertools
import multiprocessing.dummy as mp # multithreading
import datetime
start_date = datetime.datetime(2023,2,15,tzinfo=datetime.timezone.utc)
def get_tweets(username,n_tweets = 100):
tweets = itertools.islice(sntwitter.TwitterSearchScraper(f'from:{username}').get_items(),n_tweets) # invoke the scrapper
tweets = filter(lambda t:t.date>=start_date, tweets)
tweets = map(lambda t: (username,t.date,t.url,t.rawContent),tweets) # keep only attributes needed
tweets = list(tweets) # the result has to be pickle'able
return tweets
# a list of accounts to scrape
user_names = ['kevin2kelly','briansolis','PeterDiamandis','Richard_Florida']
# parallelise queries for speed !
with mp.Pool(4) as p:
results = p.map(get_tweets, user_names)
# combine results
results = list(itertools.chain(*results))