I have a working REST Search API script that pulls tweets according to https://www.karambelkar.info/2015/01/how-to-use-twitters-search-rest-api-most-effectively./
Problem: This code works, but pulls tweets with searchQuery1
and searchQuery2
. (e.g. tweets with Prostate Cancer
+ Colon Cancer
). I don't want this. Instead, I would like to get all of tweets from searchQuery1
(only tweets with Prostate Cancer
), and then all of the tweets from searchQuery2
, (only tweets with Colon Cancer
). The queries should run separately.
Goal: Sequentially loop over X number of search queries (e.g. searchQuery1
, searchQuery2
, etc)
Thank you!
searchQuery1 = 'Prostate Cancer'
searchQuery2 = 'Colon Cancer'
maxTweets = 10000
tweetsPerQry = 100
fprefix = 'REST'
sinceId = None
max_id = -1L
tweetCount = 0
with open('/Users/eer/Desktop/' + fprefix + '.' + time.strftime('%Y-%m-%d_%H-%M-%S') + '.json', 'a+') as f: #open file
while tweetCount < maxTweets:
try:
if (max_id <= 0):
if (not sinceId):
for x,y in zip(searchQuery1,searchQuery2):
new_tweets = api.search(q=[searchQuery1, searchQuery2], count=tweetsPerQry)
else:
print "sinceID 1"
new_tweets = api.search(q=[searchQuery1, searchQuery2], count=tweetsPerQry,
since_id=sinceId)
else:
if (not sinceId):
print "not sinceID 2"
new_tweets = api.search(q=[searchQuery1, searchQuery2], count=tweetsPerQry,
max_id=str(max_id - 1))
else:
print "sinceID 1"
new_tweets = api.search(q=[searchQuery1, searchQuery2], count=tweetsPerQry,
max_id=str(max_id - 1),
since_id=sinceId)
if not new_tweets:
print("No more tweets found")
break
for tweet in new_tweets:
f.write(jsonpickle.encode(tweet._json, unpicklable=False) +
'\n')
tweetCount += len(new_tweets)
max_id = new_tweets[-1].id
except tweepy.TweepError as e:
print("some error : " + str(e))
break
print ("Downloaded {0} tweets, Saved to {1}".format(tweetCount, fprefix))
searchQuery = ['Prostate Cancer', 'Colon Cancer']
i = 0
maxTweets = 1000
tweetsPerQry = 100
fprefix = 'REST'
language = ['en']
sinceId = None
max_id = -1L
tweetCount = 0
print("Downloading max {0} tweets".format(maxTweets))
with open('/Users/eer/Desktop/' + fprefix + '.' + time.strftime('%Y-%m-%d_%H-%M-%S') + '.json', 'a+') as f:
while tweetCount < maxTweets:
try:
if (max_id <= 0):
if (not sinceId):
for search in searchQuery:
new_tweets = api.search(q=searchQuery[i], count=tweetsPerQry, languages=language)
else:
for search in searchQuery:
new_tweets = api.search(q=searchQuery[i], count=tweetsPerQry,
since_id=sinceId, languages=language)
else:
print "not sinceID 2"
for search in searchQuery:
new_tweets = api.search(q=searchQuery[i], count=tweetsPerQry,
max_id=str(max_id - 1),languages=language)
else:
for search in searchQuery:
new_tweets = api.search(q=searchQuery[i], count=tweetsPerQry,
max_id=str(max_id - 1),
since_id=sinceId, languages=language)
if not new_tweets:
print("No more tweets found; checking next query")
i = i + 1
try:
for search in searchQuery:
new_tweets = api.search(q=searchQuery[i], count=tweetsPerQry, languages=language)
except IndexError:
break
for tweet in new_tweets:
f.write(jsonpickle.encode(tweet._json, unpicklable=False) +
'\n')
tweetCount += len(new_tweets)
print("Downloaded {0} tweets".format(tweetCount))
max_id = new_tweets[-1].id
except tweepy.TweepError as e:
print("some error : " + str(e))
break
print ("Downloaded {0} tweets, Saved to {1}".format(tweetCount, fprefix))