How to remove duplicate tweets in Python? - python

I am trying to retrieve about 1000 tweets from a search term like 'NFL' using tweepy and storing the tweets into a DataFrame using pandas. My issue is I can't find a way to remove duplicated tweets, I have tried df.drop_duplicates but it only gives me about 100 tweets to work with. Help would be appreciated!
num_needed = 1000
tweet_list = [] # Lists to be added as columns( Tweets, usernames, and screen names) in our dataframe
user_list = []
screen_name_list = []
last_id = -1 # ID of last tweet seen
while len(tweet_list) < num_needed:
try:
new_tweets = api.search(q = 'NFL', count = num_needed, max_id = str(last_id - 1), lang = 'en', tweet_mode = 'extended') # This is the criteria for collecting the tweets that I want. I want to make sure the results are as accurate as possible when making a final analysis.
except tweepy.TweepError as e:
print("Error", e)
break
else:
if not new_tweets:
print("Could not find any more tweets!")
break
else:
for tweet in new_tweets:
# Fetching the screen name and username
screen_name = tweet.author.screen_name
user_name = tweet.author.name
tweet_text = tweet.full_text
tweet_list.append(tweet_text)
user_list.append(user_name)
screen_name_list.append(screen_name)
df = pd.DataFrame() #Create a new dataframe (df) with new columns
df['Screen name'] = screen_name_list
df['Username'] = user_list
df['Tweets'] = tweet_list

Well, yes, when you use .drop_duplicates(), you only get 100 tweets because that's how many duplicates there are. Doesn't matter what technique you use here, there are 900 or so duplicates with how your code runs.
So you might be asking, why? It by default returns only 100 tweets, which I am assuming you are aware of since you are looping and you try to get more by using the max_id parameter. However, your max_id, is always -1 here, you never get the id and thus never change that parameter. So one thing you can do, is while you iterate through the tweets, also collect the ids. Then after you get all the ids, store the minimum id value as last_id, then it'll work in your loop:
Code:
num_needed = 1000
tweet_list = [] # Lists to be added as columns( Tweets, usernames, and screen names) in our dataframe
user_list = []
screen_name_list = []
tw_id = [] #<-- ADDED THIS
last_id = -1 # ID of last tweet seen
while len(tweet_list) < num_needed:
try:
new_tweets = api.search(q = 'NFL -filter:retweets', count = num_needed, max_id = str(last_id - 1), lang = 'en', tweet_mode = 'extended') # This is the criteria for collecting the tweets that I want. I want to make sure the results are as accurate as possible when making a final analysis.
except tweepy.TweepError as e:
print("Error", e)
break
else:
if not new_tweets:
print("Could not find any more tweets!")
break
else:
for tweet in new_tweets:
# Fetching the screen name and username
screen_name = tweet.author.screen_name
user_name = tweet.author.name
tweet_text = tweet.full_text
tweet_list.append(tweet_text)
user_list.append(user_name)
screen_name_list.append(screen_name)
tw_id.append(tweet.id) #<-- ADDED THIS
last_id = min(tw_id) #<-- ADDED THIS
df = pd.DataFrame({'Screen name':screen_name_list,
'Username':user_list,
'Tweets':tweet_list})
df = df.drop_duplicates()
This returns to me aprox 1000 tweets.
Output:
print (len(df))
1084

Related

AWS DynamoDB BOTO3 Confusing Scan

Basically, if i loop a datetime performing an scan with date range per-day, like:
table_hook = dynamodb_resource.Table('table1')
date_filter = Key('date_column').between('2021-01-01T00:00:00+00:00', '2021-01-01T23:59:59+00:00')
response = table_hook.scan(FilterExpression=date_filter)
incoming_data = response['Items']
if (response['Count']) == 0:
return
_counter = 1
while 'LastEvaluatedKey' in response:
response = table_hook.scan(ExclusiveStartKey=response['LastEvaluatedKey'])
if (
parser.parse(response['Items'][0]['date_column']).replace(tzinfo=None) < parser.parse('2021-01-01T00:00:00+00:00').replace(tzinfo=None)
or
parser.parse(response['Items'][0]['date_column']).replace(tzinfo=None).replace(tzinfo=None) > parser.parse('2021-06-07T23:59:59+00:00').replace(tzinfo=None)
):
break
incoming_data.extend(response['Items'])
_counter+=1
print("|-> Getting page %s" % _counter)
At the end of Day1 to Day2 loop, it retrieve me X rows,
But if i perform the same scan at the same way (paginating), with the same range (Day1 to Day2), without doing a loop, it retrieve me Y rows,
And to become better, when i perform a table.describe_table(TableName='table1'), row_count field comes with Z rows, i literally dont understand what is going on!
Based on help of above guys, i found my error, basically i'm not passing the filter again when performing pagination so the fixed code are:
table_hook = dynamodb_resource.Table('table1')
date_filter = Key('date_column').between('2021-01-01T00:00:00+00:00', '2021-01-01T23:59:59+00:00')
response = table_hook.scan(FilterExpression=date_filter)
incoming_data = response['Items']
_counter = 1
while 'LastEvaluatedKey' in response:
response = table_hook.scan(FilterExpression=date_filter,
ExclusiveStartKey=response['LastEvaluatedKey'])
incoming_data.extend(response['Items'])
_counter+=1
print("|-> Getting page %s" % _counter)

Length of timeline() of Twitter API

I am trying to get all tweets from a specific user:
def get_all_tweets(user_id, DEBUG):
# Your bearer token here
t = Twarc2(bearer_token="blah")
# Initialize a list to hold all the tweepy Tweets
alltweets = []
new_tweets = {}
if DEBUG:
# Debug: read from file
f = open('tweets_debug.txt',)
new_tweets = json.load(f)
alltweets.extend(new_tweets)
else:
# make initial request for most recent tweets (3200 is the maximum allowed count)
new_tweets = t.timeline(user=user_id)
# save most recent tweets
alltweets.extend(new_tweets)
if DEBUG:
# Debug: write to file
f = open("tweets_debug.txt", "w")
f.write(json.dumps(alltweets, indent=2, sort_keys=False))
f.close()
# Save the id of the oldest tweet less one
oldest = str(int(alltweets[-1]['meta']['oldest_id']) - 1)
# Keep grabbing tweets until there are no tweets left to grab
while len(dict(new_tweets)) > 0:
print(f"getting tweets before {oldest}")
# All subsiquent requests use the max_id param to prevent duplicates
new_tweets = t.timeline(user=user_id,until_id=oldest)
# Save most recent tweets
alltweets.extend(new_tweets)
# Update the id of the oldest tweet less one
oldest = str(int(alltweets[-1]['meta']['oldest_id']) - 1)
print(f"...{len(alltweets)} tweets downloaded so far")
res = []
for tweetlist in alltweets:
res.extend(tweetlist['data'])
f = open("output.txt", "w")
f.write(json.dumps(res, indent=2, sort_keys=False))
f.close()
return res
However, len(dict(new_tweets)) does not work. It always returns 0. sum(1 for dummy in new_tweets) also returns 0.
I tried json.load(new_tweets) and it does not work as well.
However, alltweets.extend(new_tweets) worked properly.
It seems like timeline() returns a generator-type value (<generator object Twarc2._timeline at 0x000001D78B3D8B30>). Is there any way I can count its length to determine whether there are any more tweets un-grabbed?
Or, is there any way to merge...
someList = []
someList.extend(new_tweets)
while len(someList) > 0:
# blah blah
...into one line with while?
Edit: I tried print(list(new_tweets)) right before the while loop, and it returns []. It seems like the object is actually empty.
Is it because alltweets.extend(new_tweets) somehow consumes the new_tweets generator...?
I figured it out myself. This problem can be solved by converting generator to list:
new_tweets = list(t.timeline(user=user_id,until_id=oldest))

How to get usernames with a tweet search query

I am trying to append a column to my data frame of usernames from my search query from twitter tweets. Any ideas on the code?
import tweepy
import pandas as pd
auth = tweepy.OAuthHandler(consumer_key=con_key, consumer_secret=con_secret)
auth.set_access_token(acc_token, acc_secret)
api = tweepy.API(auth)
num_needed = 1000
tweet_list = []
last_id = -1 # id of last tweet seen
while len(tweet_list) < num_needed:
try:
new_tweets = api.search(q = 'Python', count = 300, max_id = str(last_id - 1), lang = 'en', tweet_mode = 'extended')
except tweepy.TweepError as e:
print("Error", e)
break
else:
if not new_tweets:
print("Could not find any more tweets!")
break
tweet_list.extend(new_tweets)
last_id = new_tweets[-1].id
[tweet.full_text for tweet in tweet_list]
df = pd.DataFrame([tweet.full_text for tweet in tweet_list], columns = ['Tweets'])
import tweepy
import pandas as pd
auth = tweepy.OAuthHandler(consumer_key=con_key, consumer_secret=con_secret)
auth.set_access_token(acc_token, acc_secret)
api = tweepy.API(auth)
num_needed = 10
tweet_list = []
user_list = []
screen_name_list = []
last_id = -1 # id of last tweet seen
while len(tweet_list) < num_needed:
try:
new_tweets = api.search(q = 'Python', count = 300, max_id = str(last_id - 1), lang = 'en', tweet_mode = 'extended')
except tweepy.TweepError as e:
print("Error", e)
break
else:
if not new_tweets:
print("Could not find any more tweets!")
break
else:
for tweet in new_tweets:
# fetching the screen name
screen_name = tweet.author.screen_name
user_name = tweet.author.name
tweet_text = tweet.full_text
tweet_list.append(tweet_text)
user_list.append(user_name)
screen_name_list.append(screen_name)
df = pd.DataFrame()
df["tweets"] = tweet_list
df["user_name"] = user_list
df["screen_name"] = screen_name_list
df
are the usernames contained in tweet.user or something like that? If so then create a second column called:
df['user'] = pd.Series([tweet.user for tweet in tweet_list]

Tweepy get tweets by date

I want to get tweets for a certain topic between dates. Is this possible in tweepy? (or any other API for twitter?)
I can get it working for user_timeline by using the IDs, but when I change it to use api.search the program basically just keeps on running without any output
def getTweets(username):
tweets = []
tmpTweets = api.user_timeline(username, tweet_mode = 'extended', include_rts=True)
for tweet in tmpTweets:
if tweet.created_at < endDate and tweet.created_at > startDate:
tweets.append(tweet)
while (tmpTweets[-1].created_at > startDate):
tmpTweets = api.user_timeline(username, max_id = tmpTweets[-1].id,tweet_mode = 'extended')
for tweet in tmpTweets:
if tweet.created_at < endDate and tweet.created_at > startDate:
tweets.append(tweet)
return tweets
tl;dr: Is there a way in python to get tweets between two dates based on keyword search?
What about using a cursor:
text_query = 'Coronavirus'
since_date = '2020-02-10'
until_date = '2020-08-10'
max_tweets = 150
# Creation of query method using parameters
tweets = tweepy.Cursor(api.search,q=text_query, since=since_date, until=until_date).items(max_tweets)
Also described in this blog and this answer.

How to get latest tweet id, using python-twitter search API

I'm trying to find a way to NOT get the same tweets using search API.
That's what I'm doing:
make a request to the Twitter
Store Tweets
make another request to the Twitter
Store Tweets,
Compare results from 2 and 4
Ideally in step 5 I would get 0, meaning that no overlapping tweets where received. So I'm not asking Twitter server for the same information more than once.
But I think I got stuck in step 3, where I have to make another call. I'm trying to use 'since_id' argument to get tweets after some certain points. But I'm not sure If the value that I'm using is correct.
Code:
import twitter
class Test():
def __init__(self):
self.t_auth()
self.hashtag = ['justinbieber']
self.tweets_1 = []
self.ids_1 = []
self.created_at_1 = []
self.tweet_text_1 = []
self.last_id_1 = ''
self.page_1 = 1
self.tweets_2 = []
self.ids_2 = []
self.created_at_2 = []
self.tweet_text_2 = []
self.last_id_2 = ''
self.page_2 = 1
for i in range(1,16):
self.tweets_1.extend(self.api.GetSearch(self.hashtag, per_page=100, since_id=self.last_id_1, page=self.page_1))
self.page_1 += 1;
print len(self.tweets_1)
for t in self.tweets_1:
self.ids_1.insert(0,t.id)
self.created_at_1.insert(0,t.created_at)
self.tweet_text_1.insert(0,t.text)
self.last_id_1 = t.id
self.last_id_2 = self.last_id_1
for i in range(1,16):
self.tweets_2.extend(self.api.GetSearch(self.hashtag, per_page=100, since_id=self.last_id_2, page=self.page_2))
self.page_2 += 1;
print len(self.tweets_2)
for t in self.tweets_2:
self.ids_2.insert(0,t.id)
self.created_at_2.insert(0,t.created_at)
self.tweet_text_2.insert(0,t.text)
self.last_id_2 = t.id
print 'Total number of tweets in test 1: ', len(self.tweets_1)
print 'Last id of test 1: ', self.last_id_1
print 'Total number of tweets in test 2: ', len(self.tweets_2)
print 'Last id of test 2: ', self.last_id_2
print '##################################'
print '#############OVERLAPING###########'
ids_overlap = set(self.ids_1).intersection(self.ids_2)
tweets_text_overlap = set(self.tweet_text_1).intersection(self.tweet_text_2)
created_at_overlap = set(self.created_at_1).intersection(self.created_at_2)
print 'Ids: ', len(ids_overlap)
print 'Text: ', len(tweets_text_overlap)
print 'Created_at: ', len(created_at_overlap)
print ids_overlap
print tweets_text_overlap
print created_at_overlap
def t_auth(self):
consumer_key="xxx"
consumer_secret="xxx"
access_key = "xxx"
access_secret = "xxx"
self.api = twitter.Api(consumer_key, consumer_secret ,access_key, access_secret)
self.api.VerifyCredentials()
return self.api
if __name__ == "__main__":
Test()
In addition to 'since_id', you can use 'max_id'. From the Twitter API documentation:
Iterating in a result set: parameters such count, until, since_id, max_id allow to control how we iterate through search results, since it could be a large set of tweets.
By setting these values dynamically, you can restrict your search results to not overlap. For example, max_id is set at 1100 and since_id is set at 1000, and then you will have tweets with IDs between those two values.

Categories

Resources