I have been working on this for hours and need some help. This mostly works. I am able to connect to Twitter, pull the json data and store it in MongoDB however not all the data that I am seeing in my 'print(tweet)' line is showing up in MongoDB. Specifically I didn't see the screen_name (or name or the matter) field. I really just need these fields: "id", "text", "created_at", "screen_name", "retweet_count", "favourites_count", "lang" and I get them all but the name. I am not sure why it is not being inserted in the DB with all the other fields. Any help would be greatly appreciated!
from twython import Twython
from pymongo import MongoClient
ConsumerKey = "XXXXX"
ConsumerSecret = "XXXXX"
AccessToken = "XXXXX-XXXXX"
AccessTokenSecret = "XXXXX"
twitter = Twython(ConsumerKey,
ConsumerSecret,
AccessToken,
AccessTokenSecret)
result = twitter.search(q="drexel", count='100')
result1 = result['statuses']
for tweet in result1:
print(tweet) #prints tweets so I know I got data
client = MongoClient('mongodb://localhost:27017/')
db = client.twitterdb
tweet_collection = db.twitter_search
#Fields I need ["id", "text", "created_at", "screen_name", "retweet_count", "favourites_count", "lang"]
for tweet in result1:
try:
tweet_collection.insert(tweet)
except:
pass
print("The number of tweets in English: ")
print(tweet_collection.count(lang="en"))
You can use following way:
def get_document(post):
return {
'id': post['id_str'],
'text': post['text'],
'created_at': post['created_at'],
'retweet_count' : post['retweet_count'],
'favourites_count': post['user']['favourites_count'],
'lang': post['lang'],
'screen_name': post['user']['screen_name']
}
for tweet in result1:
try:
tweet_collection.insert(
get_document(tweet)
)
except:
pass
It should work.
The "screen_name" field is a subset of the "user" part of the tweet metadata. Make sure you're drilling down far enough.
Related
I'm trying to tweet my movie reviews from Letterboxd using python's tweepy but I can't format the tweet. I tried printing the tweet without the square brackets or printing the title and link to the full review in different lines, but couldn't achieve either. Is it possible to do either thing?
Here's the code I'm using:
import tweepy
import xxxx_keys
import feedparser
feed = feedparser.parse("https://example.rss")
def api():
auth = tweepy.OAuth1UserHandler(xxxx.api_key, xxxx.api_key_secret)
auth.set_access_token(xxxx.access_token, xxxx.access_token_secret)
return tweepy.API(auth)
review = {
"title": feed.entries[0].title,
"summary": feed.entries[0].summary,
"link": feed.entries[0].link,
"description": feed.entries[0].description,
}
def tweet(api: tweepy.API, message: str, image_path=None):
if image_path:
api.update_status_with_media(message, image_path)
else:
api.update_status(message)
print("Tweet sent successfully")
if __name__ == "__main__":
api = api()
tweet(api, [review["title"], "Full review on: ", review["link"]])
Actually managed to do it quite easily. I just created:
twitter_review = review["title"] + "\n" "Full review on: " + review["link"]
And then used:
tweet(api, twitter_review)
Then it worked!
I am using the Twitter Academic Research V2 API and want to get tweets from a list of users and store them in a dataframe.
My code works for one single user, but not for a list of users. See the code here:
import tweepy
from twitter_authentication import bearer_token
import time
import pandas as pd
import time
client = tweepy.Client(bearer_token, wait_on_rate_limit=True)
# list of twitter users
csu = ["Markus_Soeder", "DoroBaer", "andreasscheuer"]
csu_tweets = []
for politician in csu:
for response in tweepy.Paginator(client.search_all_tweets,
query = f'from:{politician} -is:retweet lang:de',
user_fields = ['username', 'public_metrics', 'description', 'location'],
tweet_fields = ['created_at', 'geo', 'public_metrics', 'text'],
expansions = 'author_id',
start_time = '2022-12-01T00:00:00Z',
end_time = '2022-12-03T00:00:00Z'):
time.sleep(1)
csu_tweets.append(response)
end = time.time()
print(f"Scraping of {csu} needed {(end - start)/60} minutes.")
result = []
user_dict = {}
# Loop through each response object
for response in csu_tweets:
# Take all of the users, and put them into a dictionary of dictionaries with the info we want to keep
for user in response.includes['users']:
user_dict[user.id] = {'username': user.username,
'followers': user.public_metrics['followers_count'],
'tweets': user.public_metrics['tweet_count'],
'description': user.description,
'location': user.location
}
for tweet in response.data:
# For each tweet, find the author's information
author_info = user_dict[tweet.author_id]
# Put all of the information we want to keep in a single dictionary for each tweet
result.append({'author_id': tweet.author_id,
'username': author_info['username'],
'author_followers': author_info['followers'],
'author_tweets': author_info['tweets'],
'author_description': author_info['description'],
'author_location': author_info['location'],
'text': tweet.text,
'created_at': tweet.created_at,
'quote_count': tweet.public_metrics['quote_count'],
'retweets': tweet.public_metrics['retweet_count'],
'replies': tweet.public_metrics['reply_count'],
'likes': tweet.public_metrics['like_count'],
})
# Change this list of dictionaries into a dataframe
df = pd.DataFrame(result)
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_25716/2249018491.py in <module>
4 for response in csu_tweets:
5 # Take all of the users, and put them into a dictionary of dictionaries with the info we want to keep
----> 6 for user in response.includes['users']:
7 user_dict[user.id] = {'username': user.username,
8 'followers': user.public_metrics['followers_count'],
KeyError: 'users'
So I get this KeyError: 'users'. I don't get the error if I just scrape tweet from a single user and replace "csu = ["Markus_Soeder", "DoroBaer", "andreasscheuer"] with "csu = "Markus_Soeder".
Does anyone know what could be the issue?
Thanks in advance!
I found the answer to this issue. It gave me the key error, because for some users in this time range there were no tweets and as a result it got stored as "none" in the response. Because of this the for loop didn't work.
I am trying to use the Tweepy python package to get the actual replies to a Tweet. I break down the process I have worked on so far:
I import my modules and configure authentication variables with tweepy:
client = tweepy.Client(bearer_token=bearer_token, wait_on_rate_limit=True)
covid_tweets = []
for mytweets in tweepy.Paginator(client.search_all_tweets, query = '#COVID lang:en',
user_fields=['username', 'public_metrics', 'description', 'location'], tweet_fields
= ['created_at', 'geo', 'public_metrics', 'text'], expansions = 'author_id',
start_time = '2019-12-30T00:00:00Z', end_time = '2020-01-15T00:00:00Z',
max_results=10):
time.sleep(2)
covid_tweets.append(mytweets)
I then search for the hashtags I want and the parameters of the Tweepy Paginator with a for loop and append unto an empty list:
client = tweepy.Client(bearer_token=bearer_token, wait_on_rate_limit=True)
covid_tweets = []
for mytweets in tweepy.Paginator(client.search_all_tweets, query = '#COVID lang:en',
user_fields=['username', 'public_metrics', 'description', 'location'], tweet_fields
= ['created_at', 'geo', 'public_metrics', 'text'], expansions = 'author_id',
start_time = '2019-12-30T00:00:00Z', end_time = '2020-01-15T00:00:00Z',
max_results=10):
time.sleep(2)
covid_tweets.append(mytweets)
Then I convert this list into a dataFrame by extracting certain key fields[a user dictionary, user_object]:
#Convert Covid-19 tweets to a DF
result = []
user_dict = {}
# Loop through each response object
for response in covid_tweets:
for user in response.includes['users']:
user_dict[user.id] = {'username': user.username,
'followers': user.public_metrics['followers_count'],
'tweets': user.public_metrics['tweet_count'],
'description': user.description,
'location': user.location
}
for tweet in response.data:
# For each tweet, find the author's information
author_info = user_dict[tweet.author_id]
#check for condition
if ('RT #' not in tweet.text):
# Put all information we want to keep in a single dictionary for each tweet
result.append({'author_id': tweet.author_id,
'tweet_id': tweet.id,
'username': author_info['username'],
'author_followers': author_info['followers'],
'author_tweets': author_info['tweets'],
'author_description': author_info['description'],
'author_location': author_info['location'],
'text': tweet.text,
'created_at': tweet.created_at,
'retweets': tweet.public_metrics['retweet_count'],
'replies': tweet.public_metrics['reply_count'],
'likes': tweet.public_metrics['like_count'],
'quote_count': tweet.public_metrics['quote_count']
})
# Change this list of dictionaries into a dataframe
df_1 = pd.DataFrame(result)
Now my issue is, from the dataFrame, I get to see tweets and reply_count for tweets and a proof of the image is shown below:
And I checked how I can get the replies from the tweets. So I did some checks and wanted to follow this code flow function:
def get_all_replies(tweet, api, fout, depth=10, Verbose=False):
global rep
if depth < 1:
if Verbose:
print('Max depth reached')
return
user = tweet.user.screen_name
tweet_id = tweet.id
search_query = '#' + user
# filter out retweets
retweet_filter = '-filter:retweets'
query = search_query + retweet_filter
try:
myCursor = tweepy.Cursor(api.search_tweets, q=query,
since_id=tweet_id,
max_id=None,
tweet_mode='extended').items()
rep = [reply for reply in myCursor if reply.in_reply_to_status_id == tweet_id]
except tweepy.TweepyException as e:
sys.stderr.write(('Error get_all_replies: {}\n').format(e))
time.sleep(60)
if len(rep) != 0:
if Verbose:
if hasattr(tweet, 'full_text'):
print('Saving replies to: %s' % tweet.full_text)
elif hasattr(tweet, 'text'):
print('Saving replies to: %s' % tweet.text)
print("Output path: %s" % fout)
# save to file
with open(fout, 'a+') as (f):
for reply in rep:
data_to_file = json.dumps(reply._json)
f.write(data_to_file + '\n')
# recursive call
get_all_replies(reply, api, fout, depth=depth - 1, Verbose=False)
return
So basically, with this function, I loop through the dataframe and pick the "tweet_id" & "the screen_name" for the tweet, then design a search query but I realized at the section of the "rep" list returns an empty list, and debugging line by line, actually showed that the in_reply_to_status_id is different from the tweet_id and the cause for the empty list even though the reply count for the dataframe shows a non-zero.
I know this is long but I really wanted to show what I have done so far and explain each process. Thank you
NB: I have access to Academic Research API
Ok so for everyone trying to fix this hurdle I finally found a way to get the tweet replies. In my use case, I have the Academic Research API from Twitter.
The code provided by geduldig on his github Github finally solved my
issue with a little tweaks. A little head-up will be that, with the TwitterAPI package, if you ignore the "start_time" or "end_time" parameter, you might get only the parent tweet, so structure it like this:
pager = TwitterPager(api, 'tweets/search/all',
{
'query':f'conversation_id:{CONVERSATION_ID}',
'start_time': '2019-12-30T00:00:00Z',
'end_time': '2021-12-31T00:00:00Z',
'expansions':'author_id',
'tweet.fields':'author_id,conversation_id,created_at,referenced_tweets'
},
hydrate_type=HydrateType.REPLACE)
I hope this helps the community. Thanks.
I need to be able to send a message to slack with the name of the collection in MongoDB when the query collection.find() finds in the DB a task_status in failure. So far it's what I've got and it works but I need the message to say the name of the collection, as this code is supposed to be used in a DB with a lot of collections, can someone help me?
import pymongo
import requests
import json
from pymongo import MongoClient
def slackmessage ():
wekbook_url = 'https://hooks.slack.com/*****'
slack_data = {
'text': "There is problem in your mongodb collection, task is in FAILURE.",
'username': 'MongodbAlert',
'icon_emoji': ':fire:'
}
response = requests.post(wekbook_url, data = json.dumps(slack_data), headers={'Content-Type': 'application/json'})
def mongodbfind ():
cluster = MongoClient("mongodb+srv://*****")
db = cluster["nameofthemongodb"]
collection = db["nameofthecollection"]
mysearch = collection.find({ "task_status":"FAILURE"})
for x in mysearch:
print (m + str(x))
return True
if mongocheck() == True:
slackmessage()
Pass collection name in slackmessage() function, and concatenate it with text in slack_data:
def slackmessage (collectionName):
wekbook_url = 'https://hooks.slack.com/*****'
slack_data = {
'text': "There is problem in your mongodb collection - "+collectionName+", task is in FAILURE.",
'username': 'MongodbAlert',
'icon_emoji': ':fire:'
}
response = requests.post(wekbook_url, data = json.dumps(slack_data), headers={'Content-Type': 'application/json'})
# call the function with connectionname as argument
slackmessage(collectionName)
I am trying to extract the all tweets which contain specific keyword and its geo locations .
for example , I want download all the tweets in english which contains the keyword 'iphone' from 'france' and 'singapore'
My code
import tweepy
import csv
import pandas as pd
import sys
# API credentials here
consumer_key = 'INSERT CONSUMER KEY HERE'
consumer_secret = 'INSERT CONSUMER SECRET HERE'
access_token = 'INSERT ACCESS TOKEN HERE'
access_token_secret = 'INSERT ACCESS TOKEN SECRET HERE'
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth,wait_on_rate_limit=True,wait_on_rate_limit_notify=True)
# Search word/hashtag value
HashValue = ""
# search start date value. the search will start from this date to the current date.
StartDate = ""
# getting the search word/hashtag and date range from user
HashValue = input("Enter the hashtag you want the tweets to be downloaded for: ")
StartDate = input("Enter the start date in this format yyyy-mm-dd: ")
# Open/Create a file to append data
csvFile = open(HashValue+'.csv', 'a')
#Use csv Writer
csvWriter = csv.writer(csvFile)
for tweet in tweepy.Cursor(api.search,q=HashValue,count=20,lang="en",since=StartDate, tweet_mode='extended').items():
print (tweet.created_at, tweet.full_text)
csvWriter.writerow([tweet.created_at, tweet.full_text.encode('utf-8')])
print ("Scraping finished and saved to "+HashValue+".csv")
#sys.exit()
How can this be done.
-Hello- Rahul
As I understand it you are looking to get geo data off searched tweets rather then filter search based on geocode.
Here is a code sample with the relevant fields you are interested in. These may or may not be provided depending on the tweeters privacy settings.
Note there is no "since" parameter on the search API:
https://tweepy.readthedocs.io/en/latest/api.html#help-methods
https://developer.twitter.com/en/docs/tweets/search/api-reference/get-search-tweets
Standard twitter api search goes back 7 days. The premium and enterprise APIs have 30 day search as well as Full Archive search, but you will pay $$$.
Unfortunately tweepy still hasn't had their models documented:
https://github.com/tweepy/tweepy/issues/720
So if you want to look at the tweet object you can use pprint package and run:
pprint(tweet.__dict__)
One difference I noticed was the "text" field in the JSON became "full_text" in the object.
There's also information on the original tweet in there if the one you found was a quote tweet, has the same info from what I could see.
Anyway here's the code, I added a max tweet count for looping through the cursor while I was testing to avoid blowing any API limits.
Let me know if you want csv code but it looks like you can handle that already.
import tweepy
# API credentials here
consumer_key = 'your-info'
consumer_secret = 'your-info'
access_token = 'your-info'
access_token_secret = 'your-info'
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth,wait_on_rate_limit=True,wait_on_rate_limit_notify=True)
searchString = "iPhone"
cursor = tweepy.Cursor(api.search, q=searchString, count=20, lang="en", tweet_mode='extended')
maxCount = 1
count = 0
for tweet in cursor.items():
print()
print("Tweet Information")
print("================================")
print("Text: ", tweet.full_text)
print("Geo: ", tweet.geo)
print("Coordinates: ", tweet.coordinates)
print("Place: ", tweet.place)
print()
print("User Information")
print("================================")
print("Location: ", tweet.user.location)
print("Geo Enabled? ", tweet.user.geo_enabled)
count = count + 1
if count == maxCount:
break;
Will output something like this:
Tweet Information
================================
Text: NowPlaying : Hashfinger - Leaving
https://derp.com
#iPhone free app https://derp.com
#peripouwebradio
Geo: None
Coordinates: None
Place: None
User Information
================================
Location: Greece
Geo Enabled? True