How to store only the text of tweet using Tweepy - python

I'm watching this series https://www.youtube.com/watch?v=wlnx-7cm4Gg&list=PL5tcWHG-UPH2zBfOz40HSzcGUPAVOOnu1 which is about mining tweets with tweepy (python) and the guy stores the tweets with everything ( such as created_at, id, id_str, text) and then he uses Dataframes in pandas to store only the text. Is this way efficient ? How Can I only store the "text" in the Json file instead of all other details ?
The code:
ACCESS_TOKEN = "xxxxxxxxxxxxxxxxxxxxx"
ACCESS_TOKEN_SECRET = "xxxxxxxxxxxxxxxxxxxxxxxxx"
CONSUMER_KEY = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
CONSUMER_SECRET = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
import tweepy
import numpy as np
import pandas as pd
# import twitter_credentials
class TwitterAuthenticator():
def authenticate_twitter_app(self):
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
return auth
class TwitterStreamer():
"""
Class for streaming and processing live tweets.
"""
def __init__(self):
self.twitter_authenticator = TwitterAuthenticator()
def stream_tweets(self, fetched_tweets_filename, hash_tag):
# This handles Twitter authetification and the connection to Twitter Streaming API
listener = TwitterListener(fetched_tweets_filename)
auth = self.twitter_authenticator.authenticate_twitter_app()
# api = tweepy.API(auth)
stream = tweepy.Stream(auth,listener)
stream.filter(track = hash_tag)
class TwitterListener(tweepy.StreamListener):
"""
This is a basic listener class that just prints received tweets to stdout.
"""
def __init__(self, fetched_tweets_filename):
self.fetched_tweets_filename = fetched_tweets_filename
def on_data(self, data):
try:
print(data)
with open(self.fetched_tweets_filename, 'a') as tf:
tf.write(data)
return True
except BaseException as e:
print("Error on_data %s" % str(e))
return True
def on_status(self, status):
print(status)
def on_error(self, status):
if status == 420:
# Returning False on_data method in case rate limit occurs.
return False
print(status)
# public_tweets = api.home_timeline()
# for tweet in public_tweets:
# print tweet.text
if __name__ == '__main__':
hash_tag = ["python"]
fetched_tweets_filename = "tweets.json"
twitter_streamer = TwitterStreamer()
twitter_streamer.stream_tweets(fetched_tweets_filename,hash_tag)
# print stream.text
The tweet stored in the json file:
{"created_at":"Sun Nov 04 18:43:59 +0000 2018","id":1059154305498972160,"id_str":"1059154305498972160","text":"RT #hmason: When you want to use a new algorithm that you don't deeply understand, the best approach is to implement it yourself to learn h\u2026","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":14858491,"id_str":"14858491","name":"Alexandra Lemus","screen_name":"nankyoku","location":"M\u00e9xico","url":null,"description":"Transitioning into the Permanent Beta state...","translator_type":"none","protected":false,"verified":false,"followers_count":173,"friends_count":585,"listed_count":18,"favourites_count":658,"statuses_count":572,"created_at":"Wed May 21 16:35:49 +0000 2008","utc_offset":null,"time_zone":null,"geo_enabled":true,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"EDECE9","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme3\/bg.gif","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme3\/bg.gif","profile_background_tile":false,"profile_link_color":"088253","profile_sidebar_border_color":"D3D2CF","profile_sidebar_fill_color":"E3E2DE","profile_text_color":"634047","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/378800000575875952\/f00390453684dd243d7ca95c69a05f74_normal.jpeg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/378800000575875952\/f00390453684dd243d7ca95c69a05f74_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/14858491\/1381524599","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Sat Nov 03 17:36:24 +0000 2018","id":1058774912201035776,"id_str":"1058774912201035776","text":"When you want to use a new algorithm that you don't deeply understand, the best approach is to implement it yoursel\u2026 https:\/\/t.co\/9F7SmlGfyf","source":"\u003ca href=\"http:\/\/twitter.com\" rel=\"nofollow\"\u003eTwitter Web Client\u003c\/a\u003e","truncated":true,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":765548,"id_str":"765548","name":"Hilary Mason","screen_name":"hmason","location":"NYC","url":"http:\/\/www.hilarymason.com","description":"GM for Machine Learning at #Cloudera. Founder at #FastForwardLabs. Data Scientist in Residence at #accel. I \u2665 data and cheeseburgers.","translator_type":"none","protected":false,"verified":true,"followers_count":111311,"friends_count":1539,"listed_count":5276,"favourites_count":12049,"statuses_count":17602,"created_at":"Sun Feb 11 21:22:24 +0000 2007","utc_offset":null,"time_zone":null,"geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_link_color":"282F8A","profile_sidebar_border_color":"87BC44","profile_sidebar_fill_color":"AB892B","profile_text_color":"000000","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/948689418709323777\/sTBM3vG0_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/948689418709323777\/sTBM3vG0_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/765548\/1353033581","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"extended_tweet":{"full_text":"When you want to use a new algorithm that you don't deeply understand, the best approach is to implement it yourself to learn how it works, and then use a library to benefit from robust code.\n\nHere's one article showing this with neural networks in Python: https:\/\/t.co\/3ehO86NFKI","display_text_range":[0,280],"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/3ehO86NFKI","expanded_url":"https:\/\/towardsdatascience.com\/how-to-build-your-own-neural-network-from-scratch-in-python-68998a08e4f6","display_url":"towardsdatascience.com\/how-to-build-y\u2026","indices":[257,280]}],"user_mentions":[],"symbols":[]}},"quote_count":14,"reply_count":8,"retweet_count":290,"favorite_count":1019,"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/9F7SmlGfyf","expanded_url":"https:\/\/twitter.com\/i\/web\/status\/1058774912201035776","display_url":"twitter.com\/i\/web\/status\/1\u2026","indices":[117,140]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"is_quote_status":false,"quote_count":0,"reply_count":0,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"hmason","name":"Hilary Mason","id":765548,"id_str":"765548","indices":[3,10]}],"symbols":[]},"favorited":false,"retweeted":false,"filter_level":"low","lang":"en","timestamp_ms":"1541357039223"}
If the question is not clear then please comment it out and I will try to edit the question.

If you want only the "text" field to be saved in the json file, you can tweak the definition of the TwitterListener.on_data method:
import json
def on_data(self, data):
try:
print(data)
with open(self.fetched_tweets_filename, 'a') as tf:
json_load = json.loads(data)
text = {'text': json_load['text']}
tf.write(json.dumps(text))
return True
except BaseException as e:
print("Error on_data %s" % str(e))
return True
Fair warning, I don't have tweepy installed/set up, so I was only able to test a version of the above code using the json file you posted above. Let me know if you run into any bugs and I'll see what I can do.

It looks like what you're getting from the API and storing in your variable "data" is unicode text in a json format. You are just writing that text directly to a file. Using the API call you do, you're always going to get all of the data so it isn't that inefficient. If you just wanted to get/write the text of the tweet, try using a json load and then processing from there.

Related

How to automatically generate partition keys for messages (Kafka + Python)?

I'm trying to generate keys for every message in Kafka, for that purpose I want to create a key generator that joins the topic first two characters and the tweet id.
Here is an example of the messages that get sent in kafka:
{"data":{"created_at":"2022-03-18T09:51:12.000Z","id":"1504757303811231755","text":"#Danielog111 #POTUS #NATO #UNPeacekeeping #UN Yes! Not to minimize Ukraine at all, but to bring attention to a horrific crisis and Tigrayan genocide that targets 7M people, longer time frame, and is largely unacknowledged by western news agencies. And people are being eaten-literally! #maddow #JoyAnnReid help Ethiopians!"},"matching_rules":[{"id":"1502932028618072070","tag":"NATO"},{"id":"1502932021731115013","tag":"Biden"}]}'
And here is my code modified to try generating partition keys (I'm using PyKafka):
from dotenv import load_dotenv
import os
import json
import tweepy
from pykafka import KafkaClient
# Getting credentials:
BEARER_TOKEN=os.getenv("BEARER_TOKEN")
# Setting up pykafka:
def get_kafka_client():
return KafkaClient(hosts='localhost:9092,localhost:9093,localhost:9094')
def send_message(data, name_topic, id):
client = get_kafka_client()
topic = client.topics[name_topic]
producer = topic.get_sync_producer()
producer.produce(data, partition_key=f"{name_topic[:2]}{id}")
# Creating a Twitter stream listener:
class Listener(tweepy.StreamingClient):
def on_data(self, data):
print(data)
message = json.loads(data)
for rule in message['matching_rules']:
send_message(data, rule['tag'], message['data']['id'].encode())
return True
def on_error(self, status):
print(status)
# Start streaming:
Listener(BEARER_TOKEN).filter(tweet_fields=['created_at'])
And this is the error I'm getting:
File "/Users/mac/.local/share/virtualenvs/tweepy_step-Ck3DvAWI/lib/python3.9/site-packages/pykafka/producer.py", line 372, in produce
raise TypeError("Producer.produce accepts a bytes object as partition_key, "
TypeError: ("Producer.produce accepts a bytes object as partition_key, but it got '%s'", <class 'str'>)
I've also tried not encoding it and trying to fetch the id just using the data (that comes in bytes) but none of these options work.
I found the error, I should've been encoding the partition key and not the json id:
def send_message(data, name_topic, id):
client = get_kafka_client()
topic = client.topics[name_topic]
producer = topic.get_sync_producer()
producer.produce(data, partition_key=f"{name_topic[:2]}{id}".encode())
# Creating a Twitter stream listener:
class Listener(tweepy.StreamingClient):
def on_data(self, data):
print(data)
message = json.loads(data)
for rule in message['matching_rules']:
send_message(data, rule['tag'], message['data']['id'])
return True
def on_error(self, status):
print(status)

Twitter Streaming - Find Top 10 trending topics | PySpark

Am doing a project to find top 10 trending topics or hashtags on Twitter. Am creating a class with the code below:
class TweetsListener(StreamListener):
def __init__(self, csocket):
self.client_socket = csocket
def on_data(self, data):
try:
msg = json.loads( data )
print(msg['user']['screen_name'].encode('utf-8'))
return True
except BaseException as e:
print("Error on_data: %s" % str(e))
return True
def on_error(self, status):
print(status)
return True
Below is the code for sending data:
def sendData(c_socket):
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
twitter_stream = Stream(auth, TweetsListener(c_socket))
twitter_stream.filter(track=['india']
Here twitter_stream.filter is filtering messages with tag India. I want to get all the messages from Twitter. In short, I do not want a filter to be applied. Is there a way to do the same?
Any help appreciated.
- P.S : Novice in Spark streaming and PySpark
Twitter now offers a sample stream: https://developer.twitter.com/en/docs/tweets/sample-realtime/overview/GET_statuse_sample.html
It's fairly new so I'm not sure if the wrappers (looks like you're using Tweepy) have implemented it yet, but it shouldn't be hard to interface with.

How to stream tweets using tweepy from a start date to end date using python?

I am currently in the process of doing some research using sentiment analysis on twitter data regarding a certain topic (isn't necessarily important to this question) using python, of which I am a beginner at. I understand the twitter streaming API limits users to access only to the previous 7 days unless you apply for a full enterprise search which opens up the whole archive. I had recently been given access to the full archive for this research project from twitter but I am unable to specify a start and end date to the tweets I would like to stream into a csv file. This is my code:
import pandas as pd
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
ckey = 'xxxxxxxxxxxxxxxxxxxxxxx'
csecret = 'xxxxxxxxxxxxxxxxxxxxxxx'
atoken = 'xxxxxxxxxx-xxxxxxxxxxxxxxxxxxxxxxx'
asecret = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxx'
# =============================================================================
# def sentimentAnalysis(text):
# output = '0'
# return output
# =============================================================================
class listener(StreamListener):
def on_data(self, data):
tweet = data.split(',"text":"')[1].split('","source')[0]
saveMe = tweet+'::'+'\n'
output = open('output.csv','a')
output.write(saveMe)
output.close()
return True
def on_error(self, status):
print(status)
auth = OAuthHandler(ckey, csecret)
auth.set_access_token(atoken, asecret)
twitterStream = Stream(auth, listener())
twitterStream.filter(track=["#weather"], languages = ["en"])
Now this code streams twitter date from the past 7 days perfectly. I tried changing the bottom line to
twitterStream.filter(track=["#weather"], languages = ["en"], since = ["2016-06-01"])
but this returns this error :: filter() got an unexpected keyword argument 'since'.
What would be the correct way to filter by a given date frame?
The tweepy does not provide the "since" argument, as you can check yourself here.
To achieve the desired output, you will have to use the api.user_timeline, iterating through pages until the desired date is reached, Eg:
import tweepy
import datetime
# The consumer keys can be found on your application's Details
# page located at https://dev.twitter.com/apps (under "OAuth settings")
consumer_key=""
consumer_secret=""
# The access tokens can be found on your applications's Details
# page located at https://dev.twitter.com/apps (located
# under "Your access token")
access_token=""
access_token_secret=""
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
page = 1
stop_loop = False
while not stop_loop:
tweets = api.user_timeline(username, page=page)
if not tweets:
break
for tweet in tweets:
if datetime.date(YEAR, MONTH, DAY) < tweet.created_at:
stop_loop = True
break
# Do the tweet process here
page+=1
time.sleep(500)
Note that you will need to update the code to fit your needs, this is just a general solution.

Running Time Estimate for Stream Twitter with Location Filter in Tweepy

PROBLEM SOLVED, SEE SOLUTION AT THE END OF THE POST
I need help to estimate running time for my tweepy program calling Twitter Stream API with location filter.
After I kicked it off, it has run for over 20 minutes, which is longer than what I expected. I am new to Twitter Stream API, and have only worked with REST API for couple of days. It looks to me that REST API will give me 50 tweets in a few seconds, easy. But this Stream request is taking a lot more time. My program hasn't died on me or given any error. So I don't know if there's anything wrong with it. If so, please do point out.
In conclusion, if you think my code is correct, could you provide an estimate for the running time? If you think my code is wrong, could you help me to fix it?
Thank you in advance!
Here's the code:
# Import Tweepy, sys, sleep, credentials.py
import tweepy, sys
from time import sleep
from credentials import *
# Access and authorize our Twitter credentials from credentials.py
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
box = [-86.33,41.63,-86.20,41.74]
class CustomStreamListener(tweepy.StreamListener):
def on_error(self, status_code):
print >> sys.stderr, 'Encountered error with status code:', status_code
return True # Don't kill the stream
def on_timeout(self):
print >> sys.stderr, 'Timeout...'
return True # Don't kill the stream
stream = tweepy.streaming.Stream(auth, CustomStreamListener()).filter(locations=box).items(50)
stream
I tried the method from http://docs.tweepy.org/en/v3.4.0/auth_tutorial.html#auth-tutorial Apparently it is not working for me... Here is my code below. Would you mind giving any input? Let me know if you have some working code. Thanks!
# Import Tweepy, sys, sleep, credentials.py
import tweepy, sys
from time import sleep
from credentials import *
# Access and authorize our Twitter credentials from credentials.py
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
# Assign coordinates to the variable
box = [-74.0,40.73,-73.0,41.73]
import tweepy
#override tweepy.StreamListener to add logic to on_status
class MyStreamListener(tweepy.StreamListener):
def on_status(self, status):
print(status.text)
def on_error(self, status_code):
if status_code == 420:
#returning False in on_data disconnects the stream
return False
myStreamListener = MyStreamListener()
myStream = tweepy.Stream(auth = api.auth, listener=myStreamListener())
myStream.filter(track=['python'], locations=(box), async=True)
Here is the error message:
Traceback (most recent call last):
File "test.py", line 26, in <module>
myStream = tweepy.Stream(auth = api.auth, listener=myStreamListener())
TypeError: 'MyStreamListener' object is not callable
PROBLEM SOLVED! SEE SOLUTION BELOW
After another round of debug, here is the solution for one who may have interest in the same topic:
# Import Tweepy, sys, sleep, credentials.py
try:
import json
except ImportError:
import simplejson as json
import tweepy, sys
from time import sleep
from credentials import *
# Access and authorize our Twitter credentials from credentials.py
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
# Assign coordinates to the variable
box = [-74.0,40.73,-73.0,41.73]
import tweepy
#override tweepy.StreamListener to add logic to on_status
class MyStreamListener(tweepy.StreamListener):
def on_status(self, status):
print(status.text.encode('utf-8'))
def on_error(self, status_code):
if status_code == 420:
#returning False in on_data disconnects the stream
return False
myStreamListener = MyStreamListener()
myStream = tweepy.Stream(api.auth, listener=myStreamListener)
myStream.filter(track=['NYC'], locations=(box), async=True)
Core Problem:
I think you're misunderstanding what the Stream is here.
Tl;dr: Your code is working, you're just not doing anything with the data that gets back.
The rest API call is a single call for information. You make a request, Twitter sends back some information, which gets assigned to your variable.
The StreamObject (which you've created as stream) from Tweepy opens a connection to twitter with your search parameters, and Twitter, well, streams Tweets to it. Forever.
From the Tweepy docs:
The streaming api is quite different from the REST api because the
REST api is used to pull data from twitter but the streaming api
pushes messages to a persistent session. This allows the streaming api
to download more data in real time than could be done using the REST
API.
So, you need to build a handler (streamListener, in tweepy's terminology), like this one that prints out the tweets..
Additional
Word of warning, from bitter experience - if you're going to try and save the tweets to a database: Twitter can, and will, stream objects to you much faster than you can save them to the database. This will result in your Stream being disconnected, because the tweets back up at Twitter, and over a certain level of backed-up-ness (not an actual phrase), they'll just disconnect you.
I handled this by using django-rq to put save jobs into a jobqueue - this way, I could handle hundreds of tweets a second (at peak), and it would smooth out. You can see how I did this below. Python-rq would also work if you're not using django as a framework round it. The read both method is just a function that reads from the tweet and saves it to a postgres database. In my specific case, I did that via the Django ORM, using the django_rq.enqueue function.
__author__ = 'iamwithnail'
from django.core.management.base import BaseCommand, CommandError
from django.db.utils import DataError
from harvester.tools import read_both
import django_rq
class Command(BaseCommand):
args = '<search_string search_string>'
help = "Opens a listener to the Twitter stream, and tracks the given string or list" \
"of strings, saving them down to the DB as they are received."
def handle(self, *args, **options):
try:
import urllib3.contrib.pyopenssl
urllib3.contrib.pyopenssl.inject_into_urllib3()
except ImportError:
pass
consumer_key = '***'
consumer_secret = '****'
access_token='****'
access_token_secret_var='****'
import tweepy
import json
# This is the listener, responsible for receiving data
class StdOutListener(tweepy.StreamListener):
def on_data(self, data):
decoded = json.loads(data)
try:
if decoded['lang'] == 'en':
django_rq.enqueue(read_both, decoded)
else:
pass
except KeyError,e:
print "Error on Key", e
except DataError, e:
print "DataError", e
return True
def on_error(self, status):
print status
l = StdOutListener()
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret_var)
stream = tweepy.Stream(auth, l)
stream.filter(track=args)
Edit: Your subsequent problem is caused by calling the listener wrongly.
myStreamListener = MyStreamListener() #creates an instance of your class
Where you have this:
myStream = tweepy.Stream(auth = api.auth, listener=myStreamListener())
You're trying to call the listener as a function when you use the (). So it should be:
myStream = tweepy.Stream(auth = api.auth, listener=myStreamListener)
And in fact, can probably just be more succinctly written as:
myStream = tweepy.Stream(api.auth,myStreamListener)

Is there a way for me to download all the tweets made by all twitter users in a particular region?

Is there a way for me to download all the tweets made by all twitter users in a particular region (say the USA) over a particular time period(say a week starting Nov. 15th and ending Nov 22nd) using Python? This is for an NLP task. Right now I am able to download the tweets related to certain topics which I search for and only the tweets being made while the program is running. I want to be able to get past tweets for a data mining/NLP task regardless of the topic.
Yes! You can.
Use Tweepy
import tweepy
consumer_key = ''
consumer_secret = ''
access_token_key = ''
access_token_secret = ''
auth1 = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth1.set_access_token(access_token_key, access_token_secret)
class StreamListener(tweepy.StreamListener):
def on_status(self, tweet):
print 'Ran on_status'
def on_error(self, status_code):
print 'Error: ' + repr(status_code)
return False
def on_data(self, data):
print 'Ok, this is actually running'
l = StreamListener()
streamer = tweepy.Stream(auth=auth1, listener=l)
setTerms = ['twitter']
streamer.filter(track = setTerms)
In stream.filter() you can specify the region, for more details
stream.filter(locations=[ "here you can define a region by listing the lang/lat" ], track=terms)
If you have a specific defined region, you can check that in the listner
def on_status(self, status):
if status.coordinates .. :

Categories

Resources