Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 6 years ago.
Improve this question
I want to analyse twitter data.I have downloaded some tweets and saved them in a .txt file.
When I tried to extract useful information from the tweets data , i was not able to make any progress because for a beginner like me it seems very difficult to extract tweets , location etc.
while googling i found if we convert json into dictionary it would be easy to extract the info.
Now I want to convert my JSON data to python dictionaries. I don't know how to proceed.
Here is the code used to save tweets
import tweepy
import json
import jsonpickle
consumer_key = "*********"
consumer_secret = "*******"
access_token = "************"
access_token_secret = "**********"
auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
# It make the Tweepy API call auto wait (sleep) when it hits the rate limit and continue upon expiry of the window.
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
if (not api):
print ("Can't Authenticate")
sys.exit(-1)
searchQuery = 'SomeHashtag'
maxTweets = 10000000 # Some arbitrary large number
tweetsPerQry = 100
fName = 'file.txt'
sinceId = None
max_id = "Latest tweet ID"
tweetCount = 0
print("Downloading max {0} tweets".format(maxTweets))
with open(fName, 'a') as f:
while tweetCount < maxTweets:
try:
if (max_id <= 0):
if (not sinceId):
new_tweets = api.search(q=searchQuery, lang ="en", count=tweetsPerQry)
else:
new_tweets = api.search(q=searchQuery, lang ="en", count=tweetsPerQry,
since_id=sinceId)
else:
if (not sinceId):
new_tweets = api.search(q=searchQuery, lang ="en", count=tweetsPerQry,
max_id=str(max_id - 1))
else:
new_tweets = api.search(q=searchQuery, lang ="en", count=tweetsPerQry,
max_id=str(max_id - 1),
since_id=sinceId)
if not new_tweets:
print("No more tweets found")
break
for tweet in new_tweets:
f.write(jsonpickle.encode(tweet._json, unpicklable=False) + '\n')
tweetCount += len(new_tweets)
print("Downloaded {0} tweets".format(tweetCount))
max_id = new_tweets[-1].id
except tweepy.TweepError as e:
# Just exit if any error
print("some error : " + str(e))
break
print ("Downloaded {0} tweets, Saved to {1}".format(tweetCount, fName))
It seems you can just read your file line by line and unpickle it using jsonpickle.decode method:
tweets = []
with open(filename) as f:
for line in f:
tweets.append(jsonpickle.decode(line))
And I think you can bypass third-party library at all:
import json
with open(filename, 'w') as f:
for tweet in new_tweets:
f.write(json.dumps(tweet) + '\n')
tweets = []
with open(filename) as f:
for line in f:
tweets.append(json.loads(line))
Related
I'm in need of some advice with my Twitter sentiment analysis.
I'm trying to do a pretty common sentiment analysis, but not on random tweets from Twitter search, but on the tweets of selected users.
What I've tried so far, is that I read in this csv of the users. And then iterated over this list and then, user by user conducted this tweet analysis.
I'll put my write_tweets function here, just so it could get some feedback maybe :)
def write_tweets(users_df, file):
# If the file exists, then read the existing data from the CSV file.
if os.path.exists(file):
df = pd.read_csv(file, header=0)
else:
df = pd.DataFrame(columns=COLS)
#page attribute in tweepy.cursor and iteration
for user in users_df[0]:
#for user in users_list:
print(user)
#user = 'fion_li'
try:
#for status in tweepy.Cursor(api.user_timeline, screen_name=user, count = 1,tweet_mode="extended").items(22):
for status in tweepy.Cursor(api.user_timeline, screen_name=user, count = 1,tweet_mode="extended").items(1):
#print(status)
new_entry = []
status = status._json
#print(to_datetime(status['created_at']))
#print(status['full_text'])
#csvFile = open(file, 'a' ,encoding='utf-8')
if (to_datetime(status['created_at']) < startDate):
#print(to_datetime(status['created_at']))
#print(status['full_text'])
continue
## check whether the tweet is in english or skip to the next tweet
if status['lang'] != 'en':
continue
#tweepy preprocessing called for basic preprocessing
#clean_text = clean(status['entities'])
clean_text = clean(status['full_text'])
#call clean_tweet method for extra preprocessing
filtered_tweet=clean_tweets(clean_text)
#pass textBlob method for sentiment calculations
blob = TextBlob(filtered_tweet)
blob_2 = TextBlob(filtered_tweet, analyzer=NaiveBayesAnalyzer())
Sentiment = blob.sentiment
Sentiment_2 = blob_2.sentiment
#seperate polarity and subjectivity in to two variables
polarity = Sentiment.polarity
subjectivity = Sentiment.subjectivity
positivity = Sentiment_2.p_pos
negativity = Sentiment_2.p_neg
#new entry append
new_entry += [status['id'], status['created_at'],
status['source'],
#status['full_text'],
filtered_tweet, Sentiment,polarity,subjectivity, positivity, negativity, status['lang'],
status['favorite_count'], status['retweet_count']]
#to append original author of the tweet
new_entry.append(status['user']['screen_name'])
try:
is_sensitive = status['possibly_sensitive']
except KeyError:
is_sensitive = None
new_entry.append(is_sensitive)
# hashtagas and mentiones are saved using comma separted
hashtags = ", ".join([hashtag_item['text'] for hashtag_item in status['entities']['hashtags']])
new_entry.append(hashtags)
mentions = ", ".join([mention['screen_name'] for mention in status['entities']['user_mentions']])
new_entry.append(mentions)
#get location of the tweet if possible
try:
location = status['user']['location']
except TypeError:
location = ''
new_entry.append(location)
try:
coordinates = [coord for loc in status['place']['bounding_box']['coordinates'] for coord in loc]
except TypeError:
coordinates = None
new_entry.append(coordinates)
single_tweet_df = pd.DataFrame([new_entry], columns=COLS)
#print(single_tweet_df)
df = df.append(single_tweet_df, ignore_index=True)
csvFile = open(file, 'a' ,encoding='utf-8')
except Exception, e:
pass
#csvFile = open(file, 'a' ,encoding='utf-8')
df.to_csv(csvFile, mode='a', columns=COLS, index=False, encoding="utf-8")
write_tweets(users_list, test_file)
Output would be a few indicators of sentiment, like positivity, negativity, neutral etc.
My question is, that maybe some of you has done this kind of thing already and can give me some recommendations about it? My version of it seems very slow and not very efficient (for me, at least).
Thanks in advance
Ive got a python script intending to scrape tweets from twitter and append them to a csv file. Im using the tweepy module however it is only returning 1 tweet. Is this a problem with my for loop, or with the call to the twitter API?
for status in tweepy.Cursor(twitterapi.search,q="labour party",since="2018-05-01", until="2018-05-10").items(200):
if 'RT' not in status.text:
with open('C:/Users/User/Desktop/twittersentiment.csv', 'wb') as f:
w = csv.writer(f)
favourites = status.user.favourites_count
location = status.user.location.encode('utf8')
tweet_text = ' '.join(re.sub("(#[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",status.text.encode('utf8')).split())
date = status.created_at.strftime('%m/%d/%Y')
a = [location]
b=[favourites]
c=[tweet_text]
d=[date]
zip(a,b,c,d)
w.writerow(zip(a,b,c,d))
You should open the file before you start iterating the tweepy.Cursor otherwise each iteration of the cursor will create a new file with one entry, overwriting the previous file.
with open('C:/Users/User/Desktop/twittersentiment.csv', 'wb') as f:
w = csv.writer(f)
for status in tweepy.Cursor(twitterapi.search,q="labour party",since="2018-05-01", until="2018-05-10").items(200):
if 'RT' not in status.text:
favourites = status.user.favourites_count
location = status.user.location.encode('utf8')
tweet_text = ' '.join(re.sub("(#[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",status.text.encode('utf8')).split())
date = status.created_at.strftime('%m/%d/%Y')
a = [location]
b=[favourites]
c=[tweet_text]
d=[date]
zip(a,b,c,d)
w.writerow(zip(a,b,c,d))
When I run the code,
I want to save it as output.
But my code is stored differently.
How do I fix it? Thank you for your advice.
"run result" =
1 : {'text': 'Today is sunday! https//abcd'}
2 : {'text': 'hi!!!\nhi!!!\nhi!!! https//abcd}
"Text file saving result"=
Today is sunday! https//abcd
hi!!!
hi!!!
hi!!! https//abcd
import tweepy
import time
import os
import json
import simplejson
search_term = ''
lat = ""
lon = ""
radius = ""
API_key = ""
API_secret = ""
Access_token = ""
Access_token_secret = ""
location = "%s,%s,%s" % (lat, lon, radius)
auth = tweepy.OAuthHandler(API_key, API_secret)
auth.set_access_token(Access_token, Access_token_secret)
api = tweepy.API(auth)
c=tweepy.Cursor(api.search,
q="{}".format(search_term),
rpp=100,
geocode=location,
include_entities=False)
wfile = open("test1.txt", mode='w', encoding='utf8')
data = {}
i = 1
for tweet in c.items():
data['text'] = tweet.text
print(i, ":", data)
wfile.write(data['text']+'\n')
time.sleep(0.35)
i += 1
wfile.close()
you need to clarify your question, i guess nobody really knows what you want to achieve exactly. Do you want a "json like" output of a dictionary in your text file? Give us an example, how your file's content should look like.
I give a guess, maybe you just want the string represenation of your dict, you get the string representation by calling str(data) or data.__str__().
wfile = open("test1.txt", mode='w', encoding='utf8')
data = {}
i = 1
for tweet in c.items():
data['text'] = tweet.text
print(i, ":", data)
wfile.write(str(data) +'\n')
time.sleep(0.35)
i += 1
wfile.close()
I have a list of lists
[["Due to the storms this weekend, we have rescheduled the Blumenfield Bike Ride for Feb 26. Hope to see you there.\xe2\x80\xa6 '"], ['Lots of sun this weekend, take advantage of the Beach Bus that gets you from Woodland Hills to the beach for just $\xe2\x80\xa6 '], ["RT #LHansenLA: Yesterday got a peek inside #LAPPL #EagleandBadge new rig for End of Watch Memorial Wall. Moving tribute to fallen #LAPD w/\xe2\x80\xa6'"], ["Happy to join Art Sherman and Wings Over #Wendys to honor veterans & 15 years of weekly meetings hosted by Ron and\xe2\x80\xa6 '"], ["Join me for the 4th Annual Blumenfield Bike Ride. Enjoy the West Valley on 2 wheels. RSVP:'"]]
As you can see, the lists unfortunately are displaying literal UTF-8 instead of the characters themselves. At some point in my code, I encode into UTF-8
outtweets = [[str(tweet.text.encode("utf-8"))] for tweet in correct_date_tweet]
outtweets = [[stuff.replace("b\'", "")] for sublist in outtweets for stuff in sublist]
outtweets = [[stuff.replace('b\"', "")] for sublist in outtweets for stuff in sublist]
The above code is all necessary in order to remove the b prefixes. These cannot be in my tweets because I am doing machine learning analysis and having the bs affects it.
My Question
How do I replace the UTF-8 script with the actual characters?
I need to encode it somehow because I am pulling tweets from (3 cities) x (50 officials) x (12 months of tweets for each) so it would be impossibly inefficient to try to manually replace them.
Code
import tweepy #https://github.com/tweepy/tweepy
#Twitter API credentials
consumer_key = "insert key here"
consumer_secret = "insert key here"
access_key = "insert key here"
access_secret = "insert key here"
#authorize twitter, initialize tweepy
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
#!/usr/bin/env python
# encoding: utf-8
import tweepy #https://github.com/tweepy/tweepy
import json
import csv
import datetime
from datetime import datetime
import os.path
failed_accounts = []
def get_all_tweets(screen_name,mode):
#try:
#Twitter only allows access to a users most recent 3240 tweets with this method
#initialize a list to hold all the tweepy Tweets
alltweets = []
#make initial request for most recent tweets (200 is the maximum allowed count)
new_tweets = api.user_timeline(screen_name = screen_name,count=200)
#save most recent tweets
alltweets.extend(new_tweets)
#save the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
i = 0
num_req = 0
#keep grabbing tweets until there are no tweets left to grab
while len(new_tweets) > 0:
#all subsiquent requests use the max_id param to prevent duplicates
new_tweets = api.user_timeline(screen_name = screen_name,count=200,max_id=oldest)
#save most recent tweets
alltweets.extend(new_tweets)
#update the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
print ("...%s tweets downloaded so far" % (len(alltweets)))
num_req = num_req + 1
# makes further requests only if batch doesn't contain tweets beyond oldest limit
oldest_limit = datetime(2016, 1, 20,0,0,0)
x = 0
for tweet in new_tweets:
raw_date = tweet.created_at
if raw_date < oldest_limit:
x = 1
else:
continue
if x == 1:
break
#BSP this script is designed to just keep going. I want it to stop.
#i = i + 1
#if i == 10:
# break
print("Number of Tweet Request Rounds: %s" %num_req)
correct_date_tweet = []
for tweet in alltweets:
raw_date = tweet.created_at
#date = datetime.strptime(raw_date, "%Y-%m-%d %H:%M:%S")
newest_limit = datetime(2017, 1, 20,0,0,0)
oldest_limit = datetime(2016, 1, 20,0,0,0)
if raw_date > oldest_limit and raw_date < newest_limit:
correct_date_tweet.append(tweet)
else:
continue
#transform the tweepy tweets into a 2D array that will populate the csv
if mode == "tweets only" or "instance file":
outtweets = [[str(tweet.text.encode("utf-8"))] for tweet in correct_date_tweet]
outtweets = [[stuff.replace("b\'", "")] for sublist in outtweets for stuff in sublist]
outtweets = [[stuff.replace('b\"', "")] for sublist in outtweets for stuff in sublist]
outtweets = [["1 ",stuff.replace('"', "")] for sublist in outtweets for stuff in sublist]
#outtweets = [["1 ",stuff] for sublist in outtweets for stuff in sublist]
else:
outtweets = [[tweet.id_str, tweet.created_at, tweet.text.encode("utf-8"),tweet.retweet_count,tweet.favorite_count,len(tweet.entities.get("hashtags")),len(tweet.entities.get("urls")),len(tweet.entities.get("user_mentions"))] for tweet in correct_date_tweet]
#write the csv
if mode == "instance file":
with open(os.path.join(save_location,'%s.instance' % screen_name), mode ='w') as f:
writer = csv.writer(f)
writer.writerows(outtweets)
else:
with open(os.path.join(save_location,'%s.csv' % screen_name), 'w',encoding='utf-8') as f:
writer = csv.writer(f)
if mode != "tweets only":
writer.writerow(["id","created_at","text","retweets","favorites","hashtags","urls"])
writer.writerows(outtweets)
pass
print("Done with %s" % screen_name)
get_all_tweets("BobBlumenfield","instance file")
Update
Based on an answer, I tried changing one of the lines to outtweets = [[tweet.text] for tweet in correct_date_tweet]
But this didn't work because it yields
---------------------------------------------------------------------------
UnicodeEncodeError Traceback (most recent call last)
<ipython-input-12-a864b5efe8af> in <module>()
----> 1 get_all_tweets("BobBlumenfield","instance file")
<ipython-input-9-d0b9b37c7261> in get_all_tweets(screen_name, mode)
104 with open(os.path.join(save_location,'%s.instance' % screen_name), mode ='w') as f:
105 writer = csv.writer(f)
--> 106 writer.writerows(outtweets)
107 else:
108 with open(os.path.join(save_location,'%s.csv' % screen_name), 'w',encoding='utf-8') as f:
C:\Users\Stan Shunpike\Anaconda3\lib\encodings\cp1252.py in encode(self, input, final)
17 class IncrementalEncoder(codecs.IncrementalEncoder):
18 def encode(self, input, final=False):
---> 19 return codecs.charmap_encode(input,self.errors,encoding_table)[0]
20
21 class IncrementalDecoder(codecs.IncrementalDecoder):
UnicodeEncodeError: 'charmap' codec can't encode characters in position 64-65: character maps to <undefined>
Remove the following line:
outtweets = [[str(tweet.text.encode("utf-8"))] for tweet in correct_date_tweet]
Here's why:
You're encoding to a byte string. Hence the b.
You're using str without an encoding defined. In this mode you're getting a representation of the array, including types, again hence the b and the UTF-8 escaping.
There's no need to encode in middle of your code. Only encode when writing to a file or the network (not when printing). Rarely do you have to call .encode() yourself if you use open()'s built-in encoder.
When using open() in text mode as you are doing, always specify the encoding as it's different per platform.
Remove all other uses of .encode() from your code.
You can now remove the other lines that are trying to correct your error.
I'm using Tweepy to collect tweets from the Twitter API by their Tweet ID.
Im trying to read in a file full of the IDs, get the previous tweet from the conversation stream, then store that tweet and its author's screen name etc in a text file. Some of the tweets have been deleted or the user's profile has been set to private, in which case I want to ignore that tweet and move on to the next. However, for some reason, I'm not collecting all accessible tweets. Its storing maybe 3/4 of all tweets that aren't private and haven't been deleted. Any ideas why its not catching everything?
Thanks in advance.
def getTweet(tweetID, tweetObj, callTweetObj, i):
tweet = callTweetObj.text.encode("utf8")
callUserName = callTweetObj.user.screen_name
callTweetID = tweetObj.in_reply_to_status_id_str
with open("call_tweets.txt", "a") as calltweets:
output = (callTweetObj.text.encode('utf-8')+ "\t" + callTweetID + "\t" + tweetID)
calltweets.write(output)
print output
with open("callauthors.txt", "a") as callauthors:
cauthors = (callUserName+ "\t" + "\t" + callTweetID + "\n")
callauthors.write(cauthors)
with open("callIDs.txt", "a") as callIDs:
callIDs.write(callTweetID + "\n")
with open("newResponseIDs.txt", "a") as responseIDs:
responseIDs.write(tweetID)
count = 0
file = "Response_IDs.txt"
with open(file, 'r+') as f:
lines = f.readlines()
for i in range(0, len(lines)):
tweetID = lines[i]
sleep(5)
try:
tweetObj = api.get_status(tweetID)
callTweetID = tweetObj.in_reply_to_status_id_str
callTweetObj = api.get_status(callTweetID)
getTweet(tweetID, tweetObj, callTweetObj, i)
count = count+1
print count
except:
pass
You haven't specified information regarding the response coming back from api.get_status, so it's hard to detect what the error is.
However, it might be you have reached the rate limit for the statuses/show/:id request. The API specifies this request is limited to 180 requests a window.
You can use Tweepy to call application/rate_limit_status:
response = api.rate_limit_status()
remaining = response['resources']['statuses']['/statuses/show/:id']['remaining']
assert remaining > 0