I'm trying to scrape twitter data for my thesis work. But in this below's code, dataframe isn't working. I mean, dataframe isn't showing at the output line. How can I modify this code to build my dataframe? Another problem is that I want to scrape data by filtering location. How can I do this?
import tweepy
import re
import pandas as pd
import itertools
import collections
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from textblob import TextBlob
import os
consumer_key = ""
consumer_secret = ""
access_token = ""
access_token_secret = ""
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth,wait_on_rate_limit=True,wait_on_rate_limit_notify=True)
latitude = 56.130367 # geographical centre of search
longitude = -106.346771 # geographical centre of search
max_range = 1
tweets = tweepy.Cursor(api.search,
q="Shopify" + " -filter:retweets",
#geocode = "%f,%f,%dkm" % (latitude,longitude,max_range),
lang="en",
since="2020-01-01").items(10)
for tweet in tweets:
print(tweet.text)
analysis = TextBlob(tweet.text)
print('Date=', tweet.created_at,'Location=', tweet.user.location)
print(analysis.sentiment)
if analysis.sentiment[0] > 0:
print('Positive')
elif analysis.sentiment[0] < 0:
print('Negative')
else:
print('Neutral')
print('====================================================================')
print()
user_data = [[tweet.created_at, remove_characters(tweet.user.name), tweet.user.location,
remove_characters(tweet.text), TextBlob(tweet.text).sentiment[0],
'Positive' if TextBlob(tweet.text).sentiment[0] > 0
else 'Negative' if TextBlob(tweet.text).sentiment[0] < 0
else 'Nuetral']
for tweet in tweets]
tweet_df = pd.DataFrame(data=user_data,
columns=['Created At', "User", 'Location', 'Text', 'Sentiment', 'Polarity', 'favorite_count'])
tweet_df.head(10)
Related
I am scraping data from Twitter for tweets, since Twitter has a limitation on this, I am scraping 2500 tweets data every 15 minutes, however, I observe that each run after 15 minutes is returning me the same tweets. Is there any way how I can skip the previously scraped tweet data using some offset.
Thank You!
Here is my code:
# Import libraries
from tweepy import OAuthHandler
#from tweepy.streaming import StreamListener
import tweepy
import csv
import pandas as pd
#import re
#from textblob import TextBlob
#import string
#import preprocessor as p
#import os
import time
# Twitter credentials
consumer_key = ''
consumer_secret = ''
access_key = ''
access_secret = ''
# Pass your twitter credentials to tweepy via its OAuthHandler
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
def extract_tweets(search_words,date_since,numTweets):
return(tweepy.Cursor(api.search, q=search_words, lang="en", since=date_since, tweet_mode='extended').items(numTweets))
def scrapetweets(search_words, date_since, numTweets, numRuns):
# Define a pandas dataframe to store the date:
db_tweets = pd.DataFrame(columns = ['username', 'acctdesc', 'location', 'following', 'followers', 'totaltweets', 'usercreatedts', 'tweetcreatedts', 'retweetcount', 'text', 'hashtags'])
#db_tweets = pd.DataFrame()
for i in range(numRuns):
tweets = extract_tweets(search_words,date_since,numTweets)
# Store these tweets into a python list
tweet_list = [tweet for tweet in tweets]
print(len(tweet_list))
noTweets = 0
for tweet in tweet_list:
username = tweet.user.screen_name
acctdesc = tweet.user.description
location = tweet.user.location
following = tweet.user.friends_count
followers = tweet.user.followers_count
totaltweets = tweet.user.statuses_count
usercreatedts = tweet.user.created_at
tweetcreatedts = tweet.created_at
retweetcount = tweet.retweet_count
hashtags = tweet.entities['hashtags']
lst=[]
for h in hashtags:
lst.append(h['text'])
try:
text = tweet.retweeted_status.full_text
except AttributeError: # Not a Retweet
text = tweet.full_text
itweet = [username,acctdesc,location,following,followers,totaltweets,usercreatedts,tweetcreatedts,retweetcount,text,lst]
db_tweets.loc[len(db_tweets)] = itweet
noTweets += 1
print(noTweets,itweet)
#filename = "tweets.csv"
#with open(filename, "a", newline='') as fp:
# wr = csv.writer(fp, dialect='excel')
# wr.writerow(itweet)
print('no. of tweets scraped for run {} is {}'.format(i + 1, noTweets))
if i+1 != numRuns:
time.sleep(920)
filename = "tweets.csv"
# Store dataframe in csv with creation date timestamp
db_tweets.to_csv(filename, mode='a', index = False)
# Initialise these variables:
search_words = "#India OR #COVID-19"
date_since = "2020-04-29"
#date_until = "2020-05-01"
numTweets = 2500
numRuns = 10
# Call the function scrapetweets
program_start = time.time()
scrapetweets(search_words, date_since, numTweets, numRuns)
program_end = time.time()
print('Scraping has completed!')
print('Total time taken to scrape is {} minutes.'.format(round(program_end - program_start)/60, 2))
I referred to a blog on medium for this purpose.
you can add a variable as validator an store it to a file that may be a tweetid.txt
and each time you run the script, you open di tweetid.txt
if tweetid same in tweet id in txt, you pass it.
I am using tweepy to download tweets about a particular topic but nobody which tutorial I follow I cannot get the tweet to output as a full tweet. There is always an ellipse that cuts it off after a certain number of characters.
Here is the code I am using
import json
import tweepy
from tweepy import OAuthHandler
import csv
import sys
from twython import Twython
nonBmpMap = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
with open ('Twitter_Credentials.json') as cred_data:
info = json.load(cred_data)
consumer_Key = info['Consumer_Key']
consumer_Secret = info['Consumer_Secret']
access_Key = info['Access_Key']
access_Secret = info['Access_Secret']
maxTweets = int(input('Enter the Number of tweets that you want to extract '))
userTopic = input('What topic do you want to search for ')
topic = ('"' + userTopic + '"')
tweetCount = 0
auth = OAuthHandler(consumer_Key, consumer_Secret)
auth.set_access_token(access_Key, access_Secret)
api = tweepy.API(auth, wait_on_rate_limit=True)
tweets = api.search(q=topic, count=maxTweets, tweet_mode= 'extended')
for tweet in tweets:
tweetCount = (tweetCount+1)
with open ('TweetsAbout' + userTopic, 'a', encoding='utf-8') as the_File:
print(tweet.full_text.translate(nonBmpMap))
tweet = (str(tweet.full_text).translate(nonBmpMap).replace(',','').replace('|','').replace('\n','').replace('’','\'').replace('…',"end"))
the_File.write(tweet + "\n")
print('Extracted ' + str(tweetCount) + ' tweets about ' + topic)
Try this, see if it works!
try:
specific_tweets = tweepy.Cursor(api.search, tweet_mode='extended', q=<your_query_string> +" -filter:retweets", lang='en').items(500)
except tweepy.error.TweepError:
pass
for tweet in specific_tweets:
extracted_text = tweet.full_text
all the text your trying to extract should be in extracted_text. Good Luck!!
I am running a sentiment analysis by using twitter and I am having some difficulties on:
Counting how many 'Positive', 'Negative' and 'Neutral' results I have.
Any help will be me more than appreciated.
Please take a look at my code:
import tweepy
from textblob import TextBlob
consumer_key = ''
consumer_key_secret = ''
access_token = ''
access_token_secret = ''
auth = tweepy.OAuthHandler(consumer_key, consumer_key_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
public_tweets = api.search('stackoverflow')
for tweet in public_tweets:
print(tweet.text)
analysis = TextBlob(tweet.text)
print(analysis.sentiment)
if analysis.sentiment[0]>0:
print ('Positive')
elif analysis.sentiment[0]<0:
print('Negative')
else:
print ('Neutral')
I think you can just create variables that keep track of how many labels were in the data. Something like this:
pos, neg, neu = 0, 0, 0
for tweet in public_tweets:
analysis = TextBlob(tweet.text)
if analysis.sentiment[0]>0:
pos += 1
elif analysis.sentiment[0]<0:
neg += 1
else:
neu += 1
print("positive: {}\nnegative: {}\nneutral: {}".format(pos,neg,neu))
Regarding the result dataframe, I was not sure what kind of data you want to save, so could not give a good answer for that.
I am trying to use the tweepy library to collect some data from twitter to conduct some sentiment analysis.
Here is a sample of the script I am running:
import tweepy
import pandas as pd
import numpy as np
auth = tweepy.OAuthHandler(CONSUMER_KEY,CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)
api = tweepy.API(auth, parser = tweepy.parsers.JSONParser())
# Set search query
searchquery = '"atiku" -filter:retweets'
data = api.search(q = searchquery, count = 100, lang = 'en', result_type = 'mixed')
data_all = list(data.values())[0]
#(data.values())[1])
while (len(data_all) <= 20000):
time.sleep(5)
last = data_all[1]['id']
data = api.search(q = searchquery, count = 100, lang = 'en', result_type = 'mixed', max_id = last)
data_all += list(data.values())[1][1:]
print(data_all)
I have hit a road block in my code, as when I run it I get this error:
TypeError: unhashable type: 'slice'
I would appreciate any pointers on thos
here is my code..i want to extract tweets from twitter with some keywords....my code dont give any errors but i am not getting the output file generated...please help me........
import re
import csv
import tweepy
from tweepy import OAuthHandler
#TextBlob perform simple natural language processing tasks.
from textblob import TextBlob
def search():
#text = e.get() **************************
consumer_key = ''
consumer_secret = ''
access_token = ' '
access_token_secret = ' '
# create OAuthHandler object
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
# set access token and secret
auth.set_access_token(access_token, access_token_secret)
# create tweepy API object to fetch tweets
api = tweepy.API(auth)
def get_tweets(query, count = 300):
# empty list to store parsed tweets
tweets = []
target = open("tweets.txt", 'w',encoding="utf-8")
t1 = open("review.txt", 'w',encoding="utf-8")
# call twitter api to fetch tweets
q=str(query)
a=str(q+" sarcasm")
b=str(q+" sarcastic")
c=str(q+" irony")
fetched_tweets = api.search(a, count = count)+ api.search(b, count = count)+ api.search(c, count = count)
# parsing tweets one by one
print(len(fetched_tweets))
for tweet in fetched_tweets:
# empty dictionary to store required params of a tweet
parsed_tweet = {}
# saving text of tweet
parsed_tweet['text'] = tweet.text
if "http" not in tweet.text:
line = re.sub("[^A-Za-z]", " ", tweet.text)
target.write(line+"\n")
t1.write(line+"\n")
return tweets
# creating object of TwitterClient Class
# calling function to get tweets
tweets = get_tweets(query =text, count = 20000)
root.mainloop()
From this code i am nor getting the output generated file. Can anyone tell me what i am doing wrong ?
Thanks in advance!
I just made some slight changes and it was working perfectly for me. Removed or commented some unnecessary statements (like the review file). Changed the open function to io.open since I have python version 2.7. Here is the running code, hope it helps!!
`
import re
import io
import csv
import tweepy
from tweepy import OAuthHandler
#TextBlob perform simple natural language processing tasks.
#from textblob import TextBlob
consumer_key = 'sz6x0nvL0ls9wacR64MZu23z4'
consumer_secret = 'ofeGnzduikcHX6iaQMqBCIJ666m6nXAQACIAXMJaFhmC6rjRmT'
access_token = '854004678127910913-PUPfQYxIjpBWjXOgE25kys8kmDJdY0G'
access_token_secret = 'BC2TxbhKXkdkZ91DXofF7GX8p2JNfbpHqhshW1bwQkgxN'
# create OAuthHandler object
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
# set access token and secret
auth.set_access_token(access_token, access_token_secret)
# create tweepy API object to fetch tweets
api = tweepy.API(auth)
def get_tweets(query, count = 300):
# empty list to store parsed tweets
tweets = []
target = io.open("mytweets.txt", 'w', encoding='utf-8')
# call twitter api to fetch tweets
q=str(query)
a=str(q+" sarcasm")
b=str(q+" sarcastic")
c=str(q+" irony")
fetched_tweets = api.search(a, count = count)+ api.search(b, count = count)+ api.search(c, count = count)
# parsing tweets one by one
print(len(fetched_tweets))
for tweet in fetched_tweets:
# empty dictionary to store required params of a tweet
parsed_tweet = {}
# saving text of tweet
parsed_tweet['text'] = tweet.text
if "http" not in tweet.text:
line = re.sub("[^A-Za-z]", " ", tweet.text)
target.write(line+"\n")
return tweets
# creating object of TwitterClient Class
# calling function to get tweets
tweets = get_tweets(query ="", count = 20000)
`