I'm working on a Sentiment Analysis project using Twitter Data, and I've encountered a small problem regarding Dates. The code itself runs fine, but I don't know how to build custom time blocks for grouping my final data. Right now, it is defaulting to grouping them by the second, which is not very useful. I want to be able to group them in half-hour, hour, and day segments...
Feel free to skip to the bottom of the code to see where the issue lies!
Here is the code:
import tweepy
API_KEY = "XXXXX"
API_SECRET = XXXXXX"
auth = tweepy.AppAuthHandler(API_KEY, API_SECRET)
api = tweepy.API(auth, wait_on_rate_limit = True, wait_on_rate_limit_notify = True)
import sklearn as sk
import pandas as pd
import got3
#"Get Old Tweets" to find older data
tweetCriteria = got3.manager.TweetCriteria()
tweetCriteria.setQuerySearch("Kentucky Derby")
tweetCriteria.setSince("2016-05-07")
tweetCriteria.setUntil("2016-05-08")
tweetCriteria.setMaxTweets(1000)
TweetCriteria = got3.manager.TweetCriteria()
KYDerby_tweets = got3.manager.TweetManager.getTweets(tweetCriteria)
from afinn import Afinn
afinn = Afinn()
#getting afinn library to use for sentiment polarity analysis
for x in KYDerby_tweets:
Text = x.text
Retweets = x.retweets
Favorites = x.favorites
Date = x.date
Id = x.id
print(Text)
AllText = []
AllRetweets = []
AllFavorites = []
AllDates = []
AllIDs = []
for x in KYDerby_tweets:
Text = x.text
Retweets = x.retweets
Favorites = x.favorites
Date = x.date
AllText.append(Text)
AllRetweets.append(Retweets)
AllFavorites.append(Favorites)
AllDates.append(Date)
AllIDs.append(Id)
data_set = [[x.id, x.date, x.text, x.retweets, x.favorites]
for x in KYDerby_tweets]
df = pd.DataFrame(data=data_set, columns=["Id", "Date", "Text", "Favorites", "Retweets"])
#I now have a DataFrame with my basic info in it
pscore = []
for x in KYDerby_tweets:
afinn.score(x.text)
pscore.append(afinn.score(x.text))
df['P Score'] = pscore
#I now have the pscores for each Tweet in the DataFrame
nrc = pd.read_csv('C:\\users\\andrew.smith\\downloads\\NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt', sep="\t", names=["word", "emotion", "association"], skiprows=45)
#import NRC emotion lexicon
nrc = nrc[nrc["association"]==1]
nrc = nrc[nrc["emotion"].isin(["positive", "negative"]) == False]
#cleaned it up a bit
from nltk import TweetTokenizer
tt = TweetTokenizer()
tokenized = [x.lower() for x in tokenized]
#built my Tweet-specific, NRC-ready tokenizer
emotions = list(set(nrc["emotion"]))
index2emotion = {}
emotion2index = {}
for i in range(len(emotions)):
index2emotion[i] = emotions[i]
emotion2index[emotions[i]] = i
cv = [0] * len(emotions)
#built indices showing locations of emotions
for token in tokenized:
sub = nrc[nrc['word'] == token]
token_emotions = sub['emotion']
for e in token_emotions:
position_index = emotion2index[e]
cv[position_index]+=1
emotions = list(set(nrc['emotion']))
index2emotion = {}
emotion2index = {}
for i in range(len(emotions)):
index2emotion[i] = emotions[i]
emotion2index[emotions[i]] = i
def makeEmoVector(tweettext):
cv = [0] * len(emotions)
tokenized = tt.tokenize(tweettext)
tokenized = [x.lower() for x in tokenized]
for token in tokenized:
sub = nrc[nrc['word'] == token]
token_emotions = sub['emotion']
for e in token_emotions:
position_index = emotion2index[e]
cv[position_index] += 1
return cv
tweettext = df.iloc[14,:]['Text']
emotion_vectors = []
for text in df['Text']:
emotion_vector = makeEmoVector(text)
emotion_vectors.append(emotion_vector)
ev = pd.DataFrame(emotion_vectors, index=df.index, columns=emotions)
#Now I have a DataFrame with all of the emotion counts for each tweet
Date_Group = df.groupby("Date")
Date_Group[emotions].agg("sum")
#Finally, we arrive at the problem! When I run this, I end up with tweets that are grouped *by the second. What I want is to be able to group them: a) by the half-hour, b) by the hour, and c) by the day
Since, the default date format for tweets with the Tweepy API is "2017-04-14 18:41:56". To get tweets grouped by hour, you can do something as simple as this:
# This will get the time parameter
time = [item.split(" ")[1] for item in df['date'].values]
# This will get the hour parameter
hour = [item.split(":")[0] for item in time]
df['time'] = hour
grouped_tweets = df[['time', 'number_tweets']].groupby('time')
tweet_growth_hour = grouped_tweets.sum()
tweet_growth_hour['time']= tweet_growth_hour.index
print tweet_growth_hour
To group by date, you can do something similiar like:
days = [item.split(" ")[0] for item in df['date'].values]
df['days'] = days
grouped_tweets = df[['days', 'number_tweets']].groupby('days')
tweet_growth_days = grouped_tweets.sum()
tweet_growth_days['days']= tweet_growth_days.index
print tweet_growth_days
Related
I am trying to adapt TF-IDF on my data ([using the code by Dr. W.J.B. Mattingly: https://github.com/wjbmattingly/topic_modeling_textbook/blob/main/lessons/02_tf_idf_official.py) on my data - descriptions of the startups from Startup blink website.
I cannot get the main idea on how to better deal with the extraction of all words as now the output is the string with all words all together like this - also you will notice lots of empty lists inside as well:
[['qualitygeotechnicalinvestigationtestinggeotechnicalreportspreconditiondevelopmentideasnewprojectimplementationintensivefieldlaboratorytestingsnecessaryobtaininputdatasoillayerscapacitysettlementcategorizationqualitymaterials']
s = requests.Session()
df = pd.DataFrame()
for p in tqdm(range(2000)):
r = s.get(f'https://www.startupblink.com/api/entities?entity=startups&page={p}')
d = pd.json_normalize(r.json()['page'])
df = pd.concat([df, d], axis=0, ignore_index=True)
df.to_csv('World_startups.csv')
# selecting only ESG related startups
esg = df[df['subindustry_name'].isin(['Energy', 'Energy & Environment-Other', 'Smart Cities', 'Smart Home', 'Public Transportation', 'Sustainability',
'Transportation-Other','Waste Management'])]
esg = esg[['title', 'description', 'subindustry_name']]
description = esg.description.tolist()
#description = description.remove(np.nan)
def remove_stopwords(text, stops):
words = text.split()
final = []
for word in words:
if word not in stops:
final.append(word)
final = "".join(final)
final = final.translate(str.maketrans("", "", string.punctuation))
final = "".join([i for i in final if not i.isdigit()])
while " " in final:
final = final.replace(" ", " ")
return final
def clean_docs(docs):
stops = stopwords.words('english')
final = []
for doc in docs:
clean_doc = remove_stopwords(doc, stops)
final.append(clean_doc)
return (final)
cleaned_docs = clean_docs(description)
vectorizer = TfidfVectorizer(lowercase=True,
max_features=100,
# max_df=.9,# percentage
# min_df=2, # number of
ngram_range=(1,3),
stop_words = 'english') # up to triagrams
vectors = vectorizer.fit_transform(cleaned_docs)
feature_names = vectorizer.get_feature_names_out()
dense = vectors.todense()
denselist = dense.tolist()
# Printing all unique dense values to mid-check
densearray = numpy.array(denselist)
print(numpy.unique(densearray))
all_keywords = []
for d in denselist:
x=0
keywords = []
for word in d:
if word > 0:
keywords.append(feature_names[x])
x=x+1
all_keywords.append(keywords)
all_keywords[7]
print(len(all_keywords))
# the list contains lots of emptly lists inside - will remove them
all_keywords = [ele for ele in all_keywords if ele != []]
print('')
print(len(all_keywords))
print(all_keywords[7])
I created the following method
import numpy as np
import re
from nltk.corpus import stopwords
def clean_tweet(tweet):
if type(tweet) == np.float:
return ""
temp = tweet.lower()
temp = re.sub("'", "", temp) # to avoid removing contractions in english
temp = re.sub("#[A-Za-z0-9_]+","", temp)
temp = re.sub("#[A-Za-z0-9_]+","", temp)
temp = re.sub(r'http\S+', '', temp)
temp = re.sub('[()!?]', ' ', temp)
temp = re.sub('\[.*?\]',' ', temp)
temp = re.sub("[^a-z0-9]"," ", temp)
temp = temp.split()
temp = [w for w in temp if not w in stopwords]
temp = " ".join(word for word in temp)
return temp
and I have a pandas dataframe with 1000 tweets to clean
If I try this:
df['cleantweet'] = df.apply(lambda row : clean_tweet(row['Tweet']), axis = 1)
I get this error:
<1 sec
TypeError: argument of type 'WordListCorpusReader' is not iterable
Update:
How did I fill the dataframe
paginator = tweepy.Paginator(
client.search_recent_tweets, # The method you want to use
"#GunControlNow -is:retweet", # Some argument for this method
max_results=100 # How many tweets asked per request
)
import pandas as pd
tweets = []
for tweet in paginator.flatten(limit=10000): # Total number of tweets to retrieve
tweets.append(tweet.text)
df = pd.DataFrame (tweets, columns = ['Tweet'])
df
from azureml.core import Workspace, Dataset
subscription_id = 'x'
resource_group = 'x'
workspace_name = 'x'
workspace = Workspace(subscription_id, resource_group, workspace_name)
from azureml.core import Datastore, Dataset
datastore = Datastore.get(workspace, 'workspaceblobstore')
dataset = Dataset.Tabular.register_pandas_dataframe(df, datastore, "tweets", show_progress=True)
Refer to the following: WordListCorpusReader is not iterable
You just need to define a variable for the stopwords that reads from the stopwords object that you import from nltk corpus:
stopwords = set(stopwords.words("english"))
I am scraping api data and totaling counts of different values into a dictionary 'building':'count' for each player (row). I would like to be able to analyze it further. An easy solution would be to pull the different unique 'buildings' (dictionary keys within the row) as dataframe columns and then do the equivalent of an index/match/match on them. The script currently gets the data, and I can extract the unique keys, but I am lost at how to make them into DF columns and then how to do the index/match/match. There may be a better approach from even before running the 'count' part of the script.
You should be able to run the script, no credentials are required to GET against the API. If you see the ranklist DF column with the building counts you will see what I am referencing.
Thank you for any guidance!
import requests
import pandas as pd
from datetime import datetime
from datetime import date
from datetime import timedelta
import operator
from time import sleep
ranklist = pd.DataFrame()
for i in range(430):
baserank_url = 'https://www.simcompanies.com/api/v3/encyclopedia/ranking/' + str(i) + '/'
r = requests.get(baserank_url)
rank_json = r.json()
df = pd.DataFrame.from_dict(rank_json)
df=df.filter(['company','id','rank','value'])
ranklist = ranklist.append(df)
ranklist.to_csv(r'rank_dataframe.csv',index=False)
print('Ranking list started succesfully!')
levellist=[]
bcolist=[]
today= date.today()
for row in ranklist.itertuples():
attempt = 0
while True:
if attempt == 6:
break
try:
print(str(row.rank + 1) +' ' + str(attempt))
account_url = 'https://www.simcompanies.com/api/v2/players/' + str(row.id) + '/'
r = requests.get(account_url)
account_json = r.json()
playerid = account_json.get("player").get("id")
playerlevel = account_json.get("player").get("level")
datestart = datetime.strptime(account_json.get("player").get("dateJoined")[:10],'%Y-%m-%d').date()
yearsactive = round((today - datestart)/ timedelta(days=365.2425),2)
buildings = account_json.get("buildings")
certificates = account_json.get("certificates")
bnames = [d['name'] for d in buildings]
bnames = [n.replace('Park','Recreation').replace('Lake','Recreation').replace('Castle','Recreation') for n in bnames]
cnames = [d['name'] for d in certificates]
sptr = 'Yes' if 'Supporter' in cnames else 'No'
dictOfElems = dict()
for elem in bnames:
if elem in dictOfElems:
dictOfElems[elem] += 1
else:
dictOfElems[elem] = 1
blist = {key:value for key, value in dictOfElems.items()}
blist = dict(sorted(blist.items(),key=operator.itemgetter(1),reverse=True))
bcolist.append([blist.keys()])
levellist.append([playerid, playerlevel,sptr, datestart,yearsactive,blist])
except:
sleep(20)
attempt +=1
continue
break
#get unique building values
bcodf= pd.DataFrame(bcolist,columns=['buildings'])
bcouni = list(set([a for b in bcodf.buildings.tolist() for a in b]))
print(bcouni)
leveldf = pd.DataFrame(levellist,columns=['id','level','sptr','datestart','yearsactive','blist'])
#clist = list(set([a for b in leveldf.cnames.tolist() for a in b]))
#print(leveldf[blist])
#bul = leveldf[blist].keys()
#buniq = list(set([a for b in leveldf.bul.tolist() for a in b]))
#print(bul)
ranklist = ranklist.merge(leveldf, on='id', how='left')
ranklist['rank'] +=1
ranklist.to_csv(r'rank_dataframe.csv',index=False)
I am trying to extract audio features from Spotify using track URIs. I have a list of 500k and would like to extract audio features for all. I have a workable code below and can extract features of 80 songs. I need some help in modifying the code below to extract 80 at a time so I don't run afoul of the Spotify limit. An example of the list is below
['spotify:track:2d7LPtieXdIYzf7yHPooWd',
'spotify:track:0y4TKcc7p2H6P0GJlt01EI',
'spotify:track:6q4c1vPRZREh7nw3wG7Ixz',
'spotify:track:54KFQB6N4pn926IUUYZGzK',
'spotify:track:0NeJjNlprGfZpeX2LQuN6c']
client_id = 'xxx'
client_secret = 'xxx'
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
def get_audio_features(saved_uris):
artist = []
track = []
danceability = []
energy = []
key = []
loudness = []
mode = []
speechiness = []
acousticness = []
instrumentalness = []
liveness = []
valence = []
tempo = []
duration_ms = []
for uri in saved_uris:
x = sp.audio_features(uri)
y = sp.track(uri)
for audio_features in x:
danceability.append(audio_features['danceability'])
energy.append(audio_features['energy'])
key.append(audio_features['key'])
loudness.append(audio_features['loudness'])
mode.append(audio_features['mode'])
speechiness.append(audio_features['speechiness'])
acousticness.append(audio_features['acousticness'])
instrumentalness.append(audio_features['instrumentalness'])
liveness.append(audio_features['liveness'])
valence.append(audio_features['valence'])
tempo.append(audio_features['tempo'])
duration_ms.append(audio_features['duration_ms'])
artist.append(y['album']['artists'][0]['name'])
track.append(y['name'])
df = pd.DataFrame()
df['artist'] = artist
df['track'] = track
df['danceability'] = danceability
df['energy'] = energy
df['key'] = key
df['loudness'] = loudness
df['mode'] = mode
df['speechiness'] = speechiness
df['acousticness'] = acousticness
df['instrumentalness'] = instrumentalness
df['liveness'] = liveness
df['valence'] = valence
df['tempo'] = tempo
df['duration_ms'] = duration_ms
df.to_csv('data/xxx.csv')
return df
My output is a dataframe and it looks like this and I have cut some columns for readibility:
artist track danceability energy key loudness
Sleeping At Last Chasing Cars 0.467 0.157 11
This code will return you dataframe that you require.
import spotipy
import time
from spotipy.oauth2 import SpotifyClientCredentials #To access authorised Spotify data4
import pandas as pd
client_id = 'paste client_id here'
client_secret = 'paste client_secret here'
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
sp.trace=False
#your uri list goes here
s_list = ['spotify:track:2d7LPtieXdIYzf7yHPooWd','spotify:track:0y4TKcc7p2H6P0GJlt01EI','spotify:track:6q4c1vPRZREh7nw3wG7Ixz','spotify:track:54KFQB6N4pn926IUUYZGzK','spotify:track:0NeJjNlprGfZpeX2LQuN6c']
#put uri to dataframe
df = pd.DataFrame(s_list)
df.columns = ['URI']
df['energy'] = ''*df.shape[0]
df['loudness'] = ''*df.shape[0]
df['speechiness'] = ''*df.shape[0]
df['valence'] = ''*df.shape[0]
df['liveness'] = ''*df.shape[0]
df['tempo'] = ''*df.shape[0]
df['danceability'] = ''*df.shape[0]
for i in range(0,df.shape[0]):
time.sleep(random.uniform(3, 6))
URI = df.URI[i]
features = sp.audio_features(URI)
df.loc[i,'energy'] = features[0]['energy']
df.loc[i,'speechiness'] = features[0]['speechiness']
df.loc[i,'liveness'] = features[0]['liveness']
df.loc[i,'loudness'] = features[0]['loudness']
df.loc[i,'danceability'] = features[0]['danceability']
df.loc[i,'tempo'] = features[0]['tempo']
df.loc[i,'valence'] = features[0]['valence']
uri=0
Output:
Hope, this solves your problem.
I am using the tweepy and geocode packages to convert ZipCodes to lat and long to then pull from the twitter API using tweepy however I am not getting anything to return. I have gone through and executed my code line by line and get stuck on the api.search every time returning nothing.
query = 'stack'
radius = 1000
DataSet = pd.DataFrame
loopCount = 0
appended_data = []
appendData = []
def toDataFrame(tweets):
DataSet = pd.DataFrame()
DataSet['tweetID'] = [tweet.id for tweet in tweets]
DataSet['tweetText'] = [tweet.text for tweet in tweets]
DataSet['tweetRetweetCt'] = [tweet.retweet_count for tweet in tweets]
DataSet['tweetFavoriteCt'] = [tweet.favorite_count for tweet in tweets]
DataSet['tweetSource'] = [tweet.source for tweet in tweets]
DataSet['tweetCreated'] = [tweet.created_at for tweet in tweets]
DataSet['userID'] = [tweet.user.id for tweet in tweets]
DataSet['userScreen'] = [tweet.user.screen_name for tweet in tweets]
DataSet['userName'] = [tweet.user.name for tweet in tweets]
DataSet['userCreateDt'] = [tweet.user.created_at for tweet in tweets]
DataSet['userDesc'] = [tweet.user.description for tweet in tweets]
DataSet['userFollowerCt'] = [tweet.user.followers_count for tweet in tweets]
DataSet['userFriendsCt'] = [tweet.user.friends_count for tweet in tweets]
DataSet['userLocation'] = [tweet.user.location for tweet in tweets]
DataSet['userTimezone'] = [tweet.user.time_zone for twee
def location(zip):
geolocator = Nominatim()
location = geolocator.geocode(zip)
cordinates = ((location.latitude, location.longitude))
cordinates = str(cordinates)
cordinates = cordinates.replace("(","")
cordinates = cordinates.replace(")","")
return cordinates
def lookUp(results):
for result in results:
DataSet = pd.DataFrame(results)
print DataSet
return DataSet
##hidden for SO
auth = tp.OAuthHandler('','')
auth.set_access_token('', '')
api = tp.API(auth)
for zip in zips:
#for row, zip in zips.iterrows():
if (loopCount == 15):
t.sleep(960)
loopCount = 0
loopCount = loopCount + 1
cordinates = location(zip)
inputCode = cordinates + ', ' + str(radius)
results = api.search(geocode=inputCode, count=100, q=query)
DataSet = lookUp(results)
appendData.append(DataSet)
appended_data = pd.concat(appendedData, axis=1)
Be careful not to pass spaces in the geocode, and also add the units. For example, using your function location,
In [5]:
zip = 28039
cordinates = location(zip)
In [23]:
radius = '1km'
inputCode = cordinates + ', ' + str(radius)
inputCode = inputCode.replace(' ', '')
inputCode
Out[23]:
'40.4604043354592,-3.70401484102134,1km'
In [24]:
query = 'a'
results = api.search(geocode=inputCode, count=100, q=query)
In [25]:
len(results)
Out[25]:
100
Reference twitter docs:
The parameter value is specified by “latitude,longitude,radius”, where
radius units must be specified as either “mi” (miles) or “km”
(kilometers).
Hope it helps.