How can I add the sentiment results as rows to the datatable? - python

import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
def sentiment_Vader(review):
over_all_polarity = sid.polarity_scores(review)
print(over_all_polarity)
exit()
if over_all_polarity['compound'] >= 0.05:
return "positive"
elif over_all_polarity['compound'] <= -0.05:
return "negative"
else:
return "neutral"
sid = SentimentIntensityAnalyzer()
metin = pd.read_excel('/content/eng kaunos 1.xlsx')
metin['sentiment_vader'] = metin['review'].apply(lambda x: sentiment_Vader(x))
metin.head(35)
I want to add the polarity results as a column to the dataset I'm doing a sentiment analysis. What code can I do this with?
enter image description here
I tried some codes but no result

Related

AttributeError: 'function' object has no attribute 'iterrows'

import pandas as pd
from pandas import DataFrame
import nltk
import numpy as np
from sklearn.metrics import accuracy_score
**this is the library**
df = pd.read_csv('spam.csv', encoding='ISO-8859-1')
**the dataframe**
def get_feature(text):
if len(text)==2:
return {'ham':text[-3]}
elif len(text)>=1:
return {'spam':text[-3]}
else:
return {'ham':'', 'spam':''}
**get future**
def get_feature_text(text):
if len(text)==2:
return {'spam':text[-2], 'ham':text[-1]}
else:
return {'spam':DataFrame.rename(text[-2])[0], 'ham':DataFrame.rename(text[-1])[0]}
**get future of text**
def get_data(df, get_feature=get_feature):
featrues = []
for i, row in df.iterrows():
text = row['v1']; type = row['v2']
if isinstance(text, str):
if ' ' in text:
text = text.replace(' ', '')
if '(' not in text:
featrues.append((get_feature(text), type.strip('() ')))
else:
text = text.partition('(')[0]
featrues.append((get_feature(text), type.strip('() ')))
return featrues
**get data all**
def get_train_test(featrues, ratio=0.9):
N = len(featrues)
T = int(N * ratio)
train = featrues[:T]
test = featrues[T:]
return train, test
**train and test**
def text_classifier(df, f=get_feature):
data = get_data(df, f)
train, test = get_train_test(data)
classifier = nltk.NaiveBayesClassifier.train(train)
acc = nltk.classify.accuracy(classifier, test)
return classifier, acc
**text classifier**
def show_type_of_text(text, texts=False, show_acc=False):
f = get_feature_text if texts else get_feature
classifier, acc = text_classifier(df, f)
if show_acc:
print("The accuracy of prediction is: ", acc)
clf = classifier.classify(f(text))
print(f'{text}: {clf}')
classifier.show_most_informative_features(10)
**show type of text**
def give_type(type1='spam', type2='ham'):
data = get_data(df, get_feature)
classifier = nltk.NaiveBayesClassifier.train(data)
following = classifier.prob_classify({'ham':type2, 'spam':type1})
x = following.generate()
print(f'{type2}: {type1}{x}')
**give type**
if __name__ == '__main__':
print('-wait a minute-')
show_type_of_text("spam")
print((text_classifier(accuracy_score)))
**i expecting to print the score
from this script I want to score the accuracy of the code that I made but here the problem is itterows then what should I do? I've read the documentation about accuracy_score but still the same result doesn't want to come out**
**
from this script I want to score the accuracy of the code that I made but here the problem is itterows then what should I do? I've read the documentation about accuracy_score but still the same result doesn't want to come out**

How do i put this type of dataframe into a TKinter graph?

I'm an undergrad new to python TKinter. I'm trying to replicate this graph in a frame called frame_info. what i dont understand its that unlike other TKinter examples, the ax is a subplot.
The code below draws out the data from a csv into a dataframe, and sorts the data accordingly to its columns.
def getcompreddata(self, hotel_name1, hotel_name2):
# Back-end read data
#BE_reviews_df = pd.read_csv("i removed this file location",
#skip_blank_lines=True, header=0)
BE_reviews_df = pd.read_csv("C:/UNI/PROJECTS/pythonProject/INF1002_project/Input/updatedcsv_hotelsg.csv",
skip_blank_lines=True, header=0)
####################################### Date Frame for Hotel 1 ##########################################
def dataframe1(hotelname1):
BE_reviews_df_1 = BE_reviews_df[BE_reviews_df['name'] == hotelname1]
pd.set_option('display.max_columns', 10)
# create the label
BE_reviews_df_1["review.rating"] = BE_reviews_df_1["reviews.rating"]
BE_reviews_df_1["is_bad_review"] = BE_reviews_df_1["reviews.rating"].apply(lambda x: 1 if x < 5 else 0)
# select only relevant columns
BE_reviews_df_1 = BE_reviews_df_1[["name", "reviews.text", "review.rating", "is_bad_review"]]
BE_reviews_df_1.head()
# Reviews data is sampled in order to speed up computations
# BE_reviews_df = BE_reviews_df.sample(frac=0.1, replace=False, random_state=42)
# remove 'No Negative' or 'No Positive' from text
BE_reviews_df_1["reviews.text"] = BE_reviews_df_1["reviews.text"].apply(
lambda x: x.replace("No Negative", "").replace("No Positive", ""))
# Drop all nan text
BE_reviews_df_1["reviews.text"] = BE_reviews_df_1["reviews.text"].astype(str)
BE_reviews_df_1.drop(BE_reviews_df_1[BE_reviews_df_1["reviews.text"] == "nan"].index, inplace=True)
# remove 'No Negative' or 'No Positive' from text
BE_reviews_df_1["reviews.text"] = BE_reviews_df_1["reviews.text"].apply(
lambda x: x.replace("No Negative", "").replace("No Positive", ""))
# clean text data
regexp = RegexpTokenizer('\w+')
BE_reviews_df_1["reviews.text"] = BE_reviews_df_1["reviews.text"].apply(regexp.tokenize)
BE_reviews_df_1["reviews.text"] = BE_reviews_df_1["reviews.text"].apply(
lambda x: ' '.join([item for item in x if len(item) > 2]))
BE_reviews_df_1["review_clean"] = BE_reviews_df_1["reviews.text"].apply(lambda x: clean_text(x))
# manual Categories Tagging
BE_reviews_df_1['category'] = BE_reviews_df_1['review_clean'].apply(lambda x: categories_tagging(x))
# Sentiment Analyzer
sid = SentimentIntensityAnalyzer()
BE_reviews_df_1["sentiments"] = BE_reviews_df_1["reviews.text"].apply(lambda x: sid.polarity_scores(x))
BE_reviews_df_1 = pd.concat(
[BE_reviews_df_1.drop(['sentiments'], axis=1), BE_reviews_df_1['sentiments'].apply(pd.Series)], axis=1)
# Create new variable with sentiment "neutral," "positive" and "negative"
BE_reviews_df_1['sentiment'] = BE_reviews_df_1['compound'].apply(
lambda x: 'positive' if x > 0 else 'neutral' if x == 0 else 'negative')
# sort data by name
BE_reviews_df_1.sort_values(by=['name'], inplace=True)
# Add emotion hotel 1
BE_reviews_df_1['emotion'] = BE_reviews_df_1['review_clean'].apply(lambda x: NRCLex(x).raw_emotion_scores)
# add number of characters column
BE_reviews_df_1["nb_chars"] = BE_reviews_df_1["reviews.text"].apply(lambda x: len(x))
# add number of words column
BE_reviews_df_1["nb_words"] = BE_reviews_df_1["reviews.text"].apply(lambda x: len(x.split(" ")))
tfidf = TfidfVectorizer(min_df=10)
tfidf_result = tfidf.fit_transform(BE_reviews_df_1["review_clean"]).toarray()
tfidf_df = pd.DataFrame(tfidf_result, columns=tfidf.get_feature_names())
tfidf_df.columns = ["word_" + str(x) for x in tfidf_df.columns]
tfidf_df.index = BE_reviews_df_1.index
BE_reviews_df_1 = pd.concat([BE_reviews_df_1, tfidf_df], axis=1)
BE_reviews_df_1.head()
# highest positive sentiment reviews (with more than 5 words)
BE_reviews_df_1[BE_reviews_df_1["nb_words"] >= 5].sort_values("pos", ascending=False)[
["reviews.text", "pos"]].head(10)
return BE_reviews_df_1
####################################### Date Frame for Hotel 2 ##########################################
def dataframe2(hotelname2):
BE_reviews_df_2 = BE_reviews_df[BE_reviews_df['name'] == hotelname2]
pd.set_option('display.max_columns', 10)
# create the label
BE_reviews_df_2["review.rating"] = BE_reviews_df_2["reviews.rating"]
BE_reviews_df_2["is_bad_review"] = BE_reviews_df_2["reviews.rating"].apply(lambda x: 1 if x < 5 else 0)
# select only relevant columns
BE_reviews_df_2 = BE_reviews_df_2[["name", "reviews.text", "review.rating", "is_bad_review"]]
BE_reviews_df_2.head()
# Reviews data is sampled in order to speed up computations
# BE_reviews_df = BE_reviews_df.sample(frac=0.1, replace=False, random_state=42)
# Drop all nan text
BE_reviews_df_2["reviews.text"] = BE_reviews_df_2["reviews.text"].astype(str)
BE_reviews_df_2.drop(BE_reviews_df_2[BE_reviews_df_2["reviews.text"] == "nan"].index, inplace=True)
# remove 'No Negative' or 'No Positive' from text
BE_reviews_df_2["reviews.text"] = BE_reviews_df_2["reviews.text"].apply(
lambda x: x.replace("No Negative", "").replace("No Positive", ""))
# clean text data
regexp = RegexpTokenizer('\w+')
BE_reviews_df_2["reviews.text"] = BE_reviews_df_2["reviews.text"].apply(regexp.tokenize)
BE_reviews_df_2["reviews.text"] = BE_reviews_df_2["reviews.text"].apply(
lambda x: ' '.join([item for item in x if len(item) > 2]))
BE_reviews_df_2["review_clean"] = BE_reviews_df_2["reviews.text"].apply(lambda x: clean_text(x))
# manual Categories Tagging
BE_reviews_df_2['category'] = BE_reviews_df_2['review_clean'].apply(lambda x: categories_tagging(x))
# Sentiment Analyzer
sid = SentimentIntensityAnalyzer()
BE_reviews_df_2["sentiments"] = BE_reviews_df_2["reviews.text"].apply(lambda x: sid.polarity_scores(x))
BE_reviews_df_2 = pd.concat(
[BE_reviews_df_2.drop(['sentiments'], axis=1), BE_reviews_df_2['sentiments'].apply(pd.Series)], axis=1)
# Create new variable with sentiment "neutral," "positive" and "negative"
BE_reviews_df_2['sentiment'] = BE_reviews_df_2['compound'].apply(
lambda x: 'positive' if x > 0 else 'neutral' if x == 0 else 'negative')
# sort data by name
BE_reviews_df_2.sort_values(by=['name'], inplace=True)
# Add emotion hotel 2
BE_reviews_df_2['emotion'] = BE_reviews_df_2['review_clean'].apply(lambda x: NRCLex(x).raw_emotion_scores)
# add number of characters column
BE_reviews_df_2["nb_chars"] = BE_reviews_df_2["reviews.text"].apply(lambda x: len(x))
# add number of words column
BE_reviews_df_2["nb_words"] = BE_reviews_df_2["reviews.text"].apply(lambda x: len(x.split(" ")))
tfidf = TfidfVectorizer(min_df=10)
tfidf_result = tfidf.fit_transform(BE_reviews_df_2["review_clean"]).toarray()
tfidf_df = pd.DataFrame(tfidf_result, columns=tfidf.get_feature_names())
tfidf_df.columns = ["word_" + str(x) for x in tfidf_df.columns]
tfidf_df.index = BE_reviews_df_2.index
BE_reviews_df_2 = pd.concat([BE_reviews_df_2, tfidf_df], axis=1)
BE_reviews_df_2.head()
# highest positive sentiment reviews (with more than 5 words)
BE_reviews_df_2[BE_reviews_df_2["nb_words"] >= 5].sort_values("pos", ascending=False)[
["reviews.text", "pos"]].head(
10)
return BE_reviews_df_2
# Comparision plt
plt_compare_number_of_pos_neg_review(self, hotel_name1, dataframe1(hotel_name1), hotel_name2, dataframe2(hotel_name2))
This is the code that i'm confused about, how do i get this data into a TKinter frame?
def plt_compare_number_of_pos_neg_review(self, hotelname1, dataframe1, hotelname2, dataframe2):
# create new data frames for all sentiments
review_neg1 = dataframe1[dataframe1["sentiment"] == "negative"]
review_neu1 = dataframe1[dataframe1["sentiment"] == "neutral"]
review_pos1 = dataframe1[dataframe1["sentiment"] == "positive"]
review_neg2 = dataframe2[dataframe2["sentiment"] == "negative"]
review_neu2 = dataframe2[dataframe2["sentiment"] == "neutral"]
review_pos2 = dataframe2[dataframe2["sentiment"] == "positive"]
# function for calculating the percentage of all the sentiments
def calc_percentage(x, y):
return x / y * 100
pos_per1 = calc_percentage(len(review_pos1), len(dataframe1))
neg_per1 = calc_percentage(len(review_neg1), len(dataframe1))
neu_per1 = calc_percentage(len(review_neu1), len(dataframe1))
pos_per2 = calc_percentage(len(review_pos2), len(dataframe2))
neg_per2 = calc_percentage(len(review_neg2), len(dataframe2))
neu_per2 = calc_percentage(len(review_neu2), len(dataframe2))
# Python dictionary
hotel1Andhotel2 = {hotelname1: [float(format(neg_per1, '.1f')), float(format(neu_per1, '.1f')),
float(format(pos_per1, '.1f'))],
hotelname2: [float(format(neg_per2, '.1f')), float(format(neu_per2, '.1f')),
float(format(pos_per2, '.1f'))]};
index = ['Negative', 'Neutral', 'Positive'];
# Python dictionary into a pandas DataFrame
dataFrame = pd.DataFrame(data=hotel1Andhotel2);
dataFrame.index = index
figure3 = plt.Figure(figsize=(5, 4), dpi=75)
ax = dataFrame.plot.barh(rot=15, title="Sentiment Analysis of Both Reviews")
for container in ax.containers:
ax.bar_label(container)
canvas = FigureCanvasTkAgg(figure3, self.frame_info)
canvas.get_tk_widget().grid(row=1, column=0, columnspan=2, padx=10, pady=10)
so far i've gotten this result
The graph im trying to replicate looks like this

TypeError: argument of type 'WordListCorpusReader' is not iterable

I created the following method
import numpy as np
import re
from nltk.corpus import stopwords
def clean_tweet(tweet):
if type(tweet) == np.float:
return ""
temp = tweet.lower()
temp = re.sub("'", "", temp) # to avoid removing contractions in english
temp = re.sub("#[A-Za-z0-9_]+","", temp)
temp = re.sub("#[A-Za-z0-9_]+","", temp)
temp = re.sub(r'http\S+', '', temp)
temp = re.sub('[()!?]', ' ', temp)
temp = re.sub('\[.*?\]',' ', temp)
temp = re.sub("[^a-z0-9]"," ", temp)
temp = temp.split()
temp = [w for w in temp if not w in stopwords]
temp = " ".join(word for word in temp)
return temp
and I have a pandas dataframe with 1000 tweets to clean
If I try this:
df['cleantweet'] = df.apply(lambda row : clean_tweet(row['Tweet']), axis = 1)
I get this error:
<1 sec
TypeError: argument of type 'WordListCorpusReader' is not iterable
Update:
How did I fill the dataframe
paginator = tweepy.Paginator(
client.search_recent_tweets, # The method you want to use
"#GunControlNow -is:retweet", # Some argument for this method
max_results=100 # How many tweets asked per request
)
import pandas as pd
tweets = []
for tweet in paginator.flatten(limit=10000): # Total number of tweets to retrieve
tweets.append(tweet.text)
df = pd.DataFrame (tweets, columns = ['Tweet'])
df
from azureml.core import Workspace, Dataset
subscription_id = 'x'
resource_group = 'x'
workspace_name = 'x'
workspace = Workspace(subscription_id, resource_group, workspace_name)
from azureml.core import Datastore, Dataset
datastore = Datastore.get(workspace, 'workspaceblobstore')
dataset = Dataset.Tabular.register_pandas_dataframe(df, datastore, "tweets", show_progress=True)
Refer to the following: WordListCorpusReader is not iterable
You just need to define a variable for the stopwords that reads from the stopwords object that you import from nltk corpus:
stopwords = set(stopwords.words("english"))

Making Excel histograms using python

This is the output of my python script so far.
Excel Table
The vertical axis of the table are road names. The horizontal axis are dates. The values indicate if a road was under construction at the time and why. I'd like to make a line graph that groups the dates by years 2017, 2018, 2019 etc... and plots the longest amount a time within those groups that a road was under construction and the average amount for the whole year. I'm a complete novice in excel and don't know how to leverage it's features to achieve my goal, though I suspect that there may be built in functions that do what I want without much difficulty. Any suggestions on how can achieve my desired output would be much appreciated. EDIT: It was suggested that I post my code so far.
import re
import time
startTime = time.time()
import collections
import xlsxwriter as xlswr
import scipy.spatial as spy
from itertools import islice
from itertools import groupby
from natsort import natsorted
from functools import partial
from collections import Counter
from datetime import date as DATE
from indexed import IndexedOrderedDict
from multiprocessing.dummy import Pool as ThreadPool
import multiprocessing as mp
workBook = xlswr.Workbook("testfix.xlsx")
cell_format = workBook.add_format()
format1 = workBook.add_format({'num_format': 'mm/dd/yy'})
sheet = workBook.add_worksheet()
def to_raw(string):
return fr"{string}"
def cvrt(x):
ans = re.split(r'(\d+)(?!.*\d)', x)
return int(ans[1])
def indexer(s):
pattern = re.compile(r'I, [0-9]+, ')
gm = re.split(pattern, s);
values = s.rsplit(gm[1])
gm = gm[1]
values[1] = gm
return values
def int2Date(x):
string = str(x)
Y = int(string[0:4])
M = int(string[4:6])
D = int(string[6:8])
return DATE(Y,M,D)
def dDelta(x, y):
string1 = str(x)
string2 = str(y)
Y1 = int(string1[0:4])
M1 = int(string1[4:6])
D1 = int(string1[6:8])
Y2 = int(string2[0:4])
M2 = int(string2[4:6])
D2 = int(string2[6:8])
f_date = DATE(Y1,M1,D1)
l_date = DATE(Y2,M2,D2)
delta = l_date - f_date
if isinstance(y, int):
return float(int((delta.days)/30.44))
else:
return int((delta.days)/30.44)
def Book(path):
file = open(path,'r')
lines = file.readlines()
file.close()
book = IndexedOrderedDict()
for line in lines:
if re.match("I", line):
IDs = indexer(line)[1]
if re.match(" 0.00,", line):
rID = line
#"GM_FINAL_AUTH,0,[1-9]"
if re.search("GM_FINAL_AUTH,0,[1-9]", line):
book.update({(rID, line): to_raw(IDs)})
return sort_book(book)
def dUpdate(dic, key, value):
return dic.update({(key[0], "GM_FINAL_AUTH,0,0"): value})
def valSplt(s):
pattern = re.compile(r'(\d+)')
gm = re.split(pattern, s)
values = s.rsplit(gm[1])
gm = gm[1]
values[1] = gm
return values
def sort_book(book):
book = natsorted([value, key] for key, value in book.items())
book = IndexedOrderedDict((data[1], data[0]) for data in book)
return book
def alph_order(word1, word2):
for i in range(min(len(word1), len(word2))):
if ord(word1[i]) == ord(word2[i]):
pass
elif ord(word1[i]) > ord(word2[i]):
return word2
else:
return word1
return word1
def read(cpdm, date_list):
sCnt = [0] * len(cpdm)
lowest_number = 999999999999
terminationCondition = [True] * len(cpdm)
saved_results = [0] * len(cpdm)
current_prefix = None
cnt = 0
while any(terminationCondition) is True:
saved_results = [0] * len(cpdm)
last_prefix = None
lowest_number = 999999999999
for dicIdx, dicVal in enumerate(sCnt):
if dicVal < len(cpdm[dicIdx]):
ID = cpdm[dicIdx].values()[dicVal]
# print(entry)
current_prefix, road_number = valSplt(ID)
road_number = int(road_number)
if last_prefix is None:
last_prefix = current_prefix
higherOrder_prefix = alph_order(last_prefix, current_prefix)
# print('check:',[higherOrder_prefix, last_prefix, current_prefix])
if current_prefix == higherOrder_prefix:
if current_prefix != last_prefix:
lowest_number = road_number
last_prefix = current_prefix
elif road_number < lowest_number:
lowest_number = road_number
last_prefix = current_prefix
for dicIdx, dicVal in enumerate(sCnt):
if dicVal < len(cpdm[dicIdx]):
# print(dicIdx, dicVal, len(cpdm[dicIdx]))
ID = cpdm[dicIdx].values()[dicVal]
VALUE = cpdm[dicIdx].keys()[dicVal]
# print(entry)
road_name, road_number = valSplt(ID)
road_number = int(road_number)
if road_name == last_prefix and lowest_number == road_number:
saved_results[dicIdx] = [ID, VALUE[1], date_list[dicIdx], VALUE[0]]
if dicVal < len(cpdm[dicIdx]):
sCnt[dicIdx] += 1
else:
terminationCondition[dicIdx] = False
else:
terminationCondition[dicIdx] = False
for rst in range(len(saved_results)):
if saved_results[rst] == 0:
pass
else:
sheet.write(cnt+1, 0, str(saved_results[rst][0]))
sheet.write(cnt+1, rst+1, cvrt(saved_results[rst][1]))
#sheet.write(cnt+1, 2*et+3, int2Date(saved_results[et][2]), format1)
#sheet.write(cnt+1, 0, saved_results[rst][3])
cnt += 1
def main():
# 2018 MAPS
path1 = "W:\\Scripting\\2018\\DBData_84577881.txt"
path2 = "W:\\Scripting\\2018\\DBData_84639568.txt"
path3 = "W:\\Scripting\\2018\\DBData_84652483.txt"
path4 = "W:\\Scripting\\2018\\DBData_84670490.txt"
# 2019 MAPS
path5 = "W:\\Scripting\\2019\\DBData_84706383.txt"
path6 = "W:\\Scripting\\2019\\DBData_84715201.txt"
path7 = "W:\\Scripting\\2019\\DBData_84743195.txt"
path8 = "W:\\Scripting\\2019\\DBData_84777742.txt"
path9 = "W:\\Scripting\\2019\\DBData_84815446.txt"
path10 = "W:\\Scripting\\2019\\DBData_84835743.txt"
# 2020 MAPS
path11 = "W:\\Scripting\\2020\\DBData_84882849.txt"
path12 = "W:\\Scripting\\2020\\DBData_84966202.txt"
path13 = "W:\\Scripting\\2020\\DBData_84988789.txt"
p_list = [path1, path2, path3, path4, path5, path6, path7,
path8, path9, path10, path11, path12, path13]
pool = mp.Pool(mp.cpu_count())
CPDM = pool.map(Book, p_list)
pool.close()
#pool.join()
date_list = [20180809, 20180913, 20181011, 20181204, 20190222, 20190325,
20190501, 20190628, 20190815, 20190925, 20200207, 20200501, 20200617]
#CPDM = [b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13]
for i in CPDM:
print(len(i))
#sheet.write("A1", "Lat Long")
sheet.write("A1", "ID")
#for i in range(len(CPDM)):
cn = 0
for i in date_list:
#sheet.write(0, 3*i+1, "ID" + str(i+1))
sheet.write(0, cn+1, int2Date(i), format1)
cn += 1
#sheet.write(0, 2*i+3, "Date" + str(i+1))
read(CPDM, date_list)
workBook.close()
if __name__ == "__main__":
main()
executionTime = (time.time() - startTime)
print('Execution time in minutes: ' + str(executionTime/60))
Long story short, what you want is not exactly possible. Your data contains spot measurements, so what happened in between? Or after? Was the road under construction or not? This makes it impossible to calculate an accurate number of days that the road was under construction.
It is possible to do something that approximates what you want, but that will require some choices from your side. For example, if you measure that the road is under construction on 08/15/2019 but not anymore on 05/01/2020, do you count all the days between those 2 dates as closed? Or only until new years?
To help you get started I've added a little script that does some formatting on your data. It should give you an idea of how to handle the data.
import pandas
import plotly.express as px
# Read the Excel file
df = pandas.read_excel("./test.xlsx", index_col="ID")
# Flip the dataframe (dates should be on the index)
df = df.transpose()
# Fill any empty cells with 0
df = df.fillna(0)
# Combine columns with the same name
df = df.groupby(df.columns, axis=1).agg(lambda column: column.max(axis=1))
# Make sure the dates are sorted
df = df.sort_index()
# Create a list to hold all the periods per road
roads = []
for road_name in df.columns:
# Group by consecutive 1's
groups = df.loc[df[road_name] == 1, road_name].groupby((df[road_name] != 1).cumsum())
# Every group denotes a period for which the road was under construction
for _, group in groups:
# Get the start and finish for each group
roads.append({
"road": road_name,
"start": group.index[0],
"finish": group.index[-1] + pandas.Timedelta(1, unit="D"), # Add one day because groups with same start and finish will not be visible on the plot
})
# Convert back to a dataframe
roads_df = pandas.DataFrame(roads)
# Create a Gantt chart with Plotly (NOTE: you'll need version 4.9+ of Plotly)
fig = px.timeline(roads_df, x_start="start", x_end="finish", y="road")
fig.update_yaxes(autorange="reversed") # otherwise tasks are listed from the bottom up
fig.show()

Grouping Tweets by Half-Hour, Hour, and Day in Pandas Dataframe

I'm working on a Sentiment Analysis project using Twitter Data, and I've encountered a small problem regarding Dates. The code itself runs fine, but I don't know how to build custom time blocks for grouping my final data. Right now, it is defaulting to grouping them by the second, which is not very useful. I want to be able to group them in half-hour, hour, and day segments...
Feel free to skip to the bottom of the code to see where the issue lies!
Here is the code:
import tweepy
API_KEY = "XXXXX"
API_SECRET = XXXXXX"
auth = tweepy.AppAuthHandler(API_KEY, API_SECRET)
api = tweepy.API(auth, wait_on_rate_limit = True, wait_on_rate_limit_notify = True)
import sklearn as sk
import pandas as pd
import got3
#"Get Old Tweets" to find older data
tweetCriteria = got3.manager.TweetCriteria()
tweetCriteria.setQuerySearch("Kentucky Derby")
tweetCriteria.setSince("2016-05-07")
tweetCriteria.setUntil("2016-05-08")
tweetCriteria.setMaxTweets(1000)
TweetCriteria = got3.manager.TweetCriteria()
KYDerby_tweets = got3.manager.TweetManager.getTweets(tweetCriteria)
from afinn import Afinn
afinn = Afinn()
#getting afinn library to use for sentiment polarity analysis
for x in KYDerby_tweets:
Text = x.text
Retweets = x.retweets
Favorites = x.favorites
Date = x.date
Id = x.id
print(Text)
AllText = []
AllRetweets = []
AllFavorites = []
AllDates = []
AllIDs = []
for x in KYDerby_tweets:
Text = x.text
Retweets = x.retweets
Favorites = x.favorites
Date = x.date
AllText.append(Text)
AllRetweets.append(Retweets)
AllFavorites.append(Favorites)
AllDates.append(Date)
AllIDs.append(Id)
data_set = [[x.id, x.date, x.text, x.retweets, x.favorites]
for x in KYDerby_tweets]
df = pd.DataFrame(data=data_set, columns=["Id", "Date", "Text", "Favorites", "Retweets"])
#I now have a DataFrame with my basic info in it
pscore = []
for x in KYDerby_tweets:
afinn.score(x.text)
pscore.append(afinn.score(x.text))
df['P Score'] = pscore
#I now have the pscores for each Tweet in the DataFrame
nrc = pd.read_csv('C:\\users\\andrew.smith\\downloads\\NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt', sep="\t", names=["word", "emotion", "association"], skiprows=45)
#import NRC emotion lexicon
nrc = nrc[nrc["association"]==1]
nrc = nrc[nrc["emotion"].isin(["positive", "negative"]) == False]
#cleaned it up a bit
from nltk import TweetTokenizer
tt = TweetTokenizer()
tokenized = [x.lower() for x in tokenized]
#built my Tweet-specific, NRC-ready tokenizer
emotions = list(set(nrc["emotion"]))
index2emotion = {}
emotion2index = {}
for i in range(len(emotions)):
index2emotion[i] = emotions[i]
emotion2index[emotions[i]] = i
cv = [0] * len(emotions)
#built indices showing locations of emotions
for token in tokenized:
sub = nrc[nrc['word'] == token]
token_emotions = sub['emotion']
for e in token_emotions:
position_index = emotion2index[e]
cv[position_index]+=1
emotions = list(set(nrc['emotion']))
index2emotion = {}
emotion2index = {}
for i in range(len(emotions)):
index2emotion[i] = emotions[i]
emotion2index[emotions[i]] = i
def makeEmoVector(tweettext):
cv = [0] * len(emotions)
tokenized = tt.tokenize(tweettext)
tokenized = [x.lower() for x in tokenized]
for token in tokenized:
sub = nrc[nrc['word'] == token]
token_emotions = sub['emotion']
for e in token_emotions:
position_index = emotion2index[e]
cv[position_index] += 1
return cv
tweettext = df.iloc[14,:]['Text']
emotion_vectors = []
for text in df['Text']:
emotion_vector = makeEmoVector(text)
emotion_vectors.append(emotion_vector)
ev = pd.DataFrame(emotion_vectors, index=df.index, columns=emotions)
#Now I have a DataFrame with all of the emotion counts for each tweet
Date_Group = df.groupby("Date")
Date_Group[emotions].agg("sum")
#Finally, we arrive at the problem! When I run this, I end up with tweets that are grouped *by the second. What I want is to be able to group them: a) by the half-hour, b) by the hour, and c) by the day
Since, the default date format for tweets with the Tweepy API is "2017-04-14 18:41:56". To get tweets grouped by hour, you can do something as simple as this:
# This will get the time parameter
time = [item.split(" ")[1] for item in df['date'].values]
# This will get the hour parameter
hour = [item.split(":")[0] for item in time]
df['time'] = hour
grouped_tweets = df[['time', 'number_tweets']].groupby('time')
tweet_growth_hour = grouped_tweets.sum()
tweet_growth_hour['time']= tweet_growth_hour.index
print tweet_growth_hour
To group by date, you can do something similiar like:
days = [item.split(" ")[0] for item in df['date'].values]
df['days'] = days
grouped_tweets = df[['days', 'number_tweets']].groupby('days')
tweet_growth_days = grouped_tweets.sum()
tweet_growth_days['days']= tweet_growth_days.index
print tweet_growth_days

Categories

Resources