I am trying to adapt TF-IDF on my data ([using the code by Dr. W.J.B. Mattingly: https://github.com/wjbmattingly/topic_modeling_textbook/blob/main/lessons/02_tf_idf_official.py) on my data - descriptions of the startups from Startup blink website.
I cannot get the main idea on how to better deal with the extraction of all words as now the output is the string with all words all together like this - also you will notice lots of empty lists inside as well:
[['qualitygeotechnicalinvestigationtestinggeotechnicalreportspreconditiondevelopmentideasnewprojectimplementationintensivefieldlaboratorytestingsnecessaryobtaininputdatasoillayerscapacitysettlementcategorizationqualitymaterials']
s = requests.Session()
df = pd.DataFrame()
for p in tqdm(range(2000)):
r = s.get(f'https://www.startupblink.com/api/entities?entity=startups&page={p}')
d = pd.json_normalize(r.json()['page'])
df = pd.concat([df, d], axis=0, ignore_index=True)
df.to_csv('World_startups.csv')
# selecting only ESG related startups
esg = df[df['subindustry_name'].isin(['Energy', 'Energy & Environment-Other', 'Smart Cities', 'Smart Home', 'Public Transportation', 'Sustainability',
'Transportation-Other','Waste Management'])]
esg = esg[['title', 'description', 'subindustry_name']]
description = esg.description.tolist()
#description = description.remove(np.nan)
def remove_stopwords(text, stops):
words = text.split()
final = []
for word in words:
if word not in stops:
final.append(word)
final = "".join(final)
final = final.translate(str.maketrans("", "", string.punctuation))
final = "".join([i for i in final if not i.isdigit()])
while " " in final:
final = final.replace(" ", " ")
return final
def clean_docs(docs):
stops = stopwords.words('english')
final = []
for doc in docs:
clean_doc = remove_stopwords(doc, stops)
final.append(clean_doc)
return (final)
cleaned_docs = clean_docs(description)
vectorizer = TfidfVectorizer(lowercase=True,
max_features=100,
# max_df=.9,# percentage
# min_df=2, # number of
ngram_range=(1,3),
stop_words = 'english') # up to triagrams
vectors = vectorizer.fit_transform(cleaned_docs)
feature_names = vectorizer.get_feature_names_out()
dense = vectors.todense()
denselist = dense.tolist()
# Printing all unique dense values to mid-check
densearray = numpy.array(denselist)
print(numpy.unique(densearray))
all_keywords = []
for d in denselist:
x=0
keywords = []
for word in d:
if word > 0:
keywords.append(feature_names[x])
x=x+1
all_keywords.append(keywords)
all_keywords[7]
print(len(all_keywords))
# the list contains lots of emptly lists inside - will remove them
all_keywords = [ele for ele in all_keywords if ele != []]
print('')
print(len(all_keywords))
print(all_keywords[7])
Related
I'm an undergrad new to python TKinter. I'm trying to replicate this graph in a frame called frame_info. what i dont understand its that unlike other TKinter examples, the ax is a subplot.
The code below draws out the data from a csv into a dataframe, and sorts the data accordingly to its columns.
def getcompreddata(self, hotel_name1, hotel_name2):
# Back-end read data
#BE_reviews_df = pd.read_csv("i removed this file location",
#skip_blank_lines=True, header=0)
BE_reviews_df = pd.read_csv("C:/UNI/PROJECTS/pythonProject/INF1002_project/Input/updatedcsv_hotelsg.csv",
skip_blank_lines=True, header=0)
####################################### Date Frame for Hotel 1 ##########################################
def dataframe1(hotelname1):
BE_reviews_df_1 = BE_reviews_df[BE_reviews_df['name'] == hotelname1]
pd.set_option('display.max_columns', 10)
# create the label
BE_reviews_df_1["review.rating"] = BE_reviews_df_1["reviews.rating"]
BE_reviews_df_1["is_bad_review"] = BE_reviews_df_1["reviews.rating"].apply(lambda x: 1 if x < 5 else 0)
# select only relevant columns
BE_reviews_df_1 = BE_reviews_df_1[["name", "reviews.text", "review.rating", "is_bad_review"]]
BE_reviews_df_1.head()
# Reviews data is sampled in order to speed up computations
# BE_reviews_df = BE_reviews_df.sample(frac=0.1, replace=False, random_state=42)
# remove 'No Negative' or 'No Positive' from text
BE_reviews_df_1["reviews.text"] = BE_reviews_df_1["reviews.text"].apply(
lambda x: x.replace("No Negative", "").replace("No Positive", ""))
# Drop all nan text
BE_reviews_df_1["reviews.text"] = BE_reviews_df_1["reviews.text"].astype(str)
BE_reviews_df_1.drop(BE_reviews_df_1[BE_reviews_df_1["reviews.text"] == "nan"].index, inplace=True)
# remove 'No Negative' or 'No Positive' from text
BE_reviews_df_1["reviews.text"] = BE_reviews_df_1["reviews.text"].apply(
lambda x: x.replace("No Negative", "").replace("No Positive", ""))
# clean text data
regexp = RegexpTokenizer('\w+')
BE_reviews_df_1["reviews.text"] = BE_reviews_df_1["reviews.text"].apply(regexp.tokenize)
BE_reviews_df_1["reviews.text"] = BE_reviews_df_1["reviews.text"].apply(
lambda x: ' '.join([item for item in x if len(item) > 2]))
BE_reviews_df_1["review_clean"] = BE_reviews_df_1["reviews.text"].apply(lambda x: clean_text(x))
# manual Categories Tagging
BE_reviews_df_1['category'] = BE_reviews_df_1['review_clean'].apply(lambda x: categories_tagging(x))
# Sentiment Analyzer
sid = SentimentIntensityAnalyzer()
BE_reviews_df_1["sentiments"] = BE_reviews_df_1["reviews.text"].apply(lambda x: sid.polarity_scores(x))
BE_reviews_df_1 = pd.concat(
[BE_reviews_df_1.drop(['sentiments'], axis=1), BE_reviews_df_1['sentiments'].apply(pd.Series)], axis=1)
# Create new variable with sentiment "neutral," "positive" and "negative"
BE_reviews_df_1['sentiment'] = BE_reviews_df_1['compound'].apply(
lambda x: 'positive' if x > 0 else 'neutral' if x == 0 else 'negative')
# sort data by name
BE_reviews_df_1.sort_values(by=['name'], inplace=True)
# Add emotion hotel 1
BE_reviews_df_1['emotion'] = BE_reviews_df_1['review_clean'].apply(lambda x: NRCLex(x).raw_emotion_scores)
# add number of characters column
BE_reviews_df_1["nb_chars"] = BE_reviews_df_1["reviews.text"].apply(lambda x: len(x))
# add number of words column
BE_reviews_df_1["nb_words"] = BE_reviews_df_1["reviews.text"].apply(lambda x: len(x.split(" ")))
tfidf = TfidfVectorizer(min_df=10)
tfidf_result = tfidf.fit_transform(BE_reviews_df_1["review_clean"]).toarray()
tfidf_df = pd.DataFrame(tfidf_result, columns=tfidf.get_feature_names())
tfidf_df.columns = ["word_" + str(x) for x in tfidf_df.columns]
tfidf_df.index = BE_reviews_df_1.index
BE_reviews_df_1 = pd.concat([BE_reviews_df_1, tfidf_df], axis=1)
BE_reviews_df_1.head()
# highest positive sentiment reviews (with more than 5 words)
BE_reviews_df_1[BE_reviews_df_1["nb_words"] >= 5].sort_values("pos", ascending=False)[
["reviews.text", "pos"]].head(10)
return BE_reviews_df_1
####################################### Date Frame for Hotel 2 ##########################################
def dataframe2(hotelname2):
BE_reviews_df_2 = BE_reviews_df[BE_reviews_df['name'] == hotelname2]
pd.set_option('display.max_columns', 10)
# create the label
BE_reviews_df_2["review.rating"] = BE_reviews_df_2["reviews.rating"]
BE_reviews_df_2["is_bad_review"] = BE_reviews_df_2["reviews.rating"].apply(lambda x: 1 if x < 5 else 0)
# select only relevant columns
BE_reviews_df_2 = BE_reviews_df_2[["name", "reviews.text", "review.rating", "is_bad_review"]]
BE_reviews_df_2.head()
# Reviews data is sampled in order to speed up computations
# BE_reviews_df = BE_reviews_df.sample(frac=0.1, replace=False, random_state=42)
# Drop all nan text
BE_reviews_df_2["reviews.text"] = BE_reviews_df_2["reviews.text"].astype(str)
BE_reviews_df_2.drop(BE_reviews_df_2[BE_reviews_df_2["reviews.text"] == "nan"].index, inplace=True)
# remove 'No Negative' or 'No Positive' from text
BE_reviews_df_2["reviews.text"] = BE_reviews_df_2["reviews.text"].apply(
lambda x: x.replace("No Negative", "").replace("No Positive", ""))
# clean text data
regexp = RegexpTokenizer('\w+')
BE_reviews_df_2["reviews.text"] = BE_reviews_df_2["reviews.text"].apply(regexp.tokenize)
BE_reviews_df_2["reviews.text"] = BE_reviews_df_2["reviews.text"].apply(
lambda x: ' '.join([item for item in x if len(item) > 2]))
BE_reviews_df_2["review_clean"] = BE_reviews_df_2["reviews.text"].apply(lambda x: clean_text(x))
# manual Categories Tagging
BE_reviews_df_2['category'] = BE_reviews_df_2['review_clean'].apply(lambda x: categories_tagging(x))
# Sentiment Analyzer
sid = SentimentIntensityAnalyzer()
BE_reviews_df_2["sentiments"] = BE_reviews_df_2["reviews.text"].apply(lambda x: sid.polarity_scores(x))
BE_reviews_df_2 = pd.concat(
[BE_reviews_df_2.drop(['sentiments'], axis=1), BE_reviews_df_2['sentiments'].apply(pd.Series)], axis=1)
# Create new variable with sentiment "neutral," "positive" and "negative"
BE_reviews_df_2['sentiment'] = BE_reviews_df_2['compound'].apply(
lambda x: 'positive' if x > 0 else 'neutral' if x == 0 else 'negative')
# sort data by name
BE_reviews_df_2.sort_values(by=['name'], inplace=True)
# Add emotion hotel 2
BE_reviews_df_2['emotion'] = BE_reviews_df_2['review_clean'].apply(lambda x: NRCLex(x).raw_emotion_scores)
# add number of characters column
BE_reviews_df_2["nb_chars"] = BE_reviews_df_2["reviews.text"].apply(lambda x: len(x))
# add number of words column
BE_reviews_df_2["nb_words"] = BE_reviews_df_2["reviews.text"].apply(lambda x: len(x.split(" ")))
tfidf = TfidfVectorizer(min_df=10)
tfidf_result = tfidf.fit_transform(BE_reviews_df_2["review_clean"]).toarray()
tfidf_df = pd.DataFrame(tfidf_result, columns=tfidf.get_feature_names())
tfidf_df.columns = ["word_" + str(x) for x in tfidf_df.columns]
tfidf_df.index = BE_reviews_df_2.index
BE_reviews_df_2 = pd.concat([BE_reviews_df_2, tfidf_df], axis=1)
BE_reviews_df_2.head()
# highest positive sentiment reviews (with more than 5 words)
BE_reviews_df_2[BE_reviews_df_2["nb_words"] >= 5].sort_values("pos", ascending=False)[
["reviews.text", "pos"]].head(
10)
return BE_reviews_df_2
# Comparision plt
plt_compare_number_of_pos_neg_review(self, hotel_name1, dataframe1(hotel_name1), hotel_name2, dataframe2(hotel_name2))
This is the code that i'm confused about, how do i get this data into a TKinter frame?
def plt_compare_number_of_pos_neg_review(self, hotelname1, dataframe1, hotelname2, dataframe2):
# create new data frames for all sentiments
review_neg1 = dataframe1[dataframe1["sentiment"] == "negative"]
review_neu1 = dataframe1[dataframe1["sentiment"] == "neutral"]
review_pos1 = dataframe1[dataframe1["sentiment"] == "positive"]
review_neg2 = dataframe2[dataframe2["sentiment"] == "negative"]
review_neu2 = dataframe2[dataframe2["sentiment"] == "neutral"]
review_pos2 = dataframe2[dataframe2["sentiment"] == "positive"]
# function for calculating the percentage of all the sentiments
def calc_percentage(x, y):
return x / y * 100
pos_per1 = calc_percentage(len(review_pos1), len(dataframe1))
neg_per1 = calc_percentage(len(review_neg1), len(dataframe1))
neu_per1 = calc_percentage(len(review_neu1), len(dataframe1))
pos_per2 = calc_percentage(len(review_pos2), len(dataframe2))
neg_per2 = calc_percentage(len(review_neg2), len(dataframe2))
neu_per2 = calc_percentage(len(review_neu2), len(dataframe2))
# Python dictionary
hotel1Andhotel2 = {hotelname1: [float(format(neg_per1, '.1f')), float(format(neu_per1, '.1f')),
float(format(pos_per1, '.1f'))],
hotelname2: [float(format(neg_per2, '.1f')), float(format(neu_per2, '.1f')),
float(format(pos_per2, '.1f'))]};
index = ['Negative', 'Neutral', 'Positive'];
# Python dictionary into a pandas DataFrame
dataFrame = pd.DataFrame(data=hotel1Andhotel2);
dataFrame.index = index
figure3 = plt.Figure(figsize=(5, 4), dpi=75)
ax = dataFrame.plot.barh(rot=15, title="Sentiment Analysis of Both Reviews")
for container in ax.containers:
ax.bar_label(container)
canvas = FigureCanvasTkAgg(figure3, self.frame_info)
canvas.get_tk_widget().grid(row=1, column=0, columnspan=2, padx=10, pady=10)
so far i've gotten this result
The graph im trying to replicate looks like this
i have used this code for category detection..
import numpy as np
# Words -> category
categories = {word: key for key, words in data.items() for word in words}
# Load the whole embedding matrix
embeddings_index = {}
with open('glove.6B.100d.txt', encoding="utf8") as f:
for line in f:
values = line.split()
word = values[0]
embed = np.array(values[1:], dtype=np.float32)
embeddings_index[word] = embed
print('Loaded %s word vectors.' % len(embeddings_index))
# Embeddings for available words
data_embeddings = {key: value for key, value in embeddings_index.items() if key in categories.keys()}
# Processing the query
def process(query):
query_embed = embeddings_index[query]
scores = {}
for word, embed in data_embeddings.items():
category = categories[word]
dist = query_embed.dot(embed)
dist /= len(data[category])
scores[category] = scores.get(category, 0) + dist
return scores
# Testing
print(process('pizza'))
OUTPUT
{'service': 6.385544379552205, 'ambiance': 3.5752111077308655, 'Food': 12.912149047851562}
is there a way I only get the highest accuracy category like Food??
def process(query):
query_embed = embeddings_index[query]
scores = {}
for word, embed in data_embeddings.items():
category = categories[word]
dist = query_embed.dot(embed)
dist /= len(data[category])
scores[category] = scores.get(category, 0) + dist
return max(scores, key=scores.get)
You can use max() for this. This will return the key name of maximum value.
I am working on an information retrieval project, where I have to process a ~1.5 GB text data and create a Dictionary (words, document frequency) and posting list (document id, term frequency). According to the professor, it should take around 10-15 minutes. But my code is running for more than 8 hours now! I tried a smaller dataset (~35 MB) and it took 5 hours to process.
I am a newbie in python and I think it is taking so long because i have created many python dictionaries and lists in my code. I tried to use generator, but I am not sure how to work around with it.
file = open(filename, 'rt')
text = file.read()
file.close()
p = r'<P ID=\d+>.*?</P>'
tag = RegexpTokenizer(p)
passage = tag.tokenize(text)
doc_re = re.compile(r"<P ID=(\d+)>")
def process_data(docu):
tokens = RegexpTokenizer(r'\w+')
lower_tokens = [word.lower() for word in tokens.tokenize(docu)]
table = str.maketrans('','', string.punctuation)
stripped = [w.translate(table) for w in lower_tokens]
alpha = [word for word in stripped if word.isalpha()]
stopwordlist = stopwords.words('english')
stopped = [w for w in alpha if not w in stopwordlist]
return stopped
data = {}
for doc in passage:
group_docID = doc_re.match(doc)
docID = group_docID.group(1)
tokens = process_data(doc)
data[docID] = list(set(tokens))
vocab = [item for i in data.values() for item in i]
total_vocab = list(set(vocab))
total_vocab.sort()
print('Document Size = ', len(data))
print('Collection Size = ', len(vocab))
print('Vocabulary Size= ', len(total_vocab))
inv_index = {}
for x in total_vocab:
for y, z in data.items():
if x in z:
wordfreq = z.count(x)
inv_index.setdefault(x, []).append((int(y), wordfreq))
flattend = [item for tag in inv_index.values() for item in tag]
posting = [item for tag in flattend for item in tag ]
doc_freq=[]
for k,v in inv_index.items():
freq1=len([item for item in v if item])
doc_freq.append((freq1))
#offset value of each vocabulary/words
offset = []
offset1=0
for i in range(len(doc_freq)):
if i>0:
offset1 =offset1 + (doc_freq[i-1]*2)
offset.append((offset1))
#create dcitionary of words, document frequency and offset
dictionary = {}
for i in range(len(total_vocab)):
dictionary[total_vocab[i]]=(doc_freq[i],offset[i])
#dictionary of word, inverse document frequency
idf = {}
for i in range(len(dictionary)):
a = np.log2(len(data)/doc_freq[i])
idf[total_vocab[i]] = a
with open('dictionary.json', 'w') as f:
json.dump(dictionary,f)
with open('idf.json', 'w') as f:
json.dump(idf, f)
binary_file = open('binary_file.txt', 'wb')
for i in range(0, len(posting)):
binary_int = (posting[i]).to_bytes(4, byteorder = 'big')
binary_file.write(binary_int)
binary_file.close()
Could someone please help me to rewrite this code so that it becomes more computationally and time efficient?
I want to create a very basic Q&A chatbot. Given a list of questions & answers that I use as my dataset, I want to train it in order to return relevant answers, depending on a hard-coded question (different every time). First I tokenize, cleaning up, then using cosine similarity, but it gives me an error, which is (I guess) a pickle issue.
UPDATED
import csv
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from collections import Counter
import pickle
import os.path
import re, math
tokenizer = RegexpTokenizer(r'\w+')
stopwords = stopwords.words('english')
extra_stopwords = stopwords + ['I', 'can']
WORD = re.compile(r'\w+')
def get_clean_data():
clean_data_set = {
'questions' : {},
'answers' : {}
}
reader = csv.reader(open('data.csv', 'r', encoding="utf-8"))
tags = []
counter = 0
for r in reader:
question = str(r[0].encode('utf-8'))
answer = str(r[1].encode('utf-8'))
_, tags_question = get_tags(question)
_, tags_answer = get_tags(answer)
clean_data_set['answers'][answer] = tags_answer + tags_question
clean_data_set['questions'][question] = text_to_vector(question)
counter += 1
# hardcode the number :)
print (counter, ' out of 746')
# pickle.dump(clean_data_set, open('dump.dict', 'wb'))
with open('dump.dict', 'wb') as my_dump_file:
pickle.dump(clean_data_set, my_dump_file)
def get_tags(text, use_set = True):
tokens = tokenizer.tokenize(text)
# remove stop words from tokens
# make it lower case
filtered_words = [word.lower() for word in tokens if word not in extra_stopwords]
# return non duplicate values by default
if use_set == True:
filterd_words = list(set(filtered_words))
return Counter(filtered_words), filtered_words
# simple cosine similarity measure
def get_cosine(vec1, vec2):
intersection = set(vec1.keys()) & set(vec2.keys())
numerator = sum([vec1[x] * vec2[x] for x in intersection])
sum1 = sum([vec1[x]**2 for x in vec1.keys()])
sum2 = sum([vec2[x]**2 for x in vec2.keys()])
denominator = math.sqrt(sum1) * math.sqrt(sum2)
if not denominator:
return 0.0
else:
return float(numerator) / denominator
def text_to_vector(text):
words = WORD.findall(text)
return Counter(words)
# question_set is the data we had
def get_cosine_value(question, question_set):
question_vector = text_to_vector(question)
cosine = get_cosine(question_vector, question_set)
return cosine
def answer_question(question, top = 5):
with open('dump.dict', 'rb') as my_dump_file:
data_set = pickle.load(my_dump_file)
# data_set = pickle.load(open('dump.dict', 'rb'))
pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)
_, question_tags = get_tags(question)
ranking_dict = {}
similar_questions_rank = {}
for entry in data_set['answers']:
tags = data_set['answers'][entry]
# rank is the intersection between the list of tags from the question
# and the list of tags associated to answers
rank = len(set(question_tags).intersection(tags))
ranking_dict[entry] = rank
for entry in data_set['questions']:
cosine_similarity = get_cosine_value(question, data_set['questions'][entry])
similar_questions_rank[entry] = cosine_similarity
sorted_similarity_dict = sorted(similar_questions_rank.items(), key=lambda x: x[1], reverse=True)
sorted_ranking_dict = sorted(ranking_dict.items(), key=lambda x: x[1], reverse=True)
# sort them by rank
for item in sorted_ranking_dict[0:top-1]:
print ('Rank: ', item[1])
print ('Answer: ', item[0])
print ('\n\n')
# sort them by rank
for item in sorted_similarity_dict[0:top-1]:
print ('Rank: ', item[1])
print ('Question: ', item[0])
#get_clean_data()
question = 'why all these errors?'
answer_question(question)
This is the updated error message:
Traceback (most recent call last):
File "C:\Users\joasa\Desktop\si\main.py", line 133, in <module>
answer_question(question)
File "C:\Users\joasa\Desktop\si\main.py", line 94, in answer_question
data_set = pickle.load(my_dump_file)
EOFError: Ran out of input
[Finished in 1.4s]
Can someone help please? I have no idea what to do. Thanks in advance
I think it comes from this line in your get_clean_data function:
pickle.dump(clean_data_set, open('dump.dict', 'w'))
See here you open the file for writing, but you never close it, so when you try to read it there is nothing to signify that the end of file has been reached. To avoid stuff like this from happening, use a context manager block:
with open('dump.dict', 'wb') as my_dump_file:
pickle.dump(clean_data_set, my_dump_file)
That way, whichever way you exit the with block, you are guaranteed to close your file.
You should also do the same when loading your pickle dump in answer_question:
with open('dump.dict', 'rb') as my_dump_file:
data_set = pickle.load(my_dump_file)
I'm working on a Sentiment Analysis project using Twitter Data, and I've encountered a small problem regarding Dates. The code itself runs fine, but I don't know how to build custom time blocks for grouping my final data. Right now, it is defaulting to grouping them by the second, which is not very useful. I want to be able to group them in half-hour, hour, and day segments...
Feel free to skip to the bottom of the code to see where the issue lies!
Here is the code:
import tweepy
API_KEY = "XXXXX"
API_SECRET = XXXXXX"
auth = tweepy.AppAuthHandler(API_KEY, API_SECRET)
api = tweepy.API(auth, wait_on_rate_limit = True, wait_on_rate_limit_notify = True)
import sklearn as sk
import pandas as pd
import got3
#"Get Old Tweets" to find older data
tweetCriteria = got3.manager.TweetCriteria()
tweetCriteria.setQuerySearch("Kentucky Derby")
tweetCriteria.setSince("2016-05-07")
tweetCriteria.setUntil("2016-05-08")
tweetCriteria.setMaxTweets(1000)
TweetCriteria = got3.manager.TweetCriteria()
KYDerby_tweets = got3.manager.TweetManager.getTweets(tweetCriteria)
from afinn import Afinn
afinn = Afinn()
#getting afinn library to use for sentiment polarity analysis
for x in KYDerby_tweets:
Text = x.text
Retweets = x.retweets
Favorites = x.favorites
Date = x.date
Id = x.id
print(Text)
AllText = []
AllRetweets = []
AllFavorites = []
AllDates = []
AllIDs = []
for x in KYDerby_tweets:
Text = x.text
Retweets = x.retweets
Favorites = x.favorites
Date = x.date
AllText.append(Text)
AllRetweets.append(Retweets)
AllFavorites.append(Favorites)
AllDates.append(Date)
AllIDs.append(Id)
data_set = [[x.id, x.date, x.text, x.retweets, x.favorites]
for x in KYDerby_tweets]
df = pd.DataFrame(data=data_set, columns=["Id", "Date", "Text", "Favorites", "Retweets"])
#I now have a DataFrame with my basic info in it
pscore = []
for x in KYDerby_tweets:
afinn.score(x.text)
pscore.append(afinn.score(x.text))
df['P Score'] = pscore
#I now have the pscores for each Tweet in the DataFrame
nrc = pd.read_csv('C:\\users\\andrew.smith\\downloads\\NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt', sep="\t", names=["word", "emotion", "association"], skiprows=45)
#import NRC emotion lexicon
nrc = nrc[nrc["association"]==1]
nrc = nrc[nrc["emotion"].isin(["positive", "negative"]) == False]
#cleaned it up a bit
from nltk import TweetTokenizer
tt = TweetTokenizer()
tokenized = [x.lower() for x in tokenized]
#built my Tweet-specific, NRC-ready tokenizer
emotions = list(set(nrc["emotion"]))
index2emotion = {}
emotion2index = {}
for i in range(len(emotions)):
index2emotion[i] = emotions[i]
emotion2index[emotions[i]] = i
cv = [0] * len(emotions)
#built indices showing locations of emotions
for token in tokenized:
sub = nrc[nrc['word'] == token]
token_emotions = sub['emotion']
for e in token_emotions:
position_index = emotion2index[e]
cv[position_index]+=1
emotions = list(set(nrc['emotion']))
index2emotion = {}
emotion2index = {}
for i in range(len(emotions)):
index2emotion[i] = emotions[i]
emotion2index[emotions[i]] = i
def makeEmoVector(tweettext):
cv = [0] * len(emotions)
tokenized = tt.tokenize(tweettext)
tokenized = [x.lower() for x in tokenized]
for token in tokenized:
sub = nrc[nrc['word'] == token]
token_emotions = sub['emotion']
for e in token_emotions:
position_index = emotion2index[e]
cv[position_index] += 1
return cv
tweettext = df.iloc[14,:]['Text']
emotion_vectors = []
for text in df['Text']:
emotion_vector = makeEmoVector(text)
emotion_vectors.append(emotion_vector)
ev = pd.DataFrame(emotion_vectors, index=df.index, columns=emotions)
#Now I have a DataFrame with all of the emotion counts for each tweet
Date_Group = df.groupby("Date")
Date_Group[emotions].agg("sum")
#Finally, we arrive at the problem! When I run this, I end up with tweets that are grouped *by the second. What I want is to be able to group them: a) by the half-hour, b) by the hour, and c) by the day
Since, the default date format for tweets with the Tweepy API is "2017-04-14 18:41:56". To get tweets grouped by hour, you can do something as simple as this:
# This will get the time parameter
time = [item.split(" ")[1] for item in df['date'].values]
# This will get the hour parameter
hour = [item.split(":")[0] for item in time]
df['time'] = hour
grouped_tweets = df[['time', 'number_tweets']].groupby('time')
tweet_growth_hour = grouped_tweets.sum()
tweet_growth_hour['time']= tweet_growth_hour.index
print tweet_growth_hour
To group by date, you can do something similiar like:
days = [item.split(" ")[0] for item in df['date'].values]
df['days'] = days
grouped_tweets = df[['days', 'number_tweets']].groupby('days')
tweet_growth_days = grouped_tweets.sum()
tweet_growth_days['days']= tweet_growth_days.index
print tweet_growth_days