I am trying to create a sentiment analysis program. The tweets that will be analyzed are read from a CSV file, and after analyzed, it will be written again in a different CSV file. However, I got the AttributeError: 'list' object has no attribute 'lower' error. The error seems to appear from this part of the code. Is this operation not allowed for a sentence inside a CSV file?
def processTweet(tweet):
# process the tweets
#Convert to lower case
tweet = tweet.lower()
#Convert www.* or https?://* to URL
tweet = re.sub('((www\.[\s]+)|(https?://[^\s]+))','URL',tweet)
#Convert #username to AT_USER
tweet = re.sub('#[^\s]+','AT_USER',tweet)
#Remove additional white spaces
tweet = re.sub('[\s]+', ' ', tweet)
#Replace #word with word
tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
#trim
tweet = tweet.strip('\'"')
return tweet
#end
#start getStopWordList
def getStopWordList(stopWordListFileName):
#read the stopwords
stopWords = []
stopWords.append('AT_USER')
stopWords.append('URL')
fp = open(stopWordListFileName, 'r')
line = fp.readline()
while line:
word = line.strip()
stopWords.append(word)
line = fp.readline()
fp.close()
return stopWords
#end
#start getfeatureVector
def getFeatureVector(tweet, stopWords):
featureVector = []
words = tweet.split()
for w in words:
#replace two or more with two occurrences
w = replaceTwoOrMore(w)
#strip punctuation
w = w.strip('\'"?,.')
#check if it consists of only words
val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*[a-zA-Z]+[a-zA-Z0-9]*$", w)
#ignore if it is a stopWord
if(w in stopWords or val is None):
continue
else:
featureVector.append(w.lower())
return featureVector
#end
Here is the full code
#import regex
import re
import csv
import pprint
import nltk.classify
#start replaceTwoOrMore
def replaceTwoOrMore(s):
#look for 2 or more repetitions of character
pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
return pattern.sub(r"\1\1", s)
#end
#start process_tweet
def processTweet(tweet):
# process the tweets
#Convert to lower case
tweet = tweet.lower()
#Convert www.* or https?://* to URL
tweet = re.sub('((www\.[\s]+)|(https?://[^\s]+))','URL',tweet)
#Convert #username to AT_USER
tweet = re.sub('#[^\s]+','AT_USER',tweet)
#Remove additional white spaces
tweet = re.sub('[\s]+', ' ', tweet)
#Replace #word with word
tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
#trim
tweet = tweet.strip('\'"')
return tweet
#end
#start getStopWordList
def getStopWordList(stopWordListFileName):
#read the stopwords
stopWords = []
stopWords.append('AT_USER')
stopWords.append('URL')
fp = open(stopWordListFileName, 'r')
line = fp.readline()
while line:
word = line.strip()
stopWords.append(word)
line = fp.readline()
fp.close()
return stopWords
#end
#start getfeatureVector
def getFeatureVector(tweet, stopWords):
featureVector = []
words = tweet.split()
for w in words:
#replace two or more with two occurrences
w = replaceTwoOrMore(w)
#strip punctuation
w = w.strip('\'"?,.')
#check if it consists of only words
val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*[a-zA-Z]+[a-zA-Z0-9]*$", w)
#ignore if it is a stopWord
if(w in stopWords or val is None):
continue
else:
featureVector.append(w.lower())
return featureVector
#end
#start extract_features
def extract_features(tweet):
tweet_words = set(tweet)
features = {}
for word in featureList:
features['contains(%s)' % word] = (word in tweet_words)
return features
#end
#Read the tweets one by one and process it
inpTweets = csv.reader(open('data/sampleTweets.csv', 'rb'), delimiter=',', quotechar='"')
stopWords = getStopWordList('data/feature_list/stopwords.txt')
count = 0;
featureList = []
tweets = []
for row in inpTweets:
sentiment = row[0]
tweet = row[1]
processedTweet = processTweet(tweet)
featureVector = getFeatureVector(processedTweet, stopWords)
featureList.extend(featureVector)
tweets.append((featureVector, sentiment));
#end loop
# Remove featureList duplicates
featureList = list(set(featureList))
# Generate the training set
training_set = nltk.classify.util.apply_features(extract_features, tweets)
# Train the Naive Bayes classifier
NBClassifier = nltk.NaiveBayesClassifier.train(training_set)
# Test the classifier
# testTweet = 'RT #Jewelz2611 #mashable #apple, iphones r 2 expensive. Most went w/ htc/galaxy. No customer loyalty w/phone comp..'
with open('data/test_datasets.csv', 'r') as csvinput:
with open('data/test_datasets_output.csv', 'w') as csvoutput:
writer = csv.writer(csvoutput, lineterminator='\n')
reader = csv.reader(csvinput)
all=[]
row = next(reader)
for row in reader:
processedTestTweet = processTweet(row)
sentiment = NBClassifier.classify(extract_features(getFeatureVector(processedTestTweet, stopWords)))
row.append(sentiment)
all.append(row)
writer.writerows(all)
# print "testTweet = %s, sentiment = %s\n" % (testTweet, sentiment)
The traceback and error are as follows:
Traceback (most recent call last):
File "simpleDemo.py", line 114, in <module>
processedTestTweet = processTweet(row)
File "simpleDemo.py", line 19, in processTweet
tweet = tweet.lower()
AttributeError: 'list' object has no attribute 'lower'
Any help would be really appreaciated. Thanks!
You pass reader to processTweet() instead of row but processTweet() expects a string you probably should processTweet(row[1])
Related
I'm doing a Pdfreader that will get the information from Energy Bills PDF in a directory. The program is running and printing exactly as I want to 'store' the information.
The next steps is:
Export to excel the values from each bills, exactly as print in console:
Console image
I've tried lists, dictionaries. Maybe in a wrong way, but none of trials was sucessful.
Any improvements and other ways to do the same, but smarter, is welcome.
Follow the code
import glob
from PyPDF2 import PdfFileReader
pdf_dir = "C:/Users/gabri/Desktop/py4e/Contas EDP AAP/Leitor PDF/Faturas"
pdf_files = glob.glob("%s/*.pdf" % pdf_dir)
path = pdf_files
for file in pdf_files:
with open(file, 'rb') as f:
pdf = PdfFileReader(f)
page = pdf.getPage(1)
text = page.extractText()
text = text.split()
for word in text:
if word.endswith('Créditos'):
word = word.replace("Créditos", "")
word = word.replace("mês", "")
word = word.replace("kWh", "")
energia_injetada = word
print('Energia injetada: ', energia_injetada)
elif word.endswith('Saldo'):
word = word.replace("mês", "")
word = word.replace("Saldo", "")
word = word.replace("Participação", "")
word = word.replace("kWh", "")
energia_injetada = word
if 'Recebido' not in word:
print('Energia injetada: ', energia_injetada)
elif word.endswith('Saldo'):
word = word.replace("Recebido", "")
word = word.replace("kWhSaldo", "")
recebidos = word
print(recebidos)
elif word.endswith('Participação'):
word = word.replace("mês", "")
word = word.replace("Saldo", "")
word = word.replace("Participação", "")
word = word.replace("kWh", "")
energia_injetada = word
print('Saldo Atualizado: ', energia_injetada)
elif word.startswith('Verde'):
print('Bandeira verde')
elif word.startswith('Vermelha:'):
print('Bandeira vermelha')
elif word.startswith('Amarela'):
print('Bandeira Amarela: ')
elif word.startswith('('):
word = word.replace("(", "")
data_inicial = word
print('Data inicial: ', data_inicial)
elif word.endswith(')Nº'):
word = word.replace(")Nº", "")
data_final1 = word
print("Data Final: ", data_final1)
elif word.endswith(")Agradecemos"):
word = word.replace(")Agradecemos", "")
data_final = word
print('Data final: ', data_final)
elif word.startswith('Saldo'):
word = word.replace("Saldo", "")
participacao_saldo = word
print('Participação no Saldo: ', participacao_saldo)
print('\n\n')
else:
continue
if __name__ == '__main__' :
print('ok')
You can use print:
with open('out.csv', 'w') as fileout:
with open(file, 'rb') as f:
#some code
print("hi", "data", file=fileout,sep=";")
I am using HDBSCAN algorithm to create clusters from the documents I have. But to create a vector matrix from the words, I am using tf-idf algorithm and want to use GloVe or Word2vec(because tf-idf based on BoW, so it can`t capture semantics).
Which method can I use - GloV, Word2vec or any other methods that will be appropriated for text clusterization?
And how I can implement it?
Any help will be highly appreciated!
nltk.download('stopwords')
title = []
synopses = []
filename = "twitter-test-dataset.csv"
num_clusters = 10
pkl_file = "doc_cluster.pkl"
generate_pkl = False
# pre-process data
with open(filename, 'r') as csvfile:
# creating a csv reader object
csvreader = csv.reader(csvfile)
# extracting field names through first row
fields = csvreader.next()
# extracting each data row one by one
duplicates = 0
for row in csvreader:
# removes the characters specified
line = re.sub(r'[.,"!]+', '', row[2], flags=re.MULTILINE)
line = re.sub(r'^RT[\s]+', '', line, flags=re.MULTILINE) # removes RT
line = re.sub(r'https?:\/\/.*[\r\n]*', '',
line, flags=re.MULTILINE) # remove link
line = re.sub(r'[:]+', '', line, flags=re.MULTILINE)
line = (re.sub(
"(#[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", line, flags=re.MULTILINE))
line = filter(lambda x: x in string.printable,
line) # filter non-ascii characers
if line not in synopses:
synopses.append(line)
title.append(row[2])
else:
duplicates += 1
print("Removed " + str(duplicates) + " rows")
stopwords = nltk.corpus.stopwords.words('english')
stemmer = SnowballStemmer("english")
def tokenize_and_stem(text):
# first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
tokens = [word for sent in nltk.sent_tokenize(
text) for word in nltk.word_tokenize(sent)]
filtered_tokens = []
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
for**strong text** token in tokens:
if re.search('[a-zA-Z]', token):
filtered_tokens.append(token)
stems = [stemmer.stem(t) for t in filtered_tokens]
return stems
def tokenize_only(text):
# first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
tokens = [word.lower() for sent in nltk.sent_tokenize(text)
for word in nltk.word_tokenize(sent)]
filtered_tokens = []
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
for token in tokens:
if re.search('[a-zA-Z]', token):
filtered_tokens.append(token)
return filtered_tokens
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in synopses:
# for each item in 'synopses', tokenize/stem
allwords_stemmed = tokenize_and_stem(i)
# extend the 'totalvocab_stemmed' list
totalvocab_stemmed.extend(allwords_stemmed)
allwords_tokenized = tokenize_only(i)
totalvocab_tokenized.extend(allwords_tokenized)
vocab_frame = pd.DataFrame(
{'words': totalvocab_tokenized}, index=totalvocab_stemmed)
# print "there are " + str(vocab_frame.shape[0]) + " items in vocab_frame"
# define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
min_df=0.0, stop_words='english',
use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1, 3))
#CREATE TFIDF MATRIX
tfidf_matrix = tfidf_vectorizer.fit_transform(synopses)
terms = tfidf_vectorizer.get_feature_names()
c = hdbscan.HDBSCAN(min_cluster_size=5)
#PASS TFIDF_MATRIX TO HDBSCAN
c.fit(tfidf_matrix)
print(c.labels_)
sys.exit()
I want to tokenize and sort reviews by keywords, but there is a problem with opening json and programs throw an error: JSONDecodeError: Extra data: line 1 column 884 (char 883).
The files test2.json and keywords.txt are here:
https://github.com/SilverYar/TransportDataMiner
Here is my code:
import nltk
from nltk.stem.snowball import RussianStemmer
from nltk.corpus import stopwords
import nltk, string, json
st = RussianStemmer()
def tokenize_me(file_text):
#applying nltk tokenization
tokens = nltk.word_tokenize(file_text)
#deleting punctuation symbols
tokens = [i for i in tokens if (i not in string.punctuation)]
#deleting stop_words
stop_words = stopwords.words('russian')
stop_words.extend(['что', 'это', 'так', 'вот', 'быть', 'как', 'в', '—', 'к', 'на'])
tokens = [i for i in tokens if (i not in stop_words)]
#cleaning words
tokens = [i.replace('«', '').replace('»', '') for i in tokens]
return tokens
with open('C:\\Creme\\token\\keywords.txt') as fin:
ww = fin.read().split(', ')
key_words = list(set([st.stem(w) for w in ww]))
with open('C:\\Creme\\token\\test2.json') as fin:
text = json.load(fin)
with open('C:\\Creme\\token\\bad.json', 'a', encoding='utf8') as fout:
for dd in text:
#for d in dd:
words = tokenize_me(dd['description'])
split_text = list(set([st.stem(word) for word in words]))
#break
tt = list(filter(lambda w: w in key_words, split_text))
if tt:
json.dump(dd, fout)
fout.write('\n')
Help me find the error and fix the code.
You may not do this, because it leads to malformed JSON file:
for dd in text:
if tt:
json.dump(dd, fout) # <<-- cannot do this in the loop
fout.write('\n')
Basically it should be written all at once, with a single dump() or dumps() call.
ok, you have to make the large list first, then output it to the file:
bad_words_list = []
for dd in text:
words = tokenize_me(dd['description'])
split_text = list(set([st.stem(word) for word in words]))
tt = list(filter(lambda w: w in key_words, split_text))
if tt:
bad_words_list.append( dd )
with open('C:\\Creme\\token\\bad.json', 'a', encoding='utf8') as fout:
json.dump( bad_words_list, fout )
I decided a little differently - read in the file and format the string into correct json formats:
with open('C:\\Creme\\token\\test2.json', 'r', encoding='utf8') as fin:
data = fin.read()
formated_text = data.replace('}{', '},{')
text = json.loads(f'[{formated_text}]')
with open('C:\\Creme\\token\\bad.json', 'a', encoding='utf8') as fout:
for dd in text:
#for d in dd:
words = tokenize_me(dd['description'])
split_text = list(set([st.stem(word) for word in words]))
#break
tt = list(filter(lambda w: w in key_words, split_text))
if tt:
json.dump(dd, fout)
fout.write('\n')
I have two text files. The 1st file contains English sentences and 2nd file contains a number of English words (vocabulary). I want to remove those words from the sentences in the 1st file which are not present in the vocabulary and then to save the processed text back into the 1st file.
I wrote the code from which I am able to get those sentences which contains the words that are not available in our 2nd file (vocabulary).
Here is my code:
s = open('eng.txt').readlines()
for i in s:
print(i)
for word in i.split(' '):
print(word)
if word in open("vocab30000.txt").read():
print("Word exist in vocab")
else:
#print("I:", i)
print("Word does not exist")
#search_in_file_func(i)
print("I:", i)
file1 = open("MyFile.txt","a+")
if i in file1:
print("Sentence already exist")
else:
file1.write(i)
However, I am not able to remove those words.
This should work:
with open('vocab30000.txt') as f:
vocabulary = set(word.strip() for word in f.readlines())
with open('eng.txt', 'r+') as f:
data = [line.strip().split(' ') for line in f.readlines()]
removed = [[word for word in line if word in vocabulary] for line in data]
result = '\n'.join(' '.join(word for word in line) for line in removed)
f.seek(0)
f.write(result)
f.truncate()
#Read the two files
with open('vocab30000.txt') as f:
vocabulary = f.readlines()
with open('eng.txt', 'r+') as f:
eng = f.readlines()
vocab_sentences = [i.split(" ") for i in vocabulary]
eng = [i.split(" ") for i in eng]
cleaned_sentences = []
# loop over the sentences and exclude words in eng
for sent in vocab_sentences:
cleaned_sentences.append(" ".join([i for i in sent if i not in eng]))
#write the file
with open('vocab30000.txt', 'w') as f:
f.writelines(cleaned_sentences)
You can try this code. I tried not to use any loops to save your runtime if you have larger files.
import re
with open('eng.txt', 'r') as f:
s = f.read()
s_copy = s
punctuation = [".","\"",",","-","(",")","[","]"]
pattern = re.compile("\\b("+"|".join(punctuation)+")\\W", re.I)
s_copy = pattern.sub(" ", s_copy)
s_copy = s_copy.replace("\"","")
s_words = s_copy.split(" ")
with open('vocab30000.txt', 'r') as f:
check_words = f.read()
remove_words = list(set(s_words) - set(check_words))
pattern = re.compile("\\b("+"|".join(remove_words[1:])+")\\W", re.I)
pattern.sub("", s)
I have tried performing Sentiment Analysis on a huge data set which is about 10000 sentences. Now, when I use the NLTK Python code for performing training and testing using Naive Bayes, I will have train the classifier each time when I need to classify a set of new sentences. This is taking a lot of time.Is there a way I can take the output of the training part and then use it for classification which would save a lot of time.This is the NLTK code that I have used.
import nltk
import re
import csv
#Read the tweets one by one and process it
def processTweet(tweet):
# process the tweets
#convert to lower case
tweet = tweet.lower()
#Convert www.* or https?://* to URL
tweet = re.sub('((www\.[\s]+)|(https?://[^\s]+))','URL',tweet)
#Convert #username to AT_USER
tweet = re.sub('#[^\s]+','AT_USER',tweet)
#Remove additional white spaces
tweet = re.sub('[\s]+', ' ', tweet)
#Replace #word with word
tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
#trim
tweet = tweet.strip('\'"')
return tweet
def replaceTwoOrMore(s):
#look for 2 or more repetitions of character and replace with the character itself
pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
return pattern.sub(r"\1\1", s)
#end
#start getStopWordList
def getStopWordList(stopWordListFileName):
#read the stopwords file and build a list
stopWords = []
stopWords.append('AT_USER')
stopWords.append('url')
stopWords.append('URL')
stopWords.append('rt')
fp = open(stopWordListFileName)
line = fp.readline()
while line:
word = line.strip()
stopWords.append(word)
line = fp.readline()
fp.close()
return stopWords
#end
#start getfeatureVector
def getFeatureVector(tweet):
featureVector = []
#split tweet into words
words = tweet.split()
for w in words:
#replace two or more with two occurrences
w = replaceTwoOrMore(w)
#strip punctuation
w = w.strip('\'"?,.')
#check if the word starts with an alphabet
val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", w)
#ignore if it is a stop word
if(w in stopWords or val is None):
continue
else:
featureVector.append(w.lower())
return featureVector
#end
def extract_features(tweet):
tweet_words = set(tweet)
features = {}
for word in featureList:
features['contains(%s)' % word] = (word in tweet_words)
return features
inpTweets = csv.reader(open('sheet3.csv', 'rb'), delimiter=',')
stopWords = getStopWordList('stopwords.txt')
featureList = []
# Get tweet words
tweets = []
for row in inpTweets:
sentiment = row[0]
tweet = row[1]
processedTweet = processTweet(tweet)
featureVector = getFeatureVector(processedTweet)
featureList.extend(featureVector)
tweets.append((featureVector, sentiment));
#end loop
# Remove featureList duplicates
featureList = list(set(featureList))
# Extract feature vector for all tweets in one shote
training_set = nltk.classify.util.apply_features(extract_features, tweets)
NBClassifier = nltk.NaiveBayesClassifier.train(training_set)
ft = open("april2.tsv")
line = ft.readline()
fo = open("dunno.tsv", "w")
fo.seek(0,0)
while line:
testTweet = line
processedTestTweet = processTweet(testTweet)
line1 = fo.write( NBClassifier.classify(extract_features(getFeatureVector(processedTestTweet))) + "\n");
line = ft.readline()
fo.close()
ft.close()
If you want to stick with NLTK, try pickle, e.g. https://spaghetti-tagger.googlecode.com/svn/spaghetti.py, see https://docs.python.org/2/library/pickle.html :
#-*- coding: utf8 -*-
from nltk import UnigramTagger as ut
from nltk import BigramTagger as bt
from cPickle import dump,load
def loadtagger(taggerfilename):
infile = open(taggerfilename,'rb')
tagger = load(infile); infile.close()
return tagger
def traintag(corpusname, corpus):
# Function to save tagger.
def savetagger(tagfilename,tagger):
outfile = open(tagfilename, 'wb')
dump(tagger,outfile,-1); outfile.close()
return
# Training UnigramTagger.
uni_tag = ut(corpus)
savetagger(corpusname+'_unigram.tagger',uni_tag)
# Training BigramTagger.
bi_tag = bt(corpus)
savetagger(corpusname+'_bigram.tagger',bi_tag)
print "Tagger trained with",corpusname,"using" +\
"UnigramTagger and BigramTagger."
return
Otherwise, try other machine learning libraries such as sklearn or shogun
The Naive Bayes Classifier module in NLTK is breathtakingly slow because it's a pure Python implementation. For this reason, consider using a different Machine Learning (ML) library like sci-kit learn.
YS-L's tip is good for using cPickle is good for your purposes at the moment but, if you ever have to retrain the classifier, it'd probably be best to switch to a different Naive Bayes implementation.