It seems the format is, for every line, the string is like 'word number number .....'. So it easy to split it.
But when I split them with the script below
import numpy as np
def loadGloveModel(gloveFile):
print "Loading Glove Model"
f = open(gloveFile,'r')
model = {}
for line in f:
splitLine = line.split()
word = splitLine[0]
embedding = np.array([float(val) for val in splitLine[1:]])
model[word] = embedding
print "Done.",len(model)," words loaded!"
return model
I load the glove 840B 300d.txt. but get error and I print the splitLine I got
['contact', 'name#domain.com', '0.016426', '0.13728', '0.18781', '0.75784', '0.44012', '0.096794' ... ]
or
['.', '.', '.', '.', '0.033459', '-0.085658', '0.27155', ...]
Please notice that this script works fine in glove.6b.*
The code works fine for files: glove.6B.*d.txt, glove.42B.*d.txt, but not glove.6B.300d.txt. This is because glove.6B.300d.txt contains spaces in a word. For example, it has a word like this: '. . .' and there are spaces between those dots. I solve this problem by changing this line:
splitLine = line.split()
into
splitLine = line.split(' ')
So you code must be like this:
import numpy as np
def loadGloveModel(gloveFile):
print "Loading Glove Model"
f = open(gloveFile,'r', encoding='utf8')
model = {}
for line in f:
splitLine = line.split(' ')
word = splitLine[0]
embedding = np.asarray(splitLine[1:], dtype='float32')
model[word] = embedding
print "Done.",len(model)," words loaded!"
return model
I think the following may help:
def process_glove_line(line, dim):
word = None
embedding = None
try:
splitLine = line.split()
word = " ".join(splitLine[:len(splitLine)-dim])
embedding = np.array([float(val) for val in splitLine[-dim:]])
except:
print(line)
return word, embedding
def load_glove_model(glove_filepath, dim):
with open(glove_filepath, encoding="utf8" ) as f:
content = f.readlines()
model = {}
for line in content:
word, embedding = process_glove_line(line, dim)
if embedding is not None:
model[word] = embedding
return model
model= load_glove_model("glove.840B.300d.txt", 300)
Related
I have two text files. The 1st file contains English sentences and 2nd file contains a number of English words (vocabulary). I want to remove those words from the sentences in the 1st file which are not present in the vocabulary and then to save the processed text back into the 1st file.
I wrote the code from which I am able to get those sentences which contains the words that are not available in our 2nd file (vocabulary).
Here is my code:
s = open('eng.txt').readlines()
for i in s:
print(i)
for word in i.split(' '):
print(word)
if word in open("vocab30000.txt").read():
print("Word exist in vocab")
else:
#print("I:", i)
print("Word does not exist")
#search_in_file_func(i)
print("I:", i)
file1 = open("MyFile.txt","a+")
if i in file1:
print("Sentence already exist")
else:
file1.write(i)
However, I am not able to remove those words.
This should work:
with open('vocab30000.txt') as f:
vocabulary = set(word.strip() for word in f.readlines())
with open('eng.txt', 'r+') as f:
data = [line.strip().split(' ') for line in f.readlines()]
removed = [[word for word in line if word in vocabulary] for line in data]
result = '\n'.join(' '.join(word for word in line) for line in removed)
f.seek(0)
f.write(result)
f.truncate()
#Read the two files
with open('vocab30000.txt') as f:
vocabulary = f.readlines()
with open('eng.txt', 'r+') as f:
eng = f.readlines()
vocab_sentences = [i.split(" ") for i in vocabulary]
eng = [i.split(" ") for i in eng]
cleaned_sentences = []
# loop over the sentences and exclude words in eng
for sent in vocab_sentences:
cleaned_sentences.append(" ".join([i for i in sent if i not in eng]))
#write the file
with open('vocab30000.txt', 'w') as f:
f.writelines(cleaned_sentences)
You can try this code. I tried not to use any loops to save your runtime if you have larger files.
import re
with open('eng.txt', 'r') as f:
s = f.read()
s_copy = s
punctuation = [".","\"",",","-","(",")","[","]"]
pattern = re.compile("\\b("+"|".join(punctuation)+")\\W", re.I)
s_copy = pattern.sub(" ", s_copy)
s_copy = s_copy.replace("\"","")
s_words = s_copy.split(" ")
with open('vocab30000.txt', 'r') as f:
check_words = f.read()
remove_words = list(set(s_words) - set(check_words))
pattern = re.compile("\\b("+"|".join(remove_words[1:])+")\\W", re.I)
pattern.sub("", s)
I am new to python and if I am asking a very simple question, please excuse.
I am trying to read each line from a text file and predict the sentiment of each line and write the output to the end of the text file. For that I am trying to append data to the end of the line.
My text file looks like below :
I am awesome.
I am terrible.
I am bad.
What I am trying to achieve is below :
I am awesome. - Positive
I am terrible. - Negative
I am bad. - Negative
When I run the code, the file is being saved as empty. Please help.
My code is as below :
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import names
def word_feats(words):
return dict([(word, True) for word in words])
positive_vocab = ['awesome', 'outstanding', 'fantastic', 'terrific', 'good', 'nice', 'great', ':)']
negative_vocab = ['bad', 'terrible', 'useless', 'hate', ':(']
positive_features = [(word_feats(pos), 'pos') for pos in positive_vocab]
negative_features = [(word_feats(neg), 'neg') for neg in negative_vocab]
train_set = negative_features + positive_features
classifier = NaiveBayesClassifier.train(train_set)
# Predict
neg = 0
pos = 0
f = open("test.txt", "r")
for sentence in f.readlines():
sentence = sentence.lower()
words = sentence.split(' ')
for word in words:
classResult = classifier.classify(word_feats(word))
if classResult == 'neg':
f.write(' negative')
if classResult == 'pos':
f.write(' positive')
f.close()
You can't write to a file that is open in 'r' mode - that mode is for reading.
My suggestion is to open the file for reading, and open a second file and write out to that. So something like:
f = open("test.txt", "r")
out_file = open("output.txt", "w")
for sentence in f.readlines():
orig = sentence
sentence = sentence.lower()
words = sentence.split(' ')
for word in words:
classResult = classifier.classify(word_feats(word))
if classResult == 'neg':
out_file.write(orig + ' negative')
if classResult == 'pos':
out_file.write(orig + ' positive')
f.close()
out_file.close()
You are opening the file in read mode. You would need to open the file in write.
f = open('test.txt', 'w')
I have two files. One creates a numpy array in compressed sparse row format
from sklearn.feature_extraction.text import TfidfTransformer
import pdb
def stem_document(document):
translatedict = ""
stemmer = PorterStemmer()
for word in string.punctuation:
translatedict = translatedict + word
doc_stemmed = []
for word in document.split():
lowerstrippedword = ''.join(c for c in word.lower() if c not in translatedict)
try:
stemmed_word = stemmer.stem(lowerstrippedword)
doc_stemmed.append(stemmed_word)
except:
print lowerstrippedword + " could not be stemmed."
return ' '.join(doc_stemmed)
def readFileandStem(filestring):
with open(filestring, 'r') as file:
reader = csv.reader(file)
file_extras = []
vector_data = []
error = False
while (error == False):
try:
next = reader.next()
if len(next) == 3 and next[2] != "":
document = next[2]
stemmed_document = stem_document(document)
vector_data.append(stemmed_document)
file_extra = []
file_extra.append(next[0])
file_extra.append(next[1])
file_extras.append(file_extra)
except:
error = True
return [vector_data, file_extras]
filestring = 'Data.csv'
print "Reading File"
data = readFileandStem(filestring)
documents = data[0]
file_extras = data[1]
print "Vectorizing Data"
vectorizer = CountVectorizer()
matrix = vectorizer.fit_transform(documents)
tf_idf_transform = TfidfTransformer(use_idf=False).fit(matrix)
tf_idf_matrix = tf_idf_transform.transform(matrix)
with open('matrix/matrix.npy', 'w') as matrix_file:
np.save(matrix_file, tf_idf_matrix)
file_json_map = {}
file_json_map['extras'] = file_extras
with open('matrix/extras.json', 'w') as extras_file:
extras_file.write(json.dumps(file_json_map))
print "finished"
The next file is supposed to load the same file...
import numpy as np
from scipy.cluster.hierarchy import dendrogram, linkage
import json
import pdb
with open('matrix/matrix.npy', 'r') as matrix_file:
matrix = np.load(matrix_file)
hcluster = linkage(matrix, "complete")
However, I get the following error:
File "Cluster.py", line 7, in <module>
matrix = np.load(matrix_file)
File "C:\Users\jarek\Anaconda2\lib\site-packages\numpy\lib\npyio.py", line 406, in load
pickle_kwargs=pickle_kwargs)
File "C:\Users\jarek\Anaconda2\lib\site-packages\numpy\lib\format.py", line 620, in read_array
version = read_magic(fp)
File "C:\Users\jarek\Anaconda2\lib\site-packages\numpy\lib\format.py", line 216, in read_magic
raise ValueError(msg % (MAGIC_PREFIX, magic_str[:-2]))
ValueError: the magic string is not correct; expected '\x93NUMPY', got '\x00\x00I\x1c\x00\x00'
I don't know why the magic string would be incorrect because from what I've looked into, all .npy files are supposed to have the same magic string "\x93NUMPY".
Ideas?
I encountered similar issue before.
Changing
open('matrix/matrix.npy', 'w')
...
open('matrix/matrix.npy', 'r')
to
open('matrix/matrix.npy', 'wb')
...
open('matrix/matrix.npy', 'rb')
solved my problem.
I have tried performing Sentiment Analysis on a huge data set which is about 10000 sentences. Now, when I use the NLTK Python code for performing training and testing using Naive Bayes, I will have train the classifier each time when I need to classify a set of new sentences. This is taking a lot of time.Is there a way I can take the output of the training part and then use it for classification which would save a lot of time.This is the NLTK code that I have used.
import nltk
import re
import csv
#Read the tweets one by one and process it
def processTweet(tweet):
# process the tweets
#convert to lower case
tweet = tweet.lower()
#Convert www.* or https?://* to URL
tweet = re.sub('((www\.[\s]+)|(https?://[^\s]+))','URL',tweet)
#Convert #username to AT_USER
tweet = re.sub('#[^\s]+','AT_USER',tweet)
#Remove additional white spaces
tweet = re.sub('[\s]+', ' ', tweet)
#Replace #word with word
tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
#trim
tweet = tweet.strip('\'"')
return tweet
def replaceTwoOrMore(s):
#look for 2 or more repetitions of character and replace with the character itself
pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
return pattern.sub(r"\1\1", s)
#end
#start getStopWordList
def getStopWordList(stopWordListFileName):
#read the stopwords file and build a list
stopWords = []
stopWords.append('AT_USER')
stopWords.append('url')
stopWords.append('URL')
stopWords.append('rt')
fp = open(stopWordListFileName)
line = fp.readline()
while line:
word = line.strip()
stopWords.append(word)
line = fp.readline()
fp.close()
return stopWords
#end
#start getfeatureVector
def getFeatureVector(tweet):
featureVector = []
#split tweet into words
words = tweet.split()
for w in words:
#replace two or more with two occurrences
w = replaceTwoOrMore(w)
#strip punctuation
w = w.strip('\'"?,.')
#check if the word starts with an alphabet
val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", w)
#ignore if it is a stop word
if(w in stopWords or val is None):
continue
else:
featureVector.append(w.lower())
return featureVector
#end
def extract_features(tweet):
tweet_words = set(tweet)
features = {}
for word in featureList:
features['contains(%s)' % word] = (word in tweet_words)
return features
inpTweets = csv.reader(open('sheet3.csv', 'rb'), delimiter=',')
stopWords = getStopWordList('stopwords.txt')
featureList = []
# Get tweet words
tweets = []
for row in inpTweets:
sentiment = row[0]
tweet = row[1]
processedTweet = processTweet(tweet)
featureVector = getFeatureVector(processedTweet)
featureList.extend(featureVector)
tweets.append((featureVector, sentiment));
#end loop
# Remove featureList duplicates
featureList = list(set(featureList))
# Extract feature vector for all tweets in one shote
training_set = nltk.classify.util.apply_features(extract_features, tweets)
NBClassifier = nltk.NaiveBayesClassifier.train(training_set)
ft = open("april2.tsv")
line = ft.readline()
fo = open("dunno.tsv", "w")
fo.seek(0,0)
while line:
testTweet = line
processedTestTweet = processTweet(testTweet)
line1 = fo.write( NBClassifier.classify(extract_features(getFeatureVector(processedTestTweet))) + "\n");
line = ft.readline()
fo.close()
ft.close()
If you want to stick with NLTK, try pickle, e.g. https://spaghetti-tagger.googlecode.com/svn/spaghetti.py, see https://docs.python.org/2/library/pickle.html :
#-*- coding: utf8 -*-
from nltk import UnigramTagger as ut
from nltk import BigramTagger as bt
from cPickle import dump,load
def loadtagger(taggerfilename):
infile = open(taggerfilename,'rb')
tagger = load(infile); infile.close()
return tagger
def traintag(corpusname, corpus):
# Function to save tagger.
def savetagger(tagfilename,tagger):
outfile = open(tagfilename, 'wb')
dump(tagger,outfile,-1); outfile.close()
return
# Training UnigramTagger.
uni_tag = ut(corpus)
savetagger(corpusname+'_unigram.tagger',uni_tag)
# Training BigramTagger.
bi_tag = bt(corpus)
savetagger(corpusname+'_bigram.tagger',bi_tag)
print "Tagger trained with",corpusname,"using" +\
"UnigramTagger and BigramTagger."
return
Otherwise, try other machine learning libraries such as sklearn or shogun
The Naive Bayes Classifier module in NLTK is breathtakingly slow because it's a pure Python implementation. For this reason, consider using a different Machine Learning (ML) library like sci-kit learn.
YS-L's tip is good for using cPickle is good for your purposes at the moment but, if you ever have to retrain the classifier, it'd probably be best to switch to a different Naive Bayes implementation.
I am trying run a sentiment analysis. I have managed to use Naive Bayes through nltk to classify a corpus of negative and positive tweets. However I do not want to go through the process of running this classifier every time I run this program so I tried to use pickle to save, and then load into a different script the classifier. However when I try to run the script it returns the error NameError: name classifier is not defined, although I thought it was defined through the def load_classifier():
The code I have atm is below:
import nltk, pickle
from nltk.corpus import stopwords
customstopwords = ['']
p = open('xxx', 'r')
postxt = p.readlines()
n = open('xxx', 'r')
negtxt = n.readlines()
neglist = []
poslist = []
for i in range(0,len(negtxt)):
neglist.append('negative')
for i in range(0,len(postxt)):
poslist.append('positive')
postagged = zip(postxt, poslist)
negtagged = zip(negtxt, neglist)
taggedtweets = postagged + negtagged
tweets = []
for (word, sentiment) in taggedtweets:
word_filter = [i.lower() for i in word.split()]
tweets.append((word_filter, sentiment))
def getwords(tweets):
allwords = []
for (words, sentiment) in tweets:
allwords.extend(words)
return allwords
def getwordfeatures(listoftweets):
wordfreq = nltk.FreqDist(listoftweets)
words = wordfreq.keys()
return words
wordlist = [i for i in getwordfeatures(getwords(tweets)) if not i in stopwords.words('english')]
wordlist = [i for i in getwordfeatures(getwords(tweets)) if not i in customstopwords]
def feature_extractor(doc):
docwords = set(doc)
features = {}
for i in wordlist:
features['contains(%s)' % i] = (i in docwords)
return features
training_set = nltk.classify.apply_features(feature_extractor, tweets)
def load_classifier():
f = open('my_classifier.pickle', 'rb')
classifier = pickle.load(f)
f.close
return classifier
while True:
input = raw_input('I hate this film')
if input == 'exit':
break
elif input == 'informfeatures':
print classifier.show_most_informative_features(n=30)
continue
else:
input = input.lower()
input = input.split()
print '\nSentiment is ' + classifier.classify(feature_extractor(input)) + ' in that sentence.\n'
p.close()
n.close()
Any help would be great, the script seems to make it to the print '\nSentiment is ' + classifier.classify(feature_extractor(input)) + ' in that sentence.\n'" before returning the error...
Well, you have declared and defined the load_classifier() method but never called it and assigned a variable using it. That means, by the time, the execution reaches the print '\nSentiment is... ' line, there is no variable names classifier. Naturally, the execution throws an exception.
Add the line classifier = load_classifier() just before while loop. (without any indentation)