Term Frequency and Document Frequency of words - python

I have written some code to find the term frequency and document frequency of words that contained in file stored at location path. Each file is go through the function cleanDoc() to get the words from text files and I want to file the term frequency in the tabled manner means so that all words from all documents should be considered to find count. Can anybody tell how should I implement it? I am only using NLTK.
import collections
import os.path
import glob
import nltk
wdict = set()
path = "C://Python27//Corpus Files//*.*"
#this function cleans up a doc (removes stopwords etc)
def cleanDoc(doc):
stopset = set(nltk.corpus.stopwords.words('english'))
stemmer = nltk.PorterStemmer()
tokens = nltk.WordPunctTokenizer().tokenize(doc)
clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 3 and token.isalpha()]
final = [stemmer.stem(word) for word in clean]
return final
for text in glob.glob(path):
f = open(text)
data= f.read()
words = cleanDoc(data)
wdict.update(words)

You can use the FreqDist object, from nltk.probability to count these words. Later, you can navigate in it using a dict-like key-value interface and methods (like freq.items() or freq['word']), or you can even plot the results using matplotlib.
import collections
import os.path
import glob
import nltk
from nltk.probability import FreqDist
term_frequency = {}
path = "C://Python27//Corpus Files//*.*"
#this function cleans up a doc (removes stopwords etc)
def cleanDoc(doc):
stopset = set(nltk.corpus.stopwords.words('english'))
stemmer = nltk.PorterStemmer()
tokens = nltk.WordPunctTokenizer().tokenize(doc)
clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 3 and token.isalpha()]
final = [stemmer.stem(word) for word in clean]
return final
for text in glob.glob(path):
f = open(text)
data = f.read()
words = cleanDoc(data)
numbers_of_words = len(words)
freq = FreqDist(all_words)
# term_frequency is a dict which structure is like:
# {
# 'path_to_file':
# {'term': 13.4, 'another_term': 15},
# 'another_file':
# {'term2': 12, 'foo': 15}
# }
for term in freq.keys():
if isintance(term_frequency[text], dict):
term_frequency[text][term] = freq[term]/numbers_of_words
else:
term_frequency[text] = {term: freq[term]/numbers_of_words}
Reference: https://nltk.googlecode.com/svn/trunk/doc/api/nltk.probability.FreqDist-class.html

Related

Base word for same word is different using spacy

I am trying to extract base word for an entire text however it reacts differently for a same word coming at different locations. Below is the code for reference:
from nltk.stem import PorterStemmer
import spacy
import pandas as pd
ps = PorterStemmer()
# spacy.cli.download("en")
nlp = spacy.load("en_core_web_sm")
words = [ "meeting","eating", "adjustable", "meeting", "eats", "eating", "eat", "ate", "rafting", "better", "good", "best", 'coming', "ability", "steal", "stolen", 'children']
word_joined = " ".join(words)
# print(word_joined)
doc = nlp(word_joined)
words_lemma = []
words_stemm = []
words_stemm = [ps.stem(w) for w in words]
words_lemma = [w.lemma_ for w in doc]
pd.DataFrame(list(zip(words, words_stemm, words_lemma)))
Is it something related to positioning of the word in a sentence from linguistical prospective?

Error with " blob = TextBlob(tweet[text])" in sentiment analysis using Python and Textblob

I'm working on a project in which I extract tweets from Twitter and run a sentiment analysis on specific keywords to draw conclusions. Unfortunately, I have come to a point where I am stumped. I have a sentiment analysis code:
When I use this: blob = TextBlob(tweet[text]) I get the following error:
Traceback (most recent call last): File
"C:/Users/Michael/python/Sentiment2.py", line 65, in
blob = TextBlob(tweet[text]) NameError: name 'text' is not defined
import json
import re
import operator
from textblob import TextBlob
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import os, sys, codecs
import csv
import sys
from nltk import bigrams
emoticons_str = r"""
(?:
[:=;] # Eyes
[oO\-]? # Nose (optional)
[D\)\]\(\]/\\OpP] # Mouth
)"""
regex_str = [
emoticons_str,
r'<[^>]+>', # HTML tags
r'(?:#[\w_]+)', # #-mentions
r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
r'http[s]?://(?:[a-z]|[0-9]|[$-_#.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
r'(?:[\w_]+)', # other words
r'(?:\S)' # anything else
]
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
def tokenize(s):
return tokens_re.findall(s)
def preprocess(s, lowercase=False):
tokens = tokenize(s)
if lowercase:
tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
return tokens
punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['rt', 'via']
fname = 'python.json'
with open(fname, 'r') as f:
lis=[]
neg=0.0
n=0.0
net=0.0
pos=0.0
p=0.0
count_all = Counter()
cout=0
for line in f:
try:
tweet = json.loads(line)
except:
continue
# Create a list with all the terms
blob = TextBlob(tweet[text])
cout+=1
lis.append(blob.sentiment.polarity)
#print blob.sentiment.subjectivity
#print (os.listdir(tweet["text"]))
if blob.sentiment.polarity < 0:
sentiment = "negative"
neg+=blob.sentiment.polarity
n+=1
elif blob.sentiment.polarity == 0:
sentiment = "neutral"
net+=1
else:
sentiment = "positive"
pos+=blob.sentiment.polarity
p+=1
# output sentiment
print("Total tweets"),len(lis)
print("Positive"),float(p/cout)*100,"%"
print("Negative"),float(n/cout)*100,"%"
print("Neutral"),float(net/len(lis))*100,"%"
#print lis
# determine if sentiment is positive, negative, or neutral
# output sentiment
#print sentiment
Change this
# Create a list with all the terms
blob = TextBlob(tweet[text])
to
# Create a list with all the terms
blob = TextBlob(tweet['text'])

How to iterate each word through nltk synsets and store misspelled words in separate list?

I am trying to take a text file with messages and iterate each word through NLTK wordnet synset function. I want to do this because I want to create a list of mispelled words. For example if I do:
wn.synsets('dog')
I get output:
[Synset('dog.n.01'),
Synset('frump.n.01'),
Synset('dog.n.03'),
Synset('cad.n.01'),
Synset('frank.n.02'),
Synset('pawl.n.01'),
Synset('andiron.n.01'),
Synset('chase.v.01')]
now if the word is mispelled like so:
wn.synsets('doeg')
I get output:
[]
If I am returned an empty list I want to save the misspelled word in another list like so and while continuing to iterate through rest of the file:
mispelled_words = ['doeg']
I am at a loss how to do this, here is my code below, I would need to do the iterating after variable "chat_message_tokenize". The name path is words I want to drop:
import nltk
import csv
from nltk.tag import pos_tag
from nltk.corpus import wordnet as wn
from nltk.stem.snowball import SnowballStemmer
def text_function():
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
# Read in chat messages and names files
chat_path = 'filepath.csv'
try:
with open(chat_path) as infile:
chat_messages = infile.read()
except Exception as error:
print(error)
return
name_path = 'filepath.txt'
try:
with open(names_path) as infile:
names = infile.read()
except Exception as error:
print(error)
return
chat_messages = chat_messages.split('Chats:')[1].strip()
names = names.split('Name:')[1].strip().lower()
chat_messages_tokenized = nltk.word_tokenize(chat_messages)
names_tokenized = nltk.word_tokenize(names)
# adding part of speech(pos) tag and dropping proper nouns
pos_drop = pos_tag(chat_messages_tokenized)
chat_messages_tokenized = [SnowballStemmer('english').stem(word.lower()) for word, pos in pos_drop if pos != 'NNP' and word not in names_tokenized]
for chat_messages_tokenized
if not wn.synset(chat_messages_tokenized):
print('empty list')
if __name__ == '__main__':
text_function()
# for s in wn.synsets('dog'):
# lemmas = s.lemmas()
# for l in lemmas:
# if l.name() == stemmer:
# print (l.synset())
csv_path ='OutputFilePath.csv'
try:
with open(csv_path, 'w') as outfile:
writer = csv.writer(outfile)
for word in chat_messages_tokenized:
writer.writerow([word])
except Exception as error:
print(error)
return
if __name__ == '__main__':
text_function()
Thank you in advance.
You already have the pseudocode in your explanation, you can just code it as you have explained, as follows:
misspelled_words = [] # The list to store misspelled words
for word in chat_messages_tokenized: # loop through each word
if not wn.synset(word): # if there is no synset for this word
misspelled_words.append(word) # add it to misspelled word list
print(misspelled_words)

How to find unique words for each text file in a bundle of text files using python?

How can I find only words that are unique to a text file? If a word is used frequently by in other files then it gets dropped.
Here is a reference http://sahandsaba.com/visualizing-philosophers-and-scientists-by-the-words-they-used-with-d3js-and-python.html
I need a script which loops through all text files in a folder and outputs the results in Json format.
My code so far :
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from pprint import pprint as pp
from glob import glob
from nltk import word_tokenize
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import CountVectorizer
import codecs
import jinja2
import json
import os
def get_raw_data():
texts = []
for x in range(1,95):
file_name = str(x+1)+".txt"
with codecs.open(file_name,"rU","utf-8") as myfile:
data = myfile.read()
texts.append(data)
yield file_name, '\n'.join(texts)
class StemTokenizer(object):
def __init__(self):
self.ignore_set = {'footnote'}
def __call__(self, doc):
words = []
for word in word_tokenize(doc):
word = word.lower()
w = wn.morphy(word)
if w and len(w) > 1 and w not in self.ignore_set:
words.append(w)
return words
def process_text(counts, vectorizer, text, file_name, index):
result = {w: counts[index][vectorizer.vocabulary_.get(w)]
for w in vectorizer.get_feature_names()}
result = {w: c for w, c in result.iteritems() if c > 4}
normalizing_factor = max(c for c in result.itervalues())
result = {w: c / normalizing_factor
for w, c in result.iteritems()}
return result
def main():
data = list(get_raw_data())
print('Data loaded')
n = len(data)
vectorizer = CountVectorizer(stop_words='english', min_df=(n-1) / n,tokenizer=StemTokenizer())
counts = vectorizer.fit_transform(text for p, text in data).toarray()
print('Vectorization done.')
print (counts)
for x in range(95):
file_name = str(x+1)+".txt"
# print (text)
for i, (text) in enumerate(data):
print (file_name)
# print (text)
with codecs.open(file_name,"rU","utf-8") as myfile:
text = myfile.read()
result = process_text(counts, vectorizer, text, file_name, i)
print (result)
if __name__ == '__main__':
main()
Looks like you've got a bunch of files named 1.txt, 2.txt, ... 95.txt, and you want to find words that occur in one file only. I'd just gather all words, counting how many files each one occurs in; and print out the singletons.
from collections import Counter
import re
fileids = [ str(n+1)+".txt" for n in range(95) ]
filecounts = Counter()
for fname in fileids:
with open(fname) as fp: # Add encoding if really needed
text = fp.read().lower()
words = re.split(r"\W+", text) # Keep letters, drop the rest
filecounts.update(set(words))
singletons = [ word in filecounts if filecounts[word] == 1 ]
print(" ".join(singletons))
Done. You don't need scikit, you don't need the nltk, you don't need a pile of IR algorithms. You can use the list of singletons in an IR algorithm, but that's a different story.
def parseText():
# oFile: text file to test
# myWord: word we are looking for
# Get all lines into list
aLines = oFile.readlines()
# Perform list comprehension on lines to test if the word is found
for sLine in aLines:
# Parse the line (remove spaces), returns list
aLine = sLine.split()
# Iterate words and test to see if they match our word
for sWord in aLines:
# if it matches, append it to our list
if sWord == myWord: aWords.append( sWord )
# Create empty list to store all instances of the word that we may find
aWords = []
# Prompt user to know what word to search
myWord = str( raw_input( 'what word to searh:' ) )
# Call function
parseText()
# Check if list has at least one element
if len( aWords ) < 1: print 'Word not found in file'
else: print str( len( aWords ) ) + ' instances of our word found in file'

How to tweak the NLTK Python code in such a way that I train the classifier only once

I have tried performing Sentiment Analysis on a huge data set which is about 10000 sentences. Now, when I use the NLTK Python code for performing training and testing using Naive Bayes, I will have train the classifier each time when I need to classify a set of new sentences. This is taking a lot of time.Is there a way I can take the output of the training part and then use it for classification which would save a lot of time.This is the NLTK code that I have used.
import nltk
import re
import csv
#Read the tweets one by one and process it
def processTweet(tweet):
# process the tweets
#convert to lower case
tweet = tweet.lower()
#Convert www.* or https?://* to URL
tweet = re.sub('((www\.[\s]+)|(https?://[^\s]+))','URL',tweet)
#Convert #username to AT_USER
tweet = re.sub('#[^\s]+','AT_USER',tweet)
#Remove additional white spaces
tweet = re.sub('[\s]+', ' ', tweet)
#Replace #word with word
tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
#trim
tweet = tweet.strip('\'"')
return tweet
def replaceTwoOrMore(s):
#look for 2 or more repetitions of character and replace with the character itself
pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
return pattern.sub(r"\1\1", s)
#end
#start getStopWordList
def getStopWordList(stopWordListFileName):
#read the stopwords file and build a list
stopWords = []
stopWords.append('AT_USER')
stopWords.append('url')
stopWords.append('URL')
stopWords.append('rt')
fp = open(stopWordListFileName)
line = fp.readline()
while line:
word = line.strip()
stopWords.append(word)
line = fp.readline()
fp.close()
return stopWords
#end
#start getfeatureVector
def getFeatureVector(tweet):
featureVector = []
#split tweet into words
words = tweet.split()
for w in words:
#replace two or more with two occurrences
w = replaceTwoOrMore(w)
#strip punctuation
w = w.strip('\'"?,.')
#check if the word starts with an alphabet
val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", w)
#ignore if it is a stop word
if(w in stopWords or val is None):
continue
else:
featureVector.append(w.lower())
return featureVector
#end
def extract_features(tweet):
tweet_words = set(tweet)
features = {}
for word in featureList:
features['contains(%s)' % word] = (word in tweet_words)
return features
inpTweets = csv.reader(open('sheet3.csv', 'rb'), delimiter=',')
stopWords = getStopWordList('stopwords.txt')
featureList = []
# Get tweet words
tweets = []
for row in inpTweets:
sentiment = row[0]
tweet = row[1]
processedTweet = processTweet(tweet)
featureVector = getFeatureVector(processedTweet)
featureList.extend(featureVector)
tweets.append((featureVector, sentiment));
#end loop
# Remove featureList duplicates
featureList = list(set(featureList))
# Extract feature vector for all tweets in one shote
training_set = nltk.classify.util.apply_features(extract_features, tweets)
NBClassifier = nltk.NaiveBayesClassifier.train(training_set)
ft = open("april2.tsv")
line = ft.readline()
fo = open("dunno.tsv", "w")
fo.seek(0,0)
while line:
testTweet = line
processedTestTweet = processTweet(testTweet)
line1 = fo.write( NBClassifier.classify(extract_features(getFeatureVector(processedTestTweet))) + "\n");
line = ft.readline()
fo.close()
ft.close()
If you want to stick with NLTK, try pickle, e.g. https://spaghetti-tagger.googlecode.com/svn/spaghetti.py, see https://docs.python.org/2/library/pickle.html :
#-*- coding: utf8 -*-
from nltk import UnigramTagger as ut
from nltk import BigramTagger as bt
from cPickle import dump,load
def loadtagger(taggerfilename):
infile = open(taggerfilename,'rb')
tagger = load(infile); infile.close()
return tagger
def traintag(corpusname, corpus):
# Function to save tagger.
def savetagger(tagfilename,tagger):
outfile = open(tagfilename, 'wb')
dump(tagger,outfile,-1); outfile.close()
return
# Training UnigramTagger.
uni_tag = ut(corpus)
savetagger(corpusname+'_unigram.tagger',uni_tag)
# Training BigramTagger.
bi_tag = bt(corpus)
savetagger(corpusname+'_bigram.tagger',bi_tag)
print "Tagger trained with",corpusname,"using" +\
"UnigramTagger and BigramTagger."
return
Otherwise, try other machine learning libraries such as sklearn or shogun
The Naive Bayes Classifier module in NLTK is breathtakingly slow because it's a pure Python implementation. For this reason, consider using a different Machine Learning (ML) library like sci-kit learn.
YS-L's tip is good for using cPickle is good for your purposes at the moment but, if you ever have to retrain the classifier, it'd probably be best to switch to a different Naive Bayes implementation.

Categories

Resources