NLP clustering documents - python

I am using HDBSCAN algorithm to create clusters from the documents I have. But to create a vector matrix from the words, I am using tf-idf algorithm and want to use GloVe or Word2vec(because tf-idf based on BoW, so it can`t capture semantics).
Which method can I use - GloV, Word2vec or any other methods that will be appropriated for text clusterization?
And how I can implement it?
Any help will be highly appreciated!
nltk.download('stopwords')
title = []
synopses = []
filename = "twitter-test-dataset.csv"
num_clusters = 10
pkl_file = "doc_cluster.pkl"
generate_pkl = False
# pre-process data
with open(filename, 'r') as csvfile:
# creating a csv reader object
csvreader = csv.reader(csvfile)
# extracting field names through first row
fields = csvreader.next()
# extracting each data row one by one
duplicates = 0
for row in csvreader:
# removes the characters specified
line = re.sub(r'[.,"!]+', '', row[2], flags=re.MULTILINE)
line = re.sub(r'^RT[\s]+', '', line, flags=re.MULTILINE) # removes RT
line = re.sub(r'https?:\/\/.*[\r\n]*', '',
line, flags=re.MULTILINE) # remove link
line = re.sub(r'[:]+', '', line, flags=re.MULTILINE)
line = (re.sub(
"(#[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", line, flags=re.MULTILINE))
line = filter(lambda x: x in string.printable,
line) # filter non-ascii characers
if line not in synopses:
synopses.append(line)
title.append(row[2])
else:
duplicates += 1
print("Removed " + str(duplicates) + " rows")
stopwords = nltk.corpus.stopwords.words('english')
stemmer = SnowballStemmer("english")
def tokenize_and_stem(text):
# first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
tokens = [word for sent in nltk.sent_tokenize(
text) for word in nltk.word_tokenize(sent)]
filtered_tokens = []
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
for**strong text** token in tokens:
if re.search('[a-zA-Z]', token):
filtered_tokens.append(token)
stems = [stemmer.stem(t) for t in filtered_tokens]
return stems
def tokenize_only(text):
# first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
tokens = [word.lower() for sent in nltk.sent_tokenize(text)
for word in nltk.word_tokenize(sent)]
filtered_tokens = []
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
for token in tokens:
if re.search('[a-zA-Z]', token):
filtered_tokens.append(token)
return filtered_tokens
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in synopses:
# for each item in 'synopses', tokenize/stem
allwords_stemmed = tokenize_and_stem(i)
# extend the 'totalvocab_stemmed' list
totalvocab_stemmed.extend(allwords_stemmed)
allwords_tokenized = tokenize_only(i)
totalvocab_tokenized.extend(allwords_tokenized)
vocab_frame = pd.DataFrame(
{'words': totalvocab_tokenized}, index=totalvocab_stemmed)
# print "there are " + str(vocab_frame.shape[0]) + " items in vocab_frame"
# define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
min_df=0.0, stop_words='english',
use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1, 3))
#CREATE TFIDF MATRIX
tfidf_matrix = tfidf_vectorizer.fit_transform(synopses)
terms = tfidf_vectorizer.get_feature_names()
c = hdbscan.HDBSCAN(min_cluster_size=5)
#PASS TFIDF_MATRIX TO HDBSCAN
c.fit(tfidf_matrix)
print(c.labels_)
sys.exit()

Related

deleting a specific line from a dataframe python NLP

I am trying to preprocess my data for NLP model. I wrote this code to remove numbers, symbols and hyper links. But now I want to delete every line that has a specific instance of the word 'system'. I don't seem to figure how to do that. df is my dataframe and df['Content'] is where I have the text I want to delete the line from.
for example the text can be :
"system: hi im the line that is meant to be deleted
Leena: this line must not be deleted
system: hi again im the line that is meant to be deleted "
the output should be :
Leena: this line must not be deleted
def CleaningTXT(df):
Allchat=list()
lines=df['Content'].values.tolist()
for text in lines:
text=text.lower()
#remove links
pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_#.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
text = pattern.sub('', text)
#remove session join/leave
pattern = re.compile('new party join session')
text = pattern.sub('', text)
pattern = re.compile('new party leave session')
text = pattern.sub('', text)
#remove sympols
text = re.sub(r"[,.\"!##$%^&*(){}?/;`~:<>+=-]", "", text)
#seperating words
tokens = word_tokenize(text)
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in tokens]
#removing numbers
words = [word for word in stripped if word.isalpha()]
words = ' '.join(words)
Allchat.append(words)
return Allchat
hope I understand your request.
try the following:
def CleaningTXT(df):
Allchat=list()
#Added
index_to_drop = df[ df['Content'].str.contains('system')].index
df.drop(index_to_drop, inplace = True)
lines=df['Content'].values.tolist()
for text in lines:
text=text.lower()
#remove links
pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_#.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
text = pattern.sub('', text)
#remove session join/leave
pattern = re.compile('new party join session')
text = pattern.sub('', text)
pattern = re.compile('new party leave session')
text = pattern.sub('', text)
#remove sympols
text = re.sub(r"[,.\"!##$%^&*(){}?/;`~:<>+=-]", "", text)
#seperating words
tokens = word_tokenize(text)
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in tokens]
#removing numbers
words = [word for word in stripped if word.isalpha()]
words = ' '.join(words)
Allchat.append(words)
return Allchat

How to find and match each elements of a list on each sentences?

I have a file including some sentences. I used polyglot for Named Entity Recognition and stored all detected entities in a list. Now I want to check if in each sentence any or pair of entities exist, show that for me.
Here what I did:
from polyglot.text import Text
file = open('input_raw.txt', 'r')
input_file = file.read()
test = Text(input_file, hint_language_code='fa')
list_entity = []
for sent in test.sentences:
#print(sent[:10], "\n")
for entity in test.entities:
list_entity.append(entity)
for i in range(len(test)):
m = test.entities[i]
n = test.words[m.start: m.end] # it shows only word not tag
if str(n).split('.')[-1] in test: # if each entities exist in each sentence
print(n)
It gives me an empty list.
Input:
sentence1: Bill Gate is the founder of Microsoft.
sentence2: Trump is the president of USA.
Expected output:
Bill Gate, Microsoft
Trump, USA
Output of list_entity:
I-PER(['Trump']), I-LOC(['USA'])
How to check if I-PER(['Trump']), I-LOC(['USA']) is in first sentence?
For starters you were adding the whole text file input to the entities list.
entities can only be called by each sentence in the polyglot object.
from polyglot.text import Text
file = open('input_raw.txt', 'r')
input_file = file.read()
file = Text(input_file, hint_language_code='fa')
list_entity = []
for sentence in file.sentences:
for entity in sentence.entities:
#print(entity)
list_entity.append(entity)
print(list_entity)
Now you don't have an empty list.
As for your problem with identifying the identity terms,
I have not found a way to generate an entity by hand, so the following simply checks if there are entities with the same term. A Chunk can have multiple strings inside, so we can go through them iteratively.
from polyglot.text import Text
file = open('input_raw.txt', 'r')
input_file = file.read()
file = Text(input_file, hint_language_code='ar')
def check_sentence(entities_list, sentence): ## Check if string terms
for term in entities_list: ## are in any of the entities
## Compare each Chunk in the list to each Chunk
## object in the sentence and see if there's any matches.
if any(any(entityTerm == term for entityTerm in entityObject)
for entityObject in sentence.entities):
pass
else:
return False
return True
sentence_number = 1 # Which sentence to check
sentence = file.sentences[sentence_number]
entity_terms = ["Bill",
"Gates"]
if check_sentence(entity_terms, sentence):
print("Entity Terms " + str(entity_terms) +
" are in the sentence. '" + str(sentence)+ "'")
else:
print("Sentence '" + str(sentence) +
"' doesn't contain terms" + str(entity_terms ))
Once you find a way to generate arbitrary entities all you'll have to do is stop popping the term from the sentence checker so you can do type comparison as well.
If you just want to match the list of entities in the file against a specific sentence, then this should do the trick:
from polyglot.text import Text
file = open('input_raw.txt', 'r')
input_file = file.read()
file = Text(input_file, hint_language_code='fa')
def return_match(entities_list, sentence): ## Check if and which chunks
matches = [] ## are in the sentence
for term in entities_list:
## Check each list in each Chunk object
## and see if there's any matches.
for entity in sentence.entities:
if entity == term:
for word in entity:
matches.append(word)
return matches
def return_list_of_entities(file):
list_entity = []
for sentence in file.sentences:
for entity in sentence.entities:
list_entity.append(entity)
return list_entity
list_entity = return_list_of_entities(file)
sentence_number = 1 # Which sentence to check
sentence = file.sentences[sentence_number]
match = return_match(list_entity, sentence)
if match:
print("Entity Term " + str(match) +
" is in the sentence. '" + str(sentence)+ "'")
else:
print("Sentence '" + str(sentence) +
"' doesn't contain any of the terms" + str(list_entity))

How to tweak the NLTK Python code in such a way that I train the classifier only once

I have tried performing Sentiment Analysis on a huge data set which is about 10000 sentences. Now, when I use the NLTK Python code for performing training and testing using Naive Bayes, I will have train the classifier each time when I need to classify a set of new sentences. This is taking a lot of time.Is there a way I can take the output of the training part and then use it for classification which would save a lot of time.This is the NLTK code that I have used.
import nltk
import re
import csv
#Read the tweets one by one and process it
def processTweet(tweet):
# process the tweets
#convert to lower case
tweet = tweet.lower()
#Convert www.* or https?://* to URL
tweet = re.sub('((www\.[\s]+)|(https?://[^\s]+))','URL',tweet)
#Convert #username to AT_USER
tweet = re.sub('#[^\s]+','AT_USER',tweet)
#Remove additional white spaces
tweet = re.sub('[\s]+', ' ', tweet)
#Replace #word with word
tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
#trim
tweet = tweet.strip('\'"')
return tweet
def replaceTwoOrMore(s):
#look for 2 or more repetitions of character and replace with the character itself
pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
return pattern.sub(r"\1\1", s)
#end
#start getStopWordList
def getStopWordList(stopWordListFileName):
#read the stopwords file and build a list
stopWords = []
stopWords.append('AT_USER')
stopWords.append('url')
stopWords.append('URL')
stopWords.append('rt')
fp = open(stopWordListFileName)
line = fp.readline()
while line:
word = line.strip()
stopWords.append(word)
line = fp.readline()
fp.close()
return stopWords
#end
#start getfeatureVector
def getFeatureVector(tweet):
featureVector = []
#split tweet into words
words = tweet.split()
for w in words:
#replace two or more with two occurrences
w = replaceTwoOrMore(w)
#strip punctuation
w = w.strip('\'"?,.')
#check if the word starts with an alphabet
val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", w)
#ignore if it is a stop word
if(w in stopWords or val is None):
continue
else:
featureVector.append(w.lower())
return featureVector
#end
def extract_features(tweet):
tweet_words = set(tweet)
features = {}
for word in featureList:
features['contains(%s)' % word] = (word in tweet_words)
return features
inpTweets = csv.reader(open('sheet3.csv', 'rb'), delimiter=',')
stopWords = getStopWordList('stopwords.txt')
featureList = []
# Get tweet words
tweets = []
for row in inpTweets:
sentiment = row[0]
tweet = row[1]
processedTweet = processTweet(tweet)
featureVector = getFeatureVector(processedTweet)
featureList.extend(featureVector)
tweets.append((featureVector, sentiment));
#end loop
# Remove featureList duplicates
featureList = list(set(featureList))
# Extract feature vector for all tweets in one shote
training_set = nltk.classify.util.apply_features(extract_features, tweets)
NBClassifier = nltk.NaiveBayesClassifier.train(training_set)
ft = open("april2.tsv")
line = ft.readline()
fo = open("dunno.tsv", "w")
fo.seek(0,0)
while line:
testTweet = line
processedTestTweet = processTweet(testTweet)
line1 = fo.write( NBClassifier.classify(extract_features(getFeatureVector(processedTestTweet))) + "\n");
line = ft.readline()
fo.close()
ft.close()
If you want to stick with NLTK, try pickle, e.g. https://spaghetti-tagger.googlecode.com/svn/spaghetti.py, see https://docs.python.org/2/library/pickle.html :
#-*- coding: utf8 -*-
from nltk import UnigramTagger as ut
from nltk import BigramTagger as bt
from cPickle import dump,load
def loadtagger(taggerfilename):
infile = open(taggerfilename,'rb')
tagger = load(infile); infile.close()
return tagger
def traintag(corpusname, corpus):
# Function to save tagger.
def savetagger(tagfilename,tagger):
outfile = open(tagfilename, 'wb')
dump(tagger,outfile,-1); outfile.close()
return
# Training UnigramTagger.
uni_tag = ut(corpus)
savetagger(corpusname+'_unigram.tagger',uni_tag)
# Training BigramTagger.
bi_tag = bt(corpus)
savetagger(corpusname+'_bigram.tagger',bi_tag)
print "Tagger trained with",corpusname,"using" +\
"UnigramTagger and BigramTagger."
return
Otherwise, try other machine learning libraries such as sklearn or shogun
The Naive Bayes Classifier module in NLTK is breathtakingly slow because it's a pure Python implementation. For this reason, consider using a different Machine Learning (ML) library like sci-kit learn.
YS-L's tip is good for using cPickle is good for your purposes at the moment but, if you ever have to retrain the classifier, it'd probably be best to switch to a different Naive Bayes implementation.

Python - AttributeError: 'list' object has no attribute

I am trying to create a sentiment analysis program. The tweets that will be analyzed are read from a CSV file, and after analyzed, it will be written again in a different CSV file. However, I got the AttributeError: 'list' object has no attribute 'lower' error. The error seems to appear from this part of the code. Is this operation not allowed for a sentence inside a CSV file?
def processTweet(tweet):
# process the tweets
#Convert to lower case
tweet = tweet.lower()
#Convert www.* or https?://* to URL
tweet = re.sub('((www\.[\s]+)|(https?://[^\s]+))','URL',tweet)
#Convert #username to AT_USER
tweet = re.sub('#[^\s]+','AT_USER',tweet)
#Remove additional white spaces
tweet = re.sub('[\s]+', ' ', tweet)
#Replace #word with word
tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
#trim
tweet = tweet.strip('\'"')
return tweet
#end
#start getStopWordList
def getStopWordList(stopWordListFileName):
#read the stopwords
stopWords = []
stopWords.append('AT_USER')
stopWords.append('URL')
fp = open(stopWordListFileName, 'r')
line = fp.readline()
while line:
word = line.strip()
stopWords.append(word)
line = fp.readline()
fp.close()
return stopWords
#end
#start getfeatureVector
def getFeatureVector(tweet, stopWords):
featureVector = []
words = tweet.split()
for w in words:
#replace two or more with two occurrences
w = replaceTwoOrMore(w)
#strip punctuation
w = w.strip('\'"?,.')
#check if it consists of only words
val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*[a-zA-Z]+[a-zA-Z0-9]*$", w)
#ignore if it is a stopWord
if(w in stopWords or val is None):
continue
else:
featureVector.append(w.lower())
return featureVector
#end
Here is the full code
#import regex
import re
import csv
import pprint
import nltk.classify
#start replaceTwoOrMore
def replaceTwoOrMore(s):
#look for 2 or more repetitions of character
pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
return pattern.sub(r"\1\1", s)
#end
#start process_tweet
def processTweet(tweet):
# process the tweets
#Convert to lower case
tweet = tweet.lower()
#Convert www.* or https?://* to URL
tweet = re.sub('((www\.[\s]+)|(https?://[^\s]+))','URL',tweet)
#Convert #username to AT_USER
tweet = re.sub('#[^\s]+','AT_USER',tweet)
#Remove additional white spaces
tweet = re.sub('[\s]+', ' ', tweet)
#Replace #word with word
tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
#trim
tweet = tweet.strip('\'"')
return tweet
#end
#start getStopWordList
def getStopWordList(stopWordListFileName):
#read the stopwords
stopWords = []
stopWords.append('AT_USER')
stopWords.append('URL')
fp = open(stopWordListFileName, 'r')
line = fp.readline()
while line:
word = line.strip()
stopWords.append(word)
line = fp.readline()
fp.close()
return stopWords
#end
#start getfeatureVector
def getFeatureVector(tweet, stopWords):
featureVector = []
words = tweet.split()
for w in words:
#replace two or more with two occurrences
w = replaceTwoOrMore(w)
#strip punctuation
w = w.strip('\'"?,.')
#check if it consists of only words
val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*[a-zA-Z]+[a-zA-Z0-9]*$", w)
#ignore if it is a stopWord
if(w in stopWords or val is None):
continue
else:
featureVector.append(w.lower())
return featureVector
#end
#start extract_features
def extract_features(tweet):
tweet_words = set(tweet)
features = {}
for word in featureList:
features['contains(%s)' % word] = (word in tweet_words)
return features
#end
#Read the tweets one by one and process it
inpTweets = csv.reader(open('data/sampleTweets.csv', 'rb'), delimiter=',', quotechar='"')
stopWords = getStopWordList('data/feature_list/stopwords.txt')
count = 0;
featureList = []
tweets = []
for row in inpTweets:
sentiment = row[0]
tweet = row[1]
processedTweet = processTweet(tweet)
featureVector = getFeatureVector(processedTweet, stopWords)
featureList.extend(featureVector)
tweets.append((featureVector, sentiment));
#end loop
# Remove featureList duplicates
featureList = list(set(featureList))
# Generate the training set
training_set = nltk.classify.util.apply_features(extract_features, tweets)
# Train the Naive Bayes classifier
NBClassifier = nltk.NaiveBayesClassifier.train(training_set)
# Test the classifier
# testTweet = 'RT #Jewelz2611 #mashable #apple, iphones r 2 expensive. Most went w/ htc/galaxy. No customer loyalty w/phone comp..'
with open('data/test_datasets.csv', 'r') as csvinput:
with open('data/test_datasets_output.csv', 'w') as csvoutput:
writer = csv.writer(csvoutput, lineterminator='\n')
reader = csv.reader(csvinput)
all=[]
row = next(reader)
for row in reader:
processedTestTweet = processTweet(row)
sentiment = NBClassifier.classify(extract_features(getFeatureVector(processedTestTweet, stopWords)))
row.append(sentiment)
all.append(row)
writer.writerows(all)
# print "testTweet = %s, sentiment = %s\n" % (testTweet, sentiment)
The traceback and error are as follows:
Traceback (most recent call last):
File "simpleDemo.py", line 114, in <module>
processedTestTweet = processTweet(row)
File "simpleDemo.py", line 19, in processTweet
tweet = tweet.lower()
AttributeError: 'list' object has no attribute 'lower'
Any help would be really appreaciated. Thanks!
You pass reader to processTweet() instead of row but processTweet() expects a string you probably should processTweet(row[1])

Ignoring duplicate words in a python dictionary

I have a Python script that takes in '.html' files removes stop words and returns all other words in a python dictionary. But if the same word occurs in multiple files I want it to return only once. i.e. contain non-stop words, each only once.
def run():
filelist = os.listdir(path)
regex = re.compile(r'.*<div class="body">(.*?)</div>.*', re.DOTALL | re.IGNORECASE)
reg1 = re.compile(r'<\/?[ap][^>]*>', re.DOTALL | re.IGNORECASE)
quotereg = re.compile(r'"', re.DOTALL | re.IGNORECASE)
puncreg = re.compile(r'[^\w]', re.DOTALL | re.IGNORECASE)
f = open(stopwordfile, 'r')
stopwords = f.read().lower().split()
filewords = {}
htmlfiles = []
for file in filelist:
if file[-5:] == '.html':
htmlfiles.append(file)
totalfreq = {}
for file in htmlfiles:
f = open(path + file, 'r')
words = f.read().lower()
words = regex.findall(words)[0]
words = quotereg.sub(' ', words)
words = reg1.sub(' ', words)
words = puncreg.sub(' ', words)
words = words.strip().split()
for w in stopwords:
while w in words:
words.remove(w)
freq = {}
for w in words:
words=words
print words
if __name__ == '__main__':
run()
Use a set. Simply add every word you find to the set; it ignores duplicates.
Assuming you have an iterator that returns each word in a file (this is for plain text; HTML would be rather more complicated):
def words(filename):
with open(filename) as wordfile:
for line in wordfile:
for word in line.split():
yield word
Then getting them into a set is simple:
wordlist = set(words("words.txt"))
If you have multiple files, just do like so:
wordlist = set()
wordfiles = ["words1.txt", "words2.txt", "words3.txt"]
for wordfile in wordfiles:
wordlist |= set(words(wordfile))
You can also use a set for your stop words. Then you can simply subtract them from the word list after the fact, which will probably be faster than checking to see if each word is a stop word before adding.
stopwords = set(["a", "an", "the"])
wordlist -= stopwords

Categories

Resources