Clustering using Latent Dirichlet Allocation algo in gensim

Clustering using Latent Dirichlet Allocation algo in gensim - python

Is it possible to do clustering in gensim for a given set of inputs using LDA? How can I go about it?

LDA produces a lower dimensional representation of the documents in a corpus. To this low-d representation you could apply a clustering algorithm, e.g. k-means. Since each axis corresponds to a topic, a simpler approach would be assigning each document to the topic onto which its projection is largest.

Yes you can. Here is a tutorial: http://nlp.fi.muni.cz/projekty/gensim/wiki.html#latent-dirichlet-allocation
First load you corpus, then call:
lda = gensim.models.ldamodel.LdaModel(corpus=mm, num_topics=100)

This is an example.
You need copy matutils.py and utils.py from gensim first, and the directory
should like the pic blow.
utils.py
matutils.py
doc_similar.py
model(dir)
data(dir)
The code blow should be in doc_similar.py.
Then just move your data_file into directory data and change fname in function main.
#coding:utf-8
from gensim import corpora, models, similarities
import cPickle
import logging
import utils
import os
import numpy as np
import scipy
import matutils
from collections import defaultdict
data_dir = os.path.join(os.getcwd(), 'data')
work_dir = os.path.join(os.getcwd(), 'model', os.path.basename(__file__).rstrip('.py'))
if not os.path.exists(work_dir):
os.mkdir(work_dir)
os.chdir(work_dir)
logger = logging.getLogger('text_similar')
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# convert to unicode
def to_unicode(text):
if not isinstance(text, unicode):
text = text.decode('utf-8')
return text
class TextSimilar(utils.SaveLoad):
def __init__(self):
self.conf = {}
def _preprocess(self):
docs = [to_unicode(doc.strip()).split()[1:] for doc in file(self.fname)]
cPickle.dump(docs, open(self.conf['fname_docs'], 'wb'))
dictionary = corpora.Dictionary(docs)
dictionary.save(self.conf['fname_dict'])
corpus = [dictionary.doc2bow(doc) for doc in docs]
corpora.MmCorpus.serialize(self.conf['fname_corpus'], corpus)
return docs, dictionary, corpus
def _generate_conf(self):
fname = self.fname[self.fname.rfind('/') + 1:]
self.conf['fname_docs'] = '%s.docs' % fname
self.conf['fname_dict'] = '%s.dict' % fname
self.conf['fname_corpus'] = '%s.mm' % fname
def train(self, fname, is_pre=True, method='lsi', **params):
self.fname = fname
self.method = method
self._generate_conf()
if is_pre:
self.docs, self.dictionary, corpus = self._preprocess()
else:
self.docs = cPickle.load(open(self.conf['fname_docs']))
self.dictionary = corpora.Dictionary.load(self.conf['fname_dict'])
corpus = corpora.MmCorpus(self.conf['fname_corpus'])
if params is None:
params = {}
logger.info("training TF-IDF model")
self.tfidf = models.TfidfModel(corpus, id2word=self.dictionary)
corpus_tfidf = self.tfidf[corpus]
if method == 'lsi':
logger.info("training LSI model")
self.lsi = models.LsiModel(corpus_tfidf, id2word=self.dictionary, **params)
self.similar_index = similarities.MatrixSimilarity(self.lsi[corpus_tfidf])
self.para = self.lsi[corpus_tfidf]
elif method == 'lda_tfidf':
logger.info("training LDA model")
self.lda = models.LdaMulticore(corpus_tfidf, id2word=self.dictionary, workers=8, **params)
self.similar_index = similarities.MatrixSimilarity(self.lda[corpus_tfidf])
self.para = self.lda[corpus_tfidf]
elif method == 'lda':
logger.info("training LDA model")
self.lda = models.LdaMulticore(corpus, id2word=self.dictionary, workers=8, **params)
self.similar_index = similarities.MatrixSimilarity(self.lda[corpus])
self.para = self.lda[corpus]
elif method == 'logentropy':
logger.info("training a log-entropy model")
self.logent = models.LogEntropyModel(corpus, id2word=self.dictionary)
self.similar_index = similarities.MatrixSimilarity(self.logent[corpus])
self.para = self.logent[corpus]
else:
msg = "unknown semantic method %s" % method
logger.error(msg)
raise NotImplementedError(msg)
def doc2vec(self, doc):
bow = self.dictionary.doc2bow(to_unicode(doc).split())
if self.method == 'lsi':
return self.lsi[self.tfidf[bow]]
elif self.method == 'lda':
return self.lda[bow]
elif self.method == 'lda_tfidf':
return self.lda[self.tfidf[bow]]
elif self.method == 'logentropy':
return self.logent[bow]
def find_similar(self, doc, n=10):
vec = self.doc2vec(doc)
sims = self.similar_index[vec]
sims = sorted(enumerate(sims), key=lambda item: -item[1])
for elem in sims[:n]:
idx, value = elem
print ' '.join(self.docs[idx]), value
def get_vectors(self):
return self._get_vector(self.para)
def _get_vector(self, corpus):
def get_max_id():
maxid = -1
for document in corpus:
maxid = max(maxid, max([-1] + [fieldid for fieldid, _ in document])) # [-1] to avoid exceptions from max(empty)
return maxid
num_features = 1 + get_max_id()
index = np.empty(shape=(len(corpus), num_features), dtype=np.float32)
for docno, vector in enumerate(corpus):
if docno % 1000 == 0:
print("PROGRESS: at document #%i/%i" % (docno, len(corpus)))
if isinstance(vector, np.ndarray):
pass
elif scipy.sparse.issparse(vector):
vector = vector.toarray().flatten()
else:
vector = matutils.unitvec(matutils.sparse2full(vector, num_features))
index[docno] = vector
return index
def cluster(vectors, ts, k=30):
from sklearn.cluster import k_means
X = np.array(vectors)
cluster_center, result, inertia = k_means(X.astype(np.float), n_clusters=k, init="k-means++")
X_Y_dic = defaultdict(set)
for i, pred_y in enumerate(result):
X_Y_dic[pred_y].add(''.join(ts.docs[i]))
print 'len(X_Y_dic): ', len(X_Y_dic)
with open(data_dir + '/cluser.txt', 'w') as fo:
for Y in X_Y_dic:
fo.write(str(Y) + '\n')
fo.write('{word}\n'.format(word='\n'.join(list(X_Y_dic[Y])[:100])))
def main(is_train=True):
fname = data_dir + '/brand'
num_topics = 100
method = 'lda'
ts = TextSimilar()
if is_train:
ts.train(fname, method=method ,num_topics=num_topics, is_pre=True, iterations=100)
ts.save(method)
else:
ts = TextSimilar().load(method)
index = ts.get_vectors()
cluster(index, ts, k=num_topics)
if __name__ == '__main__':
is_train = True if len(sys.argv) > 1 else False
main(is_train)

The basic thing to understand here is that clustering requires your data to be present in a format and is not concerned with how did you arrive at your data. So, whether you apply clustering on the term-document matrix or on the reduced-dimension (LDA output matrix), clustering will work irrespective of that.
Just do the other things right though, small mistakes in data formats can cost you a lot of time of research.

Related

AttributeError: 'function' object has no attribute 'iterrows'

import pandas as pd
from pandas import DataFrame
import nltk
import numpy as np
from sklearn.metrics import accuracy_score
**this is the library**
df = pd.read_csv('spam.csv', encoding='ISO-8859-1')
**the dataframe**
def get_feature(text):
if len(text)==2:
return {'ham':text[-3]}
elif len(text)>=1:
return {'spam':text[-3]}
else:
return {'ham':'', 'spam':''}
**get future**
def get_feature_text(text):
if len(text)==2:
return {'spam':text[-2], 'ham':text[-1]}
else:
return {'spam':DataFrame.rename(text[-2])[0], 'ham':DataFrame.rename(text[-1])[0]}
**get future of text**
def get_data(df, get_feature=get_feature):
featrues = []
for i, row in df.iterrows():
text = row['v1']; type = row['v2']
if isinstance(text, str):
if ' ' in text:
text = text.replace(' ', '')
if '(' not in text:
featrues.append((get_feature(text), type.strip('() ')))
else:
text = text.partition('(')[0]
featrues.append((get_feature(text), type.strip('() ')))
return featrues
**get data all**
def get_train_test(featrues, ratio=0.9):
N = len(featrues)
T = int(N * ratio)
train = featrues[:T]
test = featrues[T:]
return train, test
**train and test**
def text_classifier(df, f=get_feature):
data = get_data(df, f)
train, test = get_train_test(data)
classifier = nltk.NaiveBayesClassifier.train(train)
acc = nltk.classify.accuracy(classifier, test)
return classifier, acc
**text classifier**
def show_type_of_text(text, texts=False, show_acc=False):
f = get_feature_text if texts else get_feature
classifier, acc = text_classifier(df, f)
if show_acc:
print("The accuracy of prediction is: ", acc)
clf = classifier.classify(f(text))
print(f'{text}: {clf}')
classifier.show_most_informative_features(10)
**show type of text**
def give_type(type1='spam', type2='ham'):
data = get_data(df, get_feature)
classifier = nltk.NaiveBayesClassifier.train(data)
following = classifier.prob_classify({'ham':type2, 'spam':type1})
x = following.generate()
print(f'{type2}: {type1}{x}')
**give type**
if __name__ == '__main__':
print('-wait a minute-')
show_type_of_text("spam")
print((text_classifier(accuracy_score)))
**i expecting to print the score
from this script I want to score the accuracy of the code that I made but here the problem is itterows then what should I do? I've read the documentation about accuracy_score but still the same result doesn't want to come out**
**
from this script I want to score the accuracy of the code that I made but here the problem is itterows then what should I do? I've read the documentation about accuracy_score but still the same result doesn't want to come out**

AttributeError: '' object has no attribute '' - in-class declared variable is not recognized

For a given corpus of tokenized texts, I want to perform word weighing with several weighing techniques. To do so, I created the following class:
class Weighing:
def __init__(self, input_file, word_weighing):
self.input_file_ = input_file #List in which each element is a list of tokens
self.word_weighing_ = word_weighing
self.num_documents = len(self.input_file_)
#Set with all unique words from the corpus
self.vocabulary = set()
for text in self.input_file_:
self.vocabulary.update(text)
self.vocabulary_size = len(self.vocabulary)
#Create dictionary that returns index for a token or token for an index of the corpus' vocabulary
self.word_to_index = dict()
self.index_to_word = dict()
for i, word in enumerate(self.vocabulary):
self.word_to_index[word] = i
self.index_to_word[i] = word
#Create sparse Document-Term Matrix (DTM)
self.sparse_dtm = dok_matrix((self.num_documents, self.vocabulary_size), dtype=np.float32)
for doc_index, document in enumerate(self.input_file_):
document_counter = Counter(document)
for word in set(document):
self.sparse_dtm[doc_index, self.word_to_index[word]] = document_counter[word] # Update element
#Get word count for all documents to calculate sparse_p_ij
self.sum_words = Counter()
for doc in self.input_file_:
self.sum_words.update(Counter(doc))
#Create probability of word i in document j. Format: sparse matrix
def create_sparse_p_ij (self):
sparse_p_ij = dok_matrix((self.num_documents, self.vocabulary_size), dtype=np.float32)
for j in range(self.num_documents):
row_counts = self.sparse_dtm.getrow(j).toarray()[0]
word_index = row_counts.nonzero()[0]
non_zero_row_counts = row_counts[row_counts != 0]
for i, count in enumerate(non_zero_row_counts):
word = self.index_to_word[word_index[i]]
prob_ij = count/self.sum_words[word]
sparse_p_ij[j,word_index[i]] = prob_ij
return sparse_p_ij
#Create a binary sparse dtm. Format: sparse matrix
def create_sparse_binary_dtm(self):
binary_sparse_dtm = dok_matrix((self.num_documents, self.vocabulary_size), dtype=np.float32)
for doc_index, document in enumerate(self.input_file_):
document_counter = dict.fromkeys(document, 1)
for word in set(document):
binary_sparse_dtm[doc_index, self.word_to_index[word]] = document_counter[word] # Update element
return binary_sparse_dtm
#2) Calculate Global Term weighting (4 methods: entropy, IDF, Probabilistic IDF, Normal)
def calc_entropy(self):
sparse_p_ij = self.create_sparse_p_ij()
summed_word_probabilities = sparse_p_ij.sum(0).tolist()[0]
return np.array([1+((word_probability * np.log2(word_probability))/np.log2(self.num_documents)) for word_probability in summed_word_probabilities])
def calc_idf(self):
summed_words = self.sparse_dtm.sum(0).tolist()[0]
return np.array([np.log2(self.num_documents/word_count) for word_count in summed_words])
def calc_normal(self):
summed_words = self.sparse_dtm.sum(0).tolist()[0]
return np.array([1/(math.sqrt(word_count**2)) for word_count in summed_words])
def calc_probidf (self):
binary_sparse_dtm = self.create_sparse_binary_dtm()
summed_binary_words_list = binary_sparse_dtm.sum(0).tolist()[0]
return np.array([np.log2((self.num_documents - binary_word_count)/binary_word_count) for binary_word_count in summed_binary_words_list])
if self.word_weighing_ == 1:
gtw = self.calc_entropy()
elif self.word_weighing_ == 2:
gtw = self.calc_idf()
elif self.word_weighing_ == 3:
gtw = self.calc_normal()
elif self.word_weighing_ == 4:
gtw = self.calc_probidf()
Now, when I run:
model = Weighing(input_file = data_list,
word_weighing = 1)
With data_list is a list of lists with tokenized words.
I get the following error:
Traceback (most recent call last):
File "<ipython-input-621-b0a9caec82d4>", line 4, in <module>
word_weighing = 1)
File "<ipython-input-617-6f3fdcecd170>", line 90, in __init__
gtw = self.calc_entropy()
AttributeError: 'Weighing' object has no attribute 'calc_entropy'
I looked at a few other similar SO links[1,2,3,4], but none of these seem applicable here.
What can I do to overcome this error?
EDIT:
I've updated the code to:
class Weighing:
def __init__(self, input_file, word_weighing):
self.input_file_ = input_file #List in which each element is a list of tokens
self.word_weighing_ = word_weighing
self.num_documents = len(self.input_file_)
#Set with all unique words from the corpus
self.vocabulary = set()
for text in self.input_file_:
self.vocabulary.update(text)
self.vocabulary_size = len(self.vocabulary)
#Create dictionary that returns index for a token or token for an index of the corpus' vocabulary
self.word_to_index = dict()
self.index_to_word = dict()
for i, word in enumerate(self.vocabulary):
self.word_to_index[word] = i
self.index_to_word[i] = word
#Create sparse Document-Term Matrix (DTM)
self.sparse_dtm = dok_matrix((self.num_documents, self.vocabulary_size), dtype=np.float32)
for doc_index, document in enumerate(self.input_file_):
document_counter = Counter(document)
for word in set(document):
self.sparse_dtm[doc_index, self.word_to_index[word]] = document_counter[word] # Update element
if self.word_weighing_ == 1:
self.gtw = self.calc_entropy()
elif self.word_weighing_ == 2:
self.gtw = self.calc_idf()
elif self.word_weighing_ == 3:
self.gtw = self.calc_normal()
elif self.word_weighing_ == 4:
self.gtw = self.calc_probidf()
#Get word count for all documents to calculate sparse_p_ij
self.sum_words = Counter()
for doc in self.input_file_:
self.sum_words.update(Counter(doc))
#Create probability of word i in document j. Format: sparse matrix
def create_sparse_p_ij (self):
sparse_p_ij = dok_matrix((self.num_documents, self.vocabulary_size), dtype=np.float32)
for j in range(self.num_documents):
row_counts = self.sparse_dtm.getrow(j).toarray()[0]
word_index = row_counts.nonzero()[0]
non_zero_row_counts = row_counts[row_counts != 0]
for i, count in enumerate(non_zero_row_counts):
word = self.index_to_word[word_index[i]]
prob_ij = count/self.sum_words[word]
sparse_p_ij[j,word_index[i]] = prob_ij
return sparse_p_ij
#Create a binary sparse dtm. Format: sparse matrix
def create_sparse_binary_dtm(self):
binary_sparse_dtm = dok_matrix((self.num_documents, self.vocabulary_size), dtype=np.float32)
for doc_index, document in enumerate(self.input_file_):
document_counter = dict.fromkeys(document, 1)
for word in set(document):
binary_sparse_dtm[doc_index, self.word_to_index[word]] = document_counter[word] # Update element
return binary_sparse_dtm
#2) Calculate Global Term weighting (4 methods: entropy, IDF, Probabilistic IDF, Normal)
def calc_entropy(self):
sparse_p_ij = self.create_sparse_p_ij()
summed_word_probabilities = sparse_p_ij.sum(0).tolist()[0]
return np.array([1+((word_probability * np.log2(word_probability))/np.log2(self.num_documents)) for word_probability in summed_word_probabilities])
def calc_idf(self):
summed_words = self.sparse_dtm.sum(0).tolist()[0]
return np.array([np.log2(self.num_documents/word_count) for word_count in summed_words])
def calc_normal(self):
summed_words = self.sparse_dtm.sum(0).tolist()[0]
return np.array([1/(math.sqrt(word_count**2)) for word_count in summed_words])
def calc_probidf (self):
binary_sparse_dtm = self.create_sparse_binary_dtm()
summed_binary_words_list = binary_sparse_dtm.sum(0).tolist()[0]
return np.array([np.log2((self.num_documents - binary_word_count)/binary_word_count) for binary_word_count in summed_binary_words_list])
However, I still get the error:
AttributeError: 'Weighing' object has no attribute 'calc_entropy'
Now, I call a function before I have initialized it. How can I change my code so that I initialize the def calc_entropy before I initialize the self.gtw?

It seems to be an indentation problem: You define your method functions like calc_entropy() within your __init__() function and not within your class.
It should be:
class Weighing:
def __init__(self):
# your init
def calc_entropy(self):
# your method

How to fix broken data in feature extraction/pre-processing in speech recognition?

i am very new in machine learning. I stumble on this source code on github that has no database, so i decided to use my own database. This code is to recognize speaker with MFCC and GMM-UBM. But when i try to run the code, i got this error "ValueError: Found array with 1 sample(s) (shape=(1, 13)) while a minimum of 2 is required". It seems like when the code is trying to fit the GMM on the 68th dataset, the MFCC shape of the data is broken. I assume there's something wrong on the feature extraction process.
Please help me! thank you very much.
Here's the code
import python_speech_features as psf
from sklearn.mixture import GaussianMixture
from sklearn.externals import joblib
from scipy.io import wavfile
from functools import reduce
import numpy as np
from os import listdir
from os.path import isfile, join
import os
import re
DATA_PATH = 'dataCoba'
# Make a list of speakers from the newdata/data folder. The format for the files in the folder is
# name_1,wav for training and name_2.wav for testing
substring = "_2"
onlyfiles = [f for f in listdir(DATA_PATH) if isfile(join(DATA_PATH, f))]
onlyfiles.sort()
onlyones = []
for filename in onlyfiles:
dups = re.search('[\w]+_2.wav', filename)
#dups = re.search('[\w].wav', filename)
if dups is None:
onlyones.append(''.join(filename.split('_')[0]))
print(onlyones)
SPEAKERS = onlyones
TOTAL_SPEAKERS = len(SPEAKERS)
MODEL_SPEAKERS = len(SPEAKERS)
print(len(SPEAKERS))
class SpeakerRecognition:
# Create a GMM and UBM model for each speaker. The GMM is modelled after the speaker and UBM for each speaker
# is modelled after all the other speakers. Likelihood Ratio test is used to verify speaker
def setGMMUBM(self, no_components):
self.GMM = []
self.UBM = []
for i in range(MODEL_SPEAKERS):
self.GMM.append(GaussianMixture(n_components= no_components, covariance_type= 'diag'))
self.UBM.append(GaussianMixture(n_components= no_components, covariance_type= 'diag'))
# Load in data from .wav files in data/
# Extract mfcc (first 13 coefficients) from each audio sample
def load_data(self):
#training
self.spk = [wavfile.read(DATA_PATH + '/' + (str(i).replace('.wav','')) + '_1.wav') for i in SPEAKERS]
self.spk_mfcc = [psf.mfcc(self.spk[i][1], self.spk[i][0]) for i in range(0, TOTAL_SPEAKERS)]
#testing
self.p_spk = [wavfile.read(DATA_PATH + '/' + (str(i).replace('.wav','')) + '_2.wav') for i in SPEAKERS]
self.p_spk_mfcc = [psf.mfcc(self.p_spk[i][1], self.p_spk[i][0]) for i in range(0, TOTAL_SPEAKERS)]
print(self.spk_mfcc)
for i in range(TOTAL_SPEAKERS):
self.spk_train_size.append(len(self.spk_mfcc[i]))
self.spk_start.append(len(self.total_mfcc))
print("Speaker Number(train) = ",i)
print ("self.spk_mfcc[i] = ", len(self.spk_mfcc[i]))
for mfcc in self.spk_mfcc[i]:
self.total_mfcc.append(mfcc)
self.speaker_label.append(i)
self.spk_end.append(len(self.total_mfcc))
print("self.total_mfcc = ", len(self.total_mfcc))
print("\n")
for i in range(TOTAL_SPEAKERS):
#print("self.p_spk_mfcc =", self.p_spk_mfcc)
self.spk_test_size.append(len(self.p_spk_mfcc[i]))
self.spk_start.append(len(self.p_total_mfcc))
print("Speaker Num(test) = ",i)
print("self.p_spk_mfcc = ",len(self.p_spk_mfcc[i]))
print("MFCC Shape = ",self.spk_mfcc[i].shape)
for mfcc in self.p_spk_mfcc[i]:
self.p_total_mfcc.append(mfcc)
self.p_speaker_label.append(i)
self.p_spk_end.append(len(self.p_total_mfcc))
print("self.total_mfcc = ", len(self.p_total_mfcc))
print("\n")
# Gaussian Mixture Model is made of a number of Gaussian distribution components.
# To model data, a suitable number o gaussian components have to be selected.
# There is no method for finding this. It is done by trial and error. This runs
# the program for different values of component and records accuracy for each one
[![This is the error when i run the code][1]][1]
def find_best_params(self):
best_no_components = 1
maxacc = 0
for i in range(100, 256):
self.setGMMUBM(i)
self.fit_model()
_, acc, _ = self.predict()
print("Accuracy for n = {} is {}".format(i, acc))
if acc > maxacc:
maxacc = acc
best_no_components = i
return best_no_components
# Fit the GMM UBM models with training data
# fit = N buah data * dimensi data
def fit_model(self):
for i in range(MODEL_SPEAKERS):
print("Fit start for {}".format(i))
self.GMM[i].fit(self.spk_mfcc[i])
print(self.spk_mfcc[i].shape)
self.UBM[i].fit(self.total_mfcc[:self.spk_start[i]] + self.total_mfcc[self.spk_end[i]:])
print("Fit end for {}".format(i))
joblib.dump(self.UBM[i], 'dumps/new/ubm' + str(i) + '.pkl')
joblib.dump(self.GMM[i], 'dumps/new/gmm' + str(i) + '.pkl')
def model(self, no_components = 244):
self.setGMMUBM(no_components)
self.fit_model()
# Predict the output for each model for each speaker and produce confusion matrix
def load_model(self):
for i in range(0, MODEL_SPEAKERS):
self.GMM.append(joblib.load('dumps/new/gmm' + str(i) + '.pkl'))
self.UBM.append(joblib.load('dumps/new/ubm' + str(i) + '.pkl'))
def predict(self):
avg_accuracy = 0
confusion = [[ 0 for y in range(MODEL_SPEAKERS) ] for x in range(TOTAL_SPEAKERS)]
for i in range(TOTAL_SPEAKERS):
for j in range(MODEL_SPEAKERS):
x = self.GMM[j].score_samples(self.p_spk_mfcc[i]) - self.UBM[j].score_samples(self.p_spk_mfcc[i])
for score in x :
if score > 0:
confusion[i][j] += 1
confusion_diag = [confusion[i][i] for i in range(MODEL_SPEAKERS)]
diag_sum = 0
for item in confusion_diag:
diag_sum += item
remain_sum = 0
for i in range(MODEL_SPEAKERS):
for j in range(MODEL_SPEAKERS):
if i != j:
remain_sum += confusion[i][j]
spk_accuracy = 0
for i in range(MODEL_SPEAKERS):
best_guess, _ = max(enumerate(confusion[i]), key=lambda p: p[1])
print("For Accent {}, best guess is {}".format(SPEAKERS[i], SPEAKERS[best_guess]))
if i == best_guess:
spk_accuracy += 1
#print(MODEL_SPEAKERS)
spk_accuracy /= MODEL_SPEAKERS
avg_accuracy = diag_sum/(remain_sum+diag_sum)
return confusion, avg_accuracy, spk_accuracy
def __init__(self):
self.test_spk = []
self.test_mfcc = []
# Speaker data and corresponding mfcc
self.spk = []
self.spk_mfcc = []
self.p_spk = []
self.p_spk_mfcc = []
# Holds all the training mfccs of all speakers and
# speaker_label is the speaker label for the corresponding mfcc
self.total_mfcc = []
self.speaker_label = []
self.spk_train_size = [] # Index upto which is training data for that speaker.
self.p_total_mfcc = []
self.p_speaker_label = []
#print(self.p_speaker_label)
self.spk_test_size = []
# Since the length of all the audio files are different, spk_start and spk_end hold
self.spk_start = []
self.spk_end = []
self.p_spk_start = []
self.p_spk_end = []
self.GMM = []
self.UBM = []
self.load_data()
self.cepstral_mean_subtraction()
# Cepstral Mean Subtraction (Feature Normalization step)
def cepstral_mean_subtraction(self):
for i, speaker_mfcc in enumerate(self.spk_mfcc):
average = reduce(lambda acc, ele: acc + ele, speaker_mfcc)
average = list(map(lambda x: x/len(speaker_mfcc), average))
for j, feature_vector in enumerate(speaker_mfcc):
for k, feature in enumerate(feature_vector):
self.spk_mfcc[i][j][k] -= average[k]
for i, speaker_mfcc in enumerate(self.p_spk_mfcc):
average = reduce(lambda acc, ele: acc + ele, speaker_mfcc)
average = list(map(lambda x: x / len(speaker_mfcc), average))
for j, feature_vector in enumerate(speaker_mfcc):
for k, feature in enumerate(feature_vector):
self.p_spk_mfcc[i][j][k] -= average[k]
#TBD : Ten fold validation
def ten_fold():
#fold_size = 0.1 * self.n
fold_offset = 0.0
accuracy_per_fold = 0
average_accuracy = 0
for i in range(0, 10):
print("Fold start is {} and fold end is {} ".format( fold_offset, fold_offset + fold_size))
#accuracy = self.execute(int(fold_offset), int(fold_offset + fold_size))
#print("Accuracy is of test {} is : {} ".format(i, accuracy))
#average_accuracy += accuracy
#fold_offset += fold_size
average_accuracy /= 10.0
print("Average accuracy " + str(100 * average_accuracy))
return average_accuracy
# Final result is a confusion matrix which represents the accuracy of the fit of the model
if __name__ == '__main__':
SR = SpeakerRecognition()
#SR.load_model()
SR.setGMMUBM(no_components=13)
#SR.find_best_params()
SR.fit_model()
confusion, mfcc_accuracy, spk_accuracy = SR.predict()
print("Confusion Matrix")
print(np.matrix(confusion))
print("Accuracy in predicting speakers : {}".format(spk_accuracy))
print("Accuracy in testing for MFCC : {}".format(mfcc_accuracy))

unsupported operand type(s) for +=: 'zip' and 'zip'

thanks for the answer before and I have changed it what Alperen suggested, but I have another problem, my code :
import sys
import os
import itertools
import os.path
import random
from PIL import Image
from svmutil import *
DIMENSION = 200
sys.path.append("../train/")
ROOT_DIR = os.path.dirname(os.getcwd()) + "/train"
NEGATIVE = "negative"
POSITIVE = "positive"
CLASSES = [NEGATIVE, POSITIVE]
# libsvm constants
LINEAR = 0
RBF = 2
# Other
USE_LINEAR = False
IS_TUNING = False
def main():
try:
train, tune, test = getData(IS_TUNING)
models = getModels(train)
results = None
if IS_TUNING:
print ("!!! TUNING MODE !!!")
results = classify(models, tune)
else:
results = classify(models, test)
print
totalCount = 0
totalCorrect = 0
for clazz in CLASSES:
count, correct = results[clazz]
totalCount += count
totalCorrect += correct
print ("%s %d %d %f") % (clazz, correct, count, (float(correct) / count))
print ("%s %d %d %f") % ("Overall", totalCorrect, totalCount,(float(totalCorrect) / totalCount))
except Exception as e:
print (e)
return 5
def classify(models, dataSet):
results = {}
for trueClazz in CLASSES:
count = 0
correct = 0
for item in dataSet[trueClazz]:
predClazz, prob = predict(models, item)
print ("%s,%s,%f") % (trueClazz, predClazz, prob)
count += 1
if trueClazz == predClazz: correct += 1
results[trueClazz] = (count, correct)
return results
def predict(models, item):
maxProb = 0.0
bestClass = ""
for clazz, model in models.iteritems():
prob = predictSingle(model, item)
if prob > maxProb:
maxProb = prob
bestClass = clazz
return (bestClass, maxProb)
def predictSingle(model, item):
output = svm_predict([0], [item], model, "-q -b 1")
prob = output[2][0][0]
return prob
def getModels(trainingData):
models = {}
param = getParam(USE_LINEAR)
for c in CLASSES:
labels, data = getTrainingData(trainingData, c)
prob = svm_problem(labels, data)
m = svm_train(prob, param)
models[c] = m
return models
def getTrainingData(trainingData, clazz):
labeledData = getLabeledDataVector(trainingData, clazz, 1)
negClasses = [c for c in CLASSES if not c == clazz]
for c in negClasses:
ld = getLabeledDataVector(trainingData, c, -1)
labeledData += ld
random.shuffle(labeledData)
unzipped = [list(t) for t in zip(*labeledData)]
labels, data = unzipped[0], unzipped[1]
return (labels, data)
def getParam(linear = True):
param = svm_parameter("-q")
param.probability = 1
if(linear):
param.kernel_type = LINEAR
param.C = .01
else:
param.kernel_type = RBF
param.C = .01
param.gamma = .00000001
return param
def getLabeledDataVector(dataset, clazz, label):
data = dataset[clazz]
labels = [label] * len(data)
output = zip(labels, data)
return output
def getData(generateTuningData):
trainingData = {}
tuneData = {}
testData = {}
for clazz in CLASSES:
(train, tune, test) = buildTrainTestVectors(buildImageList(ROOT_DIR + clazz + "/"), generateTuningData)
trainingData[clazz] = train
tuneData[clazz] = tune
testData[clazz] = test
return (trainingData, tuneData, testData)
def buildImageList(dirName):
imgs = [Image.open(dirName + fileName).resize((DIMENSION, DIMENSION)) for fileName in os.listdir(dirName)]
imgs = [list(itertools.chain.from_iterable(img.getdata())) for img in imgs]
return imgs
def buildTrainTestVectors(imgs, generateTuningData):
# 70% for training, 30% for test.
testSplit = int(.7 * len(imgs))
baseTraining = imgs[:testSplit]
test = imgs[testSplit:]
training = None
tuning = None
if generateTuningData:
# 50% of training for true training, 50% for tuning.
tuneSplit = int(.5 * len(baseTraining))
training = baseTraining[:tuneSplit]
tuning = baseTraining[tuneSplit:]
else:
training = baseTraining
return (training, tuning, test)
if __name__ == "__main__":
sys.exit(main())
and I got the new massage
Klik this massage to see new error massage
What should I do? I have searched every answer but never make me get the answer. Now I use this code for my final project at university. I hope anyone can help me for this problem. But thank you for another last answer

EDIT:
This lines causes the error:
labeledData += ld
+= operand doesn't work for zips. You can change zips to list.
def getLabeledDataVector(dataset, clazz, label):
...
return list(output)
Also, unzipped list can be empty, you should fix this line too(Thanks to ShadowRanger for comment):
labels, data = unzipped if unzipped else ([], [])
This changes probably will affect your code's logic. You should fix them on your own.
BEFORE EDIT:
In getData(generateTuningData) function, ROOT_DIR + clazz expression causes the error, because ROOT_DIR is None.
sys.path.append doesn't return anything(returns None).
You need to change your code as:
...
import os.path
...
sys.path.append("../train/")
ROOT_DIR = os.path.dirname(os.getcwd()) + "/train/" # parent directory and "/train/"
...
I assumed ROOT_DIR is your current working directory's parent + "/train/". If it is not, you can fix it.
Also, there may be other problems, but this solves unsupported operand type(s).

error in Naive bayes classifier

i'm beginner in machine learning and i'm trying to implement my first Naive Bayes by myself for better understanding. So, i have dataset from http://archive.ics.uci.edu/ml/datasets/Adult (american census data, classes are '<=50k' and '>50k').
Here is my python code:
#!/usr/bin/python
import sys
import csv
words_stats = {} # {'word': {'class1': cnt, 'class2': cnt'}}
words_cnt = 0
targets_stats = {} # {'class1': 3234, 'class2': 884} how many words in each class
class_stats = {} # {'class1': 7896, 'class2': 3034} how many lines in each class
items_cnt = 0
def train(dataset, targets):
global words_stats, words_cnt, targets_stats, items_cnt, class_stats
num = len(dataset)
for item in xrange(num):
class_stats[targets[item]] = class_stats.get(targets[item], 0) + 1
for i in xrange(len(dataset[item])):
word = dataset[item][i]
if not words_stats.has_key(word):
words_stats[word] = {}
tgt = targets[item]
cnt = words_stats[word].get(tgt, 0)
words_stats[word][tgt] = cnt + 1
targets_stats[tgt] = targets_stats.get(tgt, 0) + 1
words_cnt += 1
items_cnt = num
def classify(doc, tgt_set):
global words_stats, words_cnt, targets_stats, items_cnt
probs = {} #the probability itself P(c|W) = P(W|c) * P(c) / P(W)
pc = {} #probability of the class in document set P(c)
pwc = {} #probability of the word set in particular class. P(W|c)
pw = 1 #probability of the word set in documet set
for word in doc:
if word not in words_stats:
continue #dirty, very dirty
pw = pw * float(sum(words_stats[word].values())) / words_cnt
for tgt in tgt_set:
pc[tgt] = class_stats[tgt] / float(items_cnt)
for word in doc:
if word not in words_stats:
continue #dirty, very dirty
tgt_wrd_cnt = words_stats[word].get(tgt, 0)
pwc[tgt] = pwc.get(tgt, 1) * float(tgt_wrd_cnt) / targets_stats[tgt]
probs[tgt] = (pwc[tgt] * pc[tgt]) / pw
l = sorted(probs.items(), key = lambda i: i[1], reverse=True)
print probs
return l[0][0]
def check_results(dataset, targets):
num = len(dataset)
tgt_set = set(targets)
correct = 0
incorrect = 0
for item in xrange(num):
res = classify(dataset[item], tgt_set)
if res == targets[item]:
correct = correct + 1
else:
incorrect = incorrect + 1
print 'correct:', float(correct) / num, ' incorrect:', float(incorrect) / num
def load_data(fil):
data = []
tgts = []
reader = csv.reader(fil)
for line in reader:
d = [x.strip() for x in line]
if '?' in d:
continue
if not len(d):
continue
data.append(d[:-1])
tgts.append(d[-1:][0])
return data, tgts
if __name__ == '__main__':
if len(sys.argv) < 3:
print './program train_data.txt test_data.txt'
sys.exit(1)
filename = sys.argv[1]
fil = open(filename, 'r')
data, tgt = load_data(fil)
train(data, tgt)
test_file = open(sys.argv[2], 'r')
test_data, test_tgt = load_data(test_file)
check_results(test_data, tgt)
it gives ~61% of correct results. when i print probabilities i get the following:
{'<=50K': 0.07371606889800396, '>50K': 15.325378327213354}
but in case of correct classifier i expect to see sum of both probabilities equal to 1.
At first i thought the problem is in float underflow and tried to make all calculations in logarithms, but results were similiar.
i understand that omitting some words is gonna affect accuracy, but the probabilities are sooo wrong.
What do i do wrong or don't understand?
for your convinience i've uploaded dataset and python script here:
https://dl.dropboxusercontent.com/u/36180992/adult.tar.gz
Thank you for your help.

Naive Bayes doesn't compute a probability directly, rather it computes a "raw score" that is relatively compared to the other scores for each label in order to classify an instance. This score can easily be converted to a "probability" in the range of [0, 1]:
total = sum(probs.itervalues())
for label, score in probs.iteritems():
probs[label] = score / total
However, keep in mind this still doesn't represent a true probability, as mentioned in this answer:
naive Bayes tends to predict probabilities that are almost always either very close to zero or very close to one.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Clustering using Latent Dirichlet Allocation algo in gensim - python

Is it possible to do clustering in gensim for a given set of inputs using LDA? How can I go about it?

Yes you can. Here is a tutorial: http://nlp.fi.muni.cz/projekty/gensim/wiki.html#latent-dirichlet-allocation First load you corpus, then call: lda = gensim.models.ldamodel.LdaModel(corpus=mm, num_topics=100)

Related

AttributeError: 'function' object has no attribute 'iterrows'

AttributeError: '' object has no attribute '' - in-class declared variable is not recognized

How to fix broken data in feature extraction/pre-processing in speech recognition?

unsupported operand type(s) for +=: 'zip' and 'zip'

error in Naive bayes classifier

Categories

Resources