import pandas as pd
from pandas import DataFrame
import nltk
import numpy as np
from sklearn.metrics import accuracy_score
**this is the library**
df = pd.read_csv('spam.csv', encoding='ISO-8859-1')
**the dataframe**
def get_feature(text):
if len(text)==2:
return {'ham':text[-3]}
elif len(text)>=1:
return {'spam':text[-3]}
else:
return {'ham':'', 'spam':''}
**get future**
def get_feature_text(text):
if len(text)==2:
return {'spam':text[-2], 'ham':text[-1]}
else:
return {'spam':DataFrame.rename(text[-2])[0], 'ham':DataFrame.rename(text[-1])[0]}
**get future of text**
def get_data(df, get_feature=get_feature):
featrues = []
for i, row in df.iterrows():
text = row['v1']; type = row['v2']
if isinstance(text, str):
if ' ' in text:
text = text.replace(' ', '')
if '(' not in text:
featrues.append((get_feature(text), type.strip('() ')))
else:
text = text.partition('(')[0]
featrues.append((get_feature(text), type.strip('() ')))
return featrues
**get data all**
def get_train_test(featrues, ratio=0.9):
N = len(featrues)
T = int(N * ratio)
train = featrues[:T]
test = featrues[T:]
return train, test
**train and test**
def text_classifier(df, f=get_feature):
data = get_data(df, f)
train, test = get_train_test(data)
classifier = nltk.NaiveBayesClassifier.train(train)
acc = nltk.classify.accuracy(classifier, test)
return classifier, acc
**text classifier**
def show_type_of_text(text, texts=False, show_acc=False):
f = get_feature_text if texts else get_feature
classifier, acc = text_classifier(df, f)
if show_acc:
print("The accuracy of prediction is: ", acc)
clf = classifier.classify(f(text))
print(f'{text}: {clf}')
classifier.show_most_informative_features(10)
**show type of text**
def give_type(type1='spam', type2='ham'):
data = get_data(df, get_feature)
classifier = nltk.NaiveBayesClassifier.train(data)
following = classifier.prob_classify({'ham':type2, 'spam':type1})
x = following.generate()
print(f'{type2}: {type1}{x}')
**give type**
if __name__ == '__main__':
print('-wait a minute-')
show_type_of_text("spam")
print((text_classifier(accuracy_score)))
**i expecting to print the score
from this script I want to score the accuracy of the code that I made but here the problem is itterows then what should I do? I've read the documentation about accuracy_score but still the same result doesn't want to come out**
**
from this script I want to score the accuracy of the code that I made but here the problem is itterows then what should I do? I've read the documentation about accuracy_score but still the same result doesn't want to come out**
Related
So, I am working on a small project that detects fake news. This is the model:
from wordcloud import STOPWORDS
from sklearn.metrics import confusion_matrix
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd #data processing
import pickle
import re
import nltk
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
train = train.fillna(" ")
test = test.fillna(" ")
test["total"] = test["title"]+' '+test["author"]+test["text"]
train["total"] = train["title"]+' '+train['author']+train["text"]
real_words = ''
fake_words = ''
stopwords = set(STOPWORDS)
for val in train[train['label']==1].total:
tokens = val.split()
for i in range(len(tokens)):
tokens[i] = tokens[i].lower()
real_words += " ".join(tokens)+" "
for val in train[train['label']==0].total:
tokens = val.split()
for i in range(len(tokens)):
tokens[i] = tokens[i].lower()
fake_words += " ".join(tokens)+" "
lemmatizer = WordNetLemmatizer()
for index,row in train.iterrows():
filter_sentence = ''
sentence = row['total']
sentence = re.sub(r'[^\w\s]','', sentence)
words = nltk.word_tokenize(sentence)
words = [w for w in words if w not in stopwords]
for word in words:
filter_sentence += ' ' + str(lemmatizer.lemmatize(word)).lower()
train.loc[index, 'total'] = filter_sentence
train = train[['total', 'label']]
X_train = train['total']
Y_train = train['label']
count_vectorizer = CountVectorizer()
count_vectorizer.fit_transform(X_train)
freq_term_matrix = count_vectorizer.transform(X_train)
tfidf = TfidfTransformer(norm="l2")
tfidf.fit(freq_term_matrix)
tf_idf_matrix = tfidf.fit_transform(freq_term_matrix)
test_counts = count_vectorizer.transform(test['total'].values)
test_tfidf = tfidf.transform(test_counts)
X_train, X_test, y_train, y_test = train_test_split(tf_idf_matrix, Y_train, random_state=0)
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1e5)
logreg.fit(X_train, y_train)
pred = logreg.predict(X_test)
print('Accuracy on training set: {:.2f}'.format(logreg.score(X_train, y_train)))
print('Accuracy on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
cm = confusion_matrix(y_test, pred)
print(cm)
from sklearn.svm import SVC
sv = SVC(kernel='linear').fit(X_train, y_train)
pickle.dump(sv, open('fake.pkl', 'wb'))
This is the Flask file:
from flask import Flask, render_template, request
import numpy as np
import pickle
app = Flask(__name__)
model = pickle.load(open('fake.pkl', 'rb'))
#app.route('/')
def man():
return render_template('home.html')
#app.route('/predict', methods=['POST'])
def home():
data1 = request.form['title']
data2 = request.form['author']
data3 = request.form['news']
arr = np.array([[data1, data2, data3]])
pred = model.predict(arr)
return render_template('result.html', data=pred)
if __name__ == "__main__":
app.run(debug=True)
Whenever I give the input and run it, it shows:
ValueError: could not convert string to float: 'Specter of Trump Loosens Tongues, if Not Purse Strings, in Silicon Valley - The New York Times
What can I do to fix this? Can someone please explain? Thanks :)
i am very new in machine learning. I stumble on this source code on github that has no database, so i decided to use my own database. This code is to recognize speaker with MFCC and GMM-UBM. But when i try to run the code, i got this error "ValueError: Found array with 1 sample(s) (shape=(1, 13)) while a minimum of 2 is required". It seems like when the code is trying to fit the GMM on the 68th dataset, the MFCC shape of the data is broken. I assume there's something wrong on the feature extraction process.
Please help me! thank you very much.
Here's the code
import python_speech_features as psf
from sklearn.mixture import GaussianMixture
from sklearn.externals import joblib
from scipy.io import wavfile
from functools import reduce
import numpy as np
from os import listdir
from os.path import isfile, join
import os
import re
DATA_PATH = 'dataCoba'
# Make a list of speakers from the newdata/data folder. The format for the files in the folder is
# name_1,wav for training and name_2.wav for testing
substring = "_2"
onlyfiles = [f for f in listdir(DATA_PATH) if isfile(join(DATA_PATH, f))]
onlyfiles.sort()
onlyones = []
for filename in onlyfiles:
dups = re.search('[\w]+_2.wav', filename)
#dups = re.search('[\w].wav', filename)
if dups is None:
onlyones.append(''.join(filename.split('_')[0]))
print(onlyones)
SPEAKERS = onlyones
TOTAL_SPEAKERS = len(SPEAKERS)
MODEL_SPEAKERS = len(SPEAKERS)
print(len(SPEAKERS))
class SpeakerRecognition:
# Create a GMM and UBM model for each speaker. The GMM is modelled after the speaker and UBM for each speaker
# is modelled after all the other speakers. Likelihood Ratio test is used to verify speaker
def setGMMUBM(self, no_components):
self.GMM = []
self.UBM = []
for i in range(MODEL_SPEAKERS):
self.GMM.append(GaussianMixture(n_components= no_components, covariance_type= 'diag'))
self.UBM.append(GaussianMixture(n_components= no_components, covariance_type= 'diag'))
# Load in data from .wav files in data/
# Extract mfcc (first 13 coefficients) from each audio sample
def load_data(self):
#training
self.spk = [wavfile.read(DATA_PATH + '/' + (str(i).replace('.wav','')) + '_1.wav') for i in SPEAKERS]
self.spk_mfcc = [psf.mfcc(self.spk[i][1], self.spk[i][0]) for i in range(0, TOTAL_SPEAKERS)]
#testing
self.p_spk = [wavfile.read(DATA_PATH + '/' + (str(i).replace('.wav','')) + '_2.wav') for i in SPEAKERS]
self.p_spk_mfcc = [psf.mfcc(self.p_spk[i][1], self.p_spk[i][0]) for i in range(0, TOTAL_SPEAKERS)]
print(self.spk_mfcc)
for i in range(TOTAL_SPEAKERS):
self.spk_train_size.append(len(self.spk_mfcc[i]))
self.spk_start.append(len(self.total_mfcc))
print("Speaker Number(train) = ",i)
print ("self.spk_mfcc[i] = ", len(self.spk_mfcc[i]))
for mfcc in self.spk_mfcc[i]:
self.total_mfcc.append(mfcc)
self.speaker_label.append(i)
self.spk_end.append(len(self.total_mfcc))
print("self.total_mfcc = ", len(self.total_mfcc))
print("\n")
for i in range(TOTAL_SPEAKERS):
#print("self.p_spk_mfcc =", self.p_spk_mfcc)
self.spk_test_size.append(len(self.p_spk_mfcc[i]))
self.spk_start.append(len(self.p_total_mfcc))
print("Speaker Num(test) = ",i)
print("self.p_spk_mfcc = ",len(self.p_spk_mfcc[i]))
print("MFCC Shape = ",self.spk_mfcc[i].shape)
for mfcc in self.p_spk_mfcc[i]:
self.p_total_mfcc.append(mfcc)
self.p_speaker_label.append(i)
self.p_spk_end.append(len(self.p_total_mfcc))
print("self.total_mfcc = ", len(self.p_total_mfcc))
print("\n")
# Gaussian Mixture Model is made of a number of Gaussian distribution components.
# To model data, a suitable number o gaussian components have to be selected.
# There is no method for finding this. It is done by trial and error. This runs
# the program for different values of component and records accuracy for each one
[![This is the error when i run the code][1]][1]
def find_best_params(self):
best_no_components = 1
maxacc = 0
for i in range(100, 256):
self.setGMMUBM(i)
self.fit_model()
_, acc, _ = self.predict()
print("Accuracy for n = {} is {}".format(i, acc))
if acc > maxacc:
maxacc = acc
best_no_components = i
return best_no_components
# Fit the GMM UBM models with training data
# fit = N buah data * dimensi data
def fit_model(self):
for i in range(MODEL_SPEAKERS):
print("Fit start for {}".format(i))
self.GMM[i].fit(self.spk_mfcc[i])
print(self.spk_mfcc[i].shape)
self.UBM[i].fit(self.total_mfcc[:self.spk_start[i]] + self.total_mfcc[self.spk_end[i]:])
print("Fit end for {}".format(i))
joblib.dump(self.UBM[i], 'dumps/new/ubm' + str(i) + '.pkl')
joblib.dump(self.GMM[i], 'dumps/new/gmm' + str(i) + '.pkl')
def model(self, no_components = 244):
self.setGMMUBM(no_components)
self.fit_model()
# Predict the output for each model for each speaker and produce confusion matrix
def load_model(self):
for i in range(0, MODEL_SPEAKERS):
self.GMM.append(joblib.load('dumps/new/gmm' + str(i) + '.pkl'))
self.UBM.append(joblib.load('dumps/new/ubm' + str(i) + '.pkl'))
def predict(self):
avg_accuracy = 0
confusion = [[ 0 for y in range(MODEL_SPEAKERS) ] for x in range(TOTAL_SPEAKERS)]
for i in range(TOTAL_SPEAKERS):
for j in range(MODEL_SPEAKERS):
x = self.GMM[j].score_samples(self.p_spk_mfcc[i]) - self.UBM[j].score_samples(self.p_spk_mfcc[i])
for score in x :
if score > 0:
confusion[i][j] += 1
confusion_diag = [confusion[i][i] for i in range(MODEL_SPEAKERS)]
diag_sum = 0
for item in confusion_diag:
diag_sum += item
remain_sum = 0
for i in range(MODEL_SPEAKERS):
for j in range(MODEL_SPEAKERS):
if i != j:
remain_sum += confusion[i][j]
spk_accuracy = 0
for i in range(MODEL_SPEAKERS):
best_guess, _ = max(enumerate(confusion[i]), key=lambda p: p[1])
print("For Accent {}, best guess is {}".format(SPEAKERS[i], SPEAKERS[best_guess]))
if i == best_guess:
spk_accuracy += 1
#print(MODEL_SPEAKERS)
spk_accuracy /= MODEL_SPEAKERS
avg_accuracy = diag_sum/(remain_sum+diag_sum)
return confusion, avg_accuracy, spk_accuracy
def __init__(self):
self.test_spk = []
self.test_mfcc = []
# Speaker data and corresponding mfcc
self.spk = []
self.spk_mfcc = []
self.p_spk = []
self.p_spk_mfcc = []
# Holds all the training mfccs of all speakers and
# speaker_label is the speaker label for the corresponding mfcc
self.total_mfcc = []
self.speaker_label = []
self.spk_train_size = [] # Index upto which is training data for that speaker.
self.p_total_mfcc = []
self.p_speaker_label = []
#print(self.p_speaker_label)
self.spk_test_size = []
# Since the length of all the audio files are different, spk_start and spk_end hold
self.spk_start = []
self.spk_end = []
self.p_spk_start = []
self.p_spk_end = []
self.GMM = []
self.UBM = []
self.load_data()
self.cepstral_mean_subtraction()
# Cepstral Mean Subtraction (Feature Normalization step)
def cepstral_mean_subtraction(self):
for i, speaker_mfcc in enumerate(self.spk_mfcc):
average = reduce(lambda acc, ele: acc + ele, speaker_mfcc)
average = list(map(lambda x: x/len(speaker_mfcc), average))
for j, feature_vector in enumerate(speaker_mfcc):
for k, feature in enumerate(feature_vector):
self.spk_mfcc[i][j][k] -= average[k]
for i, speaker_mfcc in enumerate(self.p_spk_mfcc):
average = reduce(lambda acc, ele: acc + ele, speaker_mfcc)
average = list(map(lambda x: x / len(speaker_mfcc), average))
for j, feature_vector in enumerate(speaker_mfcc):
for k, feature in enumerate(feature_vector):
self.p_spk_mfcc[i][j][k] -= average[k]
#TBD : Ten fold validation
def ten_fold():
#fold_size = 0.1 * self.n
fold_offset = 0.0
accuracy_per_fold = 0
average_accuracy = 0
for i in range(0, 10):
print("Fold start is {} and fold end is {} ".format( fold_offset, fold_offset + fold_size))
#accuracy = self.execute(int(fold_offset), int(fold_offset + fold_size))
#print("Accuracy is of test {} is : {} ".format(i, accuracy))
#average_accuracy += accuracy
#fold_offset += fold_size
average_accuracy /= 10.0
print("Average accuracy " + str(100 * average_accuracy))
return average_accuracy
# Final result is a confusion matrix which represents the accuracy of the fit of the model
if __name__ == '__main__':
SR = SpeakerRecognition()
#SR.load_model()
SR.setGMMUBM(no_components=13)
#SR.find_best_params()
SR.fit_model()
confusion, mfcc_accuracy, spk_accuracy = SR.predict()
print("Confusion Matrix")
print(np.matrix(confusion))
print("Accuracy in predicting speakers : {}".format(spk_accuracy))
print("Accuracy in testing for MFCC : {}".format(mfcc_accuracy))
thanks for the answer before and I have changed it what Alperen suggested, but I have another problem, my code :
import sys
import os
import itertools
import os.path
import random
from PIL import Image
from svmutil import *
DIMENSION = 200
sys.path.append("../train/")
ROOT_DIR = os.path.dirname(os.getcwd()) + "/train"
NEGATIVE = "negative"
POSITIVE = "positive"
CLASSES = [NEGATIVE, POSITIVE]
# libsvm constants
LINEAR = 0
RBF = 2
# Other
USE_LINEAR = False
IS_TUNING = False
def main():
try:
train, tune, test = getData(IS_TUNING)
models = getModels(train)
results = None
if IS_TUNING:
print ("!!! TUNING MODE !!!")
results = classify(models, tune)
else:
results = classify(models, test)
print
totalCount = 0
totalCorrect = 0
for clazz in CLASSES:
count, correct = results[clazz]
totalCount += count
totalCorrect += correct
print ("%s %d %d %f") % (clazz, correct, count, (float(correct) / count))
print ("%s %d %d %f") % ("Overall", totalCorrect, totalCount,(float(totalCorrect) / totalCount))
except Exception as e:
print (e)
return 5
def classify(models, dataSet):
results = {}
for trueClazz in CLASSES:
count = 0
correct = 0
for item in dataSet[trueClazz]:
predClazz, prob = predict(models, item)
print ("%s,%s,%f") % (trueClazz, predClazz, prob)
count += 1
if trueClazz == predClazz: correct += 1
results[trueClazz] = (count, correct)
return results
def predict(models, item):
maxProb = 0.0
bestClass = ""
for clazz, model in models.iteritems():
prob = predictSingle(model, item)
if prob > maxProb:
maxProb = prob
bestClass = clazz
return (bestClass, maxProb)
def predictSingle(model, item):
output = svm_predict([0], [item], model, "-q -b 1")
prob = output[2][0][0]
return prob
def getModels(trainingData):
models = {}
param = getParam(USE_LINEAR)
for c in CLASSES:
labels, data = getTrainingData(trainingData, c)
prob = svm_problem(labels, data)
m = svm_train(prob, param)
models[c] = m
return models
def getTrainingData(trainingData, clazz):
labeledData = getLabeledDataVector(trainingData, clazz, 1)
negClasses = [c for c in CLASSES if not c == clazz]
for c in negClasses:
ld = getLabeledDataVector(trainingData, c, -1)
labeledData += ld
random.shuffle(labeledData)
unzipped = [list(t) for t in zip(*labeledData)]
labels, data = unzipped[0], unzipped[1]
return (labels, data)
def getParam(linear = True):
param = svm_parameter("-q")
param.probability = 1
if(linear):
param.kernel_type = LINEAR
param.C = .01
else:
param.kernel_type = RBF
param.C = .01
param.gamma = .00000001
return param
def getLabeledDataVector(dataset, clazz, label):
data = dataset[clazz]
labels = [label] * len(data)
output = zip(labels, data)
return output
def getData(generateTuningData):
trainingData = {}
tuneData = {}
testData = {}
for clazz in CLASSES:
(train, tune, test) = buildTrainTestVectors(buildImageList(ROOT_DIR + clazz + "/"), generateTuningData)
trainingData[clazz] = train
tuneData[clazz] = tune
testData[clazz] = test
return (trainingData, tuneData, testData)
def buildImageList(dirName):
imgs = [Image.open(dirName + fileName).resize((DIMENSION, DIMENSION)) for fileName in os.listdir(dirName)]
imgs = [list(itertools.chain.from_iterable(img.getdata())) for img in imgs]
return imgs
def buildTrainTestVectors(imgs, generateTuningData):
# 70% for training, 30% for test.
testSplit = int(.7 * len(imgs))
baseTraining = imgs[:testSplit]
test = imgs[testSplit:]
training = None
tuning = None
if generateTuningData:
# 50% of training for true training, 50% for tuning.
tuneSplit = int(.5 * len(baseTraining))
training = baseTraining[:tuneSplit]
tuning = baseTraining[tuneSplit:]
else:
training = baseTraining
return (training, tuning, test)
if __name__ == "__main__":
sys.exit(main())
and I got the new massage
Klik this massage to see new error massage
What should I do? I have searched every answer but never make me get the answer. Now I use this code for my final project at university. I hope anyone can help me for this problem. But thank you for another last answer
EDIT:
This lines causes the error:
labeledData += ld
+= operand doesn't work for zips. You can change zips to list.
def getLabeledDataVector(dataset, clazz, label):
...
return list(output)
Also, unzipped list can be empty, you should fix this line too(Thanks to ShadowRanger for comment):
labels, data = unzipped if unzipped else ([], [])
This changes probably will affect your code's logic. You should fix them on your own.
BEFORE EDIT:
In getData(generateTuningData) function, ROOT_DIR + clazz expression causes the error, because ROOT_DIR is None.
sys.path.append doesn't return anything(returns None).
You need to change your code as:
...
import os.path
...
sys.path.append("../train/")
ROOT_DIR = os.path.dirname(os.getcwd()) + "/train/" # parent directory and "/train/"
...
I assumed ROOT_DIR is your current working directory's parent + "/train/". If it is not, you can fix it.
Also, there may be other problems, but this solves unsupported operand type(s).
Hello Im putting my code for inspection because Ive been playing with my neural network implementation in python for a few weeks and I cant seem to reach a misclasification error below 17% some times 16%. ve been trying different learning rate values, different hidden neurons number and still not a lot of mprovement. Im well aware my implementation is the basic traditional neural network, but I was expecting better results according to other implementations ive seen in internet. I hope this is of interest for you guys, it would be really cool if you could point me new ideas about what could be the problem in my code, or maybe you think this is the best I can do with a traditional implementation and I should add something new, that would be cool too.
In any case, here is my code, I hope is readable enough, I tried to do it as simple as possible, since its my way to understand how neural networks work.
Edit:Perhaps my question is not so clear, basically I would like, if its of interest for you guys, is to help me find details in my current implementation that could improve my misclasification error below 17% because apparently its the best my implementation can do. I would be very thankful for any advise or idea, Im deeply interested in this topic, but Im a beginner and it would be great to have some smart ideas that can help me improve my implementation.
File: mnist_dataset.py - Extract the mnist data
import numpy as np
from struct import unpack
train_input_file = open("dataset/train-images-idx3-ubyte", "rb")
train_output_file = open("/dataset/train-labels-idx1-ubyte", "rb")
test_input_file = open("dataset/t10k-images-idx3-ubyte", "rb")
test_output_file = open("dataset/t10k-labels-idx1-ubyte", "rb")
def readData(f,labels = False,scale = 1):
header = hex(unpack('>L',np.fromfile(f,dtype=np.int32,count=1)[0])[0])
num = int(unpack('>L',np.fromfile(f,dtype=np.int32,count=1)[0])[0])
col = 1
row = 1
if labels == False:
row = int(unpack('>L',np.fromfile(f,dtype=np.int32,count=1)[0])[0])
col = int(unpack('>L',np.fromfile(f,dtype=np.int32,count=1)[0])[0])
data = np.zeros((int(num/scale),col*row))
for i in range(0,int(num/scale),1):
data[i] = np.fromfile(f,dtype=np.ubyte,count=col*row)
return data
def getMNISTData():
def norm(v):
return v/255
train_input = readData(train_input_file, scale=1)/255.0
train_out = readData(train_output_file, True,scale=1)
test_input = readData(test_input_file)/255.0
test_out = readData(test_output_file, True)
print "Train input: " + str(train_input.shape)
print "Train output: " + str(train_out.shape)
print "Test input: " + str(test_input.shape)
print "Test output: " + str(test_out.shape)
train_input_file.close()
train_output_file.close()
test_input_file.close()
test_output_file.close()
return (train_input,train_out,test_input,test_out)
File: NN.py - neural network implementation
import mnist_dataset
import numpy as np
import random
import matplotlib.pyplot as plt
def encode_data_10(v):
e = (0.0) * np.ones((1, 10), dtype=float)
e[:, int(v)] = 1.0
return e.tolist()
def encode_data_1(v):
n = -1.0 + ((0.2)*v)
return n
x_train, y_train, x_test, y_test = mnist_dataset.getMNISTData()
learning_rate = 1.0
iter = 3000
sample_size = 30
num_hidden_neurons = 500
num_output_neurons = 10
if num_output_neurons > 1:
y_train = np.matrix(np.array(map(encode_data_10,y_train)))
y_test = np.matrix(np.array(map(encode_data_10,y_test)))
else:
y_train = np.matrix(map(encode_data_1,y_train))
y_test = np.matrix(map(encode_data_1,y_test))
def getSample(sample_size,x,y):
r = random.sample(xrange(1, len(y), 1), sample_size)
x_r = np.zeros((sample_size,x.shape[1]))
y_r = np.zeros((sample_size,y.shape[1]))
for i,n in enumerate(r):
x_r[i] = x[n]
y_r[i] = y[n]
return (x_r,y_r)
inputVector, targetVector = getSample(sample_size, x_train, y_train)
hiddenWeights = np.mat(np.random.random((num_hidden_neurons, x_train.shape[1])))
print "W0 shape: " + str(hiddenWeights.shape)
outputWeights = np.mat(np.random.random((num_output_neurons,num_hidden_neurons)))
print "W1 shape: " + str(outputWeights.shape)
def act_func_l1(a):
return (1.0/(1 + np.exp(-a)))
def der_act_func_l1(a):
return act_func_l1(a)*(1.0 - act_func_l1(a))
def feedforward(l0):
global hiddenWeights
global outputWeights
Z1 = l0 * hiddenWeights.T
layer1 = np.matrix(act_func_l1(np.asarray(Z1)))
Z2 = layer1 * outputWeights.T
layer2 = act_func_l1(np.asarray(Z2))
return (layer1,layer2)
def miss(x,y):
layer1, layer2 = feedforward(x)
def c(n):
if n > 0.5:
return 1.0
else:
return 0.0
layer2 = map(lambda v: map(c, v), layer2)
def cc(t):
return np.abs(cmp(np.array(y[t[0]]).tolist()[0], np.array(t[1]).tolist()))
return (np.sum(map(cc, enumerate(layer2))))
miss_x = np.zeros((iter, 1))
for j in xrange(iter):
hiddenActualInput = inputVector * hiddenWeights.T
hiddenOutputVector = np.matrix(act_func_l1(np.asarray(hiddenActualInput)))
outputActualInput = hiddenOutputVector * outputWeights.T
outputVector = act_func_l1(np.asarray(outputActualInput))
layer2_error2 = np.square(outputVector - targetVector)
print "Error: " + str(np.mean(np.abs(layer2_error2)))
m = miss(x_test,y_test)
miss_x[j] = m
print str(j) + " - Misses (%): " + str(m)
if m <= 2000:
learning_rate = 0.05
outputDelta = np.mat(der_act_func_l1(np.asarray(outputVector))*np.asarray(outputVector - targetVector))
hiddenDelta = np.mat(der_act_func_l1(np.asarray(hiddenOutputVector)) * np.asarray((outputDelta*outputWeights)))
hiddenWeights = np.mat(hiddenWeights.T - (learning_rate*np.asarray(inputVector.T*hiddenDelta))).T
outputWeights = np.mat(outputWeights.T - (learning_rate*np.asarray(hiddenOutputVector.T*outputDelta))).T
inputVector, targetVector = getSample(sample_size, x_train, y_train)
plt.plot(xrange(iter), miss_x, label = 'Miss rate(%)')
plt.legend(loc='upper right')
plt.show()
Is it possible to do clustering in gensim for a given set of inputs using LDA? How can I go about it?
LDA produces a lower dimensional representation of the documents in a corpus. To this low-d representation you could apply a clustering algorithm, e.g. k-means. Since each axis corresponds to a topic, a simpler approach would be assigning each document to the topic onto which its projection is largest.
Yes you can. Here is a tutorial: http://nlp.fi.muni.cz/projekty/gensim/wiki.html#latent-dirichlet-allocation
First load you corpus, then call:
lda = gensim.models.ldamodel.LdaModel(corpus=mm, num_topics=100)
This is an example.
You need copy matutils.py and utils.py from gensim first, and the directory
should like the pic blow.
utils.py
matutils.py
doc_similar.py
model(dir)
data(dir)
The code blow should be in doc_similar.py.
Then just move your data_file into directory data and change fname in function main.
#coding:utf-8
from gensim import corpora, models, similarities
import cPickle
import logging
import utils
import os
import numpy as np
import scipy
import matutils
from collections import defaultdict
data_dir = os.path.join(os.getcwd(), 'data')
work_dir = os.path.join(os.getcwd(), 'model', os.path.basename(__file__).rstrip('.py'))
if not os.path.exists(work_dir):
os.mkdir(work_dir)
os.chdir(work_dir)
logger = logging.getLogger('text_similar')
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# convert to unicode
def to_unicode(text):
if not isinstance(text, unicode):
text = text.decode('utf-8')
return text
class TextSimilar(utils.SaveLoad):
def __init__(self):
self.conf = {}
def _preprocess(self):
docs = [to_unicode(doc.strip()).split()[1:] for doc in file(self.fname)]
cPickle.dump(docs, open(self.conf['fname_docs'], 'wb'))
dictionary = corpora.Dictionary(docs)
dictionary.save(self.conf['fname_dict'])
corpus = [dictionary.doc2bow(doc) for doc in docs]
corpora.MmCorpus.serialize(self.conf['fname_corpus'], corpus)
return docs, dictionary, corpus
def _generate_conf(self):
fname = self.fname[self.fname.rfind('/') + 1:]
self.conf['fname_docs'] = '%s.docs' % fname
self.conf['fname_dict'] = '%s.dict' % fname
self.conf['fname_corpus'] = '%s.mm' % fname
def train(self, fname, is_pre=True, method='lsi', **params):
self.fname = fname
self.method = method
self._generate_conf()
if is_pre:
self.docs, self.dictionary, corpus = self._preprocess()
else:
self.docs = cPickle.load(open(self.conf['fname_docs']))
self.dictionary = corpora.Dictionary.load(self.conf['fname_dict'])
corpus = corpora.MmCorpus(self.conf['fname_corpus'])
if params is None:
params = {}
logger.info("training TF-IDF model")
self.tfidf = models.TfidfModel(corpus, id2word=self.dictionary)
corpus_tfidf = self.tfidf[corpus]
if method == 'lsi':
logger.info("training LSI model")
self.lsi = models.LsiModel(corpus_tfidf, id2word=self.dictionary, **params)
self.similar_index = similarities.MatrixSimilarity(self.lsi[corpus_tfidf])
self.para = self.lsi[corpus_tfidf]
elif method == 'lda_tfidf':
logger.info("training LDA model")
self.lda = models.LdaMulticore(corpus_tfidf, id2word=self.dictionary, workers=8, **params)
self.similar_index = similarities.MatrixSimilarity(self.lda[corpus_tfidf])
self.para = self.lda[corpus_tfidf]
elif method == 'lda':
logger.info("training LDA model")
self.lda = models.LdaMulticore(corpus, id2word=self.dictionary, workers=8, **params)
self.similar_index = similarities.MatrixSimilarity(self.lda[corpus])
self.para = self.lda[corpus]
elif method == 'logentropy':
logger.info("training a log-entropy model")
self.logent = models.LogEntropyModel(corpus, id2word=self.dictionary)
self.similar_index = similarities.MatrixSimilarity(self.logent[corpus])
self.para = self.logent[corpus]
else:
msg = "unknown semantic method %s" % method
logger.error(msg)
raise NotImplementedError(msg)
def doc2vec(self, doc):
bow = self.dictionary.doc2bow(to_unicode(doc).split())
if self.method == 'lsi':
return self.lsi[self.tfidf[bow]]
elif self.method == 'lda':
return self.lda[bow]
elif self.method == 'lda_tfidf':
return self.lda[self.tfidf[bow]]
elif self.method == 'logentropy':
return self.logent[bow]
def find_similar(self, doc, n=10):
vec = self.doc2vec(doc)
sims = self.similar_index[vec]
sims = sorted(enumerate(sims), key=lambda item: -item[1])
for elem in sims[:n]:
idx, value = elem
print ' '.join(self.docs[idx]), value
def get_vectors(self):
return self._get_vector(self.para)
def _get_vector(self, corpus):
def get_max_id():
maxid = -1
for document in corpus:
maxid = max(maxid, max([-1] + [fieldid for fieldid, _ in document])) # [-1] to avoid exceptions from max(empty)
return maxid
num_features = 1 + get_max_id()
index = np.empty(shape=(len(corpus), num_features), dtype=np.float32)
for docno, vector in enumerate(corpus):
if docno % 1000 == 0:
print("PROGRESS: at document #%i/%i" % (docno, len(corpus)))
if isinstance(vector, np.ndarray):
pass
elif scipy.sparse.issparse(vector):
vector = vector.toarray().flatten()
else:
vector = matutils.unitvec(matutils.sparse2full(vector, num_features))
index[docno] = vector
return index
def cluster(vectors, ts, k=30):
from sklearn.cluster import k_means
X = np.array(vectors)
cluster_center, result, inertia = k_means(X.astype(np.float), n_clusters=k, init="k-means++")
X_Y_dic = defaultdict(set)
for i, pred_y in enumerate(result):
X_Y_dic[pred_y].add(''.join(ts.docs[i]))
print 'len(X_Y_dic): ', len(X_Y_dic)
with open(data_dir + '/cluser.txt', 'w') as fo:
for Y in X_Y_dic:
fo.write(str(Y) + '\n')
fo.write('{word}\n'.format(word='\n'.join(list(X_Y_dic[Y])[:100])))
def main(is_train=True):
fname = data_dir + '/brand'
num_topics = 100
method = 'lda'
ts = TextSimilar()
if is_train:
ts.train(fname, method=method ,num_topics=num_topics, is_pre=True, iterations=100)
ts.save(method)
else:
ts = TextSimilar().load(method)
index = ts.get_vectors()
cluster(index, ts, k=num_topics)
if __name__ == '__main__':
is_train = True if len(sys.argv) > 1 else False
main(is_train)
The basic thing to understand here is that clustering requires your data to be present in a format and is not concerned with how did you arrive at your data. So, whether you apply clustering on the term-document matrix or on the reduced-dimension (LDA output matrix), clustering will work irrespective of that.
Just do the other things right though, small mistakes in data formats can cost you a lot of time of research.