How to flatten multiple levels of tuple and have tab-separated elements? - python

This is a function to compute word-similarity I wrote with import of xlwings, a python-excel library. I want it to return like this (what I expect, is the items in each row/line should be split by a tab. And then I could easily copy/paste into a Excel file for a sum), for example:
0.9999998807907104 'casual' 1.0 1.0 29.0
0.8386740684509277 'active' 0.3333 1.0 13.0
0.776314377784729 'cardigans'0.1667 1.0 84.0
But it actually return like this (what I hate, is I couldn't copy to Excel file for further use, like summing digits):
[[0.9999998807907104, ('casual', (1.0, 1.0, 29.0))],
[0.8386740684509277, ('active', (0.3333, 1.0, 13.0))],
[0.776314377784729, ('cardigans', (0.1667, 1.0, 84.0))]]
How could I realize that? Thank you.
def similarity(phrase, N=10):
phrase_vec = phrase_model[phrase]
CosDisList = []
wb = xw.Book('file01.xlsx')
sht = wb.sheets['sheet1']
for a_word in phrase_model.keys():
a_val = phrase_model[a_word]
cos_dis = cosine_similarity(phrase_vec, a_val)
for i in range(1, 18):
if a_word == sht.cells(i, 1).value:
DataFromExcel = (sht.cells(i, 2).value, sht.cells(i, 3).value, sht.cells(i, 4).value)
DataCombined = (a_word, DataFromExcel)
CosDisBind = [float(str(cos_dis.tolist()).strip('[[]]')), DataCombined]
CosDisList.append(CosDisBind)
CosDisListSort = sorted(CosDisList, key=operator.itemgetter(0), reverse=True)
CosDisListTopN = heapq.nlargest(N, CosDisListSort)
return CosDisListTopN

You can use the following function. Source : a blogpost
def flatten(l, ltypes=(list, tuple)):
ltype = type(l)
l = list(l)
i = 0
while i < len(l):
while isinstance(l[i], ltypes):
if not l[i]:
l.pop(i)
i -= 1
break
else:
l[i:i + 1] = l[i]
i += 1
return ltype(l)
Then just use:
abc = [[0.9999998807907104, ('casual', (1.0, 1.0, 29.0))],
[0.8386740684509277, ('active', (0.3333, 1.0, 13.0))],
[0.776314377784729, ('cardigans', (0.1667, 1.0, 84.0))]]
flat_list = flatten(abc)
final_array = np.array(flat_list).reshape((np.round(len(flat_list)//5), 5)).tolist()
# [['0.9999998807907104', 'casual', '1.0', '1.0', '29.0'], ['0.8386740684509277', 'active', '0.3333', '1.0', '13.0'], ['0.776314377784729', 'cardigans', '0.1667', '1.0', '84.0']]
Now you can join individual lists:
most_final = ["\t".join(x) for x in final_array]
print(most_final[0])
output
print(most_final[0])
0.9999998807907104 casual 1.0 1.0 29.0

Related

Difficulties to get the correct posterior value in a Naive Bayes Implementation

For studying purposes, I've tried to implement this "lesson" using python but "without" sckitlearn or something similar.
My attempt code is the follow:
import pandas, math
training_data = [
['A great game','Sports'],
['The election was over','Not sports'],
['Very clean match','Sports'],
['A clean but forgettable game','Sports'],
['It was a close election','Not sports']
]
text_to_predict = 'A very close game'
data_frame = pandas.DataFrame(training_data, columns=['data','label'])
data_frame = data_frame.applymap(lambda s:s.lower() if type(s) == str else s)
text_to_predict = text_to_predict.lower()
labels = data_frame.label.unique()
word_frequency = data_frame.data.str.split(expand=True).stack().value_counts()
unique_words_set = set()
unique_words = data_frame.data.str.split().apply(unique_words_set.update)
total_unique_words = len(unique_words_set)
word_frequency_per_labels = []
for l in labels:
word_frequency_per_label = data_frame[data_frame.label == l].data.str.split(expand=True).stack().value_counts()
for w, f in word_frequency_per_label.iteritems():
word_frequency_per_labels.append([w,f,l])
word_frequency_per_labels_df = pandas.DataFrame(word_frequency_per_labels, columns=['word','frequency','label'])
laplace_smoothing = 1
results = []
for l in labels:
p = []
total_words_in_label = word_frequency_per_labels_df[word_frequency_per_labels_df.label == l].frequency.sum()
for w in text_to_predict.split():
x = (word_frequency_per_labels_df.query('word == #w and label == #l').frequency.to_list()[:1] or [0])[0]
p.append((x + laplace_smoothing) / (total_words_in_label + total_unique_words))
results.append([l,math.prod(p)])
print(results)
result = pandas.DataFrame(results, columns=['labels','posterior']).sort_values('posterior',ascending = False).labels.iloc[0]
print(result)
In the blog lesson their results are:
But my result were:
[['sports', 4.607999999999999e-05], ['not sports', 1.4293831139825827e-05]]
So, what did I do wrong in my python implementation? How can I get the same results?
Thanks in advance
You haven't multiplied by the priors p(Sport) = 3/5 and p(Not Sport) = 2/5. So just updating your answers by these ratios will get you to the correct result. Everything else looks good.
So for example you implement p(a|Sports) x p(very|Sports) x p(close|Sports) x p(game|Sports) in your math.prod(p) calculation but this ignores the term p(Sport). So adding this in (and doing the same for the not sport condition) fixes things.
In code this can be achieved by:
prior = (data_frame.label == l).mean()
results.append([l,prior*math.prod(p)])
the answer by #nick is correct and should be awarded the bounty.
Here an alternative implementation (from scratch, not using pandas) that also supports normalization of probabilities and words not in training set
from collections import defaultdict
from dataclasses import dataclass, field
from typing import Dict, Set
def tokenize(text: str):
return [word.lower() for word in text.split()]
def normalize(result: Dict[str, float]):
total = sum([v for v in result.values()])
for k in result.keys():
result[k] /= total
#dataclass
class Model:
labels: Set[str] = field(default_factory=set)
words: Set[str] = field(default_factory=set)
prob_labels: Dict[str,float] = field(default_factory=lambda: defaultdict(float)) # P(label)
prob_words: Dict[str,Dict[str,float]] = field(default_factory=lambda: defaultdict(lambda: defaultdict(float))) # P(word | label) as prob_words[label][word]
def predict(self, text: str, norm=True) -> Dict[str, float]: # P(label | text) as model.predict(text)[label]
result = {label: self.prob_labels[label] for label in self.labels}
for word in tokenize(text):
for label in self.labels:
if word in self.words:
result[label] *= self.prob_words[label][word]
if norm:
normalize(result)
return result
def train(self, data):
prob_words_denominator = defaultdict(int)
for row in data:
text = row[0]
label = row[1].lower()
self.labels.add(label)
self.prob_labels[label] += 1.0
for word in tokenize(text):
self.words.add(word)
self.prob_words[label][word] += 1.0
prob_words_denominator[label] += 1.0
for label in self.labels:
self.prob_labels[label] /= len(data)
for word in self.words:
self.prob_words[label][word] = (self.prob_words[label][word] + 1.0) / (prob_words_denominator[label] + len(self.words))
training_data = [
['A great game','Sports'],
['The election was over','Not sports'],
['Very clean match','Sports'],
['A clean but forgettable game','Sports'],
['It was a close election','Not sports']
]
text_to_predict = 'A very close game'
model = Model()
model.train(training_data)
print(model.predict(text_to_predict, norm=False))
print(model.predict(text_to_predict))
print(model.predict("none of these words is in training data"))
output:
{'sports': 2.7647999999999997e-05, 'not sports': 5.7175324559303314e-06}
{'sports': 0.8286395560004286, 'not sports': 0.1713604439995714}
{'sports': 0.6, 'not sports': 0.4}

Removing words for using NayveBayes NLTK Classifier for survey data

I have a CSV file with survey data and I wish to perform a sentiment analysis on it.
I am using the Naive Bayes to show the most informative features but the output is not showing a meaningful insight. It outputs irrelevant words such as level or of, hence I tried to manually create a list of stop words that I want to remove but I don't think it is working properly because they are still there. There is my code:
import csv
from collections import Counter
import nltk
from nltk.corpus import stopwords
with open('/Users/Alessandra/Desktop/Dissertation Data/Survey Coding Inst.csv', 'r') as f:
reader = csv.reader(f, delimiter='\t')
alist = []
iterreader = iter(reader)
next(iterreader)
c = Counter()
for row in iterreader:
clean_rows = row[0].replace(",", " ").rsplit()
clean_symbols = row[0].replace("-", "").rsplit()
remove_words = ['of', 'Level', 'study', 'How', 'many', 'SC', '2.', '1.', '3.', '4.', '5.', '6.', '7.', '8.',
'9.',
'10.', '11.', '12.', '13.', '14.', '15.', 'Gender', 'inconvenience', 'times', 'Agree',
'Experience', 'Interrupted', 'Workflow', 'Unable', 'Yes', 'No', 'Statement', 'Safety',
'non-UCL', 'people', 'guards', 'Stronglee', 'Disagree', 'Neutral', 'Somewhat', 'on', 'if',
'too', '-', 'i', '1', '2']
# alist.append(clean_rows)
# alist.append(clean_symbols)
c.update(clean_rows)
c.update(clean_symbols)
alist.append(c)
word_count = Counter(c)
mostWcommon = word_count.most_common()
for i in alist:
if i in remove_words:
mostWcommon.remove(i)
print(mostWcommon)
all_words = nltk.FreqDist(w.lower() for w in alist[0])
word_features = list(all_words)[:100]
english_stop_words = stopwords.words('english')
def remove_stop_words(corpus):
removed_stop_words = []
for review in corpus:
removed_stop_words.append(' '.join([word for word in review[0].split() if word not in english_stop_words]))
return removed_stop_words
no_stop_words = remove_stop_words(mostWcommon)
def document_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains {}'.format(word)] = (word in document_words)
return features
featuresets = [(document_features(d), c) for (d, c) in mostWcommon]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
classifier.show_most_informative_features(5)
OUTPUT:
Most Informative Features
contains i = True 3 : 2 = 1.6 : 1.0
contains 1 = True 1 : 3 = 1.5 : 1.0
contains i = False 2 : 3 = 1.3 : 1.0
contains 2 = True 1 : 3 = 1.2 : 1.0
contains - = True 2 : 1 = 1.2 : 1.0
contains 1 = False 2 : 1 = 1.2 : 1.0
contains 2 = False 2 : 1 = 1.1 : 1.0
contains - = False 1 : 3 = 1.0 : 1.0
contains 5. = False 1 : 4 = 1.0 : 1.0
contains disagree = False 1 : 4 = 1.0 : 1.0
Data looks like this:
('Yes', 194), ('No', 173), ('agree', 61), ('Agree', 57), ('to', 48), ('UG', 47), ('Strongly', 38), ('and', 36), ('unlikely', 36), ('Female', 34), ('-', 34),....)
As you can see even most common is not picking up the manual removal hence displaying less meaningful data... Any suggestions would be appreciated.

trouble with my nested for loops achieve results similar to a sumif

I'm trying to cycle through 2 lists using for loops to calculate the sum for each unique reference. I suppose I'm looking for a pythonic sumif!
# list of data ("user_ID", "contract_Number", "weight", "type")
list1 = [
('1','261','6.2','Input'),
('1','262','7.2','Input'),
('1','263','5.2','Input'),
('1','264','8.2','Input'),
('1','261','3.2','Input'),
('1','262','2.2','Input'),
('1','262','7.2','Input'),
('1','263','4.2','Input'),
('1','264','6.2','Input'),
('1','265','6.2','Input'),
('1','261','9.2','Input'),
('1','261','10.2','Input')
]
contract_list = []
# create a list of contract numbers
for data_row in list1:
if data_row[0] == "1" and data_row[3] == "Input":
contract_list.append(data_row[1])
#remove duplication - left with a list of unique contract numbers
contract_list = list(dict.fromkeys(contract_list))
print(contract_list)
# I'm trying this...[28.6, 16.6, 9.4, 14.4, 6.2]
tally_list = []
tally = 0
for c in contract_list:
for l in list1:
if data_row[0] == '1' and data_row[1] == contract_list[0]:
tally = tally + float(data_row[2])
tally_list.append(tally)
print(tally_list)
I'm expecting...
['261', '262', '263', '264', '265']
[28.6, 16.6, 9.4, 14.4, 6.2]
I'm getting...
['261', '262', '263', '264', '265']
[122.40000000000002, 244.7999999999999, 367.19999999999976, 489.5999999999996, 612.0]
# I'm trying this...[28.6, 16.6, 9.4, 14.4, 6.2]
tally_list = []
tally = 0
for c in contract_list:
for l in list1: #<----------
if data_row[0] == '1' and data_row[1] == contract_list[0]:
tally = tally + float(data_row[2])
tally_list.append(tally)
In the marked row, it looks like you want to use the data_row variable instead of l
Actually, try this, you need to additionally reset tally and also use c instead of contract_list[0] in the final if statement.
# I'm trying this...[28.6, 16.6, 9.4, 14.4, 6.2]
tally_list = []
tally = 0
for c in contract_list:
for data_row in list1:
if data_row[0] == '1' and data_row[1] == c: #<----
tally = tally + float(data_row[2])
tally_list.append(tally)
tally=0 #<---
print(tally_list)
Just another approach using a defaultdict
from collections import defaultdict
list1 = [
('1','261','6.2','Input'),
('1','262','7.2','Input'),
('1','263','5.2','Input'),
('1','264','8.2','Input'),
('1','261','3.2','Input'),
('1','262','2.2','Input'),
('1','262','7.2','Input'),
('1','263','4.2','Input'),
('1','264','6.2','Input'),
('1','265','6.2','Input'),
('1','261','9.2','Input'),
('1','261','10.2','Input')
]
d = defaultdict(int)
for tup in list1:
if tup[0] == '1' and tup[3] == 'Input':
d[tup[1]] += float(tup[2])
contract_list = list(d)
print(contract_list)
tally_list = [format(v, '.1f') for v in d.values()]
print(tally_list)
Output:
['261', '262', '263', '264', '265']
['28.8', '16.6', '9.4', '14.4', '6.2']

Retrieving results from dictionary of a range

I have a ENTIREMAP which has a mapping of all names and toy prices. What I want to try and do is create a function getResults like below:
#################
def getResults(name, price):
# where name could be 'Laura' and price is 0.02
# then from the ENTIREMAP find the key 'Laura' and if 0.02 is in the range
# of the prices in the map i.e since 0.02 is between (0.0,0.05) then return
# ('PEN', 'BLUE')
prices = [d[name] for d in ENTIRELIST if name in d]
if prices:
print prices[0]
###################
GIRLTOYPRICES = {(0.0,0.05):('PEN', 'BLUE'),
(0.05,0.08):('GLASSES', 'DESIGNER'),
(0.08,0.12):('TOP', 'STRIPY'),
}
BOYTOYPRICES = {(0.0,0.10):('BOOK', 'HARRY POTTER'),
(0.10,0.15):('BLANKET', 'SOFT'),
(0.15,0.40):('GBA', 'GAMES'),
}
GIRLS = ['Laura', 'Samantha']
BOYS = ['Mike','Fred']
GIRLLIST = [{girl: GIRLTOYPRICES} for girl in GIRLS]
BOYLIST = [{boy: BOYTOYPRICES} for boy in BOYS]
ENTIRELIST = GIRLMAP + BOYMAP
print ENTIRELIST
[{'Laura': {(0.0, 0.05): ('PEN', 'BLUE'), (0.08, 0.12): ('TOP', 'STRIPY'), (0.05, 0.08): ('GLASSES', 'DESIGNER')}}, {'Samantha': {(0.0, 0.05): ('PEN', 'BLUE'), (0.08, 0.12): ('TOP', 'STRIPY'), (0.05, 0.08): ('GLASSES', 'DESIGNER')}}, {'Mike': {(0.0, 0.1): ('BOOK', 'HARRY POTTER'), (0.15, 0.4): ('GBA', 'GAMES'), (0.1, 0.15): ('BLANKET', 'SOFT')}}, {'Fred': {(0.0, 0.1): ('BOOK', 'HARRY POTTER'), (0.15, 0.4): ('GBA', 'GAMES'), (0.1, 0.15): ('BLANKET', 'SOFT')}}]
Any help would be appreciated.
Kind of a weird data structure, but:
for person in ENTIRELIST:
person_name, toys = person.items()[0]
if person_name != name: # inverted to reduce nesting
continue
for (price_min, price_max), toy in toys.items():
if price_min <= price < price_max:
return toy
This is simpler (and more effective):
GIRLMAP = {girl: GIRLTOYPRICES for girl in GIRLS}
BOYMAP = {boy: BOYTOYPRICES for boy in BOYS}
ENTIREMAP = dict(GIRLMAP, **BOYMAP)
for (price_min, price_max), toy in ENTIREMAP[name].items():
if price_min <= price < price_max:
return toy

load parameters from a file in Python

I am writing a Python class to model a process and I want to initialized the parameters from a file, say 'input.dat'. The format of the input file looks like this.
'input.dat' file:
Z0: 0 0
k: 0.1
g: 1
Delta: 20
t_end: 300
The code I wrote is the following. It works but appears redundant and inflexible. Is there a better way to do the job? Such as a loop to do readline() and then match the keyword?
def load(self,filename="input.dat"):
FILE = open(filename)
s = FILE.readline().split()
if len(s) is 3:
self.z0 = [float(s[1]),float(s[2])] # initial state
s = FILE.readline().split()
if len(s) is 2:
self.k = float(s[1]) # kappa
s = FILE.readline().split()
if len(s) is 2:
self.g = float(s[1])
s = FILE.readline().split()
if len(s) is 2:
self.D = float(s[1]) # Delta
s = FILE.readline().split()
if len(s) is 2:
self.T = float(s[1]) # end time
Assuming the params are coming from a safe place (made by you or users, not the internet), just make the parameters file a Python file, params.py:
Z0 = (0, 0)
k = 0.1
g = 1
Delta = 20
t_end = 300
Then in your code all you need is:
import params
fancy_calculation(10, k=params.k, delta=params.Delta)
The beauty of this is two-fold: 1) simplicity, and 2) you can use the power of Python in your parameter descriptions -- particularly useful here, for example:
k = 0.1
Delta = 20
g = 3 * k + Delta
Alternatively, you could use Python's built-in JSON or ConfigParser .INI parser modules.
If you are open to some other kind of file where you can keep your parameters, I would suggest you to use a YAML file.
The Python library is PyYAML. This is how you can easily use it with Python.
For a better introduction, look at this Wikipedia article: http://en.wikipedia.org/wiki/YAML.
The benefit is you can read the parameter values as lists or maps.
You would love it!
Try the following:
def load(self, filename="input.dat"):
d = {"Z0": "z0", "k": "k", "g": "g", "Delta": "D", "t_end": "T"}
FILE = open(filename)
for line in FILE:
name, value = line.split(":")
value = value.strip()
if " " in value:
value = map(float, value.split())
else:
value = float(value)
setattr(self, d[name], value)
Proof that it works:
>>> class A(object): pass
...
>>> a = A()
>>> load(a)
>>> a.__dict__
{'k': 0.10000000000000001, 'z0': [0.0, 0.0], 'D': 20.0, 'g': 1.0, 'T': 300.0}
As others have mentioned, in Python you can create object attributes dynamically "on the fly". That means you could do something like the following to create Params objects as they're read-in. I've tried to make the code as data-driven as possible, so relatively flexible.
# maps label to attribute name and types
label_attr_map = {
"Z0:": ["z0", float, float],
"k:": [ "k", float],
"g:": [ "g", float],
"Delta:": [ "D", float],
"t_end:": [ "T", float]
}
class Params(object):
def __init__(self, input_file_name):
with open(input_file_name, 'r') as input_file:
for line in input_file:
row = line.split()
label = row[0]
data = row[1:] # rest of row is data list
attr = label_attr_map[label][0]
datatypes = label_attr_map[label][1:]
values = [(datatypes[i](data[i])) for i in range(len(data))]
self.__dict__[attr] = values if len(values) > 1 else values[0]
params = Params('input.dat')
print 'params.z0:', params.z0
print 'params.k:', params.k
print 'params.g:', params.g
print 'params.D:', params.D
print 'params.T:', params.T
Output:
params.z0: [0.0, 0.0]
params.k: 0.1
params.g: 1.0
params.D: 20.0
params.T: 300.0
Perhaps this might give you what you need:
def load(self,filename='input.dat'):
with open(filename) as fh:
for line in fh:
s = line.split()
if len(s) == 2:
setattr(self,s[1],s[2])
elif len(s) == 3:
setattr(self,s[1],s[2:])
I also didn't include any error checking, but setattr is very handy.
Something like this:
def load(self,filename="input.dat"):
# maps names to number of fields they need
# only necessary for variables with more than 1 field
argmap = dict(Z0=2)
# maps config file names to their attribute names on the object
# if name is the same both places, no need
namemap = dict(Z0="z0", Delta="D", t_end="T")
with open(filename) as FILE:
for line in FILE:
s = line.split()
var = s[0].rstrip(":")
try:
val = [float(x) for x in s[1:]]
except ValueError:
continue
if len(val) == varmap.get(var, 1):
if len(val) == 1:
val = val[0]
setattr(self, namemap.get(var, var), val)
Python objects have a built-in __dict__ member. You can modify it, and then refer to properties as obj.key.
class Data(object):
def __init__(self, path='infile.dat'):
with open(path, 'r') as fo:
for line in fo.readlines():
if len(line) < 2: continue
parts = [s.strip(' :\n') for s in line.split(' ', 1)]
numbers = [float(s) for s in parts[1].split()]
# This is optional... do you want single values to be stored in lists?
if len(numbers) == 1: numbers = numbers[0]
self.__dict__[parts[0]] = numbers
# print parts -- debug
obj = Data('infile.dat')
print obj.g
print obj.Delta
print obj.Z0
At the end of this, we print out a few of the keys. Here's the output of those.
1.0
20.0
[0.0, 0.0]
For consistency, you can remove the line marked "optional" in my code, and have all objects in lists -- regardless of how many elements they have. That will make using them quite a bit easier, because you never have to worry about obj.g[0] returning an error.
Here's another one
def splitstrip(s):
return s.split(':')[1].strip()
with open('input.dat','r') as f:
a.z0 = [float(x) for x in splitstrip(f.readline()).split(' ')]
a.k, a.g, a.D, a.T = tuple([float(splitstrip(x)) for x in f.read().rstrip().split('\n')])
;)

Categories

Resources