Python 3 equivalent to code involving tuples passed to a function - python

I have a code which calculates the movie similarities given a dataset, which is ratings.dat and movies.dat. The code however, is written in python 2.7.
I tried converting the code to python-3 but was unable to get the desired results. Need some expert help to review if there are any mistakes in the code.
Below is the code is the code area which I need to convert to python 3:
def makePairs((user, ratings)):
(movie1, rating1) = ratings[0]
(movie2, rating2) = ratings[1]
return ((movie1, movie2), (rating1, rating2))
def filterDuplicates( (userID, ratings) ):
(movie1, rating1) = ratings[0]
(movie2, rating2) = ratings[1]
return movie1 < movie2
and also this
# Filter for movies with this sim that are "good" as defined by
# our quality thresholds above
filteredResults = moviePairSimilarities.filter(lambda((pair,sim)): \
(pair[0] == movieID or pair[1] == movieID) \
and sim[0] > scoreThreshold and sim[1] > coOccurenceThreshold)
# Sort by quality score.
results = filteredResults.map(lambda((pair,sim)): (sim, pair)).sortByKey(ascending = False).take(10)
the complete code as follows
spark-submit mycodefile.py 50
here's the code in python 2.7
import sys
from pyspark import SparkConf, SparkContext
from math import sqrt
def loadMovieNames():
movieNames = {}
with open("movies.dat") as f:
for line in f:
fields = line.split("::")
movieNames[int(fields[0])] = fields[1].decode('ascii', 'ignore')
return movieNames
def makePairs((user, ratings)):
(movie1, rating1) = ratings[0]
(movie2, rating2) = ratings[1]
return ((movie1, movie2), (rating1, rating2))
def filterDuplicates( (userID, ratings) ):
(movie1, rating1) = ratings[0]
(movie2, rating2) = ratings[1]
return movie1 < movie2
def computeCosineSimilarity(ratingPairs):
numPairs = 0
sum_xx = sum_yy = sum_xy = 0
for ratingX, ratingY in ratingPairs:
sum_xx += ratingX * ratingX
sum_yy += ratingY * ratingY
sum_xy += ratingX * ratingY
numPairs += 1
numerator = sum_xy
denominator = sqrt(sum_xx) * sqrt(sum_yy)
score = 0
if (denominator):
score = (numerator / (float(denominator)))
return (score, numPairs)
conf = SparkConf()
sc = SparkContext(conf = conf)
print("\nLoading movie names...")
nameDict = loadMovieNames()
data = sc.textFile("ratings.dat")
# Map ratings to key / value pairs: user ID => movie ID, rating
ratings = data.map(lambda l: l.split("::")).map(lambda l: (int(l[0]), (int(l[1]), float(l[2]))))
# Emit every movie rated together by the same user.
# Self-join to find every combination.
ratingsPartitioned = ratings.partitionBy(100)
joinedRatings = ratingsPartitioned.join(ratingsPartitioned)
# At this point our RDD consists of userID => ((movieID, rating), (movieID, rating))
# Filter out duplicate pairs
uniqueJoinedRatings = joinedRatings.filter(filterDuplicates)
# Now key by (movie1, movie2) pairs.
moviePairs = uniqueJoinedRatings.map(makePairs).partitionBy(100)
# We now have (movie1, movie2) => (rating1, rating2)
# Now collect all ratings for each movie pair and compute similarity
moviePairRatings = moviePairs.groupByKey()
# We now have (movie1, movie2) = > (rating1, rating2), (rating1, rating2) ...
# Can now compute similarities.
moviePairSimilarities = moviePairRatings.mapValues(computeCosineSimilarity).persist()
# Save the results if desired
moviePairSimilarities.sortByKey()
moviePairSimilarities.saveAsTextFile("movie-sims")
# Extract similarities for the movie we care about that are "good".
if (len(sys.argv) > 1):
scoreThreshold = 0.97
coOccurenceThreshold = 1000
movieID = int(sys.argv[1])
# Filter for movies with this sim that are "good" as defined by
# our quality thresholds above
filteredResults = moviePairSimilarities.filter(lambda((pair,sim)): \
(pair[0] == movieID or pair[1] == movieID) \
and sim[0] > scoreThreshold and sim[1] > coOccurenceThreshold)
# Sort by quality score.
results = filteredResults.map(lambda((pair,sim)): (sim, pair)).sortByKey(ascending = False).take(10)
print("Top 10 similar movies for " + nameDict[movieID])
for result in results:
(sim, pair) = result
# Display the similarity result that isn't the movie we're looking at
similarMovieID = pair[0]
if (similarMovieID == movieID):
similarMovieID = pair[1]
print(nameDict[similarMovieID] + "\tscore: " + str(sim[0]) + "\tstrength: " + str(sim[1]))
any help is much appreciated.
Regard
what I have already done is converting this code to a python 3 equivalent code as follows, but unable to get the desired results.
import sys
from pyspark import SparkConf, SparkContext
from math import sqrt
def loadMovieNames():
movieNames = {}
with open("movies.dat") as f:
for line in f:
fields = line.split("::")
movieNames[int(fields[0])] = fields[1] #.decode('ascii', 'ignore')
return movieNames
def makePairs(*ratings):
for t in ratings:
(movie1, rating1) = t[1][0]
(movie2, rating2) = t[1][1]
return ((movie1, movie2), (rating1, rating2))
def filterDuplicates(*ratings):
for t in ratings:
(movie1, rating1) = t[1][0]
(movie2, rating2) = t[1][1]
return movie1 < movie2
def computeCosineSimilarity(ratingPairs):
numPairs = 0
sum_xx = sum_yy = sum_xy = 0
for ratingX, ratingY in ratingPairs:
sum_xx += ratingX * ratingX
sum_yy += ratingY * ratingY
sum_xy += ratingX * ratingY
numPairs += 1
numerator = sum_xy
denominator = sqrt(sum_xx) * sqrt(sum_yy)
score = 0
if (denominator):
score = (numerator / (float(denominator)))
return (score, numPairs)
conf = SparkConf().setMaster("local[*]").setAppName("MovieSimilarities")
sc = SparkContext(conf = conf)
print("\nLoading movie names...")
nameDict = loadMovieNames()
print("\nLoading movie ratings...")
data = sc.textFile("ratings100.dat")
print("\nDone..")
# Map ratings to key / value pairs: user ID => movie ID, rating
ratings = data.map(lambda l: l.split("::")).map(lambda l: (int(l[0]), (int(l[1]), float(l[2]))))
# Emit every movie rated together by the same user.
# Self-join to find every combination.
ratingsPartitioned = ratings.partitionBy(100)
joinedRatings = ratingsPartitioned.join(ratingsPartitioned)
#joinedRatings = ratings.join(ratings)
# At this point our RDD consists of userID => ((movieID, rating), (movieID, rating))
# Filter out duplicate pairs
uniqueJoinedRatings = joinedRatings.filter(filterDuplicates)
# Now key by (movie1, movie2) pairs.
moviePairs = uniqueJoinedRatings.map(makePairs).partitionBy(100)
# We now have (movie1, movie2) => (rating1, rating2)
# Now collect all ratings for each movie pair and compute similarity
moviePairRatings = moviePairs.groupByKey()
# We now have (movie1, movie2) = > (rating1, rating2), (rating1, rating2) ...
# Can now compute similarities.
moviePairSimilarities = moviePairRatings.mapValues(computeCosineSimilarity).persist()
# Save the results if desired
moviePairSimilarities.sortByKey()
moviePairSimilarities.saveAsTextFile("movie-sims")
# Extract similarities for the movie we care about that are "good".
if (len(sys.argv) > 1):
scoreThreshold = 0.9
coOccurenceThreshold = 1000
movieID = int(sys.argv[1])
# Filter for movies with this sim that are "good" as defined by
# our quality thresholds above
filteredResults = moviePairSimilarities.filter(lambda pairSim: (pairSim[0][0] == movieID or pairSim[0][1] == movieID) and pairSim[1][0] > scoreThreshold and pairSim[1][1] > coOccurenceThreshold)
# Sort by quality score.
results = filteredResults.map(lambda pairSim: (pairSim[1], pairSim[0])).sortByKey(ascending = False).take(10)
print("Top 10 similar movies for " + str(nameDict[movieID]))
for result in results:
(sim, pair) = result
# Display the similarity result that isn't the movie we're looking at
similarMovieID = pair[0]
if (similarMovieID == movieID):
similarMovieID = pair[1]
print(nameDict[similarMovieID] + "\tscore: " + str(sim[0]) + "\tstrength: " + str(sim[1]))
Below is expected result, which should show top 10 similar movie results.
Top 10 similar movies for Wizard of Oz, The (1939)
Toy Story (1995) score: 661 strength: 1545
Some Other Movie score: 594 strength: 720
Another Movie score: 2018 strength: 2804

def f(*tuplex) is not the same as def f((x, y)); it is (more or less) the same as def f(x, y). That is, the first (py3) function receives a list of non-keyword argument, the second (py2) a single tuple argument. Since you're passing a single-element (which happens to be a tuple), tuplex will be a tuple of one element (and the resulting for t in tuplex will only iterate once). You should make it def(xy), where xy will be your (x, y) tuple.
Your Python 2 code:
def makePairs((user, ratings)):
(movie1, rating1) = ratings[0]
(movie2, rating2) = ratings[1]
return ((movie1, movie2), (rating1, rating2))
The actual compatible Python 3 code:
def makePairs(user_ratings):
_, ratings = user_ratings
(movie1, rating1) = ratings[0]
(movie2, rating2) = ratings[1]
return ((movie1, movie2), (rating1, rating2))
As also mentioned somewhere in the comments, you can replace this whole function by a simple zip call, for example:
>>> a = (('movie1', 'rating1'), ('movie2', 'rating2'))
>>> list(zip(*a))
[('movie1', 'movie2'), ('rating1', 'rating2')]
(you don't need list(...) if you just need to return an iterator, but that doesn't show the actual contents on the command line. so leave out the call to list(...) unless you get an error about a "zip object" in your actual code.)
The unfortunate parts here are that
- the map method where you use makePairs passes only a function, so you can't specify the asterisk.
- you need to get rid of the first argument, user.
You could probably use the following:
moviePairs = uniqueJoinedRatings.map(lambda x: zip(*x[1])).partitionBy(100)
(untested)
That gets rid of the complete makePairs function, at the cost of some clarity.
Last tidbit: make_pairs follows the style guide; makePairs is not Python style. As goes for all other names in your code. Since you mentioned the word review at the top of your question (but that's probably more a matter for Code Review.

Related

How to read data in a class to solve a tabu search problem

I'm trying to learn search algorithms in order to prepare my master thesis, so I have a TSP problem in which I want to find the best and minimal route to visite all the states , I'm using a .txt file named cities__coordinates.txt that contains the coordinates for every state, so to read the data I've found this source code that has a class to read the data you'll find it here:
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import copy
import re
import math
class Data():
'''
the format of solomon dataset
'''
def __init__(self):
self.customerNum = 0 # the number of customers
self.nodeNum = 0 # the sum of customers and depots
self.vehicleNum = 0
self.capacity = 0
self.cor_X = []
self.cor_Y = []
self.demand = []
self.readyTime = []
self.dueTime = []
self.serviceTime = []
self.disMatrix = {}
def read_data(self, path, customerNum, depotNum):
'''
function to read solomom data from .txt files, notice that it must be solomon dataset
INPUT
# data : class Data
# path : Data path
# customerNum : the number of customer
OutPut : none
'''
self.customerNum = customerNum
self.nodeNum = customerNum + depotNum
f = open('cities__coordinates.txt', 'r')
lines = f.readlines()
count = 0
for line in lines:
count = count + 1
if(count == 5):
line = line[:-1].strip()
str = re.split(r" +", line)
self.vehicleNum = float(str[0])
self.capacity = float(str[1])
elif(count >= 10 and count <= 10 + customerNum):
line = line[:-1]
str = re.split(r" +", line)
self.cor_X.append(float(str[2]))
self.cor_Y.append(float(str[3]))
self.demand.append(float(str[4]))
self.readyTime.append(float(str[5]))
self.dueTime.append(float(str[6]))
self.serviceTime.append(float(str[7]))
# compute the distance matrix
self.disMatrix = {}
for i in range(0, self.nodeNum):
dis_temp={}
for j in range(0, self.nodeNum):
dis_temp[j] = int(math.hypot(self.cor_X[i] - self.cor_X[j],self.cor_Y[i] - self.cor_Y[j]))
self.disMatrix[i] = dis_temp
def plot_nodes(self):
'''
Description: function to plot
'''
Graph = nx.DiGraph()
nodes_name = [str(x) for x in list(range(self.nodeNum))]
Graph.add_nodes_from(nodes_name)
cor_xy = np.array([self.cor_X,self.cor_Y]).T.astype(int)
pos_location = {nodes_name[i]:x for i,x in enumerate(cor_xy)}
nodes_color_dict = ['r'] + ['gray'] * (self.nodeNum-1)
nx.draw_networkx(Graph,pos_location,node_size=200,node_color=nodes_color_dict,labels=None)
plt.show(Graph)
def plot_route(self,route,color='k'):
Graph = nx.DiGraph()
nodes_name = [0]
cor_xy=[[self.cor_X[0] , self.cor_Y[0]]]
edge = []
edges = [[0,route[0]]]
for i in route :
nodes_name.append(i)
cor_xy.append([self.cor_X[i] , self.cor_Y[i]])
edge.append(i)
if len(edge) == 2 :
edges.append(copy.deepcopy(edge))
edge.pop(0)
edges.append([route[-1],0])
Graph.add_nodes_from(nodes_name)
Graph.add_edges_from(edges)
pos_location = {nodes_name[i]:x for i,x in enumerate(cor_xy)}
nodes_color_dict = ['r'] + ['gray'] * (len(route))
nx.draw_networkx(Graph,pos_location,node_size=200,node_color=nodes_color_dict,edge_color=color, labels=None)
plt.show(Graph)
so in read_data function I've changed the path to my .txt file , and for the code which will calculate all the distance and took the tabu search and all the staffs, here it is the code:
from itertools import combinations
import os,sys,copy
import numpy as np
import time
from Datareader import Data
import matplotlib.pyplot as plt
class Tabu():
def __init__(self,disMatrix,max_iters=200,maxTabuSize=20):
"""parameters definition"""
self.disMatrix = disMatrix
self.maxTabuSize = maxTabuSize
self.max_iters = max_iters
self.tabu_list=[]
def get_route_distance(self,route):
'''
Description: function to calculate total distance of a route. evaluate function.
parameters: route : list
return : total distance : folat
'''
routes = [0] + route + [0] # add the start and end point
total_distance = 0
for i,n in enumerate(routes):
if i != 0 :
total_distance = total_distance + self.disMatrix[last_pos][n]
last_pos = n
return total_distance
def exchange(self,s1,s2,arr):
"""
function to Swap positions of two elements in an arr
Args: int,int,list
s1 : target 1
s2 : target 2
arr : target array
Ouput: list
current_list : target array
"""
current_list = copy.deepcopy(arr)
index1 , index2 = current_list.index(s1) , current_list.index(s2) # get index
current_list[index1], current_list[index2]= arr[index2] , arr[index1]
return current_list
def generate_initial_solution(self,num=10,mode='greedy'):
"""
function to get the initial solution,there two different way to generate route_init.
Args:
num : int
the number of points
mode : string
"greedy" : advance step by choosing optimal one
"random" : randomly generate a series number
Ouput: list
s_init : initial solution route_init
"""
if mode == 'greedy':
route_init=[0]
for i in range(num):
best_distance = 10000000
for j in range(num+1):
if self.disMatrix[i][j] < best_distance and j not in route_init:
best_distance = self.disMatrix[i][j]
best_candidate = j
route_init.append(best_candidate)
route_init.remove(0)
if mode == 'random':
route_init = np.arange(1,num+1) #init solution from 1 to num
np.random.shuffle(route_init) #shuffle the list randomly
return list(route_init)
def tabu_search(self,s_init):
"""tabu search"""
s_best = s_init
bestCandidate = copy.deepcopy(s_best)
routes , temp_tabu = [] , [] # init
routes.append(s_best)
while(self.max_iters):
self.max_iters -= 1 # Number of iterations
neighbors = copy.deepcopy(s_best)
for s in combinations(neighbors, 2):
sCandidate = self.exchange(s[0],s[1],neighbors) # exchange number to generate candidates
if s not in self.tabu_list and self.get_route_distance(sCandidate) < self.get_route_distance(bestCandidate):
bestCandidate = sCandidate
temp_tabu = s
if self.get_route_distance(bestCandidate) < self.get_route_distance(s_best): # record the best solution
s_best = bestCandidate
if temp_tabu not in self.tabu_list:
self.tabu_list.append(temp_tabu)
if len(self.tabu_list) > self.maxTabuSize :
self.tabu_list.pop(0)
routes.append(bestCandidate)
return s_best, routes
if __name__ == "__main__":
data = Data()
data.read_data(path='cities__coordinates.txt',customerNum=100,depotNum=1) # change the path
""" Tabu :
disMatrix : the distance matrix from 0 to X , 0 represernt starting and stopping point。
for example: disMatrix = [[0,3,4,...
1,0,5,...
3,5,0,...]]
that means the distance from 0 to 0 is 0, from 0 to 1 is 3,... from 1 to 3 is 5....
max_iters : maximum iterations
maxTabuSize : maximum iterations
"""
tsp = Tabu(disMatrix=data.disMatrix,max_iters=10,maxTabuSize=10)
# two different way to generate initial solution
# num : the number of points
s_init = tsp.generate_initial_solution(num=10,mode='greedy') # mode = "greedy" or "random"
print('init route : ' , s_init)
print('init distance : ' , tsp.get_route_distance(s_init))
start = time.time()
best_route , routes = tsp.tabu_search(s_init) # tabu search
end = time.time()
print('best route : ' , best_route)
print('best best_distance : ' , tsp.get_route_distance(best_route))
print('the time cost : ',end - start )
# plot the result changes with iterations
results=[]
for i in routes:
results.append(tsp.get_route_distance(i))
plt.plot(np.arange(len(results)) , results)
plt.show()
# plot the route
data.plot_route(best_route)
when I execute it, it takes a little time and then it shows me this error :
Traceback (most recent call last):
File "C:/Users/malle/OneDrive/Desktop/TS.py", line 100, in <module>
data.read_data(path='cities__coordinates.txt',customerNum=100,depotNum=1) # change the path
File "C:/Users/malle/OneDrive/Desktop\Datareader.py", line 49, in read_data
self.cor_X.append(float(str[2]))
IndexError: list index out of range
anyone can help to resolve this problem please ?

Making Excel histograms using python

This is the output of my python script so far.
Excel Table
The vertical axis of the table are road names. The horizontal axis are dates. The values indicate if a road was under construction at the time and why. I'd like to make a line graph that groups the dates by years 2017, 2018, 2019 etc... and plots the longest amount a time within those groups that a road was under construction and the average amount for the whole year. I'm a complete novice in excel and don't know how to leverage it's features to achieve my goal, though I suspect that there may be built in functions that do what I want without much difficulty. Any suggestions on how can achieve my desired output would be much appreciated. EDIT: It was suggested that I post my code so far.
import re
import time
startTime = time.time()
import collections
import xlsxwriter as xlswr
import scipy.spatial as spy
from itertools import islice
from itertools import groupby
from natsort import natsorted
from functools import partial
from collections import Counter
from datetime import date as DATE
from indexed import IndexedOrderedDict
from multiprocessing.dummy import Pool as ThreadPool
import multiprocessing as mp
workBook = xlswr.Workbook("testfix.xlsx")
cell_format = workBook.add_format()
format1 = workBook.add_format({'num_format': 'mm/dd/yy'})
sheet = workBook.add_worksheet()
def to_raw(string):
return fr"{string}"
def cvrt(x):
ans = re.split(r'(\d+)(?!.*\d)', x)
return int(ans[1])
def indexer(s):
pattern = re.compile(r'I, [0-9]+, ')
gm = re.split(pattern, s);
values = s.rsplit(gm[1])
gm = gm[1]
values[1] = gm
return values
def int2Date(x):
string = str(x)
Y = int(string[0:4])
M = int(string[4:6])
D = int(string[6:8])
return DATE(Y,M,D)
def dDelta(x, y):
string1 = str(x)
string2 = str(y)
Y1 = int(string1[0:4])
M1 = int(string1[4:6])
D1 = int(string1[6:8])
Y2 = int(string2[0:4])
M2 = int(string2[4:6])
D2 = int(string2[6:8])
f_date = DATE(Y1,M1,D1)
l_date = DATE(Y2,M2,D2)
delta = l_date - f_date
if isinstance(y, int):
return float(int((delta.days)/30.44))
else:
return int((delta.days)/30.44)
def Book(path):
file = open(path,'r')
lines = file.readlines()
file.close()
book = IndexedOrderedDict()
for line in lines:
if re.match("I", line):
IDs = indexer(line)[1]
if re.match(" 0.00,", line):
rID = line
#"GM_FINAL_AUTH,0,[1-9]"
if re.search("GM_FINAL_AUTH,0,[1-9]", line):
book.update({(rID, line): to_raw(IDs)})
return sort_book(book)
def dUpdate(dic, key, value):
return dic.update({(key[0], "GM_FINAL_AUTH,0,0"): value})
def valSplt(s):
pattern = re.compile(r'(\d+)')
gm = re.split(pattern, s)
values = s.rsplit(gm[1])
gm = gm[1]
values[1] = gm
return values
def sort_book(book):
book = natsorted([value, key] for key, value in book.items())
book = IndexedOrderedDict((data[1], data[0]) for data in book)
return book
def alph_order(word1, word2):
for i in range(min(len(word1), len(word2))):
if ord(word1[i]) == ord(word2[i]):
pass
elif ord(word1[i]) > ord(word2[i]):
return word2
else:
return word1
return word1
def read(cpdm, date_list):
sCnt = [0] * len(cpdm)
lowest_number = 999999999999
terminationCondition = [True] * len(cpdm)
saved_results = [0] * len(cpdm)
current_prefix = None
cnt = 0
while any(terminationCondition) is True:
saved_results = [0] * len(cpdm)
last_prefix = None
lowest_number = 999999999999
for dicIdx, dicVal in enumerate(sCnt):
if dicVal < len(cpdm[dicIdx]):
ID = cpdm[dicIdx].values()[dicVal]
# print(entry)
current_prefix, road_number = valSplt(ID)
road_number = int(road_number)
if last_prefix is None:
last_prefix = current_prefix
higherOrder_prefix = alph_order(last_prefix, current_prefix)
# print('check:',[higherOrder_prefix, last_prefix, current_prefix])
if current_prefix == higherOrder_prefix:
if current_prefix != last_prefix:
lowest_number = road_number
last_prefix = current_prefix
elif road_number < lowest_number:
lowest_number = road_number
last_prefix = current_prefix
for dicIdx, dicVal in enumerate(sCnt):
if dicVal < len(cpdm[dicIdx]):
# print(dicIdx, dicVal, len(cpdm[dicIdx]))
ID = cpdm[dicIdx].values()[dicVal]
VALUE = cpdm[dicIdx].keys()[dicVal]
# print(entry)
road_name, road_number = valSplt(ID)
road_number = int(road_number)
if road_name == last_prefix and lowest_number == road_number:
saved_results[dicIdx] = [ID, VALUE[1], date_list[dicIdx], VALUE[0]]
if dicVal < len(cpdm[dicIdx]):
sCnt[dicIdx] += 1
else:
terminationCondition[dicIdx] = False
else:
terminationCondition[dicIdx] = False
for rst in range(len(saved_results)):
if saved_results[rst] == 0:
pass
else:
sheet.write(cnt+1, 0, str(saved_results[rst][0]))
sheet.write(cnt+1, rst+1, cvrt(saved_results[rst][1]))
#sheet.write(cnt+1, 2*et+3, int2Date(saved_results[et][2]), format1)
#sheet.write(cnt+1, 0, saved_results[rst][3])
cnt += 1
def main():
# 2018 MAPS
path1 = "W:\\Scripting\\2018\\DBData_84577881.txt"
path2 = "W:\\Scripting\\2018\\DBData_84639568.txt"
path3 = "W:\\Scripting\\2018\\DBData_84652483.txt"
path4 = "W:\\Scripting\\2018\\DBData_84670490.txt"
# 2019 MAPS
path5 = "W:\\Scripting\\2019\\DBData_84706383.txt"
path6 = "W:\\Scripting\\2019\\DBData_84715201.txt"
path7 = "W:\\Scripting\\2019\\DBData_84743195.txt"
path8 = "W:\\Scripting\\2019\\DBData_84777742.txt"
path9 = "W:\\Scripting\\2019\\DBData_84815446.txt"
path10 = "W:\\Scripting\\2019\\DBData_84835743.txt"
# 2020 MAPS
path11 = "W:\\Scripting\\2020\\DBData_84882849.txt"
path12 = "W:\\Scripting\\2020\\DBData_84966202.txt"
path13 = "W:\\Scripting\\2020\\DBData_84988789.txt"
p_list = [path1, path2, path3, path4, path5, path6, path7,
path8, path9, path10, path11, path12, path13]
pool = mp.Pool(mp.cpu_count())
CPDM = pool.map(Book, p_list)
pool.close()
#pool.join()
date_list = [20180809, 20180913, 20181011, 20181204, 20190222, 20190325,
20190501, 20190628, 20190815, 20190925, 20200207, 20200501, 20200617]
#CPDM = [b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13]
for i in CPDM:
print(len(i))
#sheet.write("A1", "Lat Long")
sheet.write("A1", "ID")
#for i in range(len(CPDM)):
cn = 0
for i in date_list:
#sheet.write(0, 3*i+1, "ID" + str(i+1))
sheet.write(0, cn+1, int2Date(i), format1)
cn += 1
#sheet.write(0, 2*i+3, "Date" + str(i+1))
read(CPDM, date_list)
workBook.close()
if __name__ == "__main__":
main()
executionTime = (time.time() - startTime)
print('Execution time in minutes: ' + str(executionTime/60))
Long story short, what you want is not exactly possible. Your data contains spot measurements, so what happened in between? Or after? Was the road under construction or not? This makes it impossible to calculate an accurate number of days that the road was under construction.
It is possible to do something that approximates what you want, but that will require some choices from your side. For example, if you measure that the road is under construction on 08/15/2019 but not anymore on 05/01/2020, do you count all the days between those 2 dates as closed? Or only until new years?
To help you get started I've added a little script that does some formatting on your data. It should give you an idea of how to handle the data.
import pandas
import plotly.express as px
# Read the Excel file
df = pandas.read_excel("./test.xlsx", index_col="ID")
# Flip the dataframe (dates should be on the index)
df = df.transpose()
# Fill any empty cells with 0
df = df.fillna(0)
# Combine columns with the same name
df = df.groupby(df.columns, axis=1).agg(lambda column: column.max(axis=1))
# Make sure the dates are sorted
df = df.sort_index()
# Create a list to hold all the periods per road
roads = []
for road_name in df.columns:
# Group by consecutive 1's
groups = df.loc[df[road_name] == 1, road_name].groupby((df[road_name] != 1).cumsum())
# Every group denotes a period for which the road was under construction
for _, group in groups:
# Get the start and finish for each group
roads.append({
"road": road_name,
"start": group.index[0],
"finish": group.index[-1] + pandas.Timedelta(1, unit="D"), # Add one day because groups with same start and finish will not be visible on the plot
})
# Convert back to a dataframe
roads_df = pandas.DataFrame(roads)
# Create a Gantt chart with Plotly (NOTE: you'll need version 4.9+ of Plotly)
fig = px.timeline(roads_df, x_start="start", x_end="finish", y="road")
fig.update_yaxes(autorange="reversed") # otherwise tasks are listed from the bottom up
fig.show()

How do I improve my code to make it run faster?

I am conducting a project in data science to analyse large volumes of cancer genome data, my computer is relatively inefficient and has a low cpu and low ram. As a result to run through all the samples it take sufficiently too long.
I have tried reducing any excess code, I have tried getting rid of for loops for list comprehensions, I have used multiprocessing to split up my tasks to run faster.
import re
import xlrd
import os
import time
from multiprocessing import Pool
import collections
import pandas as pd
if os.path.exists("C:\\Users\\js769\\genomemutations\\Input\\ChromosomesVersion") == True:
print("chromosomes in folder")
else:
os.makedirs("C:\\Users\\js769\\genomemutations\\Input\\ChromosomesVersion")
print(
"Chromosome Folder Created, Please transfer current version of chromosome number base data to new file."
)
if os.path.exists("C:\\Users\\js769\\genomemutations\\Input\\MutationSamples") == True:
print("Add sample data to run.")
else:
os.makedirs("C:\\Users\\js769\\genomemutations\\Input\\MutationSamples")
print("Mutation Sample Folder Created, please add mutation sample data to folder.")
if os.path.exists("C:\\Users\\js769\\genomemutations\\output") == True:
print("3")
else:
os.makedirs("C:\\Users\\js769\\genomemutations\\output")
# Require editing of this so it works both on a mac or windows system. Currently this version suited to mac because of higher processing power.
# Require ability to check to see if error occurs
def Main(Yeram):
import os
import glob
import errno
import shutil
import xlrd
import pandas as pd
import time
import re
import numpy as np
FragmentSize = 10000000 # This is fragment size which is adjustable.
# Code not needed
Position1 = Yeram.vectx
Position2 = Yeram.vecty
samplelist = Yeram.samplelist
dictA = Yeram.dictA
FragmentSize = Yeram.FragmentSize
chromosomesizes = Yeram.chromosomesizes
def chromosomex_mutation_data(
chromosomenumber, mutationlist
): # It selects the correct chromosome mutation point data, then it selects the data before the -. Mutation data in form(12-20)
chromosomexlist = ["0-1"]
for mutationposition in mutationlist:
if mutationposition[0:2] == str(chromosomenumber):
chromosomexlist.append(mutationposition[3:])
elif mutationposition[0:2] == (str(chromosomenumber) + ":"):
chromosomexlist.append(mutationposition[2:])
else:
continue
Puremutationdatapoints = [int(mutationposition.split("-")[0]) for mutationposition in chromosomexlist]
return Puremutationdatapoints
def Dictionary_Of_Fragment_mutation(FragmentSize, MutationData, ChromosomeNumber): #
chromosomes = {} # Dictionary
chromosomesize = chromosomesizes[ChromosomeNumber - 1]
# Opening up specific chromosome data and calculating amount of bases present in chromosome
Number_of_fragments = int(chromosomesize / FragmentSize)
for mutation in MutationData:
for i in range(0, (Number_of_fragments), 1):
a = (
"Chromosome"
+ str(ChromosomeNumber)
+ "Fragment"
+ str(i)
+ ",Basepairs "
+ str(i * FragmentSize + 1)
+ "-"
+ str(i * FragmentSize + FragmentSize)
)
if mutation in range(i * FragmentSize + 1, i * FragmentSize + FragmentSize + 1):
if chromosomes.get(a) == None:
chromosomes.update({a: 1})
else:
b = (chromosomes.get(a)) + 1
chromosomes.update({a: b})
else:
if chromosomes.get(a) == None:
chromosomes.update({a: 0})
else:
continue
return chromosomes # adds
# This adds mutations or no mutation to each fragment for chromosome,makes dicitonaries
def DictionaryRead(FragmentSize, Dict, ChromosomeNumber):
chromosomesize = chromosomesizes[ChromosomeNumber - 1]
Number_of_fragments = int(chromosomesize / FragmentSize)
chromosomefragmentlist = []
for i in range(0, (Number_of_fragments), 1):
a = (
"Chromosome"
+ str(ChromosomeNumber)
+ "Fragment"
+ str(i)
+ ",Basepairs "
+ str(i * FragmentSize + 1)
+ "-"
+ str(i * FragmentSize + FragmentSize)
)
chromosomefragmentlist.append(str(Dict.get((a))))
return chromosomefragmentlist
# This uses dictionary to create list
def forwardpackage2(FragmentSize, PureMutationData):
C = [] # list of data in numerical order 0 = no mutation
for i in range(1, 23, 1):
A = chromosomex_mutation_data(i, PureMutationData) # Purifies Data
B = Dictionary_Of_Fragment_mutation(FragmentSize, A, i) # Constructs Dictionary
C += DictionaryRead(
FragmentSize, B, i
) # Uses constructed Dictionary amd generates list of numbers, each number being a fragment in numerical order.
return C
def Mutationpointdata(Position1, Position2, dictA, FragmentSize): # Require dictA
vectx = Position1
vecty = Position2
Samplesandmutationpoints = []
for i in range(vectx, vecty):
print(samplelist[i])
new = [k for k, v in dictA.items() if int(v) == samplelist[i]]
mutationlist = [excelsheet.cell_value(i, 23) for i in new]
mutationlist.sort()
Samplesandmutationpoints.append(forwardpackage2(FragmentSize, mutationlist))
return Samplesandmutationpoints
# Opening sample data from excel table
return Mutationpointdata(Position1, Position2, dictA, FragmentSize) # yeram to james samples
def ChromosomeSequenceData(ChromosomeNumber): # Formats the chromosome file into readable information
with open(
r"C:\Users\js769\genomemutations\Input\ChromosomesVersion\chr" + str(ChromosomeNumber) + ".fa"
) as text_file:
text_data = text_file.read()
listA = re.sub("\n", "", text_data)
# list2=[z for z in text_data if z!= "\n"]
if ChromosomeNumber < 10:
ChromosomeSequenceData = listA[5:]
else:
ChromosomeSequenceData = listA[6:]
return ChromosomeSequenceData
def basepercentage_single(
i, FragmentSize, ChromosomeSequenceData
): # Creates a list of base percentage known for certain type of chromosome.
sentence = ChromosomeSequenceData[(i * FragmentSize + 1) : (i * FragmentSize + FragmentSize)]
a = sentence.count("N") + sentence.count("n")
c = str(((FragmentSize - a) / FragmentSize) * 100) + "%"
return c
def basepercentage_multiple(
FragmentSize, ChromosomeSequenceData
): # Creates a a list of base percentages known which correspond with the dna fragments for every chromosome.
fragmentamount = int(len(ChromosomeSequenceData) / FragmentSize)
list = [
basepercentage_single(i, FragmentSize, ChromosomeSequenceData) for i in range(0, (fragmentamount), 1)
]
return list
def FragmentEncodedPercentage(
FragmentSize
): # Packages a list of base percentages known which correspond with the dna fragments for every chromosome.
Initial_list = [basepercentage_multiple(FragmentSize, ChromosomeSequenceData(i)) for i in range(1, 23, 1)]
List_of_fragment_encoded_percentages = [item for sublist in Initial_list for item in sublist]
return List_of_fragment_encoded_percentages
def chromosomefragmentlist(
FragmentSize, ChromosomeNumber
): # Creares a list of fragment sizes for a specific chromosome.
chromosomesize = chromosomesizes[ChromosomeNumber - 1]
Number_of_fragments = int(chromosomesize / FragmentSize)
chromosomefragmentlist = []
for i in range(0, (Number_of_fragments), 1):
a = (
"Chromosome"
+ str(ChromosomeNumber)
+ "Fragment"
+ str(i)
+ ",Basepairs "
+ str(i * FragmentSize + 1)
+ "-"
+ str(i * FragmentSize + FragmentSize)
)
chromosomefragmentlist.append(str(((a))))
return chromosomefragmentlist
def GenomeFragmentGenerator(
FragmentSize
): # Creates the genome fragments for all chromosomes and adds them all to a list.
list = [chromosomefragmentlist(FragmentSize, i) for i in range(1, 23, 1)]
A = [item for sublist in list for item in sublist]
return A
def excelcreation(
mutationdata, samplelist, alpha, bravo, FragmentSize, A, B
): # Program runs sample alpha to bravo and then constructs excel table
data = {"GenomeFragments": A, "Encoded Base Percentage": B}
for i in range(alpha, bravo):
data.update({str(samplelist[i]): mutationdata[i]})
df = pd.DataFrame(data, index=A)
export_csv = df.to_csv(
r"C:/Users/js769/genomemutations/output/chromosomeAll.csv", index=None, header=True
)
start_time = time.time()
# Code determine base fragment size
FragmentSize = 1000000
chromosomesizes = [] # This calculates the base pair sizes for each chromosome.
for i in range(1, 23):
with open(r"C:\Users\js769\genomemutations\Input\ChromosomesVersion\chr" + str(i) + ".fa") as text_file:
text_data = text_file.read()
list = re.sub("\n", "", text_data)
if i < 10:
chromosomesizes.append(len(list[5:]))
else:
chromosomesizes.append(len(list[6:]))
wb = xlrd.open_workbook("C:/Users/js769/genomemutations/input/MutationSamples/Complete Sample For lungs.xlsx")
excelsheet = wb.sheet_by_index(0)
excelsheet.cell_value(0, 0)
sampleswithduplicates = [excelsheet.cell_value(i, 5) for i in range(1, excelsheet.nrows)]
samplelist = []
for sample in sampleswithduplicates:
if sample not in samplelist:
samplelist.append(int(sample)) # Constructs list of sample , each sample only comes up once
dictA = {}
counter = 1 # Creates a dictionary where it counts the
for sample in sampleswithduplicates:
dictA.update({counter: int(sample)})
counter = counter + 1
A = GenomeFragmentGenerator(FragmentSize)
B = FragmentEncodedPercentage(FragmentSize)
value = collections.namedtuple(
"value", ["vectx", "vecty", "samplelist", "dictA", "FragmentSize", "chromosomesizes"]
)
SampleValues = (
value(
vectx=0,
vecty=2,
samplelist=samplelist,
dictA=dictA,
FragmentSize=FragmentSize,
chromosomesizes=chromosomesizes,
),
value(
vectx=2,
vecty=4,
samplelist=samplelist,
dictA=dictA,
FragmentSize=FragmentSize,
chromosomesizes=chromosomesizes,
),
value(
vectx=4,
vecty=6,
samplelist=samplelist,
dictA=dictA,
FragmentSize=FragmentSize,
chromosomesizes=chromosomesizes,
),
value(
vectx=6,
vecty=8,
samplelist=samplelist,
dictA=dictA,
FragmentSize=FragmentSize,
chromosomesizes=chromosomesizes,
),
value(
vectx=8,
vecty=10,
samplelist=samplelist,
dictA=dictA,
FragmentSize=FragmentSize,
chromosomesizes=chromosomesizes,
),
value(
vectx=10,
vecty=12,
samplelist=samplelist,
dictA=dictA,
FragmentSize=FragmentSize,
chromosomesizes=chromosomesizes,
),
value(
vectx=12,
vecty=14,
samplelist=samplelist,
dictA=dictA,
FragmentSize=FragmentSize,
chromosomesizes=chromosomesizes,
),
value(
vectx=14,
vecty=16,
samplelist=samplelist,
dictA=dictA,
FragmentSize=FragmentSize,
chromosomesizes=chromosomesizes,
),
)
print("starting multiprocessing")
if __name__ == "__main__":
with Pool(4) as p:
result = p.map(Main, SampleValues)
Allmutationdata = []
for i in result:
for b in i:
Allmutationdata.append(b)
excelcreation(Allmutationdata, samplelist, 0, 16, FragmentSize, A, B)
print("My program took " + str(time.time() - start_time) + " to run")
So the program runs that isn't the issue, the issue is the time it runs,can anyone spot anywhere where my code maybe at fault.
This article How to make your pandas loop run 72,000x faster has really resonated with me and I think will help you.
It provides clear instructions on how to vectorize your for loops to drastically speed them up
Methods to speed up a For Loop:
Utilize pandas iterrows()
~321 times faster
Example
for index, row in dataframe.iterrows():
print(index, row)
Pandas Vectorization
~9280 times faster
Example
df.loc[((col1 == val1) & (col2 == val2)), column_name] = conditional_result
Numpy Vectorization
~72,000 times faster
Example
df.loc[((col1.values == val1) & (col2.values == val2)), column_name] = conditional_result
By adding .values we receive a numpy array.
Credit for the timing results goes to this article

'Int' object has no attribute

I am trying to create a list objects that holds data about professional golfers. The different data points are golfer name and putting percentages from different distances. I want to sort this list of objects by name once all the data has been entered for every player object. The list of these objects is called PlayerNumber. When I try to sort PlayerNumber by attribute 'name'. I get an error stating that 'int' has no attribute and I am not sure why PlayerNumber is being referred to as an integer and not a list.
Any help would be appreciated. Here is the code:
import operator
import numpy as np
import statistics
import matplotlib.pyplot as plt
from colour import Color
from bs4 import BeautifulSoup
import urllib3
############### ACCESS WEBPAGES ####################
def makeSoup(url):
http = urllib3.PoolManager()
response = http.request('GET', url)
soupdata = BeautifulSoup(response.data)
return soupdata
siteURL = []
for i in range(7):
siteURL.append(i)
siteURL[0] = ''
siteURL[1] = 'http://www.pgatour.com/stats/stat.408.html' #>25
siteURL[2] = 'http://www.pgatour.com/stats/stat.407.html' #20-25
siteURL[3] = 'http://www.pgatour.com/stats/stat.406.html' #15-20
siteURL[4] = 'http://www.pgatour.com/stats/stat.405.html' #10-15
siteURL[5] = 'http://www.pgatour.com/stats/stat.404.html' #5-10
siteURL[6] = 'http://www.pgatour.com/stats/stat.02427.html' #3-5
############### ACCESS TABLE DATA ###################
def row_number(soupdata):
for row in table.findAll('tr'):
tot_row = row
return tot_row
def parse_table(soupdata):
currRank = []
prevRank = []
playerName = []
rounds = []
pctMake = []
attempts = []
puttsMade = []
table = soupdata.find('tbody')
tot_row = 0
for row in table.findAll('tr'):
#for col in row.findAll('td'):
col = row.find_all('td')
#column_1 = col[0]
#currRank.append(column_1)
#column_2 = col[1]
#prevRank.append(column_2)
column_3 = col[2].text
column_3.strip()
playerName.append(column_3)
#column_4 = col[3]
#rounds.append(column_4)
column_5 = col[4].text
pctMake.append(column_5)
#column_6 = col[5]
#attempts.append(column_6)
#column_7 = col[6]
#puttsMade.append(column_7)
tot_row += 1
#return currRank, prevRank, playerName, rounds, pctMake, attempts, puttsMade
return playerName, pctMake, tot_row
"""
>25 ft: distance1
20-25 ft: distance2
15-20 ft: distance3
10-15 ft: distance4
5-10 ft: distance5
3-5 ft: distance6
"""
############### CLASS DEFINITION ###################
class Player:
id_list={}
def __init__(self,name, id, dis1=0.0, dis2=0.0, dis3=0.0, dis4=0.0, dis5=0.0, dis6=0.0):
self.name = name
self.dis1 = dis1
self.dis2 = dis2
self.dis3 = dis3
self.dis4 = dis4
self.dis5 = dis5
self.dis6 = dis6
self.id = id
Player.id_list[self.name] = self # save the id as key and self as he value
def addDis1(self,distance1):
self.dis1 = float(distance1)
def addDis2(self,distance2):
self.dis2 = float(distance2)
def addDis3(self,distance3):
self.dis3 = float(distance3)
def addDis4(self,distance4):
self.dis4 = float(distance4)
def addDis5(self,distance5):
self.dis5 = float(distance5)
def addDis6(self,distance6):
self.dis6 = float(distance6)
def displayPlayer(self):
print("Player: ", self.name, '\n'
">25 Ft %: ", self.dis1, '\n'
"20-25 Ft %: ", self.dis2, '\n'
"15-20 Ft %: ", self.dis3, '\n'
"10-15 Ft %: ", self.dis4, '\n'
"5-10 Ft %: ", self.dis5, '\n'
"3-5 Ft %: ", self.dis6, '\n')
#classmethod
def lookup_player_name_by_id(cls, name):
try:
return cls.id_list[name] # return the instance with the id
except KeyError: # error check for if id does not exist
raise KeyError("No user with id %s" % str(id))
############### DATA POPULATION ###################
PlayerNumber=[]
for i in range(0,195):
PlayerNumber.append(i)
for i in range(1,7):
soupdata = makeSoup(siteURL[i])
playerName, pctMake, tot_row = parse_table(soupdata)
for x in range(0,tot_row):
#PlayerNumber.append(x)
name = playerName[x]
name = name.replace("\xa0", " ")
name = name.replace("\n", "")
if i == 1:
PlayerNumber[x] = Player(name, x)
Player.addDis1(PlayerNumber[x],pctMake[x])
if i == 2:
val = Player.lookup_player_name_by_id(name)
Player.addDis2(PlayerNumber[val.id],pctMake[x])
if i == 3:
val = Player.lookup_player_name_by_id(name)
Player.addDis3(PlayerNumber[val.id],pctMake[x])
if i == 4:
val = Player.lookup_player_name_by_id(name)
Player.addDis4(PlayerNumber[val.id],pctMake[x])
if i == 5:
val = Player.lookup_player_name_by_id(name)
Player.addDis5(PlayerNumber[val.id],pctMake[x])
if i == 6:
val = Player.lookup_player_name_by_id(name)
Player.addDis6(PlayerNumber[val.id],pctMake[x])
PlayerNumber.sort(key = operator.attrgetter("name"))
#PlayerNumber[2].displayPlayer()
I'm using Python 3.4 spyder IDE. I'm relatively new to python as an FYI.
Thanks!
It isn't that PlayerNumber is being referred to as an integer, but rather that PlayerNumber is a list of integers, and every element of that list (and integer) doesn't has an attribute "name", which sort() is trying to access (in order to sort them).
Edit:
To elaborate, the second to last line in your sample:
PlayerNumber.sort(key = operator.attrgetter("name"))
is trying to sort PlayerNumber, using the comparison function: operator.attrgetter("name"), which means it must call that function on each element of PlayerNumber to get its rank in the sorted array. That is why you are trying to grab a .name attribute from the integers in PlayerNumber.

error in Naive bayes classifier

i'm beginner in machine learning and i'm trying to implement my first Naive Bayes by myself for better understanding. So, i have dataset from http://archive.ics.uci.edu/ml/datasets/Adult (american census data, classes are '<=50k' and '>50k').
Here is my python code:
#!/usr/bin/python
import sys
import csv
words_stats = {} # {'word': {'class1': cnt, 'class2': cnt'}}
words_cnt = 0
targets_stats = {} # {'class1': 3234, 'class2': 884} how many words in each class
class_stats = {} # {'class1': 7896, 'class2': 3034} how many lines in each class
items_cnt = 0
def train(dataset, targets):
global words_stats, words_cnt, targets_stats, items_cnt, class_stats
num = len(dataset)
for item in xrange(num):
class_stats[targets[item]] = class_stats.get(targets[item], 0) + 1
for i in xrange(len(dataset[item])):
word = dataset[item][i]
if not words_stats.has_key(word):
words_stats[word] = {}
tgt = targets[item]
cnt = words_stats[word].get(tgt, 0)
words_stats[word][tgt] = cnt + 1
targets_stats[tgt] = targets_stats.get(tgt, 0) + 1
words_cnt += 1
items_cnt = num
def classify(doc, tgt_set):
global words_stats, words_cnt, targets_stats, items_cnt
probs = {} #the probability itself P(c|W) = P(W|c) * P(c) / P(W)
pc = {} #probability of the class in document set P(c)
pwc = {} #probability of the word set in particular class. P(W|c)
pw = 1 #probability of the word set in documet set
for word in doc:
if word not in words_stats:
continue #dirty, very dirty
pw = pw * float(sum(words_stats[word].values())) / words_cnt
for tgt in tgt_set:
pc[tgt] = class_stats[tgt] / float(items_cnt)
for word in doc:
if word not in words_stats:
continue #dirty, very dirty
tgt_wrd_cnt = words_stats[word].get(tgt, 0)
pwc[tgt] = pwc.get(tgt, 1) * float(tgt_wrd_cnt) / targets_stats[tgt]
probs[tgt] = (pwc[tgt] * pc[tgt]) / pw
l = sorted(probs.items(), key = lambda i: i[1], reverse=True)
print probs
return l[0][0]
def check_results(dataset, targets):
num = len(dataset)
tgt_set = set(targets)
correct = 0
incorrect = 0
for item in xrange(num):
res = classify(dataset[item], tgt_set)
if res == targets[item]:
correct = correct + 1
else:
incorrect = incorrect + 1
print 'correct:', float(correct) / num, ' incorrect:', float(incorrect) / num
def load_data(fil):
data = []
tgts = []
reader = csv.reader(fil)
for line in reader:
d = [x.strip() for x in line]
if '?' in d:
continue
if not len(d):
continue
data.append(d[:-1])
tgts.append(d[-1:][0])
return data, tgts
if __name__ == '__main__':
if len(sys.argv) < 3:
print './program train_data.txt test_data.txt'
sys.exit(1)
filename = sys.argv[1]
fil = open(filename, 'r')
data, tgt = load_data(fil)
train(data, tgt)
test_file = open(sys.argv[2], 'r')
test_data, test_tgt = load_data(test_file)
check_results(test_data, tgt)
it gives ~61% of correct results. when i print probabilities i get the following:
{'<=50K': 0.07371606889800396, '>50K': 15.325378327213354}
but in case of correct classifier i expect to see sum of both probabilities equal to 1.
At first i thought the problem is in float underflow and tried to make all calculations in logarithms, but results were similiar.
i understand that omitting some words is gonna affect accuracy, but the probabilities are sooo wrong.
What do i do wrong or don't understand?
for your convinience i've uploaded dataset and python script here:
https://dl.dropboxusercontent.com/u/36180992/adult.tar.gz
Thank you for your help.
Naive Bayes doesn't compute a probability directly, rather it computes a "raw score" that is relatively compared to the other scores for each label in order to classify an instance. This score can easily be converted to a "probability" in the range of [0, 1]:
total = sum(probs.itervalues())
for label, score in probs.iteritems():
probs[label] = score / total
However, keep in mind this still doesn't represent a true probability, as mentioned in this answer:
naive Bayes tends to predict probabilities that are almost always either very close to zero or very close to one.

Categories

Resources