Bad performance of sorting algorithms implemented - python

I have implemented some sorting algorithms including Insertion, Selection, Shell, two kinds of Merge. I found that the performance of my implements didn't accord with the description of Algorithms(4th).
For example, here are two kinds of Merge sorting. When sorting a list contains 100,000 elements, Merge1 takes about 0.6s, and Merge2 takes about 50+s. But Merge2 is almost the same as the one in Algorithms(4th) except I use python. I can't figure it out why Merge2 is so slow and how to improve it. Can somebody help me? Thanks!
class Merge1:
def merge(self, a, b):
i = 0; j = 0
res = []
while i < len(a) and j < len(b):
if a[i] < b[j]:
res.append(a[i])
i = i + 1
else:
res.append(b[j])
j = j + 1
res = res + a[i:] + b[j:]
return res
def sort(self, source):
if len(source) <= 1:
return source
half = len(source) // 2
left = self.sort(source[:half])
right = self.sort(source[half:])
retval = self.merge(left, right)
return retval
def is_sort(self, source):
length = len(source)
for i in range(0, length-1):
if source[i] > source[i+1]:
return False
return True
class Merge2:
def merge(self, source, lo, mid ,hi):
i = lo
j = mid + 1
aux = source[:]
k = lo
while k <= hi:
if i > mid:
source[k] = aux[j]
j = j + 1
elif j > hi:
source[k] = aux[i]
i = i + 1
elif aux[i] < aux[j]:
source[k] = aux[i]
i = i + 1
else:
source[k] = aux[j]
j = j + 1
k = k+1
def sort(self, source):
sz = 1
N = len(source)
while sz < N:
for lo in range(0, N-sz, sz+sz):
# pdb.set_trace()
self.merge(source, lo, lo+sz-1, min(lo+sz+sz-1, N-1))
sz = sz + sz
def is_sort(self, source):
length = len(source)
for i in range(0, length-1):
if source[i] > source[i+1]:
return False
return True
Here is the implement in Algorithms:
Here is the test code:
merge1 = Merge1()
source = np.random.randint(100000, size=100000).tolist()
start = time.time()
merge1.sort(source)
end = time.time()
print("Merge1 takes: {}s".format(end-start))
merge2 = Merge2()
source = np.random.randint(100000, size=100000).tolist()
start = time.time()
merge2.sort(source)
end = time.time()
print("Merge2 takes: {}s".format(end-start))
result:
E:>python sort.py
Merge1 takes: 0.6376256942749023s
Merge2 takes: 57.99568271636963s

Consider this modification. According to my quick tests, it improved the performance considerably (from nearly one minute down to less than 1 second). The main performance gain comes from avoiding to create that many copies of the whole list. The other alterations only increase performance marginally.
According to a simple comparison of the sum it should not mess up the list, but you should do some more tests if you like to use it.
class Merge4:
def merge(self, source, aux, lo, mid ,hi):
i = lo
j = mid + 1
a_j= aux[j]
a_i= aux[i]
k = lo
while k <= hi:
if i > mid:
source[k] = a_j
j += 1
a_j= aux[j]
elif j > hi:
source[k] = a_i
i += 1
a_i= aux[i]
elif a_i < a_j:
source[k] = a_i
i += 1
a_i= aux[i]
else:
source[k] = a_j
j += 1
a_j= aux[j]
k += 1
# update the aux array for the next call
aux[lo:hi+1]= source[lo:hi+1]
def sort(self, source):
sz = 1
N = len(source)
while sz < N:
sz_2= sz * 2
# create the aux array, that will be maintained continuously
# and add one extra None, so the "prefetching" works also
# during the last iteration (refering to a_i and a_j)
aux= source[:]
aux.append(None)
for lo in range(0, N-sz, sz_2):
# pdb.set_trace()
self.merge(source, aux, lo, lo+sz-1, min(lo+sz_2-1, N-1))
sz = sz_2
def is_sort(self, source):
length = len(source)
for i in range(0, length-1):
if source[i] > source[i+1]:
return False
return True

Related

sorting orders with mergesort incorrect output

I have to design an algorithm to sort a list of orders by selection time (t selection, finding the good in the warehouse and bringing it to the surface) plus shipping time (t shipping, constant). The customer orders can be retrieved (in the same order as placed) from a server database. You should expect between 100-10K elements.
The program takes as input a data-set of orders where the id, t selection, and t shipping are of type unsigned int, n is the number of orders and a space character.
id1, t selection1, t shipping1; ...; idn, t selectionn, t shippingn \n
The expected output is a space-separated list of the ids, sorted by t selection + t shipping and terminated by a new line \n.
Input: 1, 500, 100; 2, 700, 100; 3, 100, 100\n
Output: 3 1 2\n
I am trying to do it with merge sort, however my program returns
1 2 3/n instead of 3 1 2/n
I have provided my code below, could anyone help me out?
#!/usr/bin/env python3
import sys
class Order:
def __init__(self, id: int, selection_time: int, shipping_time: int):
self.id: int = id
self.selection_time: int = selection_time
self.shipping_time: int = shipping_time
def merge(left, right):
if not len(left) or not len(right):
return left or right
result = []
i, j = 0, 0
while len(result) < len(left) + len(right):
if left[i].shipping_time + left[i].selection_time < right[j].shipping_time + right[j].selection_time:
result.append(left[i])
i += 1
else:
result.append(right[j])
j += 1
if i == len(left) or j == len(right):
result.extend(left[i:] or right[j:])
break
return result
def sort(list):
if len(list) < 2:
return list
middle = int(len(list) / 2)
left = sort(list[:middle])
right = sort(list[middle:])
return merge(left, right)
if __name__ == '__main__':
'''
Retrieves and splits the input
'''
data = input()
data = data.split('; ')
order_list = []
for d in data:
id, selection_t, shipping_t = d.split(', ', 2)
order: Order = Order(int(id), int(selection_t), int(shipping_t))
order_list.append(order)
sort(order_list)
for order in order_list:
sys.stdout.write(str(order.id))
sys.stdout.write(" ")
The simplest (and probably least efficient) sorting algorithm is the Bubble sort. But the question says nothing about performance so it can be simplified like this:
class Order:
def __init__(self, ident, selection_time, shipping_time):
self._ident = ident
self._selection_time = selection_time
self._shipping_time = shipping_time
#property
def selection_time(self):
return self._selection_time
#property
def shipping_time(self):
return self._shipping_time
#property
def ident(self):
return self._ident
def merge(lst):
def comboval(order):
return order.selection_time + order.shipping_time
if len(lst) > 1:
mid = len(lst) // 2
left = lst[:mid]
right = lst[mid:]
merge(left)
merge(right)
i = j = k = 0
while i < len(left) and j < len(right):
if comboval(left[i]) < comboval(right[j]):
lst[k] = left[i]
i += 1
else:
lst[k] = right[j]
j += 1
k += 1
for _i in range(i, len(left)):
lst[k] = left[_i]
k += 1
for _j in range(j, len(right)):
lst[k] = right[_j]
k += 1
return lst
inval = '1, 500, 100; 2, 700, 100; 3, 100, 100'
orderlist = []
for order in inval.split(';'):
orderlist.append(Order(*map(int, order.split(','))))
print(*[order.ident for order in merge(orderlist)])
Output:
3 1 2
Note:
This is an in-place sort

Python decorator to time recursive functions properly

I am working in a piece of code for studying purposes, and i want to compare the time required to sort a list using different algorithms. I tried using a decorator but since the mergeSort function is recursive, it gives me the result for each recursion. I want to find a way to summarize the result, if possible. Since i'm very new to decorators i'm not sure what could be done in that case. Is there a way to achieve that goal using a decorator?
import random
import functools
import time
def timeIt(func):
#functools.wraps(func)
def newfunc(*args, **kwargs):
startTime = time.time()
func(*args, **kwargs)
elapsedTime = time.time() - startTime
print('function [{}] finished in {} ms'.format(
func.__name__, int(elapsedTime * 1000)))
return newfunc
#timeIt
def mergeSort(L):
if len(L) > 1:
mid = len(L) // 2
left = L[:mid]
right = L[mid:]
mergeSort(left)
mergeSort(right)
i = j = k = 0
while i < len(left) and j < len(right):
if left[i] < right[j]:
L[k] = left[i]
i += 1
else:
L[k] = right[j]
j += 1
k += 1
while i < len(left):
L[k] = left[i]
i += 1
k += 1
while j < len(right):
L[k] = right[j]
j += 1
k += 1
#timeIt
def selectionSort(L):
for fillslot in range(len(L) - 1, 0, -1):
maxpos = 0
for location in range(1, fillslot + 1):
if L[location] > L[maxpos]:
maxpos = location
temp = L[fillslot]
L[fillslot] = L[maxpos]
L[maxpos] = temp
randomList = random.sample(range(10000), 10000)
mergeSort(randomList.copy())
selectionSort(randomList.copy())
Output:
[...] truncated
function [mergeSort] finished in 7 ms
function [mergeSort] finished in 15 ms
function [mergeSort] finished in 33 ms
function [mergeSort] finished in 68 ms
function [selectionSort] finished in 2049 ms
You can set an attribute (_entered in the example) on the wrapper function as a flag so that it can tell that it is inside a recursive call if the attribute is set:
def timeIt(func):
#functools.wraps(func)
def newfunc(*args, **kwargs):
if not hasattr(newfunc, '_entered'): # enter only if _entered is not set
newfunc._entered = True # set _entered
startTime = time.time()
func(*args, **kwargs)
elapsedTime = time.time() - startTime
print('function [{}] finished in {} ms'.format(
func.__name__, int(elapsedTime * 1000)))
del newfunc._entered # remove _entered
return newfunc
You could just wrap it with another function...
import random
import functools
import time
def timeIt(func):
#functools.wraps(func)
def newfunc(*args, **kwargs):
startTime = time.time()
func(*args, **kwargs)
elapsedTime = time.time() - startTime
print('function [{}] finished in {} ms'.format(
func.__name__, int(elapsedTime * 1000)))
return newfunc
def mergeSort(L):
if len(L) > 1:
mid = len(L) // 2
left = L[:mid]
right = L[mid:]
mergeSort(left)
mergeSort(right)
i = j = k = 0
while i < len(left) and j < len(right):
if left[i] < right[j]:
L[k] = left[i]
i += 1
else:
L[k] = right[j]
j += 1
k += 1
while i < len(left):
L[k] = left[i]
i += 1
k += 1
while j < len(right):
L[k] = right[j]
j += 1
k += 1
def selectionSort(L):
for fillslot in range(len(L) - 1, 0, -1):
maxpos = 0
for location in range(1, fillslot + 1):
if L[location] > L[maxpos]:
maxpos = location
temp = L[fillslot]
L[fillslot] = L[maxpos]
L[maxpos] = temp
#timeIt
def timedSelectionSort(L):
selectionSort(L)
#timeIt
def timedMergeSort(L):
mergeSort(L)
randomList = random.sample(range(10000), 10000)
timedSelectionSort(randomList.copy())
timedMergeSort(randomList.copy())

Merge Sort in Python - IndexError: list assignment index out of range

I'm trying to implement merge sort in Python, but I keep getting the same mistake:
IndexError: list assignment index out of range. I based my code on the Java code by Sedgewick and Wayne found here: https://algs4.cs.princeton.edu/22mergesort/Merge.java.html
Am I doing something wrong with the auxiliar array while trying to pass the items from vetorNum to vetorAux? What is it then?
vetorAux = [] * len(vetorNum)
nTrocas = 0
mergeSort(vetorNum, vetorAux, 0, len(vetorNum)-1)
def mergeSort(vetorNum, vetorAux, i, f):
if(f > i):
m = (i+f)//2
mergeSort(vetorNum, vetorAux, i, m)
mergeSort(vetorNum, vetorAux, m+1, f)
intercala(vetorNum, vetorAux, i, m, f)
def intercala(vetorNum, vetorAux, i, m, f):
global nTrocas
if(i == f):
return
k = i
while(k <= f):
vetorAux[k] = vetorNum[k]
k += 1
i_aux = i
m_aux = m+1
k = i
while(k <= f):
if(i_aux > m):
vetorNum[k] = vetorAux[m_aux]
m_aux += 1
elif(m_aux > f):
vetorNum[k] = vetorAux[i_aux]
i_aux += 1
elif(vetorAux[i_aux] > vetorAux[m_aux]):
vetorNum[k] = vetorAux[m_aux]
m_aux += 1
nTrocas += 1
else:
vetorNum[k] = vetorAux[i_aux]
i_aux += 1
k += 1

Recursive algorithm works without return statement? (Runs really fast)

I tried to implement merge-sort in Python. Somehow this piece of code runs correctly (and pretty fast), but I don't know why: There is no return-statement in mergeSort()
from sys import stdin
def mergeSort(A):
if len(A) > 1:
m = int(len(A)/2)
L = A[:m]
R = A[m:]
mergeSort(L)
mergeSort(R)
l = 0
r = 0
a = 0
while l < len(L) and r < len(R):
if L[l] < R[r]:
A[a] = L[l]
l += 1
else:
A[a] = R[r]
r += 1
a += 1
while l < len(L):
A[a] = L[l]
l += 1
a += 1
while r < len(R):
A[a] = R[r]
r += 1
a += 1
def main():
A = []
for line in stdin:
A.append(int(line))
mergeSort(A)
print(A)
if __name__ == "__main__":
main()

Why do I get a TypeError when I call cProfile.run()?

import math
import random
import cProfile
import pstats
from goody import irange
def partition(alist, left, right):
def swap(i,j): alist[i],alist[j] = alist[j],alist[i]
pivot = alist[right]
i = left
for j in range(left,right):
if alist[j] <= pivot:
swap(i,j)
i += 1
swap(i,right)
return I
def select(alist, n):
left,right = 0, len(alist)-1
while True:
if left == right:
return alist[left]
pivot_index = partition(alist, left, right)
if n == pivot_index:
return alist[n]
elif n < pivot_index:
right = pivot_index - 1
else:
left = pivot_index + 1
def closest_2d(alist):
def dist(p1,p2): return math.sqrt( (p1[0] - p2[0])**2 + (p1[1] - p2[1])**2)
def min_none(*args): return min([x for x in args if x != None])
if len(alist) < 2:
return None # +infinity
if len(alist) == 2:
return dist(alist[0],alist[1])
m = select([x for (x,_) in alist],len(alist)//2)
s1,s2,s3 = [],[],[]
for v in alist:
if v[0] == m:
s3.append(v)
else:
(s1 if v[0] < m else s2).append(v)
if s1 == []:
s1.append(s3[0])
s2.extend(s3[1:])
else:
s2.append(s3[0])
s1.extend(s3[1:])
d1 = closest_2d(s1)
d2 = closest_2d(s2)
d = min_none(d1,d2)
s1.sort(key = lambda p : p[1])
s2.sort(key = lambda p : p[1])
i,j = 0,0
d3 = None # +infinity
while True:
while i != len(s1) and j != len(s2) and abs(s1[i][1]-s2[j][1]) > d:
if s1[i][1] < s2[j][1]:
i += 1
else:
j += 1
if i == len(s1) or j ==len(s2):
break;
j1 = j
while j1 < len(s2) and abs(s1[i][1]-s2[j1][1]) < d:
if d3 == None or dist(s1[i],s2[j1]) < d3:
d3 = dist(s1[i],s2[j1])
j1 += 1
i += 1
return min_none(d1,d2,d3)
# My code
a = []
for i in range(128000):
a.append((random.random,random.random))
cProfile.run('closest_2d(a)')
I am trying to write a script that uses the cProfile module to profile all the functions called when the closest_2d function is run first on a random list with 128,000 coordinate. Generate the random list, and then call cProfile.run so that it runs closest_2d on that list; also specify a second argument, which is the file to put the results in (and the file on which to call pstats.Stats) to print the results.
I got the following error:
Traceback (most recent call last):
cProfile.run('closest_2d(a)')
return _pyprofile._Utils(Profile).run(statement, filename, sort)
prof.run(statement)
return self.runctx(cmd, dict, dict)
exec(cmd, globals, locals)
m = select([x for (x,_) in alist],len(alist)//2)
pivot_index = partition(alist, left, right)
if alist[j] <= pivot:
TypeError: unorderable types: builtin_function_or_method() <= builtin_function_or_method()
How can I fix it?
Your question title is misleading, cProfile is being called since it's right at the top of your stack trace.
The problem is, you're getting a TypeError because you're trying to compare a two functions instead of their return values on this line:
if alist[j] <= pivot:
This is because your didn't call random.random() when you were populating your list, instead you put random.random. This places a reference to the random function rather than a random value.
a.append((random.random,random.random))
Should be:
a.append((random.random(), random.random()))

Categories

Resources