Parallelize sorting algorithm using pyspark - python

Good morning, I have developed a simple merge-sort algorithm which I want to compare its performance when it is parallelized and not parallelized.
First, I am generating a list of numbers to sort and check how long it takes for the merge-sort to sort the list.
The next thing I want to do is passing the list of numbers into sc.parallelize() and converting the list to RDD followed by passing the merge-sort function into mapPartitions() and then collect().
import random
import time
from pyspark import SparkContext
def execute_merge_sort(generated_list):
start_time = time.time()
sorted_list = merge_sort(generated_list)
elapsed = time.time() - start_time
print('Simple merge sort: %f sec' % elapsed)
return sorted_list
def generate_list(length):
N = length
generated_list = [random.random() for num in range(N)]
return generated_list
def merging(left_side, right_side):
result = []
i = j = 0
while i < len(left_side) and j < len(right_side):
if left_side[i] <= right_side[j]:
result.append(left_side[i])
i += 1
else:
result.append(right_side[j])
j += 1
if i == len(left_side):
result.extend(right_side[j:])
else:
result.extend(left_side[i:])
return result
def merge_sort(generated_list):
if len(generated_list) <= 1:
return generated_list
middle_value = len(generated_list) // 2
sorted_list = merging(merge_sort(generated_list[:middle_value]), merge_sort(generated_list[middle_value:]))
return sorted_list
def is_sorted(num_array):
for i in range(1, len(num_array)):
if num_array[i] < num_array[i - 1]:
return False
return True
generate_list = generate_list(500000)
sorted_list = execute_merge_sort(generate_list)
sc = SparkContext()
rdd = sc.parallelize(generate_list).mapPartitions(execute_merge_sort).collect()
When I'm executing this sc.parallelize(generate_list).mapPartitions(execute_merge_sort).collect() I'm getting the following error:
File "<ipython-input-15-1b7974b4fa56>", line 7, in execute_merge_sort
File "<ipython-input-15-1b7974b4fa56>", line 36, in merge_sort
TypeError: object of type 'itertools.chain' has no len()
Any help would be appreciated. Thanks in advance.

I figured out how to solve the problem of TypeError: 'float' object is not iterable.
This can be solved by flattening the data using flatMap(lambda x: x) and calling glom() in order to wrap the list and make it executable by the function execute_merge_sort.
By executing the following line, the returned result is a list containing sorted lists.
sc.parallelize(random_list_of_lists).flatMap(lambda x: x).glom().mapPartitions(execute_merge_sort_rdd).collect()

Related

Process time, timeit()

I have several functions to create a list within a range. I'm using a time function I wrote, but it isn't timing the list functions I input. My list functions return the created list, currently. And the error is telling me, when I use time_it() that result cannot pass through.
# one of my list functions
def for_list(x):
x = range(x)
list_1 = []
for i in x:
i = str(i)
list_1 += i
return list_1
# timing function
def time_limit(tx):
start_time = process_time()
tx()
end_time = process_time()
time = (end_time - start_time)
print(f'{tx.__name__}, {time:.15f}')
SIZE = 10000
time_limit(for_list(SIZE))
Am I suppose to return something differently or is my time_limit() incorrect?
Inside the function time_limit() you are calling the for list twice.
It is called once when passed through and called again on the tx() line.
When removing that line it should look like this:
# one of my list functions
def for_list(x):
x = range(x)
list_1 = []
for i in x:
i = str(i)
list_1 += i
return list_1
# timing function
def time_limit(tx):
start_time = process_time()
end_time = process_time()
time = (end_time - start_time)
print(f'{tx.__name__}, {time:.15f}')
SIZE = 10000
time_limit(for_list(SIZE))

Why can't I sum these lists? It only returns the first answer, but if I print I can get it to print all answers instead? Thanks :)

This is the code that I have so far. I don't understand why it would work with print but not as a return function?
# Setup
import numpy as np
data_string = input("Enter elements of a list separated by space")
data = data_string.split()
# Function
def sumrescubed(data):
for i in range(len(data)):
data[i] = float(data[i])
data_sum = sum(data)
mean = sum(data) / len(data)
for i in range(1, len(data)):
answer_sum = sum([(data[i] - mean) ** 3])
return answer_sum
sumrescubed(data)
What you probably want to do is make answer_sum a list, and append each cube to it so that you can return the list of individual items (which are what you're seeing when you print(answer_sum) within the loop in your current code):
answer_sum = []
for i in data:
answer_sum.append((i - mean)**3)
return answer_sum
I'd suggest simplifying the whole thing by using comprehensions instead of iterating over the lists by index:
def sumrescubed(data):
nums = [float(i) for i in data]
mean = sum(nums) / len(nums)
return [(i - mean)**3 for i in nums]

Convert manager.dict() to list of tuples of form [[a,b,c],[q,w,e],[e,r,t].......]

I am using multiprocessing to increase the computation speed of my program for which I used
manager=Manager()
parallel_array_sites=manager.dict()
find_sites()
removal()
find_sites function is running properly
my removal function is
global array_sites
for i in parallel_array_sites:
array_sites.append(i)
#----not very relevant from here on-----
count = 0
remove_sites = {} # dictionary which contains index to remove sites
for i in range(len(array_sites)):
remove_sites[i] = 0
for i in range(len(array_sites)):
if remove_sites[i]:
continue
for j in range(len(array_sites)):
if(j > i and remove_sites[j] == 0):
x = array_sites[i][0] - array_sites[j][0]
y = array_sites[i][1] - array_sites[j][1]
z = array_sites[i][2] - array_sites[j][2]
r = math.sqrt(x*x + y*y + z*z)
if(r < (rmin/1.1)):
count = count + 1
remove_sites[j] = 1
print "after removel",len(array_sites)
#print remove_sites
count = 0
for key,val in remove_sites.iteritems():
if(val == 1):
del array_sites[key-count]
count = count + 1
The removal function requires me to use the tuples stored in
parallel_array_sites
as tuples in the list
array_sites
All the objects in parallel_array_list are tuples of 3 elements each
The number of entries can be fairly large which is why i don't want to specify the size while declaring a multiprocessing.list() instead.
The loop
for i in parallel_array_sites:
array_sites.append(i)
does not work and gives the following error:
File "/usr/lib/python2.7/multiprocessing/managers.py", line 774, in _callmethod
raise convert_to_error(kind, result)
KeyError: 1081
Require help with any kind of changes I can make
Used
for i in range(len(parallel_array_sites)):
array_sites.append(parallel_array_sites[i])
instead because
for i in parallel_array_sites:
does not work for a dictionary

Function for Simple Moving Average (SMA)

I have the following function for calculating SMA in python:
import numpy as np
def calcSma(data, smaPeriod):
sma = []
count = 0
for i in xrange(data.size):
if data[i] is None:
sma.append(None)
else:
count += 1
if count < smaPeriod:
sma.append(None)
else:
sma.append(np.mean(data[i-smaPeriod+1:i+1]))
return np.array(sma)
This function works, but I find it very little pythonic. I don't like the indexing and counting I'm doing, nor the way I have to append to the list and then turn it into a numpy array before I return it.
The reason I have to deal with all these None, is because I want to return an array at the same size as the input array. This makes it easier to plot and deal with on a general level later. I can easily do stuff such as this:
sma = calcSma(data=data, smaPeriod=20)
sma2 = calcSma(data=sma, smaPeriod=10)
plt.plot(data)
plt.plot(sma)
plt.plot(sma2)
plt.show()
So, any ideas on how this can be done prettier and more pythonic?
Pythonic enough I hope
import numpy as np
def calcSma(data, smaPeriod):
j = next(i for i, x in enumerate(data) if x is not None)
our_range = range(len(data))[j + smaPeriod - 1:]
empty_list = [None] * (j + smaPeriod - 1)
sub_result = [np.mean(data[i - smaPeriod + 1: i + 1]) for i in our_range]
return np.array(empty_list + sub_result)
Here is another implementation of moving average just using standard Python library:
from collections import deque
import itertools
def moving_average(iterable, n=3):
# http://en.wikipedia.org/wiki/Moving_average
it = iter(iterable)
# create an iterable object from input argument
d = deque(itertools.islice(it, n-1))
# create deque object by slicing iterable
d.appendleft(0)
s = sum(d)
for elem in it:
s += elem - d.popleft()
d.append(elem)
yield s / n
# example on how to use it
for i in moving_average([40, 30, 50, 46, 39, 44]):
print(i)
# 40.0
# 42.0
# 45.0
# 43.0

How to write this program into a for loop?

I'm trying to learn how to change this program into a for loop for the sake of knowing both ways
def Diff(a_list):
num = enumerate(max(x) - min(x) for x in a_list)
return max(x[::-1] for x in num)
I want it to be something like
def Diff(x):
for a in x
if it helps the program is intended to return the row that has the smallest sum of the elements inside it so like [[1,2,3,4],[-500],[10,20]] would be 1.
I do not understand why you use this name for your function, it does something else (as far as I understand). It searches for the inner-list inside a list for which the difference between min and max, the span, are maximal and the n returns a tuple (span, idx), idx being the index within the outer loop.
When you want to have the same as a loop, try:
def minRow_loop(a_list):
rv = (0,0)
for idx, row in enumerate(a_list):
span = max(row) - min(row)
span_and_idx = (span, idx)
if span_and_idx > rv:
rv = span_and_idx
return rv
But your code doesn't do what it'S intended to do, so I created two correct versions, once with and once without a loop.
import random
random.seed(12346)
def minRow(a_list):
num = enumerate(max(x) - min(x) for x in a_list)
return max(x[::-1] for x in num)
def minRow_loop(a_list):
rv = (0,0)
for idx, row in enumerate(a_list):
span = max(row) - min(row)
span_and_idx = (span, idx)
if span_and_idx > rv:
rv = span_and_idx
return rv
def minRow_correct(a_list):
return min(enumerate([sum(l) for l in a_list]),
key=lambda (idx, val): val)[0]
def minRow_correct_loop(a_list):
min_idx = 0
min_sum = 10e50
for idx, list_ in enumerate(a_list):
sum_ = sum(list_)
if sum_<min_sum:
min_idx = idx
min_sum = sum
return min_idx
li = [[random.random() for i in range(2)] for j in range(3)]
from pprint import pprint
print "Input:"
pprint(li)
print "\nWrong versions"
print minRow(li)
print minRow_loop(li)
which prints:
Input:
[[0.46318380478657073, 0.7396007585882016],
[0.38778699106140135, 0.7078233515518557],
[0.7453097328344933, 0.23853757442660117]]
Wrong versions
(0.5067721584078921, 2)
(0.5067721584078921, 2)
Corrected versions
2
2
What you want can actually be done in two lines of code:
# Let's take the list from your example
lst = [[1,2,3,4],[-500],[10,20]]
# Create a new list holding the sums of each sublist using a list comprehension
sums = [sum(sublst) for sublst in lst]
# Get the index of the smallest element
sums.index(min(sums)) # Returns: 1
if you're looking for minimum sum, just go through every row and keep track of the smallest:
def minRow(theList):
foundIndex = 0 # assume first element is the answer for now.
minimumSum = sum(theList[0])
for index, row in enumerate(theList):
if sum(row) < minimumSum:
foundIndex = index
minimumSum = sum(row) # you don't have to sum() twice, but it looks cleaner
return foundIndex
If your looking for greatest range (like the first Diff() function), it'd be similar. You'd keep track of the greatest range and return its index.
Thorsten's answer is very complete. But since I finished this anyway, I'm submitting my "dumbed down" version in case it helps you understand.

Categories

Resources