Related
I have a numpy array with these values:
[10620.5, 11899., 11879.5, 13017., 11610.5]
import Numpy as np
array = np.array([10620.5, 11899, 11879.5, 13017, 11610.5])
I would like to get values that are "close" (in this instance, 11899 and 11879) and average them, then replace them with a single instance of the new number resulting in this:
[10620.5, 11889, 13017, 11610.5]
the term "close" would be configurable. let's say a difference of 50
the purpose of this is to create Spans on a Bokah graph, and some lines are just too close
I am super new to python in general (a couple weeks of intense dev)
I would think that I could arrange the values in order, and somehow grab the one to the left, and right, and do some math on them, replacing a match with the average value. but at the moment, I just dont have any idea yet.
Try something like this, I added a few extra steps, just to show the flow:
the idea is to group the data into adjacent groups, and decide if you want to group them or not based on how spread they are.
So as you describe you can combine you data in sets of 3 nummbers and if the difference between the max and min numbers are less than 50 you average them, otherwise you leave them as is.
import pandas as pd
import numpy as np
arr = np.ravel([1,24,5.3, 12, 8, 45, 14, 18, 33, 15, 19, 22])
arr.sort()
def reshape_arr(a, n): # n is number of consecutive adjacent items you want to compare for averaging
hold = len(a)%n
if hold != 0:
container = a[-hold:] #numbers that do not fit on the array will be excluded for averaging
a = a[:-hold].reshape(-1,n)
else:
a = a.reshape(-1,n)
container = None
return a, container
def get_mean(a, close): # close = how close adjacent numbers need to be, in order to be averaged together
my_list=[]
for i in range(len(a)):
if a[i].max()-a[i].min() > close:
for j in range(len(a[i])):
my_list.append(a[i][j])
else:
my_list.append(a[i].mean())
return my_list
def final_list(a, c): # add any elemts held in the container to the final list
if c is not None:
c = c.tolist()
for i in range(len(c)):
a.append(c[i])
return a
arr, container = reshape_arr(arr,3)
arr = get_mean(arr, 5)
final_list(arr, container)
You could use fuzzywuzzy here to gauge the ratio of cloesness between 2 data sets.
See details here: http://jonathansoma.com/lede/algorithms-2017/classes/fuzziness-matplotlib/fuzzing-matching-in-pandas-with-fuzzywuzzy/
Taking Gustavo's answer and tweaking it to my needs:
def reshape_arr(a, close):
flag = True
while flag is not False:
array = a.sort_values().unique()
l = len(array)
flag = False
for i in range(l):
previous_item = next_item = None
if i > 0:
previous_item = array[i - 1]
if i < (l - 1):
next_item = array[i + 1]
if previous_item is not None:
if abs(array[i] - previous_item) < close:
average = (array[i] + previous_item) / 2
flag = True
#find matching values in a, and replace with the average
a.replace(previous_item, value=average, inplace=True)
a.replace(array[i], value=average, inplace=True)
if next_item is not None:
if abs(next_item - array[i]) < close:
flag = True
average = (array[i] + next_item) / 2
# find matching values in a, and replace with the average
a.replace(array[i], value=average, inplace=True)
a.replace(next_item, value=average, inplace=True)
return a
this will do it if I do something like this:
candlesticks['support'] = reshape_arr(supres_df['support'], 150)
where candlesticks is the main DataFrame that I am using and supres_df is another DataFrame that I am massaging before I apply it to the main one.
it works, but is extremely slow. I am trying to optimize it now.
I added a while loop because after averaging, the averages can become close enough to average out again, so I will loop again, until it doesn't need to average anymore. This is total newbie work, so if you see something silly, please comment.
I am looking for an optimization algorithm that takes a text file encoded with 0s, 1s, and -1s:
1's denoting target cells that requires Wi-Fi coverage
0's denoting cells that are walls
1's denoting cells that are void (do not require Wi-Fi coverage)
Example of text file:
I have created a solution function along with other helper functions, but I can't seem to get the optimal positions of the routers to be placed to ensure proper coverage. There is another file that does the printing, I am struggling with finding the optimal location. I basically need to change the get_random_position function to get the optimal one, but I am unsure how to do that. The area covered by the various routers are:
This is the kind of output I am getting:
Each router covers a square area of at most (2S+1)^2
Type 1: S=5; Cost=180
Type 2: S=9; Cost=360
Type 3: S=15; Cost=480
My code is as follows:
import numpy as np
import time
from random import randint
def is_taken(taken, i, j):
for coords in taken:
if coords[0] == i and coords[1] == j:
return True
return False
def get_random_position(floor, taken , nrows, ncols):
i = randint(0, nrows-1)
j = randint(0, ncols-1)
while floor[i][j] == 0 or floor[i][j] == -1 or is_taken(taken, i, j):
i = randint(0, nrows-1)
j = randint(0, ncols-1)
return (i, j)
def solution(floor):
start_time = time.time()
router_types = [1,2,3]
nrows, ncols = floor.shape
ratio = 0.1
router_scale = int(nrows*ncols*0.0001)
if router_scale == 0:
router_scale = 1
row_ratio = int(nrows*ratio)
col_ratio = int(ncols*ratio)
print('Row : ',nrows, ', Col: ', ncols, ', Router scale :', router_scale)
global_best = [0, ([],[],[])]
taken = []
while True:
found_better = False
best = [global_best[0], (list(global_best[1][0]), list(global_best[1][1]), list(global_best[1][2]))]
for times in range(0, row_ratio+col_ratio):
if time.time() - start_time > 27.0:
print('Time ran out! Using what I got : ', time.time() - start_time)
return global_best[1]
fit = []
for rtype in router_types:
interim = (list(global_best[1][0]), list(global_best[1][1]), list(global_best[1][2]))
for i in range(0, router_scale):
pos = get_random_position(floor, taken, nrows, ncols)
interim[0].append(pos[0])
interim[1].append(pos[1])
interim[2].append(rtype)
fit.append((fitness(floor, interim), interim))
highest_fitness = fit[0]
for index in range(1, len(fit)):
if fit[index][0] > highest_fitness[0]:
highest_fitness = fit[index]
if highest_fitness[0] > best[0]:
best[0] = highest_fitness[0]
best[1] = (highest_fitness[1][0],highest_fitness[1][1], highest_fitness[1][2])
found_better = True
global_best = best
taken.append((best[1][0][-1],best[1][1][-1]))
break
if found_better == False:
break
print('Best:')
print(global_best)
end_time = time.time()
run_time = end_time - start_time
print("Run Time:", run_time)
return global_best[1]
def available_cells(floor):
available = 0
for i in range(0, len(floor)):
for j in range(0, len(floor[i])):
if floor[i][j] != 0:
available += 1
return available
def fitness(building, args):
render = np.array(building, dtype=int, copy=True)
cov_factor = 220
cost_factor = 22
router_types = { # type: [coverage, cost]
1: {'size' : 5, 'cost' : 180},
2: {'size' : 9, 'cost' : 360},
3: {'size' : 15, 'cost' : 480},
}
routers_used = args[-1]
for r, c, t in zip(*args):
size = router_types[t]['size']
nrows, ncols = render.shape
rows = range(max(0, r-size), min(nrows, r+size+1))
cols = range(max(0, c-size), min(ncols, c+size+1))
walls = []
for ri in rows:
for ci in cols:
if building[ri, ci] == 0:
walls.append((ri, ci))
def blocked(ri, ci):
for w in walls:
if min(r, ri) <= w[0] and max(r, ri) >= w[0]:
if min(c, ci) <= w[1] and max(c, ci) >= w[1]:
return True
return False
for ri in rows:
for ci in cols:
if blocked(ri, ci):
continue
if render[ri, ci] == 2:
render[ri, ci] = 4
if render[ri, ci] == 1:
render[ri, ci] = 2
render[r, c] = 5
return (
cov_factor * np.sum(render > 1) -
cost_factor * np.sum([router_types[x]['cost'] for x in routers_used])
)
Here's a suggestion on how to solve the problem; however I don't affirm this is the best approach, and it's certainly not the only one.
Main idea
Your problem can be modelised as a weighted minimum set cover problem.
Good news, this is a well known optimization problem:
It is easy to find algorithm descriptions for approximate solutions
A quick search on the web shows many implementations of approximation algorithms in Python.
Bad news, this is a NP-hard optimization problem:
If you need an exact solution: algorithms will work only for "small" sized problems in a reasonable amount of time(in your case: size of the problem <=> number of "1" cells).
Approximate (a.k.a greedy) algorithms are trade-off between computation requirements, and a risk do deliver far from optimal solutions in certain cases.
Note that the following part does not prove that your problem is NP-hard. The general minimum set cover problem is NP-hard. In your case the subsets have several properties that might help to design a better algorithm. I have no idea how though.
Translating into a cover set problem
Let's define some sets:
U: the set of "1" cells (requiring Wifi).
P(U): the power set of U (the set of subsets of U).
P: the set of cells on which you can place a router (not sure if P=U in your original post).
T: the set of router type (3 values in your case).
R+: positive Real number (used to describe prices).
Let's define a function (pseudo Python):
# Domain of definition : T,P --> R+,P(U)
# This function takes a router type and a position, and returns
# a tuple containing:
# - the price of a router of the given type.
# - the subset of U containing all the position covered by a router
# of the given type placed at the given position.
def weighted_subset(routerType, position):
pass # TODO: implementation
Now, we define a last set, as the image of the function we've just described: S=weighted_subset(T,P). Each element of this set is a subset of U, weighted by a price in R+.
With all this formalism, finding the router types & positions that:
gives coverage to all the desirable locations
minimize the cost
Is equivalent to finding a sub-collection of S:
whose union of their P(U) is equal to U
which minimise the sum of the associated weights
Which is the weighted minimal set cover problem.
Working on below problem,
Problem,
Given a m * n grids, and one is allowed to move up or right, find the different paths between two grid points.
I write a recursive version and a dynamic programming version, but they return different results, and any thoughts what is wrong?
Source code,
from collections import defaultdict
def move_up_right(remaining_right, remaining_up, prefix, result):
if remaining_up == 0 and remaining_right == 0:
result.append(''.join(prefix[:]))
return
if remaining_right > 0:
prefix.append('r')
move_up_right(remaining_right-1, remaining_up, prefix, result)
prefix.pop(-1)
if remaining_up > 0:
prefix.append('u')
move_up_right(remaining_right, remaining_up-1, prefix, result)
prefix.pop(-1)
def move_up_right_v2(remaining_right, remaining_up):
# key is a tuple (given remaining_right, given remaining_up),
# value is solutions in terms of list
dp = defaultdict(list)
dp[(0,1)].append('u')
dp[(1,0)].append('r')
for right in range(1, remaining_right+1):
for up in range(1, remaining_up+1):
for s in dp[(right-1,up)]:
dp[(right,up)].append(s+'r')
for s in dp[(right,up-1)]:
dp[(right,up)].append(s+'u')
return dp[(right, up)]
if __name__ == "__main__":
result = []
move_up_right(2,3,[],result)
print result
print '============'
print move_up_right_v2(2,3)
In version 2 you should be starting your for loops at 0 not at 1. By starting at 1 you are missing possible permutations where you traverse the bottom row or leftmost column first.
Change version 2 to:
def move_up_right_v2(remaining_right, remaining_up):
# key is a tuple (given remaining_right, given remaining_up),
# value is solutions in terms of list
dp = defaultdict(list)
dp[(0,1)].append('u')
dp[(1,0)].append('r')
for right in range(0, remaining_right+1):
for up in range(0, remaining_up+1):
for s in dp[(right-1,up)]:
dp[(right,up)].append(s+'r')
for s in dp[(right,up-1)]:
dp[(right,up)].append(s+'u')
return dp[(right, up)]
And then:
result = []
move_up_right(2,3,[],result)
set(move_up_right_v2(2,3)) == set(result)
True
And just for fun... another way to do it:
from itertools import permutations
list(map(''.join, set(permutations('r'*2+'u'*3, 5))))
The problem with the dynamic programming version is that it doesn't take into account the paths that start from more than one move up ('uu...') or more than one move right ('rr...').
Before executing the main loop you need to fill dp[(x,0)] for every x from 1 to remaining_right+1 and dp[(0,y)] for every y from 1 to remaining_up+1.
In other words, replace this:
dp[(0,1)].append('u')
dp[(1,0)].append('r')
with this:
for right in range(1, remaining_right+1):
dp[(right,0)].append('r'*right)
for up in range(1, remaining_up+1):
dp[(0,up)].append('u'*up)
I would like to query the value of an exponentially weighted moving average at particular points. An inefficient way to do this is as follows. l is the list of times of events and queries has the times at which I want the value of this average.
a=0.01
l = [3,7,10,20,200]
y = [0]*1000
for item in l:
y[int(item)]=1
s = [0]*1000
for i in xrange(1,1000):
s[i] = a*y[i-1]+(1-a)*s[i-1]
queries = [23,68,103]
for q in queries:
print s[q]
Outputs:
0.0355271185019
0.0226018371526
0.0158992102478
In practice l will be very large and the range of values in l will also be huge. How can you find the values at the times in queries more efficiently, and especially without computing the potentially huge lists y and s explicitly. I need it to be in pure python so I can use pypy.
Is it possible to solve the problem in time proportional to len(l)
and not max(l) (assuming len(queries) < len(l))?
Here is my code for doing this:
def ewma(l, queries, a=0.01):
def decay(t0, x, t1, a):
from math import pow
return pow((1-a), (t1-t0))*x
assert l == sorted(l)
assert queries == sorted(queries)
samples = []
try:
t0, x0 = (0.0, 0.0)
it = iter(queries)
q = it.next()-1.0
for t1 in l:
# new value is decayed previous value, plus a
x1 = decay(t0, x0, t1, a) + a
# take care of all queries between t0 and t1
while q < t1:
samples.append(decay(t0, x0, q, a))
q = it.next()-1.0
# take care of all queries equal to t1
while q == t1:
samples.append(x1)
q = it.next()-1.0
# update t0, x0
t0, x0 = t1, x1
# take care of any remaining queries
while True:
samples.append(decay(t0, x0, q, a))
q = it.next()-1.0
except StopIteration:
return samples
I've also uploaded a fuller version of this code with unit tests and some comments to pastebin: http://pastebin.com/shhaz710
EDIT: Note that this does the same thing as what Chris Pak suggests in his answer, which he must have posted as I was typing this. I haven't gone through the details of his code, but I think mine is a bit more general. This code supports non-integer values in l and queries. It also works for any kind of iterables, not just lists since I don't do any indexing.
I think you could do it in ln(l) time, if l is sorted. The basic idea is that the non recursive form of EMA is a*s_i + (1-a)^1 * s_(i-1) + (1-a)^2 * s_(i-2) ....
This means for query k, you find the greatest number in l less than k, and for a estimation limit, use the following, where v is the index in l, l[v] is the value
(1-a)^(k-v) *l[v] + ....
Then, you spend lg(len(l)) time in search + a constant multiple for the depth of your estimation. I'll provide a code sample in a little bit (after work) if you want it, just wanted to get my idea out there while I was thinking about it
here's the code -
v is the dictionary of values at a given time; replace with 1 if it's just a 1 every time...
import math
from bisect import bisect_right
a = .01
limit = 1000
l = [1,5,14,29...]
def find_nearest_lt(l, time):
i = bisect_right(a, x)
if i:
return i-1
raise ValueError
def find_ema(l, time):
i = find_nearest_lt(l, time)
if l[i] == time:
result = a * v[l[i]
i -= 1
else:
result = 0
while (time-l[i]) < limit:
result += math.pow(1-a, time-l[i]) * v[l[i]]
i -= 1
return result
if I'm thinking correctly, the find nearest is l(n), then the while loop is <= 1000 iterations, guaranteed, so it's technically a constant (though a kind of large one). find_nearest was stolen from the page on bisect - http://docs.python.org/2/library/bisect.html
It appears that y is a binary value -- either 0 or 1 -- depending on the values of l. Why not use y = set(int(item) for item in l)? That's the most efficient way to store and look up a list of numbers.
Your code will cause an error the first time through this loop:
s = [0]*1000
for i in xrange(1000):
s[i] = a*y[i-1]+(1-a)*s[i-1]
because i-1 is -1 when i=0 (first pass of loop) and both y[-1] and s[-1] are the last element of the list, not the previous. Maybe you want xrange(1,1000)?
How about this code:
a=0.01
l = [3.0,7.0,10.0,20.0,200.0]
y = set(int(item) for item in l)
queries = [23,68,103]
ewma = []
x = 1 if (0 in y) else 0
for i in xrange(1, queries[-1]):
x = (1-a)*x
if i in y:
x += a
if i == queries[0]:
ewma.append(x)
queries.pop(0)
When it's done, ewma should have the moving averages for each query point.
Edited to include SchighSchagh's improvements.
For example,
The function could be something like def RandABCD(n, .25, .34, .25, .25):
Where n is the length of the string to be generated and the following numbers are the desired probabilities of A, B, C, D.
I would imagine this is quite simple, however i am having trouble creating a working program. Any help would be greatly appreciated.
Here's the code to select a single weighted value. You should be able to take it from here. It uses bisect and random to accomplish the work.
from bisect import bisect
from random import random
def WeightedABCD(*weights):
chars = 'ABCD'
breakpoints = [sum(weights[:x+1]) for x in range(4)]
return chars[bisect(breakpoints, random())]
Call it like this: WeightedABCD(.25, .34, .25, .25).
EDIT: Here is a version that works even if the weights don't add up to 1.0:
from bisect import bisect_left
from random import uniform
def WeightedABCD(*weights):
chars = 'ABCD'
breakpoints = [sum(weights[:x+1]) for x in range(4)]
return chars[bisect_left(breakpoints, uniform(0.0,breakpoints[-1]))]
The random class is quite powerful in python. You can generate a list with the characters desired at the appropriate weights and then use random.choice to obtain a selection.
First, make sure you do an import random.
For example, let's say you wanted a truly random string from A,B,C, or D.
1. Generate a list with the characters
li = ['A','B','C','D']
Then obtain values from it using random.choice
output = "".join([random.choice(li) for i in range(0, n)])
You could easily make that a function with n as a parameter.
In the above case, you have an equal chance of getting A,B,C, or D.
You can use duplicate entries in the list to give characters higher probabilities. So, for example, let's say you wanted a 50% chance of A and 25% chances of B and C respectively. You could have an array like this:
li = ['A','A','B','C']
And so on.
It would not be hard to parameterize the characters coming in with desired weights, to model that I'd use a dictionary.
characterbasis = {'A':25, 'B':25, 'C':25, 'D':25}
Make that the first parameter, and the second being the length of the string and use the above code to generate your string.
For four letters, here's something quick off the top of my head:
from random import random
def randABCD(n, pA, pB, pC, pD):
# assumes pA + pB + pC + pD == 1
cA = pA
cB = cA + pB
cC = cB + pC
def choose():
r = random()
if r < cA:
return 'A'
elif r < cB:
return 'B'
elif r < cC:
return 'C'
else:
return 'D'
return ''.join([choose() for i in xrange(n)])
I have no doubt that this can be made much cleaner/shorter, I'm just in a bit of a hurry right now.
The reason I wouldn't be content with David in Dakota's answer of using a list of duplicate characters is that depending on your probabilities, it may not be possible to create a list with duplicates in the right numbers to simulate the probabilities you want. (Well, I guess it might always be possible, but you might wind up needing a huge list - what if your probabilities were 0.11235442079, 0.4072777384, 0.2297927874, 0.25057505341?)
EDIT: here's a much cleaner generic version that works with any number of letters with any weights:
from bisect import bisect
from random import uniform
def rand_string(n, content):
''' Creates a string of letters (or substrings) chosen independently
with specified probabilities. content is a dictionary mapping
a substring to its "weight" which is proportional to its probability,
and n is the desired number of elements in the string.
This does not assume the sum of the weights is 1.'''
l, cdf = zip(*[(l, w) for l, w in content.iteritems()])
cdf = list(cdf)
for i in xrange(1, len(cdf)):
cdf[i] += cdf[i - 1]
return ''.join([l[bisect(cdf, uniform(0, cdf[-1]))] for i in xrange(n)])
Here is a rough idea of what might suit you
import random as r
def distributed_choice(probs):
r= r.random()
cum = 0.0
for pair in probs:
if (r < cum + pair[1]):
return pair[0]
cum += pair[1]
The parameter probs takes a list of pairs of the form (object, probability). It is assumed that the sum of probabilities is 1 (otherwise, its trivial to normalize).
To use it just execute:
''.join([distributed_choice(probs)]*4)
Hmm, something like:
import random
class RandomDistribution:
def __init__(self, kv):
self.entries = kv.keys()
self.where = []
cnt = 0
for x in self.entries:
self.where.append(cnt)
cnt += kv[x]
self.where.append(cnt)
def find(self, key):
l, r = 0, len(self.where)-1
while l+1 < r:
m = (l+r)/2
if self.where[m] <= key:
l=m
else:
r=m
return self.entries[l]
def randomselect(self):
return self.find(random.random()*self.where[-1])
rd = RandomDistribution( {"foo": 5.5, "bar": 3.14, "baz": 2.8 } )
for x in range(1000):
print rd.randomselect()
should get you most of the way...
Thank you all for your help, I was able to figure something out, mostly with this info.
For my particular need, I did something like this:
import random
#Create a function to randomize a given string
def makerandom(seq):
return ''.join(random.sample(seq, len(seq)))
def randomDNA(n, probA=0.25, probC=0.25, probG=0.25, probT=0.25):
notrandom=''
A=int(n*probA)
C=int(n*probC)
T=int(n*probT)
G=int(n*probG)
#The remainder part here is used to make sure all n are used, as one cannot
#have half an A for example.
remainder=''
for i in range(0, n-(A+G+C+T)):
ramainder+=random.choice("ATGC")
notrandom=notrandom+ 'A'*A+ 'C'*C+ 'G'*G+ 'T'*T + remainder
return makerandom(notrandom)