Finding the probability of a variable in collection of lists

Finding the probability of a variable in collection of lists - python

I have a selection of lists of variables
import numpy.random as npr
w = [0.02, 0.03, 0.05, 0.07, 0.11, 0.13, 0.17]
x = 1
y = False
z = [0.12, 0.2, 0.25, 0.05, 0.08, 0.125, 0.175]
v = npr.choice(w, x, y, z)
I want to find the probability of the value V being a selection of variables eg; False or 0.12.
How do I do this.
Heres what I've tried;
import numpy.random as npr
import math
w = [0.02, 0.03, 0.05, 0.07, 0.11, 0.13, 0.17]
x = 1
y = False
z = [0.12, 0.2, 0.25, 0.05, 0.08, 0.125, 0.175]
v = npr.choice(w, x, y, z)
from collections import Counter
c = Counter(0.02, 0.03, 0.05, 0.07, 0.11, 0.13, 0.17,1,False,0.12, 0.2, 0.25, 0.05, 0.08, 0.125, 0.175)
def probability(0.12):
return float(c[v]/len(w,x,y,z))
which I'm getting that 0.12 is an invalid syntax

There are several issues in the code, I think you want the following:
import numpy.random as npr
import math
from collections import Counter
def probability(v=0.12):
return float(c[v]/len(combined))
w = [0.02, 0.03, 0.05, 0.07, 0.11, 0.13, 0.17]
x = [1]
y = [False]
z = [0.12, 0.2, 0.25, 0.05, 0.08, 0.125, 0.175]
combined = w + x + y + z
v = npr.choice(combined)
c = Counter(combined)
print(probability())
print(probability(v=0.05))
1) def probability(0.12) does not make sense; you will have to pass a variable which can also have a default value (above I use 0.12)
2) len(w, x, y, z) does not make much sense either; you probably look for a list that combines all the elements of w, x, y and z. I put all of those in the list combined.
3) One would also have to put in an additional check, in case the user passes e.g. v=12345 which is not included in combined (I leave this to you).
The above will print
0.0625
0.125
which gives the expected outcome.

Related

Normalize vectors in the complex space : mean=0 and std-dev=1/sqrt(n)

I have a Vector Or bunch of vectors (stored in 2D array, by rows)
The vectors are generated as :
MEAN=0, STD-DEV=1/SQRT(vec_len)
and before or after operations have to be normalized in the same form
I want to normalize them in the complex space.
Here is the wrapper of a function:
#staticmethod
def fft_normalize(x, dim=DEF_DIM):
cx = rfft(x, dim=dim)
....
rv = irfft(cx_proj, dim=dim)
return rv
help me fill the dots.
Here is the real-value normalization that I use.
#staticmethod
def normalize(a, dim=DEF_DIM):
norm=torch.linalg.norm(a,dim=dim)
# if torch.eq(norm,0) : return torch.divide(a,st.MIN)
if dim is not None : norm = norm.unsqueeze(dim)
return torch.divide(a,norm)
In [70]: st.normalize(x + 3)
Out[70]:
([[0.05, 0.04, 0.05, ..., 0.04, 0.04, 0.04],
[0.04, 0.04, 0.05, ..., 0.05, 0.04, 0.05],
[0.05, 0.04, 0.05, ..., 0.04, 0.05, 0.04]])
In [71]: st.normalize(x + 5)
Out[71]:
([[0.05, 0.04, 0.05, ..., 0.04, 0.04, 0.04],
[0.04, 0.04, 0.05, ..., 0.05, 0.04, 0.04],
[0.05, 0.04, 0.04, ..., 0.04, 0.05, 0.04]])
In [73]: st.normalize(x + 5).len()
Out[73]: ([1.00, 1.00, 1.00])
In [74]: st.normalize(x + 3).len()
Out[74]: ([1., 1., 1.])
In [75]: st.normalize(x).len()
Out[75]: ([1.00, 1.00, 1.00])
#bad, need normalization
In [76]: (x + 3).len()
Out[76]: ([67.13, 67.13, 67.13])
#staticmethod
def len(a,dim=DEF_DIM): return torch.linalg.norm(a,dim=dim)
I did not want to post this so not to influence possible better solution.. So here is one my attempts .. parts I borrowed from what I found.
This only works for 1D vectors ;(
#staticmethod
def fft_normalize(x, dim=DEF_DIM):# Normalize a vector x in complex domain.
c = rfft(x,dim=dim)
ri = torch.vstack([c.real, c.imag])
norm = torch.abs(c)
print(norm.shape, ri.shape)
# norm = torch.linalg.norm(ri, dim=dim)
# if dim is not None : norm = norm.unsqueeze(dim)
if torch.any(torch.eq(norm,0)): norm[torch.eq(norm,0)] = st.MIN #!fixme
ri= torch.divide(ri,norm) #2D fails here
c_proj = ri[0,:] + 1j * ri[1,:]
rv = irfft(c_proj, dim=dim)
return rv
adapted the solution of Thibault Cimic ... seems to work for 1D vectors, but not for 2D
#staticmethod
def fft_normalize(x, dim=DEF_DIM, dot_dim=None):# Normalize a vector x in complex domain.
c = rfftn(x,dim=dim)
c_conj = torch.conj(c)
if dot_dim is None : dot_dim = st.dot_dims(c, c_conj)
c_norm = torch.sqrt(torch.tensordot(c, c_conj, dims=dot_dim))
c_proj = torch.divide(c, c_norm)
rv = irfftn(c_proj, dim=dim)
return rv

I'm guessing you want to normalize with the norm associated to the natural complex inner product. So is that what you're trying to do :
def fft_normalize(x, dim=DEF_DIM):# Normalize a vector x in complex domain.
c = rfft(x,dim=dim)
c_norm = math.sqrt(c.dot(numpy.conjugate(c)))
c_proj = c/c_norm
rv = irfft(c_proj, dim=dim)
return rv

getting values from a CDF

Good morning, everyone. I have a set of values.
Arr = np.array([0.11, 0.14, 0.22, 0.26, 0.31, 0.36, 0.44, 0.69, 0.70, 0.70, 0.70, 0.75, 0.98, 1.40])
I have constructed the CDF function in this way:
def ecdf(a):
x, counts = np.unique(a, return_counts=True)
cusum = np.cumsum(counts)
return x, cusum / cusum[-1]
def plot_ecdf(a):
x, y = ecdf(a)
x = np.insert(x, 0, x[0])
y = np.insert(y, 0, 0.)
plt.plot(x, y, drawstyle='steps-post')
plt.grid(True)
ecdf_ = ecdf(Arr)
plot_ecdf(ecdf_)
Obtaining this figure:
Now I want to divide the space (y-axis) into 5 parts. To do this I am using the following function:
from scipy.stats.qmc import LatinHypercube
engine = LatinHypercube(d=1)
sample = engine.random(n=5) #Array of float64
For example, obtaining 5 values randomly generated:
0.0886183
0.450613
0.808077
0.753524
0.343108
At this point I would like to keep the corresponding values in the CDF as in the picture.
I also observed that in this way the constructed CDF has a discrete set of values. Which may not be optimal for my purpose.

Distribute values based on sum and list of provided values

I need to generate list of values from provided that satisfy this requirements:
Sum of all generated values should be equal of total, only providedValues should be used to get the sum, providedValues and total can be any double.
For example:
total = 1.0
providedValues = [0.5, 0.25]
Values in output list should be randomly distributed, for example output can be: [0.5, 0.25, 0.25], [0.25, 0.5, 0.25] or [0.25, 0.25, 0.5]
In case sum can't be equal total:
total = 1.0
providedValues = [0.3]
algorithm should throw error.
Language for implementation not so matter, I'll try to read any.

This algorithm will return all the possible combinations that sum to total.
import itertools
import numpy as np
def find_combination(total, providedValues):
i = 1
rv = []
while True:
combs = list(itertools.combinations_with_replacement(providedValues,i))
validCombs = [comb for comb in combs if np.isclose(sum(comb),total)]
if validCombs:
rv.extend(validCombs)
elif not [comb for comb in combs if sum(comb) <= total]:
return rv
i += 1
Output:
>>> find_combination(1.0, [0.5, 0.25])
[(0.5, 0.5), (0.5, 0.25, 0.25), (0.25, 0.25, 0.25, 0.25)]
>>> find_combination(1.0, [0.3])
[]
If you want to get all permutations of the results, you can use
>>> set(itertools.permutations((0.5, 0.25, 0.25)))
{(0.25, 0.25, 0.5), (0.25, 0.5, 0.25), (0.5, 0.25, 0.25)}
For example:
>>> set(y for x in find_combination(1.0, [0.5, 0.25]) for y in itertools.permutations(x))
{(0.25, 0.25, 0.25, 0.25),
(0.25, 0.25, 0.5),
(0.25, 0.5, 0.25),
(0.5, 0.25, 0.25),
(0.5, 0.5)}

Here is my solution based on there are two values provided, you may want to change it for you need
from itertools import permutations, combinations
def get_scala(x,y,t):
# get list of scala combinations
# find a,b that a*x+b*y = total
scala_list = []
amax = int(t // x) # possible max scala for x
bmax = int(t // y) # possible max scala for y
for i in range(1, amax+1):
for j in range(1, bmax+1):
if i*x + j*y == t: # find the scala combination that == total
scala_list.append((i, j))
if scala_list:
return scala_list
else:
print("Warning: cannot add up to the total")
def dist(x, y, scala):
a, b = scala
# get a base list with a number of x and b number of y [x,x,y,y,y]
bl = [x]*a + [y]*b
# get permutations and using set to get rid of duplicate items
return set(permutations(bl))
for l in get_scala(0.3, 0.2, 1):
for d in dist(0.3, 0.2, l):
print(d)
the output would look look:
(0.2, 0.3, 0.2, 0.3)
(0.2, 0.2, 0.3, 0.3)
(0.3, 0.2, 0.2, 0.3)
(0.3, 0.2, 0.3, 0.2)
(0.3, 0.3, 0.2, 0.2)
(0.2, 0.3, 0.3, 0.2)

Chi-squared for the optimal order of a fit with polynomial

I have the following code, in which DGauss is a function that generates the expected values. The two arrays, on the other hand, allow me to generate a distribution, that I take as observed values.
The code, based on the observed values, extracts a polynomial (for the moment of the seventh degree) that describes its trend.
import matplotlib.pyplot as plt
import numpy as np
from scipy.optimize import curve_fit
def DGauss(x,I1,I2,sigma1,sigma2):
return I1*np.exp(-x*x/(2*sigma1*sigma1)) + I2*np.exp(-x*x/(2*sigma2*sigma2))
Pos = np.array([3.28, 3.13, 3.08, 3.03, 2.98, 2.93, 2.88, 2.83, 2.78, 2.73, 2.68,
2.63, 2.58, 2.53, 2.48, 2.43, 2.38, 2.33, 2.28, 2.23, 2.18, 2.13,
2.08, 2.03, 1.98, 1.93, 1.88, 1.83, 1.78, 1.73, 1.68, 1.63, 1.58,
1.53, 1.48, 1.43, 1.38, 1.33, 1.28, 1.23, 1.18, 1.13, 1.08, 1.03,
0.98, 0.93, 0.88, 0.83, 0.78, 0.73, 0.68, 0.63, 0.58, 0.53, 0.48,
0.43, 0.38, 0.33, 0.28, 0.23, 0.18, 0.13, 0.08, 0.03])
Val = np.array([0.00986279, 0.01529543, 0.0242624 , 0.0287456 , 0.03238484,
0.03285927, 0.03945234, 0.04615091, 0.05701618, 0.0637672 ,
0.07194268, 0.07763934, 0.08565687, 0.09615262, 0.1043281 ,
0.11350606, 0.1199406 , 0.1260062 , 0.14093328, 0.15079665,
0.16651464, 0.18065023, 0.1938894 , 0.2047541 , 0.21794024,
0.22806706, 0.23793043, 0.25164404, 0.2635118 , 0.28075974,
0.29568682, 0.30871501, 0.3311846 , 0.34648062, 0.36984661,
0.38540666, 0.40618835, 0.4283945 , 0.45002014, 0.48303911,
0.50746062, 0.53167057, 0.5548792 , 0.57835128, 0.60256181,
0.62566436, 0.65704847, 0.68289386, 0.71332794, 0.73258027,
0.769608 , 0.78769989, 0.81407275, 0.83358852, 0.85210239,
0.87109068, 0.89456217, 0.91618782, 0.93760247, 0.95680234,
0.96919757, 0.9783219 , 0.98486193, 0.9931429 ])
f = np.linspace(-9,9,2*len(Pos))
plt.errorbar(Pos, Val, xerr=0.02, yerr=2.7e-3, fmt='o')
popt, pcov = curve_fit(DGauss, Pos, Val)
plt.plot(xfull, DGauss(f, *popt), '--', label='Double Gauss')
x = Pos
y = Val
#z, w = np.polyfit(x, y, 7, full=False, cov=True)
p = np.poly1d(z)
u = np.array(p)
xp = np.linspace(1, 6, 100)
_ = plt.plot(xp, p(xp), '-', color='darkviolet')
x = symbols('x')
list = u[::-1]
poly = sum(S("{:7.3f}".format(v))*x**i for i, v in enumerate(list))
eq_latex = sympy.printing.latex(poly)
print(eq_latex)
#LOOP SUGGESTED BY #Fourier
dof = [1,2,3,4,5,6,7,8,9,10]
for i in dof:
z = np.polyfit(x, y, i, full=False, cov=True)
chi = np.sum((np.polyval(z, x) - y) ** 2)
chinorm = chi/i
plt.plot(chinorm)
What I would like to do now is to make a fit by varying the order of the polynomial to figure out which is the minimum order I need to have a good fit and not exceed the number of free parameters. In particular, I would like to make this fit with different orders and plot the chi-squared, which must be normalized with respect to the number of degrees of freedom.
Could someone help me kindly?
Thanks!

Based on the posted code this should work for your purpose:
chiSquares = []
dofs = 10
for i in np.arange(1,dofs+1):
z = np.polyfit(x, y, i, full=False, cov=False)
chi = np.sum((np.polyval(z, x) - y) ** 2) / np.std(y) #ideally you should divide this using an error for Val array
chinorm = chi/i
chiSquares.append(chinorm)
plt.plot(np.arange(1,dofs+1),chiSquares)
If not evident from the plot, you can further use the F-test to check how much dof is really needed:
n = len(y)
for d, (rss1,rss2) in enumerate(zip(chiSquares,chiSquares[1:])):
p1 = d + 1
p2 = d + 2
F = (rss1-rss2/(p2-p1)) / (rss2/(n-p2))
p = 1.0 - scipy.stats.f.cdf(F,p1,p2)
print 'F-stats: {:.3f}, p-value: {:.5f}'.format(F,p)

searching k nearest neighbors in numpy

I'm new to Python. I want to use numpy and sklearn to do KNN. However, there's a nan in my data. I set dtype of genfromtxt to None but the array will look like below:
[('ADT1_YEAST', 0.58, 0.61, 0.47, 0.13, 0.5, 0.0, 0.48, 0.22, 'MIT')
('ADT2_YEAST', 0.43, 0.67, 0.48, 0.27, 0.5, 0.0, 0.53, 0.22, 'MIT')
('ADT3_YEAST', 0.64, 0.62, 0.49, 0.15, 0.5, 0.0, 0.53, 0.22, 'MIT') ...,
('ZNRP_YEAST', 0.67, 0.57, 0.36, 0.19, 0.5, 0.0, 0.56, 0.22, 'ME2')
('ZUO1_YEAST', 0.43, 0.4, 0.6, 0.16, 0.5, 0.0, 0.53, 0.39, 'NUC')
('G6PD_YEAST', 0.65, 0.54, 0.54, 0.13, 0.5, 0.0, 0.53, 0.22, 'CYT')]
then, I will get data type not understood on NearestNeighbors function.
Here is my code:
npGem = np.genfromtxt('temp.data', dtype=None)
X = np.array(npGem)
nbrs = NearestNeighbors(n_neighbors=5, algorithm='ball_tree').fit(X)
can anyone teach me how to make the list be read? Thanks in advance.

If I understand the problem, you're really asking how to encode the categorical variables such that they can be properly interpreted by the nearest neighbors algorithm. You can do this with sklearn as explained in 4.2.4. Encoding categorical features. On the other hand, if you have incomplete features, 4.2.6. Imputation of missing values.

I think you need to get the data into a matrix properly. I typically do something like this:
import numpy as np
features = [] # list of lists of the feature vairables.
classes = [] # list of the target variables
for line in f:
line = line.strip().split() # will split the line into pieces on any white spaces
features.append(line[1:-1]) # or whatever indices your features are in
classes.append(line[-1]) # or whatever index your target variable is in
classes = np.array(classes)
features = np.array(features,dtype=np.float)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Finding the probability of a variable in collection of lists - python

Related

Normalize vectors in the complex space : mean=0 and std-dev=1/sqrt(n)

getting values from a CDF

Distribute values based on sum and list of provided values

Chi-squared for the optimal order of a fit with polynomial

searching k nearest neighbors in numpy

Categories

Resources