Is there a standard function for Python which outputs True or False probabilistically based on the input of a random number from 0 to 1?
example of what I mean:
def decision(probability):
...code goes here...
return ...True or False...
the above example if given an input of, say, 0.7 will return True with a 70% probability and false with a 30% probability
import random
def decision(probability):
return random.random() < probability
Given a function rand that returns a number between 0 and 1, you can define decision like this:
bool decision(float probability)
{
return rand()<probability;
}
Assuming that rand() returns a value in the range [0.0, 1.0) (so can output a 0.0, will never output a 1.0).
Just use PyProbs library. It is very easy to use.
>>> from pyprobs import Probability as pr
>>>
>>> # You can pass float (i.e. 0.5, 0.157), int (i.e. 1, 0) or str (i.e. '50%', '3/11')
>>> pr.prob(50/100)
False
>>> pr.prob(50/100, num=5)
[False, False, False, True, False]
I use this to generate a random boolean in python with a probability:
from random import randint
n=8 # inverse of probability
rand_bool=randint(0,n*n-1)%n==0
so to expand that :
def rand_bool(prob):
s=str(prob)
p=s.index('.')
d=10**(len(s)-p)
return randint(0,d*d-1)%d<int(s[p+1:])
I came up with this myself but it seems to work.
If you want to amass a lot of data, I would suggest using a map:
from numpy import random as rn
p = 0.15
data = rn.random(100)
final_data = list(map(lambda x: x < p, data))
Related
I am evaluating an extensive summation, by evaluating each term separately using a for-loop (Python 3.5 + NumPy 1.15.4). However, I obtained a surprising result when comparing manual term-by-term evaluation vs. using the for-loop. See MWE below.
S = sum(c_i x^i) for i=0..n (properly formatted LaTeX version here)
Main questions:
Where does the difference in the outputs y1 and y2 originate from?
How could I alter the code such that the for-loop yields the expected result (y1==y2)?
Comparing dy1 and dy2:
dy1:
[-1.76004137e-02 3.50290845e+01 1.50326037e+01 -7.25045852e+01
2.08908445e+02 -3.31104542e+02 2.98005855e+02 -1.53154111e+02
4.18203833e+01 -4.68961704e+00 0.00000000e+00]
dy2:
[-1.76004137e-02 3.50290845e+01 1.50326037e+01 -7.25045852e+01
-3.27960559e-01 -4.01636743e-04 2.26525295e-07 4.80637463e-10
1.93967535e-13 -1.93976497e-17 -0.00000000e+00]
dy1==dy2:
[ True True True True False False False False False False True]
Thanks!
MWE:
import numpy as np
coeff = np.array([
[ 0.000000000000E+00, -0.176004136860E-01],
[ 0.394501280250E-01, 0.389212049750E-01],
[ 0.236223735980E-04, 0.185587700320E-04],
[-0.328589067840E-06, -0.994575928740E-07],
[-0.499048287770E-08, 0.318409457190E-09],
[-0.675090591730E-10, -0.560728448890E-12],
[-0.574103274280E-12, 0.560750590590E-15],
[-0.310888728940E-14, -0.320207200030E-18],
[-0.104516093650E-16, 0.971511471520E-22],
[-0.198892668780E-19, -0.121047212750E-25],
[-0.163226974860E-22, 0.000000000000E+00]
]).T
c = coeff[1] # select appropriate coeffs
x = 900 # input
# manual calculation
y = c[0]*x**0 + c[1]*x**1 + c[2]*x**2 + c[3]*x**3 + c[4]*x**4 + \
c[5]*x**5 + c[6]*x**6 + c[7]*x**7 + c[8]*x**8 + c[9]*x**9 + c[10]*x**10
print('y:',y)
# calc terms individually
dy1 = np.zeros(c.size)
dy1[0] = c[0]*x**0
dy1[1] = c[1]*x**1
dy1[2] = c[2]*x**2
dy1[3] = c[3]*x**3
dy1[4] = c[4]*x**4
dy1[5] = c[5]*x**5
dy1[6] = c[6]*x**6
dy1[7] = c[7]*x**7
dy1[8] = c[8]*x**8
dy1[9] = c[9]*x**9
dy1[10] = c[10]*x**10
# calc terms in for loop
dy2 = np.zeros(len(c))
for i in np.arange(len(c)):
dy2[i] = c[i]*x**i
# summation and print
y1 = np.sum(dy1)
print('y1:',y1)
y2 = np.sum(dy2)
print('y2:',y2)
Output:
y: 37.325915370853856
y1: 37.32591537085385
y2: -22.788859384118823
It seems like raising a python int to a power of numpy integer (of specific size) leads to conversion of result to a numpy integer of the same size.
Example:
type(900**np.int32(10))
returns numpy.int32 and
type(900**np.int64(10))
returns numpy.int64
From this Stackoverflow question it seems that while Python int are variable sized, numpy integers are not (the size is specified by type as, for example, np.int32 or np.int64). So, while Python range function returns integers of variable size (Python int type), np.arange returns integers of specific type (if not specified, type is inferred).
Trying to compare the Python integer math vs numpy integer math:
900**10 returns 348678440100000000000000000000
while 900**np.int32(10) returns -871366656
Looks like you get integer overflow via np.arange function because the numpy integer dtype (in this case it is inferred as np.int32) is too small to store the resulting value.
Edit:
In this specific case, using np.arange(len(c), dtype = np.uint64) seems to output the right values:
dy2 = np.zeros(len(c))
for i in np.arange(len(c), dtype = np.uint64):
dy2[i] = c[i]*x**i
dy1 == dy2
Outputs:
array([ True, True, True, True, True, True, True, True, True,
True, True])
Note: the accuracy might suffer using numpy in this case (int(900*np.uint64(10)) returns 348678440099999970966892445696 which is less than 900**10), so if that is of importance, I'd still opt to use Python built-in range function.
def large_sum(lst, num):
def larger(x):
return (int(x) > num)
m = list(map(larger,lst))
print(m)
large_sum([2,4,6,8,10], 7)
# [False, False, False, True, True]
I wish to get the sum of the element in [2,4,6,8,10] where m is True.
You can use itertools.compress in the following way:
from itertools import compress
def large_sum(lst, num):
def larger(x):
return (int(x) > num)
m = list(map(larger,lst))
return m
l = [2,4,6,8,10]
n = 7
res = large_sum(l, n)
print(sum(list(compress(l, res)))
I kept the original logic, but I would suggest to use #chris method as it would be better efficency-wise:
l = [2,4,6,8,10]
n = 7
sum(i for i in l if i > n)
The main thing is to use filter instead of map:
def large_sum(lst, num):
def larger(x):
return x > num
return sum(filter(larger, lst))
print(large_sum([2,4,6,8,10], 7))
# 18
map transforms each value in the list, into a boolean in this case. The original values are not there anymore, and if you sum the booleans, True is converted to 1 and False to 0, which is why you get 2.
filter, on the other hand, returns the original values, but only those for which the predicate (larger in this case) returns True, and omits the others. Then you just sum those.
Before I ask my question, I provide you with the code.
Code
from scipy import *
x = randn(10)
cum_x = cumsum(x)
#The objective is to recover x using cum_x and the diff function.
y = append(cum_x[0],diff(cum_x))
#Now, y should be equal to x but this is not confirmed by the function in1d
test = in1d(x,y)
The variable test does not return an array of "True" boolean values even if y and x are clearly the same. What is the problem here?
Thank you in advance.
if you use set_printoptions to increase precision you will see some differences:
from scipy import *
set_printoptions(30)
x = randn(10)
cum_x = cumsum(x)
#The objective is to recover x using cum_x and the diff function.
y = append(cum_x[0], diff(cum_x))
print(x)
print("\n")
print(y)
#Now, y should be equal to x but this is not confirmed by the function in1d
test = in1d(x, y)
print(test)
Output:
[ 0.54816314147543721002620031868 0.14319052613251953554041051575
0.489110961092741158839913850898 -0.093011827554544138085823590245
-0.58370623188476589149331630324 -0.40395493550429123486011917521
0.387387395892057895263604905267 1.001637373359834937147638811439
-1.486778459872974744726548124163 1.446772274227251076084144187917]
[ 0.54816314147543721002620031868 0.143190526132519591051561747008
0.48911096109274110332876261964 -0.093011827554544179719187013688
-0.58370623188476589149331630324 -0.40395493550429123486011917521
0.387387395892057895263604905267 1.001637373359834937147638811439
-1.486778459872974744726548124163 1.446772274227251076084144187917]
[ True False False False True True True True True True]
What you probably want is allclose but interestingly setting the dtype to np.float128 or np.longdouble on my ubuntu system does not lose precision and in1d returns True.
cum_x = cumsum(x,dtype=np.longdouble)
The R ppoints function is described as:
Ordinates for Probability Plotting
Description:
Generates the sequence of probability points ‘(1:m - a)/(m +
(1-a)-a)’ where ‘m’ is either ‘n’, if ‘length(n)==1’, or
‘length(n)’.
Usage:
ppoints(n, a = ifelse(n <= 10, 3/8, 1/2))
...
I've been trying to replicate this function in python and I have a couple of doubts.
1- The first m in (1:m - a)/(m + (1-a)-a) is always an integer: int(n) (ie: the integer of n) if length(n)==1 and length(n) otherwise.
2- The second m in the same equation is NOT an integer if length(n)==1 (it assumes the real value of n) and it IS an integer (length(n)) otherwise.
3- The n in a = ifelse(n <= 10, 3/8, 1/2) is the real number n if length(n)==1 and the integer length(n) otherwise.
This points are not made clear at all in the description and I'd very much appreciate if someone could confirm that this is the case.
Add
Well this was initially posted at https://stats.stackexchange.com/ because I was hoping to get the input of staticians who work with the ppoints function. Since it has been migrated here, I'll paste below the function I wrote to replicate ppoints in python. I've tested it and both seem to give back the same results, but I'd be great if someone could clarify the points made above because they are not made at all clear by the function's description.
def ppoints(vector):
'''
Mimics R's function 'ppoints'.
'''
m_range = int(vector[0]) if len(vector)==1 else len(vector)
n = vector[0] if len(vector)==1 else len(vector)
a = 3./8. if n <= 10 else 1./2
m_value = n if len(vector)==1 else m_range
pp_list = [((m+1)-a)/(m_value+(1-a)-a) for m in range(m_range)]
return pp_list
I would implement this with numpy:
import numpy as np
def ppoints(n, a):
""" numpy analogue or `R`'s `ppoints` function
see details at http://stat.ethz.ch/R-manual/R-patched/library/stats/html/ppoints.html
:param n: array type or number"""
try:
n = np.float(len(n))
except TypeError:
n = np.float(n)
return (np.arange(n) + 1 - a)/(n + 1 - 2*a)
Sample output:
>>> ppoints(5, 1./2)
array([ 0.1, 0.3, 0.5, 0.7, 0.9])
>>> ppoints(5, 1./4)
array([ 0.13636364, 0.31818182, 0.5 , 0.68181818, 0.86363636])
>>> n = 10
>>> a = 3./8. if n <= 10 else 1./2
>>> ppoints(n, a)
array([ 0.06097561, 0.15853659, 0.25609756, 0.35365854, 0.45121951,
0.54878049, 0.64634146, 0.74390244, 0.84146341, 0.93902439])
One can use R fiddle to test implementation.
I have a big data set of floating point numbers. I iterate through them and evaluate np.log(x) for each of them.
I get
RuntimeWarning: divide by zero encountered in log
I would like to get around this and return 0 if this error occurs.
I am thinking of defining a new function:
def safe_ln(x):
#returns: ln(x) but replaces -inf with 0
l = np.log(x)
#if l = -inf:
l = 0
return l
Basically,I need a way of testing that the output is -inf but I don't know how to proceed.
Thank you for your help!
You are using a np function, so I can safely guess that you are working on a numpy array?
Then the most efficient way to do this is to use the where function instead of a for loop
myarray= np.random.randint(10,size=10)
result = np.where(myarray>0, np.log(myarray), 0)
otherwise you can simply use the log function and then patch the hole:
myarray= np.random.randint(10,size=10)
result = np.log(myarray)
result[result==-np.inf]=0
The np.log function return correctly -inf when used on a value of 0, so are you sure that you want to return a 0? if somewhere you have to revert to the original value, you are going to experience some problem, changing zeros into ones...
Since the log for x=0 is minus infinite, I'd simply check if the input value is zero and return whatever you want there:
def safe_ln(x):
if x <= 0:
return 0
return math.log(x)
EDIT: small edit: you should check for all values smaller than or equal to 0.
EDIT 2: np.log is of course a function to calculate on a numpy array, for single values you should use math.log. This is how the above function looks with numpy:
def safe_ln(x, minval=0.0000000001):
return np.log(x.clip(min=minval))
You can do this.
def safe_ln(x):
try:
l = np.log(x)
except ZeroDivisionError:
l = 0
return l
I like to use sys.float_info.min as follows:
>>> import numpy as np
>>> import sys
>>> arr = np.linspace(0.0, 1.0, 3)
>>> print(arr)
[0. 0.5 1. ]
>>> arr[arr < sys.float_info.min] = sys.float_info.min
>>> print(arr)
[2.22507386e-308 5.00000000e-001 1.00000000e+000]
>>> np.log10(arr)
array([-3.07652656e+02, -3.01029996e-01, 0.00000000e+00])
Other answers have also introduced small positive values, but I prefer to use the smallest possible value to make the approximation more accurate.
The answer given by Enrico is nice, but both solutions result in a warning:
RuntimeWarning: divide by zero encountered in log
As an alternative, we can still use the where function but only execute the main computation where it is appropriate:
# alternative implementation -- a bit more typing but avoids warnings.
loc = np.where(myarray>0)
result2 = np.zeros_like(myarray, dtype=float)
result2[loc] =np.log(myarray[loc])
# answer from Enrico...
myarray= np.random.randint(10,size=10)
result = np.where(myarray>0, np.log(myarray), 0)
# check it is giving right solution:
print(np.allclose(result, result2))
My use case was for division, but the principle is clearly the same:
x = np.random.randint(10, size=10)
divisor = np.ones(10,)
divisor[3] = 0 # make one divisor invalid
y = np.zeros_like(divisor, dtype=float)
loc = np.where(divisor>0) # (or !=0 if your data could have -ve values)
y[loc] = x[loc] / divisor[loc]
use exception handling:
In [27]: def safe_ln(x):
try:
return math.log(x)
except ValueError: # np.log(x) might raise some other error though
return float("-inf")
....:
In [28]: safe_ln(0)
Out[28]: -inf
In [29]: safe_ln(1)
Out[29]: 0.0
In [30]: safe_ln(-100)
Out[30]: -inf
you could do:
def safe_ln(x):
#returns: ln(x) but replaces -inf with 0
try:
l = np.log(x)
except RunTimeWarning:
l = 0
return l
For those looking for a np.log solution that intakes a np.ndarray and nudges up only zero values:
import sys
import numpy as np
def smarter_nextafter(x: np.ndarray) -> np.ndarray:
safe_x = np.where(x != 0, x, np.nextafter(x, 1))
return np.log(safe_x)
def clip_usage(x: np.ndarray, safe_min: float | None = None) -> np.ndarray:
# Inspiration: https://stackoverflow.com/a/13497931/
clipped_x = x.clip(min=safe_min or np.finfo(x.dtype).min)
return np.log(clipped_x)
def inplace_usage(x: np.ndarray, safe_min: float | None = None) -> np.ndarray:
# Inspiration: https://stackoverflow.com/a/62292638/
x[x == 0] = safe_min or np.finfo(x.dtype).min
return np.log(x)
Or if you don't mind nudging all values and like bad big-O runtimes:
def brute_nextafter(x: np.ndarray) -> np.ndarray:
# Just for reference, don't use this
while not x.all():
x = np.nextafter(x, 1)
return np.log(x)