Iterate through data frame to generate random number in python - python

Starting with this dataframe I want to generate 100 random numbers using the hmean column for loc and the hstd column for scale
I am starting with a data frame that I change to an array. I want to iterate through the entire data frame and produce the following output.
My code below will only return the answer for row zero.
Name amax hmean hstd amin
0 Bill 22.924545 22.515861 0.375822 22.110000
1 Bob 26.118182 24.713880 0.721507 23.738400
2 Becky 23.178606 22.722464 0.454028 22.096752
This code provides one row of output, instead of three
from scipy import stats
import pandas as pd
def h2f(df, n):
for index, row in df.iterrows():
list1 = []
nr = df.as_matrix()
ff = stats.norm.rvs(loc=nr[index,2], scale=nr[index,3], size = n)
list1.append(ff)
return list1
df2 = h2f(data, 100)
pd.DataFrame(df2)
This is the output of my code
0 1 2 3 4 ... 99 100
0 22.723833 22.208324 22.280701 22.416486 22.620035 22.55817
This is the desired output
0 1 2 3 ... 99 100
0 22.723833 22.208324 22.280701 22.416486 22.620035
1 21.585776 22.190145 22.206638 21.927285 22.561882
2 22.357906 22.680952 21.4789 22.641407 22.341165

Dedent return list1 so it is not in the for-loop.
Otherwise, the function returns after only one pass through the loop.
Also move list1 = [] outside the for-loop so list1 does not get re-initialized with every pass through the loop:
import io
from scipy import stats
import pandas as pd
def h2f(df, n):
list1 = []
for index, row in df.iterrows():
mean, std = row['hmean'], row['hstd']
ff = stats.norm.rvs(loc=mean, scale=std, size=n)
list1.append(ff)
return list1
content = '''\
Name amax hmean hstd amin
0 Bill 22.924545 22.515861 0.375822 22.110000
1 Bob 26.118182 24.713880 0.721507 23.738400
2 Becky 23.178606 22.722464 0.454028 22.096752'''
df = pd.read_table(io.BytesIO(content), sep='\s+')
df2 = pd.DataFrame(h2f(df, 100))
print(df2)
PS. It is inefficent to call nr = df.as_matrix() with each pass through the loop.
Since nr never changes, at most, call it once, before entering the for-loop.
Even better, just use row['hmean'] and row['hstd'] to obtain the desired numbers.

Related

How to move with range in data frame using python [duplicate]

I have a large pandas dataframe of time-series data.
I currently manipulate this dataframe to create a new, smaller dataframe that is rolling average of every 10 rows. i.e. a rolling window technique. Like this:
def create_new_df(df):
features = []
x = df['X'].astype(float)
i = x.index.values
time_sequence = [i] * 10
idx = np.array(time_sequence).T.flatten()[:len(x)]
x = x.groupby(idx).mean()
x.name = 'X'
features.append(x)
new_df = pd.concat(features, axis=1)
return new_df
Code to test:
columns = ['X']
df_ = pd.DataFrame(columns=columns)
df_ = df_.fillna(0) # with 0s rather than NaNs
data = np.array([np.arange(20)]*1).T
df = pd.DataFrame(data, columns=columns)
test = create_new_df(df)
print test
Output:
X
0 4.5
1 14.5
However, I want the function to make the new dataframe using a sliding window with a 50% overlap
So the output would look like this:
X
0 4.5
1 9.5
2 14.5
How can I do this?
Here's what I've tried:
from itertools import tee, izip
def window(iterable, size):
iters = tee(iterable, size)
for i in xrange(1, size):
for each in iters[i:]:
next(each, None)
return izip(*iters)
for each in window(df, 20):
print list(each) # doesn't have the desired sliding window effect
Some might also suggest using the pandas rolling_mean() methods, but if so, I can't see how to use this function with window overlap.
Any help would be much appreciated.
I think pandas rolling techniques are fine here. Note that starting with version 0.18.0 of pandas, you would use rolling().mean() instead of rolling_mean().
>>> df=pd.DataFrame({ 'x':range(30) })
>>> df = df.rolling(10).mean() # version 0.18.0 syntax
>>> df[4::5] # take every 5th row
x
4 NaN
9 4.5
14 9.5
19 14.5
24 19.5
29 24.5

Pandas select rows by index then append

import pandas as pd
import numpy as np
data_dir = 'data_r14.csv'
data = pd.read_csv(data_dir)
# print(data)
signals = data['signal']
value_counts = signals.value_counts()
buy_count = value_counts[1]
signals_code = [1, 2]
buy_sell_rows = data.loc[data['signal'].isin(signals_code)]
data_without_signals = data[~data['signal'].isin(signals_code)]
random_0_indexes = np.random.choice(data_without_signals.index.values, buy_count)
value_counts2 = data_without_signals['signal'].value_counts()
# print(value_counts2)
for index in random_0_indexes:
row = data.loc[index, :]
# df = row.to_frame()
print(row)
buy_sell_rows.append(row)
# print(buy_sell_rows)
# print(signals.loc[index, :])
# print(random_0_rows)
print(buy_sell_rows)
# print(buy_sell_rows['signal'].value_counts())
So I have a dataframe where I have a column named signal where the values are either 0, 1, or 2 and I want to balance them by having equal amount rows for each value because they are very unbalanced I have only 1984 row of non zero value and over 20000 rows of zero value.
So I created a new dataframe where all the values are zeroes and called it data_without_signals then I get a random list of indexes from it, then I run a loop to get that row to append it to another dataframe I created called buy_sell_rows where only non zero values are in, but the issue is that row is being appened.
As said in my comment, I think your general approach could be simplified by randomly sampling the different signals:
# my test signal of 0s, 1s and 2s
test = pd.DataFrame({"data" : [0,0,0,1,1,1,1,1,1,1,2,2,2,2,2,2]})
# get the lowest size of any group, which is the size all groups should be reduced to
max_size = test.groupby("data")["data"].count().min()
# sample
output = (test
.groupby(["data"])
.agg(sample = ("data", lambda x : x.sample(max_size).to_list()))
.explode("sample")
.reset_index(drop=True)
)
and the output for this test is:
sample
0
0
1
0
2
0
3
1
4
1
5
1
6
2
7
2
8
2

suggestion on how to solve an infinte loop problem (python-pandas)

I have a data frame with 384 rows (and an additional dummy one in the bigining).
each row has 4 variable I wrote manually. 3 calculated fields based on those 4 variables.
and 3 that are comparing each calculated variable to the row before. each field can have 1 of two values (basically True/False).
Final goal - I want to arrange the data frame in a way that the 64 possible combination of the 6 calculated fields (2^6), occur 6 times (2^6*6=384).
Each iteration does a frequency table (pivot) and if one of the fields differ from 6 it breaks and randomize the order.
The problem that there are 384!-12*6! possible combinations and my computer is running the following script for over 4 days without a solution.
import pandas as pd
from numpy import random
# a function that calculates if a row is congruent or in-congruent
def set_cong(df):
if df["left"] > df["right"] and df["left_size"] > df["right_size"] or df["left"] < df["right"] and df["left_size"] < df["right_size"]:
return "Cong"
else:
return "InC"
# open file and calculate the basic fields
DF = pd.read_csv("generator.csv")
DF["distance"] = abs(DF.right-DF.left)
DF["CR"] = DF.left > DF.right
DF["Cong"] = DF.apply(set_cong, axis=1)
again = 1
# main loop to try and find optimal order
while again == 1:
# make a copy of the DF to not have to load it each iteration
df = DF.copy()
again = 0
df["rand"] = [[random.randint(low=1, high=100000)] for i in range(df.shape[0])]
# as 3 of the fields are calculated based on the previous row the first one is a dummy and when sorted needs to stay first
df.rand.loc[0] = 0
Sorted = df.sort_values(['rand'])
Sorted["Cong_n1"] = Sorted.Cong.eq(Sorted.Cong.shift())
Sorted["Side_n1"] = Sorted.CR.eq(Sorted.CR.shift())
Sorted["Dist_n1"] = Sorted.distance.eq(Sorted.distance.shift())
# here the dummy is deleted
Sorted = Sorted.drop(0, axis=0)
grouped = Sorted.groupby(['distance', 'CR', 'Cong', 'Cong_n1', 'Dist_n1', "Side_n1"])
for name, group in grouped:
if group.shape[0] != 6:
again = 1
break
Sorted.to_csv("Edos.csv", sep="\t",index=False)
print ("bye")
the data frame looks like this:
left right size_left size_right distance cong CR distance_n1 cong_n1 side_n1
1 6 22 44 5 T F dummy dummy dummy
5 4 44 22 1 T T F T F
2 3 44 22 1 F F T F F

How to generalize this calculation with a pandas DataFrame to any number of columns?

I have a file with some data that looks like
1 2 3 4
2 3 4 5
3 4 5 6
4 5 6 7
I can process this data and do math on it just fine:
import sys
import numpy as np
import pandas as pd
def main():
if(len(sys.argv) != 2):
print "Takes one filename as argument"
sys.exit()
file_name = sys.argv[1]
data = pd.read_csv(file_name, sep=" ", header=None)
data.columns = ["timestep", "mux", "muy", "muz"]
t = data["timestep"].count()
c = np.zeros(t)
for i in range(0,t):
for j in range(0,i+1):
c[i-j] += data["mux"][i-j] * data["mux"][i]
c[i-j] += data["muy"][i-j] * data["muy"][i]
c[i-j] += data["muz"][i-j] * data["muz"][i]
for i in range(t):
print c[i]/(t-i)
The expected result for my sample input above is
42.5
62.0
84.5
110.0
This math is finding the time correlation function for my data, which is the time-average of all permutations of the pairs of products in each column.
I would like to generalize this program to
work on n number of columns (in the i/j loop for example), and
be able to read in the column names from the file, so as to not have them hard-coded in
Which numpy or pandas methods can I use to accomplish this?
We can reduce it to one loop, as we would make use of array-slicing and use sum ufunc to operate along the rows of the dataframe and thus in the process make it generic to cover any number of columns, like so -
a = data.values
t = data["timestep"].count()
c = np.zeros(t)
for i in range(t):
c[:i+1] += (a[:i+1,1:]*a[i,1:]).sum(axis=1)
Explanation
1) a[:i+1,1:] is the slice of all rows until the i+1-th row and all columns starting from the second column, i.e mux, muy and so on.
2) Similarly, for [i,1:], that's the i-th row and all columns from second column onwards.
To keep it "pandas-way", simply replace a[ with data.iloc[.

Python equivalent of R's code

I posted a question along the same lines yesterday. This is a slightly modified version of it. previous question here.
I have 2 dataframes as follows:
data1 looks like this:
id address
1 11123451
2 78947591
data2 looks like the following:
lowerbound_address upperbound_address place
78392888 89000000 X
10000000 20000000 Y
I want to create another column in data1 called "place" which contains the place the id is from. There will be many ids coming from the same place. And some ids don't have a match.
The addresses here are float values.
What I am actually looking for in Python is an equivalent of this in R. It's easier to code the following in R. But I am unsure of how to code this in Python. Can someone help me with this?
data_place = rep(NA, nrow(data1))
for (i in (1:nrow(data1)){
tmp = as.character(data2[data1$address[i] >= data2$lowerbound_address & data1$address[i] <= data2$upperbound_address, "place"])
if(length(tmp)==1) {data_place[i] = tmp}
}
data$place = data_place
Something like this would work.
import pandas as pd
import numpy as np
# The below section is only used to import data
from io import StringIO
data = """
id address
1 11123451
2 78947591
3 50000000
"""
data2 = """
lowerbound_address upperbound_address place
78392888 89000000 X
10000000 20000000 Y
"""
# The above section is only used to import data
df = pd.read_csv(StringIO(data), delimiter='\s+')
df2 = pd.read_csv(StringIO(data2), delimiter='\s+')
df['new']=np.nan
df['new'][(df['address'] > df2['lowerbound_address'][0]) & (df['address'] < df2['upperbound_address'][0])] = 'X'
df['new'][(df['address'] > df2['lowerbound_address'][1]) & (df['address'] < df2['upperbound_address'][1])] = 'Y'
In addition to pandas, we used numpy for np.nan.
All I have done was create a new column and assign NaN to it. Then created two criteria to assign either X or 'Y' based on the upper and lower boundaries in the second data (last two lines).
Final results:
id address new
0 1 11123451 Y
1 2 78947591 X
2 3 50000000 NaN
Do a merge_asof and then replace all those times that the address is out of bounds with nan.
data1.sort_values('address', inplace = True)
data2.sort_values('lowerbound_address', inplace=True)
data3 = pd.merge_asof(data1, data2, left_on='address', right_on='lowerbound_address')
data3['place'] = data3['place'].where(data3.address <= data3.upperbound_address)
data3.drop(['lowerbound_address', 'upperbound_address'], axis=1)
Output
id address place
0 1 11123451 Y
1 3 50000000 NaN
2 2 78947591 X

Categories

Resources