I would like to change the format of my output for the following code.
import pandas as pd
x= pd.read_csv('x.csv')
y= pd.read_csv('y.csv')
z= pd.read_csv('z.csv')
list = pd.merge(x, y, how='left', on=['xx'])
list = pd.merge(list, z, how='left', on=['xx'])
columns_to_keep = ['yy','zz', 'uu']
list = list.set_index(['xx'])
list = list[columns_to_keep]
list = list.sort_index(axis=0, level=None, ascending=True, inplace=False,
sort_remaining=True, by=None)
with open('write.csv','w') as f:
list.to_csv(f,header=True, index=True, index_label='xx')
from this:
id date user_id user_name
1 8/13/2007 1 a1
2 1/8/2007 2 a2
2 1/8/2007 3 a3
3 12/14/2007 4 a4
4 3/6/2008 5 a5
4 4/14/2009 6 a6
4 5/30/2008 7 a7
4 5/30/2008 8 a8
5 6/17/2007 9 a9
to this:
id date user_id user_name
1 8/13/2007 1 a1
2 1/8/2007 2;3 a2;a3
3 12/14/2007 4 a4
4 3/6/2008 5;6;7;8 a5;a6;a7;a8
5 6/17/2007 9 a9
I think the following should work on the final dataframe (list), though I would suggest not to use "list" as a name as it is a built in function in python and you might want to use that function somewhere else. So in my code I will use "df" instead of "list":
ind = list(set(df.index.get_values()))
finaldf = pd.DataFrame(columns = list(df.columns))
for val in ind:
tempDF = df.loc[val]
print tempDF
for i in range(tempDF.shape[0]):
for jloc,j in enumerate(list(df.columns)):
if i != 0 and j != 'date':
finaldf.loc[val,j] += (";"+str(tempDF.iloc[i,jloc]))
elif i == 0:
finaldf.loc[val,j] = str(tempDF.iloc[i,jloc])
print finaldf
Related
I have this dataframe:
record = {
'F1': ['x1', 'x2','x3', 'x4','x5','x6','x7'],
'F2': ['a1', 'a2','a3', 'a4','a5','a6','a7'],
'Sex': ['F', 'M','F', 'M','M','M','F'] }
# Creating a dataframe
df = pd.DataFrame(record)
I would like to create for example 2 samples of this dataframe while keeping a fixed ratio of 50-50 on the Sex column.
I tried like this:
df_dict ={}
for i in range(2):
df_dict['df{}'.format(i)] = df.sample(frac=0.50, random_state=123)
But the output I get does not seem to match my expectation:
df_dict["df0"]
# Output:
F1 F2 Sex
1 x2 a2 M
3 x4 a4 M
4 x5 a5 M
0 x1 a1 F
Any help ?
Might not be the best idea, but I believe it might help you to solve your problem somehow:
n = 2
fDf = df[df["Sex"] == "F"].sample(frac=0.5, random_state=123).iloc[:n]
mDf = df[df["Sex"] == "M"].sample(frac=0.5, random_state=123).iloc[:n]
fDf.append(mDf)
Output
F1 F2 Sex
0 x1 a1 F
2 x3 a3 F
5 x6 a6 M
1 x2 a2 M
This should also work
n = 2
df.groupby('Sex', group_keys=False).apply(lambda x: x.sample(n))
Don't use frac that will give your a fraction of each group, but n that will give you a fixed value per group:
df.groupby('Sex').sample(n=2)
example output:
F1 F2 Sex
2 x3 a3 F
0 x1 a1 F
3 x4 a4 M
4 x5 a5 M
using a custom ratio
ratios = {'F':0.4, 'M':0.6} # sum should be 1
# total number desired
total = 4
# note that the exact number in the output depends
# on the rounding method to convert to int
# round should give the correct number but floor/ceil might
# under/over-sample
# see below for an example
s = pd.Series(ratios)*total
# convert to integer (chose your method, ceil/floor/round...)
s = np.ceil(s).astype(int)
df.groupby('Sex').apply(lambda x: x.sample(n=s[x.name])).droplevel(0)
example output:
F1 F2 Sex
0 x1 a1 F
6 x7 a7 F
4 x5 a5 M
3 x4 a4 M
1 x2 a2 M
I need some help to modify my function and how to apply it in order to iterate an ifelse condition through multiple features.
Suppose we have the following table t1
import pandas as pd
names = {'name': ['Jon','Bill','Maria','Emma']
,'feature1': [2,3,4,5]
,'feature2': [1,2,3,4]
,'feature3': [1,2,3,4]}
t1 = pd.DataFrame(names,columns=['name','feature1','feature2','feature3'])
I want to create 3 new columns based on an ifelse condition. Here is how I am doing it for the first feature:
# Define the conditions
def ifelsefunction(row):
if row['feature1'] >=3:
return 1
elif row['feature1'] ==2:
return 2
else:
return 0
# Apply the condition
t1['ft1'] = t1.apply(ifelsefunction, axis=1)
I would like to write the function into something iterable like this
def ifelsefunction(row, feature):
if row[feature] >=3:
return 1
elif row[feature] ==2:
return 2
else:
return 0
t1['ft1_score'] = t1.apply(ifelsefunction(row, 'feature1'), axis=1)
t1['ft2_score'] = t1.apply(ifelsefunction(row, 'feature2'), axis=1)
t1['ft3_score'] = t1.apply(ifelsefunction(row, 'feature3'), axis=1)
---- EDIT ----
Thanks for the answers, I may have over-simplified the actual problem.
How do I do the same for this conditions?
def ifelsefunction(var1, var2):
mask1 = (var1 >=3) and (var1<var2)
mask2 = var1 == 2
return np.select([mask1,mask2], [var1*0.7, var1*var2], default=0)
I think here is best avoid loops, use numpy.select for test and assign mask only for selected columns from list, for pass function with input DataFrame is used DataFrame.pipe:
# Define the conditions
def ifelsefunction(df):
m1 = df >= 3
m2 = df == 2
return np.select([m1, m2], [1, 2], default=0)
cols = ['feature1','feature2','feature3']
t1[cols] = t1[cols].pipe(ifelsefunction)
#alternative
#t1[cols] = ifelsefunction(t1[cols])
print (t1)
name feature1 feature2 feature3
0 Jon 2 0 0
1 Bill 1 2 2
2 Maria 1 1 1
3 Emma 1 1 1
For new columns use:
# Define the conditions
def ifelsefunction(df):
m1 = df >= 3
m2 = df == 2
return np.select([m1, m2], [1, 2], default=0)
cols = ['feature1','feature2','feature3']
new = [f'{x}_score' for x in cols]
t1[new] = t1[cols].pipe(ifelsefunction)
#alternative
#t1[new] = ifelsefunction(t1[cols])
print (t1)
name feature1 feature2 feature3 feature1_score feature2_score \
0 Jon 2 1 1 2 0
1 Bill 3 2 2 1 2
2 Maria 4 3 3 1 1
3 Emma 5 4 4 1 1
feature3_score
0 0
1 2
2 1
3 1
EDIT:
You can change function like:
def ifelsefunction(df, var1, var2):
mask1 = (df[var1] >=3) & (df[var1]<df[var2])
mask2 = df[var1] == 2
return np.select([mask1,mask2], [df[var1]*0.7, df[var1]*df[var2]], default=0)
t1['new'] = ifelsefunction(t1, 'feature3','feature1')
print (t1)
name feature1 feature2 feature3 new
0 Jon 2 1 1 0.0
1 Bill 3 2 2 6.0
2 Maria 4 3 3 2.1
3 Emma 5 4 4 2.8
Try using apply
t1["ft1_score"] = t1.feature1.apply(lambda x: 1 if x >= 3 else (2 if x == 2 else 0))
t1["ft2_score"] = t1.feature2.apply(lambda x: 1 if x >= 3 else (2 if x == 2 else 0))
t1["ft3_score"] = t1.feature3.apply(lambda x: 1 if x >= 3 else (2 if x == 2 else 0))
from itertools import product
import pandas as pd
df = pd.DataFrame.from_records(product(range(10), range(10)))
df = df.sample(90)
df.columns = "c1 c2".split()
df = df.sort_values(df.columns.tolist()).reset_index(drop=True)
# c1 c2
# 0 0 0
# 1 0 1
# 2 0 2
# 3 0 3
# 4 0 4
# .. .. ..
# 85 9 4
# 86 9 5
# 87 9 7
# 88 9 8
# 89 9 9
#
# [90 rows x 2 columns]
How do I quickly find, identify, and remove the last duplicate of all symmetric pairs in this data frame?
An example of symmetric pair is that '(0, 1)' is equal to '(1, 0)'. The latter should be removed.
The algorithm must be fast, so it is recommended to use numpy. Converting to python object is not allowed.
You can sort the values, then groupby:
a= np.sort(df.to_numpy(), axis=1)
df.groupby([a[:,0], a[:,1]], as_index=False, sort=False).first()
Option 2: If you have a lot of pairs c1, c2, groupby can be slow. In that case, we can assign new values and filter by drop_duplicates:
a= np.sort(df.to_numpy(), axis=1)
(df.assign(one=a[:,0], two=a[:,1]) # one and two can be changed
.drop_duplicates(['one','two']) # taken from above
.reindex(df.columns, axis=1)
)
One way is using np.unique with return_index=True and use the result to index the dataframe:
a = np.sort(df.values)
_, ix = np.unique(a, return_index=True, axis=0)
print(df.iloc[ix, :])
c1 c2
0 0 0
1 0 1
20 2 0
3 0 3
40 4 0
50 5 0
6 0 6
70 7 0
8 0 8
9 0 9
11 1 1
21 2 1
13 1 3
41 4 1
51 5 1
16 1 6
71 7 1
...
frozenset
mask = pd.Series(map(frozenset, zip(df.c1, df.c2))).duplicated()
df[~mask]
I will do
df[~pd.DataFrame(np.sort(df.values,1)).duplicated().values]
From pandas and numpy tri
s=pd.crosstab(df.c1,df.c2)
s=s.mask(np.triu(np.ones(s.shape)).astype(np.bool) & s==0).stack().reset_index()
Here's one NumPy based one for integers -
def remove_symm_pairs(df):
a = df.to_numpy(copy=False)
b = np.sort(a,axis=1)
idx = np.ravel_multi_index(b.T,(b.max(0)+1))
sidx = idx.argsort(kind='mergesort')
p = idx[sidx]
m = np.r_[True,p[:-1]!=p[1:]]
a_out = a[np.sort(sidx[m])]
df_out = pd.DataFrame(a_out)
return df_out
If you want to keep the index data as it is, use return df.iloc[np.sort(sidx[m])].
For generic numbers (ints/floats, etc.), we will use a view-based one -
# https://stackoverflow.com/a/44999009/ #Divakar
def view1D(a): # a is array
a = np.ascontiguousarray(a)
void_dt = np.dtype((np.void, a.dtype.itemsize * a.shape[1]))
return a.view(void_dt).ravel()
and simply replace the step to get idx with idx = view1D(b) in remove_symm_pairs.
If this needs to be fast, and if your variables are integer, then the following trick may help: let v,w be the columns of your vector; construct [v+w, np.abs(v-w)] =: [x, y]; then sort this matrix lexicographically, remove duplicates, and finally map it back to [v, w] = [(x+y), (x-y)]/2.
I have set the outcome variable y as a column in a csv. It loads properly and works when I print just y, but when I use y = y[x:] I start getting NaN as values.
y = previous_games_stats['Unnamed: 7'] #outcome variable (win/loss)
y = y[9:] #causes NaN for outcome variables
Then later in the file I print the outcome column. final_df is a dataframe which does not yet have the outcome variable set, so I set it below:
final_df['outcome'] = y
print(final_df['outcome'])
But the outcome is:
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN
5 NaN
6 NaN
7 NaN
8 NaN
9 L
It looks like the last value is correct (they should all be 'W' or 'L').
How can I line up my data frames properly so I do not get NaN?
Entire Code:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
np.random.seed(0)
from array import array
iris=load_iris()
previous_games_stats = pd.read_csv('stats/2016-2017 CANUCKS STATS.csv', header=1)
numGamesToLookBack = 10;
axis=1) #Predictor variables
X = previous_games_stats[['GF', 'GA']]
count = 0
final_df = pd.DataFrame(columns=['GF', 'GA'])
#final_y = pd.DataFrame(columns=['Unnamed: 7'])
y = previous_games_stats['Unnamed: 7'] #outcome variable (win/loss)
y = y[numGamesToLookBack-1:]
for game in range(0, 10):
X = previous_games_stats[['GF', 'GA']]
X = X[count:numGamesToLookBack] #num games to look back
stats_feature_names = list(X.columns.values)
df = pd.DataFrame(iris.data, columns=iris.feature_names)
stats_df = pd.DataFrame(X, columns=stats_feature_names).sum().to_frame().T
final_df = final_df.append(stats_df, ignore_index=True)
count+=1
numGamesToLookBack+=1
print("final_df:\n", final_df)
stats_target_names = np.array(['Win', 'Loss']) #don't need?...just a label it looks like
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)
final_df['outcome'] = y
final_df['outcome'].update(y) #ADDED UPDATE TO FIX NaN
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75 #for iris
final_df['is_train'] = np.random.uniform(0, 1, len(final_df)) <= .65
train, test = df[df['is_train']==True], df[df['is_train']==False]
stats_train = final_df[final_df['is_train']==True]
stats_test = final_df[final_df['is_train']==False]
features = df.columns[:4]
stats_features = final_df.columns[:2]
y = pd.factorize(train['species'])[0]
stats_y = pd.factorize(stats_train['outcome'])[0]
clf = RandomForestClassifier(n_jobs=2, random_state=0)
stats_clf = RandomForestClassifier(n_jobs=2, random_state=0)
clf.fit(train[features], y)
stats_clf.fit(stats_train[stats_features], stats_y)
stats_clf.predict_proba(stats_test[stats_features])[0:10]
preds = iris.target_names[clf.predict(test[features])]
stats_preds = stats_target_names[stats_clf.predict(stats_test[stats_features])]
pd.crosstab(stats_test['outcome'], stats_preds, rownames=['Actual Outcome'], colnames=['Predicted Outcome'])
print("~~~confusion matrix~~~\nColumns represent what we predicted for the outcome of the game, and rows represent the actual outcome of the game.\n")
print(pd.crosstab(stats_test['outcome'], stats_preds, rownames=['Actual Outcome'], colnames=['Predicted Outcome']))
It is expected, because y have no indices (no data) for first 9 values, so after assign back get NaNs.
If column is new and length of y is same as length of df assign numpy array:
final_df['outcome'] = y.values
But if lengths are different, it is a bit complicated, because need same lengths:
df = pd.DataFrame({'a':range(10), 'b':range(20,30)}).astype(str).radd('a')
print (df)
a b
0 a0 a20
1 a1 a21
2 a2 a22
3 a3 a23
4 a4 a24
5 a5 a25
6 a6 a26
7 a7 a27
8 a8 a28
9 a9 a29
y = df['a']
y = y[4:]
print (y)
4 a4
5 a5
6 a6
7 a7
8 a8
9 a9
Name: a, dtype: object
len(final_df) < len(y):
Filter y by final_df, then convert to numpy array for not align indices:
final_df = pd.DataFrame({'new':range(100, 105)})
final_df['s'] = y.iloc[:len(final_df)].values
print (final_df)
new s
0 100 a4
1 101 a5
2 102 a6
3 103 a7
4 104 a8
len(final_df) > len(y):
Create new Series by filtered index values:
final_df1 = pd.DataFrame({'new':range(100, 110)})
final_df1['s'] = pd.Series(y.values, index=final_df1.index[:len(y)])
print (final_df1)
new s
0 100 a4
1 101 a5
2 102 a6
3 103 a7
4 104 a8
5 105 a9
6 106 NaN
7 107 NaN
8 108 NaN
9 109 NaN
I just recently made the switch from R to python and have been having some trouble getting used to data frames again as opposed to using R's data.table. The problem I've been having is that I'd like to take a list of strings, check for a value, then sum the count of that string- broken down by user. So I would like to take this data:
A_id B C
1: a1 "up" 100
2: a2 "down" 102
3: a3 "up" 100
3: a3 "up" 250
4: a4 "left" 100
5: a5 "right" 102
And return:
A_id_grouped sum_up sum_down ... over_200_up
1: a1 1 0 ... 0
2: a2 0 1 0
3: a3 2 0 ... 1
4: a4 0 0 0
5: a5 0 0 ... 0
Before I did it with the R code (using data.table)
>DT[ ,list(A_id_grouped, sum_up = sum(B == "up"),
+ sum_down = sum(B == "down"),
+ ...,
+ over_200_up = sum(up == "up" & < 200), by=list(A)];
However all of my recent attempts with Python have failed me:
DT.agg({"D": [np.sum(DT[DT["B"]=="up"]),np.sum(DT[DT["B"]=="up"])], ...
"C": np.sum(DT[(DT["B"]=="up") & (DT["C"]>200)])
})
Thank you in advance! it seems like a simple question however I couldn't find it anywhere.
To complement unutbu's answer, here's an approach using apply on the groupby object.
>>> df.groupby('A_id').apply(lambda x: pd.Series(dict(
sum_up=(x.B == 'up').sum(),
sum_down=(x.B == 'down').sum(),
over_200_up=((x.B == 'up') & (x.C > 200)).sum()
)))
over_200_up sum_down sum_up
A_id
a1 0 0 1
a2 0 1 0
a3 1 0 2
a4 0 0 0
a5 0 0 0
There might be a better way; I'm pretty new to pandas, but this works:
import pandas as pd
import numpy as np
df = pd.DataFrame({'A_id':'a1 a2 a3 a3 a4 a5'.split(),
'B': 'up down up up left right'.split(),
'C': [100, 102, 100, 250, 100, 102]})
df['D'] = (df['B']=='up') & (df['C'] > 200)
grouped = df.groupby(['A_id'])
def sum_up(grp):
return np.sum(grp=='up')
def sum_down(grp):
return np.sum(grp=='down')
def over_200_up(grp):
return np.sum(grp)
result = grouped.agg({'B': [sum_up, sum_down],
'D': [over_200_up]})
result.columns = [col[1] for col in result.columns]
print(result)
yields
sum_up sum_down over_200_up
A_id
a1 1 0 0
a2 0 1 0
a3 2 0 1
a4 0 0 0
a5 0 0 0
An old question; I feel a better way, and avoiding the apply, would be to create a new dataframe, before grouping and aggregating:
df = df.set_index('A_id')
outcome = {'sum_up' : df.B.eq('up'),
'sum_down': df.B.eq('down'),
'over_200_up' : df.B.eq('up') & df.C.gt(200)}
outcome = pd.DataFrame(outcome).groupby(level=0).sum()
outcome
sum_up sum_down over_200_up
A_id
a1 1 0 0
a2 0 1 0
a3 2 0 1
a4 0 0 0
a5 0 0 0
Another option would be to unstack before grouping; however, I feel it is a longer, unnecessary process:
(df
.set_index(['A_id', 'B'], append = True)
.C
.unstack('B')
.assign(gt_200 = lambda df: df.up.gt(200))
.groupby(level='A_id')
.agg(sum_up=('up', 'count'),
sum_down =('down', 'count'),
over_200_up = ('gt_200', 'sum')
)
)
sum_up sum_down over_200_up
A_id
a1 1 0 0
a2 0 1 0
a3 2 0 1
a4 0 0 0
a5 0 0 0
Here, what I have recently learned using df assign and numpy's where method:
df3=
A_id B C
1: a1 "up" 100
2: a2 "down" 102
3: a3 "up" 100
3: a3 "up" 250
4: a4 "left" 100
5: a5 "right" 102
df3.assign(sum_up= np.where(df3['B']=='up',1,0),sum_down= np.where(df3['B']=='down',1,0),
over_200_up= np.where((df3['B']=='up') & (df3['C']>200),1,0)).groupby('A_id',as_index=False).agg({'sum_up':sum,'sum_down':sum,'over_200_up':sum})
outcome=
A_id sum_up sum_down over_200_up
0 a1 1 0 0
1 a2 0 1 0
2 a3 2 0 1
3 a4 0 0 0
4 a5 0 0 0
This also resembles with if you are familiar with SQL case and want to apply the same logic in pandas
select a,
sum(case when B='up' then 1 else 0 end) as sum_up
....
from table
group by a