I am trying to map zip_codes of given data frame to regions provided by a second data frame.
The regions are defined by a range of integers (for example, range 1000-1299 is Noord-Holland, 1300-1379 is Flevoland and so on). The data looks like this:
df1
zip_code state_name
0 7514 None
1 7891 None
2 2681 None
3 7606 None
4 5051 None
5 2611 None
6 4341 None
7 1851 None
8 1861 None
9 2715 None
df2
zpcd1 zpcd2 region
0 1000 1299 Noord-Holland
1 1300 1379 Flevoland
2 1380 1384 Noord-Holland
3 1390 1393 Utrecht
4 1394 1494 Noord-Holland
5 1396 1496 Utrecht
6 1398 1425 Noord-Holland
7 1426 1427 Utrecht
8 1428 1429 Zuid-Holland
9 1430 2158 Noord-Holland
The duplicates regions are ok, because one region can have different zip code ranges.
The question is: How do I map the zip code values in df1 to the ranges defined in df2 in order to assign the region name to that row?
I tried
def region_map(row):
global df2
if row['zip_code'] in range(nlreg.zpcd1, nlreg.zpcd2, 1):
return df2.region
df1['state_name'] = df1.apply(lambda row: region_map(row))
but it returns a KeyError: 'zip_code'.
Thank you in advance
EDIT
I got the result that I was searching for using
df2['zip_c_range']=list(zip(df2.zpcd1, df2.zpcd2))
for i,v in tqdm(df1.zip_code.items()):
for x,z in df2.zip_c_range.items():
if v in range(*z):
df1['state_name'][i]=df2.region[x]
but I am sure that there is a better solution using lambda.
I think what you're trying to do is the following (nlreg being df2):
def region_map(zc):
return nlreg.loc[(nlreg['zpcd1'] <= zc) & (zc <= nlreg['zpcd2']), 'region']
df1['state_name'] = df1['zip_code'].apply(lambda z: region_map(z))
I have a dataframe like as shown below
id,Name,country,amount,qty
1,ABC,USA,123,4500
1,ABC,USA,156,3210
1,BCE,USA,687,2137
1,DEF,UK,456,1236
1,ABC,nan,216,324
1,DEF,nan,12678,11241
1,nan,nan,637,213
1,BCE,nan,213,543
1,XYZ,KOREA,432,321
1,XYZ,AUS,231,321
sf = pd.read_clipboard(sep=',')
I would like to do the below
a) Get top 3 based on amount for each id and other selected columns such as Name and country. Meaning, we get top 3 based id and Name first and later, we again get top 3 based on id and country
b) Find out how much does each of the top 3 item contribute to total amount for each unique id.
So, I tried the below
sf_name = sf.groupby(['id','Name'],dropna=False)['amount'].sum().nlargest(3).reset_index().rename(columns={'amount':'Name_amount'})
sf_country = sf.groupby(['id','country'],dropna=False)['amount'].sum().nlargest(3).reset_index().rename(columns={'amount':'country_amount'})
sf_name['total'] = sf.groupby('id')['amount'].sum()
sf_country['total'] = sf.groupby('id')['amount'].sum()
sf_name['name_pct_total'] = (sf_name['Name_amount']/sf_name['total'])*100
sf_country['country_pct_total'] = (sf_country['country_amount']/sf_country['total'])*100
As you can see, I am repeating the same operation for each column.
But in my real dataframe, I have to do this groupby id and find Top3 and compute pct_total % for another 8 columns (along with Name and country)
Is there any efficient, elegant and scalable solution that you can share?
I expect my output to be like as below
update - full error
KeyError Traceback (most recent call last)
C:\Users\Test\AppData\Local\Temp/ipykernel_8720/1850446854.py in <module>
----> 1 df_new.groupby(['unique_key','Resale Customer'],dropna=False)['Revenue Resale EUR'].sum().nlargest(3).reset_index(level=1, name=f'{c}_revenue')
~\Anaconda3\lib\site-packages\pandas\core\series.py in nlargest(self, n, keep)
3834 dtype: int64
3835 """
-> 3836 return algorithms.SelectNSeries(self, n=n, keep=keep).nlargest()
3837
3838 def nsmallest(self, n: int = 5, keep: str = "first") -> Series:
~\Anaconda3\lib\site-packages\pandas\core\algorithms.py in nlargest(self)
1135 #final
1136 def nlargest(self):
-> 1137 return self.compute("nlargest")
1138
1139 #final
~\Anaconda3\lib\site-packages\pandas\core\algorithms.py in compute(self, method)
1181
1182 dropped = self.obj.dropna()
-> 1183 nan_index = self.obj.drop(dropped.index)
1184
1185 if is_extension_array_dtype(dropped.dtype):
~\Anaconda3\lib\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs)
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
312
313 return wrapper
~\Anaconda3\lib\site-packages\pandas\core\series.py in drop(self, labels, axis, index, columns, level, inplace, errors)
4769 dtype: float64
4770 """
-> 4771 return super().drop(
4772 labels=labels,
4773 axis=axis,
~\Anaconda3\lib\site-packages\pandas\core\generic.py in drop(self, labels, axis, index, columns, level, inplace, errors)
4277 for axis, labels in axes.items():
4278 if labels is not None:
-> 4279 obj = obj._drop_axis(labels, axis, level=level, errors=errors)
4280
4281 if inplace:
~\Anaconda3\lib\site-packages\pandas\core\generic.py in _drop_axis(self, labels, axis, level, errors, consolidate, only_slice)
4321 new_axis = axis.drop(labels, level=level, errors=errors)
4322 else:
-> 4323 new_axis = axis.drop(labels, errors=errors)
4324 indexer = axis.get_indexer(new_axis)
4325
~\Anaconda3\lib\site-packages\pandas\core\indexes\multi.py in drop(self, codes, level, errors)
2234 for level_codes in codes:
2235 try:
-> 2236 loc = self.get_loc(level_codes)
2237 # get_loc returns either an integer, a slice, or a boolean
2238 # mask
~\Anaconda3\lib\site-packages\pandas\core\indexes\multi.py in get_loc(self, key, method)
2880 if keylen == self.nlevels and self.is_unique:
2881 try:
-> 2882 return self._engine.get_loc(key)
2883 except TypeError:
2884 # e.g. test_partial_slicing_with_multiindex partial string slicing
~\Anaconda3\lib\site-packages\pandas\_libs\index.pyx in pandas._libs.index.BaseMultiIndexCodesEngine.get_loc()
~\Anaconda3\lib\site-packages\pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
~\Anaconda3\lib\site-packages\pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.UInt64HashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.UInt64HashTable.get_item()
KeyError: 8937472
Simpliest is use loop by columnsnames in list, for pct_amount use GroupBy.transform with sum per id and divide amount column:
dfs = []
cols = ['Name','country']
for c in cols:
df = (sf.groupby(['id',c],dropna=False)['amount'].sum()
.nlargest(3)
.reset_index(level=1, name=f'{c}_amount'))
df[f'{c}_pct_total']=(df[f'{c}_amount'].div(df.groupby('id',dropna=False)[f'{c}_amount']
.transform('sum'))*100)
dfs.append(df)
df = pd.concat(dfs, axis=1)
print (df)
Name Name_amount Name_pct_total country country_amount \
id
1 DEF 13134 89.365177 NaN 13744
1 BCE 900 6.123699 USA 966
1 XYZ 663 4.511125 UK 456
country_pct_total
id
1 90.623764
1 6.369511
1 3.006726
Testing with Resale Customer column name::
print (sf)
id Resale Customer country amount qty
0 1 ABC USA 123 4500
1 1 ABC USA 156 3210
2 1 BCE USA 687 2137
3 1 DEF UK 456 1236
4 1 ABC NaN 216 324
5 1 DEF NaN 12678 11241
6 1 NaN NaN 637 213
7 1 BCE NaN 213 543
8 1 XYZ KOREA 432 321
9 1 XYZ AUS 231 321
Test columns names:
print (sf.columns)
Index(['id', 'Resale Customer', 'country', 'amount', 'qty'], dtype='object')
dfs = []
cols = ['Resale Customer','country']
for c in cols:
df = (sf.groupby(['id',c],dropna=False)['amount'].sum()
.nlargest(3)
.reset_index(level=1, name=f'{c}_amount'))
df[f'{c}_pct_total']=(df[f'{c}_amount'].div(df.groupby('id',dropna=False)[f'{c}_amount']
.transform('sum'))*100)
dfs.append(df)
df = pd.concat(dfs, axis=1)
print (df)
Resale Customer Resale Customer_amount Resale Customer_pct_total country \
id
1 DEF 13134 89.365177 NaN
1 BCE 900 6.123699 USA
1 XYZ 663 4.511125 UK
country_amount country_pct_total
id
1 13744 90.623764
1 966 6.369511
1 456 3.006726
Solution with melt is possible, but more complicated:
df = sf.melt(id_vars=['id', 'amount'], value_vars=['Name','country'])
df = (df.groupby(['id','variable', 'value'],dropna=False)['amount']
.sum()
.sort_values(ascending=False)
.groupby(level=[0,1],dropna=False)
.head(3)
.to_frame()
.assign(pct_total=lambda x: x['amount'].div(x.groupby(level=[0,1],dropna=False)['amount'].transform('sum')).mul(100),
g=lambda x: x.groupby(level=[0,1],dropna=False).cumcount())
.set_index('g', append=True)
.reset_index('value')
.unstack(1)
.sort_index(level=1, axis=1)
.droplevel(1)
)
df.columns = df.columns.map(lambda x: f'{x[1]}_{x[0]}')
print (df)
Name_amount Name_pct_total Name_value country_amount country_pct_total \
id
1 13134 89.365177 DEF 13744 90.623764
1 900 6.123699 BCE 966 6.369511
1 663 4.511125 XYZ 456 3.006726
country_value
id
1 NaN
1 USA
1 UK
df1.fillna(0)
Montant vente Marge
0 778283.75 13.63598
1 312271.20 9.26949
2 163214.65 14.50288
3 191000.20 9.55818
4 275970.00 12.76534
... ... ...
408 2999.80 14.60610
409 390.00 0.00000
410 699.00 26.67334
411 625.00 30.24571
412 0.00 24.79797
x = df1.iloc[:,1:3] # 1t for rows and second for columns
x
Marge
0 13.63598
1 9.26949
2 14.50288
3 9.55818
4 12.76534
... ...
408 14.60610
409 NaN
410 26.67334
411 30.24571
412 24.79797
413 rows × 1 columns
Why does the line 409 has a 0.000value first and then after iloc, it has NaN?
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
You should learn which functions mutate the data frame and which doesn't. For example fillna does not mutate the dataframe. Instead you can use inplace=True
df1 = df1.fillna(0)
or
df1.fillna(0, inplace=True)
I try to sort one column after another. In doing so, I deliberately create duplicates.
This is what my df looks like at the moment:
ticket magic Id
0 193454845 1311 1313
1 193454846 1927 1311
2 193454847 1810 1927
3 193454852 1313 NaN
What I want:
ticket magic Id
0 193454845 1311 1311
1 193454846 1927 1927
2 193454847 1810 NaN
3 193454852 1313 1313
The column "magic" and "Id" should be identical if no NaN.
Does anyone have an idea?
Thank you very much!
I guess the value in the id column for magic = 1313 should be NaN instead.
data['id'] = np.where(data['Id'].isna(), np.nan, data['magic'])
Update:
data.merge(data[['Id']], left_on='magic', right_on='Id', how='left',suffixes=['_x','']).drop(columns='Id_x')
ticket magic Id
0 193454845 1311 1311.0
1 193454846 1927 1927.0
2 193454847 1810 NaN
3 193454852 1313 1313.0
Context
So I'm iterating through a bunch of files where each file is a subject, and in each file there are 3 columns, each representing the x,y,z axis at a certain point (the lengths across files are not the same). I want to put all of them into a multi-index PD df.
What I've tried
I found this post and when I do it, it seems to work
d_ = dict()
DATA_ROOT = "../sample_data/chest_mounted/"
cutoff_min = 0
for fileName in os.listdir(DATA_ROOT):
if ".csv" in fileName and '.swp' not in fileName:
with open(DATA_ROOT + fileName) as f:
data = np.asarray(list(map(lambda x: x.strip().split(",")[1:-1], f.readlines())), dtype=np.int)
subj_key = "Subject_" + str(fileName.split(".")[0])
d_[subj_key] = pd.DataFrame(data, columns=['x_acc', 'y_acc', 'z_acc'])
df = pd.concat(d_.values(), keys=d_.keys())
When I do df.head() it looks exactly like what I want (I think?)
x_acc y_acc z_acc
Subject_1 0 1502 2215 2153
1 1667 2072 2047
2 1611 1957 1906
3 1601 1939 1831
4 1643 1965 1879
The Problem
However, when I try to index by Subject_x I get an error. Instead, I have to first do something like
df["x_acc"]["Subject_1"]
where I access the x_acc first then the Subject_1.
Questions
1) I had the impression that I was creating a multi-index but trying df["x_acc"]["Subject_1"] that does not seem to be the case. How do I transform it to that?
2) Is there any way to change the index so that I access by Subject first?
Use loc for selecting - first by level of MultiIndex and then by column name or xs implemented for simple selections:
df = df.loc['Subject_1', 'x_acc']
print (df)
0 1502
1 1667
2 1611
3 1601
4 1643
Name: x_acc, dtype: int64
df = df.xs('Subject_1')
print (df)
x_acc y_acc z_acc
0 1502 2215 2153
1 1667 2072 2047
2 1611 1957 1906
3 1601 1939 1831
4 1643 1965 1879
And for more complicated selections use slicers:
idx = pd.IndexSlice
df = df.loc['Subject_1', idx['x_acc','y_acc']]
print (df)
x_acc y_acc
0 1502 2215
1 1667 2072
2 1611 1957
3 1601 1939
4 1643 1965
Also it seems your code should be simplify by read_csv:
d_ = dict()
DATA_ROOT = "../sample_data/chest_mounted/"
cutoff_min = 0
for fileName in os.listdir(DATA_ROOT):
if ".csv" in fileName and '.swp' not in fileName:
subj_key = "Subject_" + str(fileName.split(".")[0])
d_[subj_key] = pd.read_csv(fileName, names=['x_acc', 'y_acc', 'z_acc'])
df = pd.concat(d_)