Pandas KeyError when working on split data frame - python

I want to perform some operations on a pandas data frame that is split into chunks. After splitting the data frame, I then try to iterate over the chunks, but after the first iterations runs well, I get an error (see below). I have gone through some questions like these: 1 and 2 but they don't quite address my issue. Kindly help me resolve this as I don't fully understand it.
import pandas as pd
tupList = [('Eisenstadt', 'Paris','1', '2'), ('London', 'Berlin','1','3'), ('Berlin', 'stuttgat','1', '4'),
('Liverpool', 'Southampton','1', '5'),('Tirana', 'Blackpool', '1', '6'),('blackpool', 'tirana','1','7'),
('Paris', 'Lyon','1','8'), ('Manchester', 'Nice','1','10'),('Orleans', 'Madrid','1', '12'),
('Lisbon','Stockholm','1','12')]
cities = pd.DataFrame(tupList, columns=['Origin', 'Destination', 'O_Code', 'D_code'])
# purpose - splits the DataFrame into smaller of max size chunkSize (last is smaller)
def splitDataFrameIntoSmaller(df, chunkSize = 3):
listOfDf = list()
numberChunks = len(df) // chunkSize + 1
for i in range(numberChunks):
listOfDf.append(df[i*chunkSize:(i+1)*chunkSize])
return listOfDf
citiesChunks = splitDataFrameIntoSmaller(cities)
for ind, cc in enumerate(citiesChunks):
cc["distance"] = 0
cc["time"] = 0
for i in xrange(len(cc)):
al = cc['Origin'][i]
bl = cc['Destination'][i]
'...' #trucating to make it readable
cc.to_csv('out.csv', sep=',', encoding='utf-8')
Traceback (most recent call last):
File ..., line 39, in <module>
al = cc['Origin'][i]
File ..., line 603, in __getitem__
result = self.index.get_value(self, key)
File ..., line 2169, in get_value
tz=getattr(series.dtype, 'tz', None))
File "pandas\index.pyx", line 98, in pandas.index.IndexEngine.get_value (pandas\index.c:3557)
File "pandas\index.pyx", line 106, in pandas.index.IndexEngine.get_value (pandas\index.c:3240)
File "pandas\index.pyx", line 154, in pandas.index.IndexEngine.get_loc (pandas\index.c:4279)
File "pandas\src\hashtable_class_helper.pxi", line 404, in pandas.hashtable.Int64HashTable.get_item (pandas\hashtable.c:8564)
File "pandas\src\hashtable_class_helper.pxi", line 410, in pandas.hashtable.Int64HashTable.get_item (pandas\hashtable.c:8508)
KeyError: 0L

You can first floor divide index values and then use list comprehension - loop by unique values and select by loc, last reset_index for remove duplicated index:
cities.index = cities.index // 3
print (cities)
Origin Destination O_Code D_code
0 Eisenstadt Paris 1 2
0 London Berlin 1 3
0 Berlin stuttgat 1 4
1 Liverpool Southampton 1 5
1 Tirana Blackpool 1 6
1 blackpool tirana 1 7
2 Paris Lyon 1 8
2 Manchester Nice 1 10
2 Orleans Madrid 1 12
3 Lisbon Stockholm 1 12
citiesChunks = [cities.loc[[x]].reset_index(drop=True) for x in cities.index.unique()]
#print (citiesChunks)
print (citiesChunks[0])
Origin Destination O_Code D_code
0 Eisenstadt Paris 1 2
1 London Berlin 1 3
2 Berlin stuttgat 1 4
Last need iterrows if need loop in DataFrame:
#write columns to file first
cols = ['Origin', 'Destination', 'O_Code', 'D_code', 'distance', 'time']
df = pd.DataFrame(columns=cols)
df.to_csv('out.csv', encoding='utf-8', index=False)
for ind, cc in enumerate(citiesChunks):
cc["distance"] = 0
cc["time"] = 0
for i, val in cc.iterrows():
al = cc.loc[i, 'Origin']
bl = cc.loc[i, 'Destination']
'...' #trucating to make it readable
cc.to_csv('out.csv', encoding='utf-8', mode='a', header=None, index=False)
print (cc.to_csv(encoding='utf-8'))
,Origin,Destination,O_Code,D_code,distance,time
0,Eisenstadt,Paris,1,2,0,0
1,London,Berlin,1,3,0,0
2,Berlin,stuttgat,1,4,0,0
,Origin,Destination,O_Code,D_code,distance,time
0,Liverpool,Southampton,1,5,0,0
1,Tirana,Blackpool,1,6,0,0
2,blackpool,tirana,1,7,0,0
,Origin,Destination,O_Code,D_code,distance,time
0,Paris,Lyon,1,8,0,0
1,Manchester,Nice,1,10,0,0
2,Orleans,Madrid,1,12,0,0
,Origin,Destination,O_Code,D_code,distance,time
0,Lisbon,Stockholm,1,12,0,0

Related

Finding the distance between 2 points fails

I'm cycling through points in a geodataframe by index in such away where I am comparing index 0 and 1, then 1 and 2, then 3 and 4 etc... The purpose is to compare the 2 points. if the points are occupying the same location pass, else draw a line between the 2 points and summarize some stats. I figured if I compared the distance between the 2 points and got 0 then that would be skipped. What I have done before was to pass the 2 points in a single geodataframe into a function that returns a value for distance. They are in a projected crs units metres.
def getdist(pt_pair):
shift_pt = pt_pair.shift()
return pt_pair.distance(shift_pt)[1]
When I pass my 2 points to the function the first 2 return 0.0 the next return nan then I get this error.
Traceback (most recent call last):
File "C:/.../PycharmProjects/.../vessel_track_builder.py", line 33, in <module>
print(getdist(set_pts))
File "C:/.../PycharmProjects/.../vessel_track_builder.py", line 19, in getdist
if math.isnan(mdist1.distance(shift_pt)[1]):
File "C:\OSGEO4~1\apps\Python37\lib\site-packages\pandas\core\series.py", line 871, in __getitem__
result = self.index.get_value(self, key)
File "C:\OSGEO4~1\apps\Python37\lib\site-packages\pandas\core\indexes\base.py", line 4405, in get_value
return self._engine.get_value(s, k, tz=getattr(series.dtype, "tz", None))
File "pandas\_libs\index.pyx", line 80, in pandas._libs.index.IndexEngine.get_value
File "pandas\_libs\index.pyx", line 90, in pandas._libs.index.IndexEngine.get_value
File "pandas\_libs\index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 997, in
pandas._libs.hashtable.Int64HashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 1004, in
pandas._libs.hashtable.Int64HashTable.get_item
KeyError: 1
Process finished with exit code 1
Thought this might be an error in the point geometry so I added an if nan return 0 to the function.
def getdist(pt_pair):
shift_pt = pt_pair.shift()
if math.isnan(pt_pair.distance(shift_pt)[1]):
return 0
else:
return pt_pair.distance(shift_pt)[1]
The result is 0.0, 0, then the aforementioned error.
I added a print statement of my geodataframes but didn't see anything out of the ordinary.
index ... MMSI MONTH geometry
0 92 ... 123 4 POINT (2221098.494 1668358.870)
1 39 ... 123 4 POINT (2221098.494 1668358.870)
[2 rows x 12 columns]
index ... MMSI MONTH geometry
1 39 ... 456 4 POINT (2221098.494 1668358.870)
2 3231 ... 456 4 POINT (2221098.494 1668358.870)
[2 rows x 12 columns]
index ... MMSI MONTH geometry
2 3231 ... 789 4 POINT (2221098.494 1668358.870)
3 1032 ... 789 4 POINT (2221098.494 1668358.870)
I tried it on some test data with simple points and it went through them fine so I am wondering if there is something with how I am passing the geodataframe to the function. Since I am trying to compare each point to the one after it I am using the index to keep the order, could that be the issue?
for mmsi in points_gdf.MMSI.unique():
track_pts = points_gdf[(points_gdf.MMSI == mmsi)].sort_values(['POSITION_UTC_DATE']).reset_index()
print(track_pts.shape[0])
for index, row in track_pts.iterrows():
if index + 1 < track_pts.shape[0]:
set_pts = track_pts[(track_pts.index == index) | (track_pts.index == index + 1)]
print(set_pts)
print(getdist(set_pts))
else:
sys.exit()
I am noticing the index header which when I look at the data in QGIS there is no index column the first column is OBJECTID and the data is stored in a filegeodatabase. Could the index column be causing me the issue?
Instead of looping through each pair of points, do this once:
dist_to_next_point = track_pts.distance(track_pts.shift()).dropna()

Pandas groupby based on column value

I have following dataframe - dfgeo:
x y z zt n k pv geometry dist
0 6574878.210 4757530.610 1152.588 1 8 4 90 POINT (6574878.210 4757530.610) 0.000000
1 6574919.993 4757570.314 1174.724 0 POINT (6574919.993 4757570.314) 57.638760
2 6575020.518 4757665.839 1177.339 0 POINT (6575020.518 4757665.839) 138.673362
3 6575239.548 4757873.972 1160.156 1 8 4 90 POINT (6575239.548 4757873.972) 302.148120
4 6575351.603 4757980.452 1202.418 0 POINT (6575351.603 4757980.452) 154.577856
5 6575442.780 4758067.093 1199.297 0 POINT (6575442.780 4758067.093) 125.777217
6 6575538.217 4758157.782 1192.914 1 8 4 90 POINT (6575538.217 4758157.782) 131.653772
7 6575594.625 4758240.033 1217.442 0 POINT (6575594.625 4758240.033) 99.735096
8 6575738.820 4758450.289 1174.477 0 POINT (6575738.820 4758450.289) 254.950551
9 6575850.937 4758613.772 1123.852 1 8 4 90 POINT (6575850.937 4758613.772) 198.234490
10 6575984.323 4758647.118 1131.761 0 POINT (6575984.323 4758647.118) 137.491020
11 6576204.312 4758702.115 1119.407 0 POINT (6576204.312 4758702.115) 226.759410
12 6576303.976 4758727.031 1103.064 0 POINT (6576303.976 4758727.031) 102.731300
13 6576591.496 4758798.910 1114.06 0 POINT (6576591.496 4758798.910) 296.368590
14 6576736.965 4758835.277 1120.285 1 8 4 90 POINT (6576736.965 4758835.277) 149.945952
I am trying to group by zt values an summarize dist column. I have tried this:
def summarize(group):
s = group['zt'].eq(1).cumsum()
return group.groupby(s).agg(
D=('dist', 'sum')
)
dfzp=dfgeo.apply(summarize)
But i get following errors on last line of code
s = group['zt'].eq(1).cumsum()
File "C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py", line 871, in __getitem__
result = self.index.get_value(self, key)
File "C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py", line 4405, in get_value
return self._engine.get_value(s, k, tz=getattr(series.dtype, "tz", None))
File "pandas\_libs\index.pyx", line 80, in pandas._libs.index.IndexEngine.get_value
File "pandas\_libs\index.pyx", line 90, in pandas._libs.index.IndexEngine.get_value
File "pandas\_libs\index.pyx", line 135, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index_class_helper.pxi", line 109, in pandas._libs.index.Int64Engine._check_type
KeyError: 'zt'
Any help in resolving this appreciated.
If need pass Dataframe to function use:
dfzp=summarize(dfgeo)
Or DataFrame.pipe:
dfzp=dfgeo.pipe(summarize)
If use DataFrame.apply then is used function per columns or per rows if axis=1.

Python convert conditional array to dataframe

I have a dataframe with the following information:
ticker date close gap
0 BHP 1981-07-31 0.945416 -0.199458
1 BHP 1981-08-31 0.919463 -0.235930
2 BHP 1981-09-30 0.760040 -0.434985
3 BHP 1981-10-30 0.711842 -0.509136
4 BHP 1981-11-30 0.778578 -0.428161
.. ... ... ... ...
460 BHP 2019-11-29 38.230000 0.472563
461 BHP 2019-12-31 38.920000 0.463312
462 BHP 2020-01-31 39.400000 0.459691
463 BHP 2020-02-28 33.600000 0.627567
464 BHP 2020-03-31 28.980000 0.784124
I developed the following code to find where the rows are when it crosses 0:
zero_crossings =np.where(np.diff(np.sign(BHP_data['gap'])))[0]
This returns:
array([ 52, 54, 57, 75, 79, 86, 93, 194, 220, 221, 234, 235, 236,
238, 245, 248, 277, 379, 381, 382, 383, 391, 392, 393, 395, 396],
dtype=int64)
I need to be able to do the following:
calculate the number of months between points where 'gap' crosses 0
remove items where the number of months is <12
average the remaining months
However, I don't know how to turn this nd.array into something useful that I can make the calculations from. When I try:
pd.DataFrame(zero_crossings)
I get the following df, which only returns the index:
0
0 52
1 54
2 57
3 75
4 79
5 86
.. ..
Please help...
Just extended your code a bit to get the zero_crossings into the original dataframe as required.
import pandas as pd
import numpy as np
BHP_data = pd.DataFrame({'gap': [-0.199458, 0.472563, 0.463312, 0.493318, -0.509136, 0.534985, 0.784124]})
BHP_data['zero_crossings'] = 0
zero_crossings = np.where(np.diff(np.sign(BHP_data['gap'])))[0]
print(zero_crossings) # [0 3 4]
# Updates the column to 1 based on the 0 crossing
BHP_data.loc[zero_crossings, 'zero_crossings'] = 1
print(BHP_data)
Output
gap zero_crossings
0 -0.199458 1
1 0.472563 0
2 0.463312 0
3 0.493318 1
4 -0.509136 1
5 0.534985 0
6 0.784124 0

How to convert label data with None values to OneHot using LabelBinarizer sklearn

I have label data that same of the values are np.nan.
I want to convert the data to OneHot vector using LabelBinarizer, and the np.nan will convert to zero-array.
But I get an error. I success to convert the data with get_dummies from pandas model.
I can't use the get_dummies function because the train and the test data coming with different files and different time. I want to use sklearn model, for save it, and us the model latter.
Code for example:
In[11]: df = pd.DataFrame({'CITY':['London','NYC','Manchester',np.nan],'Country':['UK','US','UK','AUS']})
In[12]: df
Out[12]:
CITY Country
0 London UK
1 NYC US
2 Manchester UK
3 NaN AUS
In[13]: pd.get_dummies(df['CITY'])
Out[13]:
London Manchester NYC
0 1 0 0
1 0 0 1
2 0 1 0
3 0 0 0
In[14]: from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
In[15]: lb.fit_transform(df['CITY'])
Traceback (most recent call last):
File "/home/oshrib/.conda/envs/on_target/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-16-d0afb38b2695>", line 1, in <module>
lb.fit_transform(df['CITY'])
File "/home/oshrib/.conda/envs/on_target/lib/python3.5/site-packages/sklearn/preprocessing/label.py", line 307, in fit_transform
return self.fit(y).transform(y)
File "/home/oshrib/.conda/envs/on_target/lib/python3.5/site-packages/sklearn/preprocessing/label.py", line 276, in fit
self.y_type_ = type_of_target(y)
File "/home/oshrib/.conda/envs/on_target/lib/python3.5/site-packages/sklearn/utils/multiclass.py", line 288, in type_of_target
if (len(np.unique(y)) > 2) or (y.ndim >= 2 and len(y[0]) > 1):
File "/home/oshrib/.conda/envs/on_target/lib/python3.5/site-packages/numpy/lib/arraysetops.py", line 223, in unique
return _unique1d(ar, return_index, return_inverse, return_counts)
File "/home/oshrib/.conda/envs/on_target/lib/python3.5/site-packages/numpy/lib/arraysetops.py", line 283, in _unique1d
ar.sort()
TypeError: unorderable types: float() < str()

Slice pandas dataframe json column into columns

I have a data frame which have two columns in JSON format, like this:
author biblio series
Mehrdad Vahabi {'volume': 68, 'month': 'January', {'handle':'RePEc:aka:aoecon', 'name': 'Oeconomica'}
'name': 'János Kornai',
'issue': 's', 'handle':
'n:v:68:y:2018:i',
'year': '2018',
'pages': '27-52', 'doi': ''}
Michael Bailey {'c_date': '2017', 'number': {'handle': '', 'name': ''}
'23608', 'handle': 'RePEc:nbr:
nberwo:23608', 'name': 'Measuring'}
I Want to my data frame looks like this:
author biblio.volume biblio.month biblio.name biblio.issue biblio.handle bibilio.year biblio.pages biblio.doi biblio.c_date bibi¡lio.number series.handle series.name
Mehrdad Vahabi 68 January János Kornai s n:v:68:y:2018:i 2018 27-52 NA NA RePEc:aka:aoecon Oeconomica
Michael Bailey NA Na Meausuring NA nberwo:23608 NA NA NA 2017 23608
I try do it using the answers in this question, but no one works for me.
How can I do it?
[EDIT]
Here is a sample of the data
[EDIT]
Following the #jezrael solution I get this:
df1 = pd.DataFrame(df['biblio'].values.tolist())
df1.columns = 'biblio.'+ df1.columns
df2 = pd.DataFrame(df['series'].values.tolist())
df2.columns = 'series.'+ df2.columns
col = df.columns.difference(['biblio','series'])
df = pd.concat([df[col], df1, df2],axis=1)
print (df)
Traceback (most recent call last):
File "dfs.py", line 8, in <module>
df1.columns = 'bibliographic.'+ df1.columns
File "/Users/danielotero/anaconda3/lib/python3.6/site-
packages/pandas/core/indexes/range.py", line 583, in _evaluate_numeric_binop
other = self._validate_for_numeric_binop(other, op, opstr)
File "/Users/danielotero/anaconda3/lib/python3.6/site-
packages/pandas/core/indexes/base.py", line 3961, in
_validate_for_numeric_binop
raise TypeError("can only perform ops with scalar values")
TypeError: can only perform ops with scalar values
And with json_normalize:
Traceback (most recent call last):
File "/Users/danielotero/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/base.py", line 2525, in get_loc
return self._engine.get_loc(key)
File "pandas/_libs/index.pyx", line 117, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 139, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1265, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1273, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 0
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "dfs.py", line 7, in <module>
df = json_normalize(d)
File "/Users/danielotero/anaconda3/lib/python3.6/site-packages/pandas/io/json/normalize.py", line 192, in json_normalize
if any([isinstance(x, dict) for x in compat.itervalues(data[0])]):
File "/Users/danielotero/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py", line 2139, in __getitem__
return self._getitem_column(key)
File "/Users/danielotero/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py", line 2146, in _getitem_column
return self._get_item_cache(key)
File "/Users/danielotero/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py", line 1842, in _get_item_cache
values = self._data.get(item)
File "/Users/danielotero/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py", line 3843, in get
loc = self.items.get_loc(item)
File "/Users/danielotero/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/base.py", line 2527, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/_libs/index.pyx", line 117, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 139, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1265, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1273, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 0
Following the #Jhon H solution, I get this:
Traceback (most recent call last):
File "dfs.py", line 7, in <module>
jsonSeries = df[['bibliographic']].tolist()
File "/Users/danielotero/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py", line 3614, in __getattr__
return object.__getattribute__(self, name)
AttributeError: 'DataFrame' object has no attribute 'tolist'
Create for each dict column new DataFrame by constructor and last concat all together:
df1 = pd.DataFrame(df['biblio'].values.tolist())
df1.columns = 'biblio.'+ df1.columns
df2 = pd.DataFrame(df['series'].values.tolist())
df2.columns = 'series.'+ df2.columns
col = df.columns.difference(['biblio','series'])
df = pd.concat([df[col], df1, df2],axis=1)
print (df)
author biblio.c_date biblio.doi biblio.handle \
0 Mehrdad Vahabi NaN n:v:68:y:2018:i
1 Michael Bailey 2017 NaN RePEc:nbr:nberwo:23608
biblio.issue biblio.month biblio.name biblio.number biblio.pages \
0 s January Janos Kornai NaN 27-52
1 NaN NaN Measuring 23608 NaN
biblio.volume biblio.year series.handle series.name
0 68.0 2018 RePEc:aka:aoecon Oeconomica
1 NaN NaN
EDIT:
If input is json is possible use json_normalize:
from pandas.io.json import json_normalize
d = [{"author":"Mehrdad Vahabi","biblio":{"volume":68,"month":"January","name":"Janos Kornai","issue":"s","handle":"n:v:68:y:2018:i","year":"2018","pages":"27-52","doi":""},"series":{"handle":"RePEc:aka:aoecon","name":"Oeconomica"}},{"author":"Michael Bailey","biblio":{"c_date":"2017","number":"23608","handle":"RePEc:nbr:nberwo:23608","name":"Measuring"},"series":{"handle":"","name":""}}]
df = json_normalize(d)
print (df)
author biblio.c_date biblio.doi biblio.handle \
0 Mehrdad Vahabi NaN n:v:68:y:2018:i
1 Michael Bailey 2017 NaN RePEc:nbr:nberwo:23608
biblio.issue biblio.month biblio.name biblio.number biblio.pages \
0 s January Janos Kornai NaN 27-52
1 NaN NaN Measuring 23608 NaN
biblio.volume biblio.year series.handle series.name
0 68.0 2018 RePEc:aka:aoecon Oeconomica
1 NaN NaN
EDIT: There is problem your dictionaries are strings, so first is necessary use ast.literal_eval for convert:
import ast
df = pd.read_csv('probe.csv')
#print (df)
df1 = pd.DataFrame(df['bibliographic'].apply(ast.literal_eval).values.tolist())
df1.columns = 'bibliographic.'+ df1.columns
df2 = pd.DataFrame(df['series'].apply(ast.literal_eval).values.tolist())
df2.columns = 'series.'+ df2.columns
col = df.columns.difference(['bibliographic','series'])
df = pd.concat([df[col], df1, df2],axis=1)
You need to process the columns individually and join them all together to get the format that you need. Here is a simple example that you could follow
import pandas as pd
records = [{'col1':'v1','col2':{'a1':1,'b1':1},'col3':{'c1':1,'d1':1}},
{'col1':'v2','col2':{'a1':2,'b1':2},'col3':{'c1':2,'d1':2}}]
sample_df = pd.DataFrame(records)
sample_df
col1 col2 col3
0 v1 {'a1': 1, 'b1': 1} {'c1': 1, 'd1': 1}
1 v2 {'a1': 2, 'b1': 2} {'c1': 2, 'd1': 2}
col2_expanded = sample_df.col2.apply(lambda x:pd.Series(x))
col2_expanded.columns = ['{}.{}'.format('col2',i) for i in col2_expanded]
col2_expanded
col2.a1 col2.b1
0 1 1
1 2 2
col3_expanded = sample_df.col3.apply(lambda x:pd.Series(x))
col3_expanded.columns = ['{}.{}'.format('col3',i) for i in col3_expanded]
col3_expanded
col3.c1 col3.d1
0 1 1
1 2 2
final = pd.concat([sample_df[['col1']],col2_expanded,col3_expanded],axis=1)
final
col1 col2.a1 col2.b1 col3.c1 col3.d1
0 v1 1 1 1 1
1 v2 2 2 2 2

Categories

Resources