I have a hdf5 file that contains a table where the column time is in datetime64[ns] format.
I want to get all the rows that are older than thresh. How can I do that? This is what I've tried:
thresh = pd.datetime.strptime('2018-03-08 14:19:41','%Y-%m-%d %H:%M:%S').timestamp()
hdf = pd.read_hdf(STORE, 'gh1', where = 'time>thresh' )
I get the following error:
Traceback (most recent call last):
File "<ipython-input-80-fa444735d0a9>", line 1, in <module>
runfile('/home/joao/github/control_panel/controlpanel/controlpanel/reading_test.py', wdir='/home/joao/github/control_panel/controlpanel/controlpanel')
File "/home/joao/anaconda3/lib/python3.6/site-packages/spyder/utils/site/sitecustomize.py", line 705, in runfile
execfile(filename, namespace)
File "/home/joao/anaconda3/lib/python3.6/site-packages/spyder/utils/site/sitecustomize.py", line 102, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "/home/joao/github/control_panel/controlpanel/controlpanel/reading_test.py", line 15, in <module>
hdf = pd.read_hdf(STORE, 'gh1', where = 'time>thresh' )
File "/home/joao/anaconda3/lib/python3.6/site-packages/pandas/io/pytables.py", line 370, in read_hdf
return store.select(key, auto_close=auto_close, **kwargs)
File "/home/joao/anaconda3/lib/python3.6/site-packages/pandas/io/pytables.py", line 717, in select
return it.get_result()
File "/home/joao/anaconda3/lib/python3.6/site-packages/pandas/io/pytables.py", line 1457, in get_result
results = self.func(self.start, self.stop, where)
File "/home/joao/anaconda3/lib/python3.6/site-packages/pandas/io/pytables.py", line 710, in func
columns=columns, **kwargs)
File "/home/joao/anaconda3/lib/python3.6/site-packages/pandas/io/pytables.py", line 4141, in read
if not self.read_axes(where=where, **kwargs):
File "/home/joao/anaconda3/lib/python3.6/site-packages/pandas/io/pytables.py", line 3340, in read_axes
self.selection = Selection(self, where=where, **kwargs)
File "/home/joao/anaconda3/lib/python3.6/site-packages/pandas/io/pytables.py", line 4706, in __init__
self.condition, self.filter = self.terms.evaluate()
File "/home/joao/anaconda3/lib/python3.6/site-packages/pandas/core/computation/pytables.py", line 556, in evaluate
self.condition = self.terms.prune(ConditionBinOp)
File "/home/joao/anaconda3/lib/python3.6/site-packages/pandas/core/computation/pytables.py", line 118, in prune
res = pr(left.value, right.value)
File "/home/joao/anaconda3/lib/python3.6/site-packages/pandas/core/computation/pytables.py", line 113, in pr
encoding=self.encoding).evaluate()
File "/home/joao/anaconda3/lib/python3.6/site-packages/pandas/core/computation/pytables.py", line 327, in evaluate
values = [self.convert_value(v) for v in rhs]
File "/home/joao/anaconda3/lib/python3.6/site-packages/pandas/core/computation/pytables.py", line 327, in <listcomp>
values = [self.convert_value(v) for v in rhs]
File "/home/joao/anaconda3/lib/python3.6/site-packages/pandas/core/computation/pytables.py", line 185, in convert_value
v = pd.Timestamp(v)
File "pandas/_libs/tslib.pyx", line 390, in pandas._libs.tslib.Timestamp.__new__
File "pandas/_libs/tslib.pyx", line 1549, in pandas._libs.tslib.convert_to_tsobject
File "pandas/_libs/tslib.pyx", line 1735, in pandas._libs.tslib.convert_str_to_tsobject
ValueError: could not convert string to Timestamp
Demo:
creating sample DF (100.000 rows):
In [9]: N = 10**5
In [10]: dates = pd.date_range('1980-01-01', freq='99T', periods=N)
In [11]: df = pd.DataFrame({'date':dates, 'val':np.random.rand(N)})
In [12]: df
Out[12]:
date val
0 1980-01-01 00:00:00 0.985215
1 1980-01-01 01:39:00 0.452295
2 1980-01-01 03:18:00 0.780096
3 1980-01-01 04:57:00 0.004596
4 1980-01-01 06:36:00 0.515051
... ... ...
99995 1998-10-27 15:45:00 0.509954
99996 1998-10-27 17:24:00 0.046636
99997 1998-10-27 19:03:00 0.026678
99998 1998-10-27 20:42:00 0.660652
99999 1998-10-27 22:21:00 0.839426
[100000 rows x 2 columns]
writing it to HDF5 file (index date column):
In [13]: df.to_hdf('d:/temp/test.h5', 'test', format='t', data_columns=['date'])
read HDF5 conditionally by index:
In [14]: x = pd.read_hdf('d:/temp/test.h5', 'test', where="date > '1998-10-27 15:00:00'")
In [15]: x
Out[15]:
date val
99995 1998-10-27 15:45:00 0.509954
99996 1998-10-27 17:24:00 0.046636
99997 1998-10-27 19:03:00 0.026678
99998 1998-10-27 20:42:00 0.660652
99999 1998-10-27 22:21:00 0.839426
Related
I have a dataframe of the type:
date TICKER x1 x2 ... Z Y month x3
0 1999-12-31 A UN Equity 52.1330 51.9645 ... 0.0052 NaN 12 NaN
1 1999-12-31 AA UN Equity 92.9415 92.8715 ... 0.0052 NaN 12 NaN
2 1999-12-31 ABC UN Equity 3.6843 3.6539 ... 0.0052 NaN 12 NaN
3 1999-12-31 ABF UN Equity 22.0625 21.9375 ... 0.0052 NaN 12 NaN
4 1999-12-31 ABM UN Equity 10.2188 10.1250 ... 0.0052 NaN 12 NaN
I would like to run an OLS regression from the formula 'Y ~ x1 + x2:x3' by the group ['TICKER','year','month'] (year is a column which does not appear here) from statsmodels.formula.api as smf. I therefore use:
data.groupby(['TICKER','year','month']).apply(lambda x: smf.ols(formula='Y ~ x1 + x2:x3', data=x))
However, I get the following error:
IndexError: tuple index out of range
Any idea why?
The full tracebakc is
Traceback (most recent call last):
File "<input>", line 1, in <module>
File "C:\Users\xxxx\PycharmProjects\non_parametric\venv\lib\site-packages\pandas\core\groupby\groupby.py", line 894, in apply
result = self._python_apply_general(f, self._selected_obj)
File "C:\Users\xxxx\PycharmProjects\non_parametric\venv\lib\site-packages\pandas\core\groupby\groupby.py", line 928, in _python_apply_general
keys, values, mutated = self.grouper.apply(f, data, self.axis)
File "C:\Users\xxxx\PycharmProjects\non_parametric\venv\lib\site-packages\pandas\core\groupby\ops.py", line 238, in apply
res = f(group)
File "<input>", line 1, in <lambda>
File "C:\Users\xxxx\PycharmProjects\non_parametric\venv\lib\site-packages\statsmodels\base\model.py", line 195, in from_formula
mod = cls(endog, exog, *args, **kwargs)
File "C:\Users\xxxx\PycharmProjects\non_parametric\venv\lib\site-packages\statsmodels\regression\linear_model.py", line 872, in __init__
super(OLS, self).__init__(endog, exog, missing=missing,
File "C:\Users\xxxx\PycharmProjects\non_parametric\venv\lib\site-packages\statsmodels\regression\linear_model.py", line 703, in __init__
super(WLS, self).__init__(endog, exog, missing=missing,
File "C:\Users\xxxx\PycharmProjects\non_parametric\venv\lib\site-packages\statsmodels\regression\linear_model.py", line 190, in __init__
super(RegressionModel, self).__init__(endog, exog, **kwargs)
File "C:\Users\xxxx\PycharmProjects\non_parametric\venv\lib\site-packages\statsmodels\base\model.py", line 237, in __init__
super(LikelihoodModel, self).__init__(endog, exog, **kwargs)
File "C:\Users\xxxx\PycharmProjects\non_parametric\venv\lib\site-packages\statsmodels\base\model.py", line 77, in __init__
self.data = self._handle_data(endog, exog, missing, hasconst,
File "C:\Users\xxxx\PycharmProjects\non_parametric\venv\lib\site-packages\statsmodels\base\model.py", line 101, in _handle_data
data = handle_data(endog, exog, missing, hasconst, **kwargs)
File "C:\Users\xxxx\PycharmProjects\non_parametric\venv\lib\site-packages\statsmodels\base\data.py", line 672, in handle_data
return klass(endog, exog=exog, missing=missing, hasconst=hasconst,
File "C:\Users\xxxx\PycharmProjects\non_parametric\venv\lib\site-packages\statsmodels\base\data.py", line 71, in __init__
arrays, nan_idx = self.handle_missing(endog, exog, missing,
File "C:\Users\xxxx\PycharmProjects\non_parametric\venv\lib\site-packages\statsmodels\base\data.py", line 247, in handle_missing
if combined_nans.shape[0] != nan_mask.shape[0]:
IndexError: tuple index out of range
I see that your Y columns has a lot of NaNs, so you need to ensure that the subgroup has enough observations, so that the regression can work.
So if I use an example data:
import statsmodels.formula.api as smf
np.random.seed(123)
data = pd.concat([
pd.DataFrame({'TICKER':np.random.choice(['A','B','C'],30),
'year':np.random.choice([2000,2001],30),
'month':np.random.choice([1,2],30)}),
pd.DataFrame(np.random.normal(0,1,(30,4)),columns=['Y','x1','x2','x3'])
],axis=1)
data.loc[:6,'Y'] = np.nan
If I run your code on the data frame above, I get the same error.
So if we use only complete data (relevant for your regression):
complete_ix = data[['Y','x1','x2','x3']].dropna().index
data.loc[complete_ix].groupby(['TICKER','year','month']).apply(lambda x: smf.ols(formula='Y ~ x1 + x2:x3', data=x))
It works:
TICKER year month
A 2000 2 <statsmodels.regression.linear_model.OLS objec...
2001 1 <statsmodels.regression.linear_model.OLS objec...
2 <statsmodels.regression.linear_model.OLS objec...
B 2000 1 <statsmodels.regression.linear_model.OLS objec...
2 <statsmodels.regression.linear_model.OLS objec...
2001 1 <statsmodels.regression.linear_model.OLS objec...
C 2000 1 <statsmodels.regression.linear_model.OLS objec...
2 <statsmodels.regression.linear_model.OLS objec...
I have a table containing production data on parts and the variables that were recorded during their production.
FORMAT:
Part | Variable1 | Variable 2 etc
_____________________________
1-----------X---------------X
1-----------X---------------X
2-----------X---------------X
2-----------X---------------X
2-----------X---------------X
2-----------X---------------X
2-----------X---------------X
2-----------X---------------X
2-----------X---------------X
3-----------X---------------X
3-----------X---------------X
3-----------X---------------X
I can group these by part with
dfg = df.groupby("Part") #Where df is my dataframe of productiondata
I also have stored all the Part numbers in the part_num array
part_num = df['Part'].unique()
>out:
array([ 615, 629, 901, 908, 911, 959, 969, 1024, 1025, 1058, 1059,
1092, 1097, 1104, 1105, 1114, 1115, 1117, 1147, 1161, 1171, 1172,
1173, 1174, 1175, 1176, 1177, 1188, 1259, 1307, 1308, 1309, 1310,
1311, 1312, 1313, 1322, 1339, 1340, 1359, 1383, 1384, 1389, 1393,
1394, 1398, 1399, 1402, 1404, 1413, 1414, 1417, 1441, 1449, 1461,
1462, 1463, 1488, 1489, 1490, 1491, 1508, 1509, 1514, 1541, 1542,
1543, 1544, 1545, 1554, 1555, 1559, 1586, 1589, 1601, 1606, 1607,
1618, 1620, 1636, 1659, 1664, 1665, 1667, 1668, 1673, 1674, 1676,
1677, 1679, 1680, 1681, 1687, 1688, 1690, 1704, 1706, 1711, 1714,
1717, 1718, 1723, 1724, 1729, 1731, 1732, 1745, 1747, 1748, 1749,
1753, 1754, 1755, 1756, 1757, 1758, 1759, 1760, 1761, 1762, 1763,
1764, 1765, 1766, 1767, 1768, 1769, 1773, 1774, 1779, 1780, 1783,
1784, 1785, 1787, 1789, 1790, 1791, 1792, 1800, 1845], dtype=int64)
How do I create a dataframe for each part number group?
So you want a dataframe for each unique 'part number group'.
Do a groupby on the index and then store each dataframe in a dict (or list) iteratively.
dummy data
>>> df = pd.DataFrame(np.random.randint(1, 10, (8, 7)),
columns=['a1', 'a2', 'a3', 'b1', 'b2', 'b3', 'b4'], index=[1, 1, 2, 2, 2, 3, 5, 5])
>>> df.head(10)
a1 a2 a3 b1 b2 b3 b4
1 5 3 8 8 1 7 1
1 8 8 8 7 2 5 8
2 4 8 1 9 2 7 5
2 1 8 4 4 1 8 9
2 4 7 4 4 3 9 5
3 3 6 9 3 8 9 2
5 9 5 2 1 7 6 1
5 3 9 1 5 5 8 1
grouped dict of dataframes
>>> grouped_dict = {k: v for k, v in df.groupby(df.index)}
>>> grouped_dict[3] # part number 3 dataframe
a1 a2 a3 b1 b2 b3 b4
3 5 7 5 1 7 8 5
df = quandl.get('NSE/TATAMOTORS', start_date='2000-01-01', end_date='2018-05-10')
df=df.drop(['Last','Total Trade Quantity','Turnover (Lacs)'], axis=1)
df.head(10)
OUTPUT -
Open High Low Close
Date
2003-12-26 435.80 440.50 431.65 438.60
2003-12-29 441.00 449.70 441.00 447.80
2003-12-30 450.00 451.90 430.10 442.40
2003-12-31 446.00 459.30 443.55 452.05
2004-01-01 453.25 457.90 451.50 454.45
2004-01-02 458.00 460.35 454.05 456.40
2004-01-05 458.00 465.00 450.60 454.85
2004-01-06 460.00 465.00 448.50 454.45
2004-01-07 451.40 454.70 438.10 446.45
2004-01-08 449.00 466.95 449.00 464.75
-
from statsmodels.tsa.arima_model import ARIMA
model = ARIMA(df, order=(5,1,0))
OUTPUT -
Traceback (most recent call last):
File "<ipython-input-90-799de8e60d6f>", line 1, in <module>
model = ARIMA(df, order=(5,1,0))
File "D:\A\lib\site-packages\statsmodels\tsa\arima_model.py", line 1000, in __new__
mod.__init__(endog, order, exog, dates, freq, missing)
File "D:\A\lib\site-packages\statsmodels\tsa\arima_model.py", line 1024, in __init__
self.data.ynames = 'D.' + self.endog_names
TypeError: must be str, not list
So i converted the index column containing dates to proper column
by -
df = df.reset_index()
df.head(10)
Out[92]:
Date Open High Low Close
0 2003-12-26 435.80 440.50 431.65 438.60
1 2003-12-29 441.00 449.70 441.00 447.80
2 2003-12-30 450.00 451.90 430.10 442.40
3 2003-12-31 446.00 459.30 443.55 452.05
4 2004-01-01 453.25 457.90 451.50 454.45
5 2004-01-02 458.00 460.35 454.05 456.40
6 2004-01-05 458.00 465.00 450.60 454.85
7 2004-01-06 460.00 465.00 448.50 454.45
8 2004-01-07 451.40 454.70 438.10 446.45
9 2004-01-08 449.00 466.95 449.00 464.75
then when i run this line -
from statsmodels.tsa.arima_model import ARIMA
model = ARIMA(df, order=(5,1,0))
OUTPUT -
Traceback (most recent call last):
File "<ipython-input-94-799de8e60d6f>", line 1, in <module>
model = ARIMA(df, order=(5,1,0))
File "D:\A\lib\site-packages\statsmodels\tsa\arima_model.py", line 1000, in __new__
mod.__init__(endog, order, exog, dates, freq, missing)
File "D:\A\lib\site-packages\statsmodels\tsa\arima_model.py", line 1015, in __init__
super(ARIMA, self).__init__(endog, (p, q), exog, dates, freq, missing)
File "D:\A\lib\site-packages\statsmodels\tsa\arima_model.py", line 452, in __init__
super(ARMA, self).__init__(endog, exog, dates, freq, missing=missing)
File "D:\A\lib\site-packages\statsmodels\tsa\base\tsa_model.py", line 43, in __init__
super(TimeSeriesModel, self).__init__(endog, exog, missing=missing)
File "D:\A\lib\site-packages\statsmodels\base\model.py", line 212, in __init__
super(LikelihoodModel, self).__init__(endog, exog, **kwargs)
File "D:\A\lib\site-packages\statsmodels\base\model.py", line 63, in __init__
**kwargs)
File "D:\A\lib\site-packages\statsmodels\base\model.py", line 88, in _handle_data
data = handle_data(endog, exog, missing, hasconst, **kwargs)
File "D:\A\lib\site-packages\statsmodels\base\data.py", line 630, in handle_data
**kwargs)
File "D:\A\lib\site-packages\statsmodels\base\data.py", line 76, in __init__
self.endog, self.exog = self._convert_endog_exog(endog, exog)
File "D:\A\lib\site-packages\statsmodels\base\data.py", line 471, in _convert_endog_exog
raise ValueError("Pandas data cast to numpy dtype of object. "
ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).
HELP?
ARIMA is expected a array-like object, if we instead of using a 2D array(dataframe) and use a 1D array(Series) and this will work.
Try:
ARIMA(df['Close'].values, order=(5,1,0))
where df has a datetime in index and you select one column:
df.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 10 entries, 2003-12-26 to 2004-01-08
Data columns (total 4 columns):
Open 10 non-null float64
High 10 non-null float64
Low 10 non-null float64
Close 10 non-null float64
dtypes: float64(4)
memory usage: 400.0 bytes
I have a data frame which have two columns in JSON format, like this:
author biblio series
Mehrdad Vahabi {'volume': 68, 'month': 'January', {'handle':'RePEc:aka:aoecon', 'name': 'Oeconomica'}
'name': 'János Kornai',
'issue': 's', 'handle':
'n:v:68:y:2018:i',
'year': '2018',
'pages': '27-52', 'doi': ''}
Michael Bailey {'c_date': '2017', 'number': {'handle': '', 'name': ''}
'23608', 'handle': 'RePEc:nbr:
nberwo:23608', 'name': 'Measuring'}
I Want to my data frame looks like this:
author biblio.volume biblio.month biblio.name biblio.issue biblio.handle bibilio.year biblio.pages biblio.doi biblio.c_date bibi¡lio.number series.handle series.name
Mehrdad Vahabi 68 January János Kornai s n:v:68:y:2018:i 2018 27-52 NA NA RePEc:aka:aoecon Oeconomica
Michael Bailey NA Na Meausuring NA nberwo:23608 NA NA NA 2017 23608
I try do it using the answers in this question, but no one works for me.
How can I do it?
[EDIT]
Here is a sample of the data
[EDIT]
Following the #jezrael solution I get this:
df1 = pd.DataFrame(df['biblio'].values.tolist())
df1.columns = 'biblio.'+ df1.columns
df2 = pd.DataFrame(df['series'].values.tolist())
df2.columns = 'series.'+ df2.columns
col = df.columns.difference(['biblio','series'])
df = pd.concat([df[col], df1, df2],axis=1)
print (df)
Traceback (most recent call last):
File "dfs.py", line 8, in <module>
df1.columns = 'bibliographic.'+ df1.columns
File "/Users/danielotero/anaconda3/lib/python3.6/site-
packages/pandas/core/indexes/range.py", line 583, in _evaluate_numeric_binop
other = self._validate_for_numeric_binop(other, op, opstr)
File "/Users/danielotero/anaconda3/lib/python3.6/site-
packages/pandas/core/indexes/base.py", line 3961, in
_validate_for_numeric_binop
raise TypeError("can only perform ops with scalar values")
TypeError: can only perform ops with scalar values
And with json_normalize:
Traceback (most recent call last):
File "/Users/danielotero/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/base.py", line 2525, in get_loc
return self._engine.get_loc(key)
File "pandas/_libs/index.pyx", line 117, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 139, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1265, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1273, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 0
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "dfs.py", line 7, in <module>
df = json_normalize(d)
File "/Users/danielotero/anaconda3/lib/python3.6/site-packages/pandas/io/json/normalize.py", line 192, in json_normalize
if any([isinstance(x, dict) for x in compat.itervalues(data[0])]):
File "/Users/danielotero/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py", line 2139, in __getitem__
return self._getitem_column(key)
File "/Users/danielotero/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py", line 2146, in _getitem_column
return self._get_item_cache(key)
File "/Users/danielotero/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py", line 1842, in _get_item_cache
values = self._data.get(item)
File "/Users/danielotero/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py", line 3843, in get
loc = self.items.get_loc(item)
File "/Users/danielotero/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/base.py", line 2527, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/_libs/index.pyx", line 117, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 139, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1265, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1273, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 0
Following the #Jhon H solution, I get this:
Traceback (most recent call last):
File "dfs.py", line 7, in <module>
jsonSeries = df[['bibliographic']].tolist()
File "/Users/danielotero/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py", line 3614, in __getattr__
return object.__getattribute__(self, name)
AttributeError: 'DataFrame' object has no attribute 'tolist'
Create for each dict column new DataFrame by constructor and last concat all together:
df1 = pd.DataFrame(df['biblio'].values.tolist())
df1.columns = 'biblio.'+ df1.columns
df2 = pd.DataFrame(df['series'].values.tolist())
df2.columns = 'series.'+ df2.columns
col = df.columns.difference(['biblio','series'])
df = pd.concat([df[col], df1, df2],axis=1)
print (df)
author biblio.c_date biblio.doi biblio.handle \
0 Mehrdad Vahabi NaN n:v:68:y:2018:i
1 Michael Bailey 2017 NaN RePEc:nbr:nberwo:23608
biblio.issue biblio.month biblio.name biblio.number biblio.pages \
0 s January Janos Kornai NaN 27-52
1 NaN NaN Measuring 23608 NaN
biblio.volume biblio.year series.handle series.name
0 68.0 2018 RePEc:aka:aoecon Oeconomica
1 NaN NaN
EDIT:
If input is json is possible use json_normalize:
from pandas.io.json import json_normalize
d = [{"author":"Mehrdad Vahabi","biblio":{"volume":68,"month":"January","name":"Janos Kornai","issue":"s","handle":"n:v:68:y:2018:i","year":"2018","pages":"27-52","doi":""},"series":{"handle":"RePEc:aka:aoecon","name":"Oeconomica"}},{"author":"Michael Bailey","biblio":{"c_date":"2017","number":"23608","handle":"RePEc:nbr:nberwo:23608","name":"Measuring"},"series":{"handle":"","name":""}}]
df = json_normalize(d)
print (df)
author biblio.c_date biblio.doi biblio.handle \
0 Mehrdad Vahabi NaN n:v:68:y:2018:i
1 Michael Bailey 2017 NaN RePEc:nbr:nberwo:23608
biblio.issue biblio.month biblio.name biblio.number biblio.pages \
0 s January Janos Kornai NaN 27-52
1 NaN NaN Measuring 23608 NaN
biblio.volume biblio.year series.handle series.name
0 68.0 2018 RePEc:aka:aoecon Oeconomica
1 NaN NaN
EDIT: There is problem your dictionaries are strings, so first is necessary use ast.literal_eval for convert:
import ast
df = pd.read_csv('probe.csv')
#print (df)
df1 = pd.DataFrame(df['bibliographic'].apply(ast.literal_eval).values.tolist())
df1.columns = 'bibliographic.'+ df1.columns
df2 = pd.DataFrame(df['series'].apply(ast.literal_eval).values.tolist())
df2.columns = 'series.'+ df2.columns
col = df.columns.difference(['bibliographic','series'])
df = pd.concat([df[col], df1, df2],axis=1)
You need to process the columns individually and join them all together to get the format that you need. Here is a simple example that you could follow
import pandas as pd
records = [{'col1':'v1','col2':{'a1':1,'b1':1},'col3':{'c1':1,'d1':1}},
{'col1':'v2','col2':{'a1':2,'b1':2},'col3':{'c1':2,'d1':2}}]
sample_df = pd.DataFrame(records)
sample_df
col1 col2 col3
0 v1 {'a1': 1, 'b1': 1} {'c1': 1, 'd1': 1}
1 v2 {'a1': 2, 'b1': 2} {'c1': 2, 'd1': 2}
col2_expanded = sample_df.col2.apply(lambda x:pd.Series(x))
col2_expanded.columns = ['{}.{}'.format('col2',i) for i in col2_expanded]
col2_expanded
col2.a1 col2.b1
0 1 1
1 2 2
col3_expanded = sample_df.col3.apply(lambda x:pd.Series(x))
col3_expanded.columns = ['{}.{}'.format('col3',i) for i in col3_expanded]
col3_expanded
col3.c1 col3.d1
0 1 1
1 2 2
final = pd.concat([sample_df[['col1']],col2_expanded,col3_expanded],axis=1)
final
col1 col2.a1 col2.b1 col3.c1 col3.d1
0 v1 1 1 1 1
1 v2 2 2 2 2
I simply want to get index column.
import pandas as pd
df1=pd.read_csv(path1, index_col='ID')
df1.head()
VAR1 VAR2 VAR3 OUTCOME
ID
28677 28 1 0.0 0
27170 59 1 0.0 1
39245 65 1 0.0 1
31880 19 1 0.0 0
41441 24 1 0.0 1
I can get many columns like:
df1["VAR1"]
ID
28677 28
27170 59
39245 65
31880 19
41441 24
31070 77
39334 63
....
38348 23
38278 52
28177 58
but, I cannot get index column:
>>> df1["ID"]
Traceback (most recent call last):
File "C:\Anaconda3\lib\site-packages\pandas\indexes\base.py", line 2134, in get_loc
return self._engine.get_loc(key)
File "pandas\index.pyx", line 132, in pandas.index.IndexEngine.get_loc (pandas\index.c:4433)
File "pandas\index.pyx", line 154, in pandas.index.IndexEngine.get_loc (pandas\index.c:4279)
File "pandas\src\hashtable_class_helper.pxi", line 732, in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:13742)
File "pandas\src\hashtable_class_helper.pxi", line 740, in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:13696)
KeyError: 'ID'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "C:\Anaconda3\lib\site-packages\pandas\core\frame.py", line 2059, in __getitem__
return self._getitem_column(key)
File "C:\Anaconda3\lib\site-packages\pandas\core\frame.py", line 2066, in _getitem_column
return self._get_item_cache(key)
File "C:\Anaconda3\lib\site-packages\pandas\core\generic.py", line 1386, in _get_item_cache
values = self._data.get(item)
File "C:\Anaconda3\lib\site-packages\pandas\core\internals.py", line 3543, in get
loc = self.items.get_loc(item)
File "C:\Anaconda3\lib\site-packages\pandas\indexes\base.py", line 2136, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas\index.pyx", line 132, in pandas.index.IndexEngine.get_loc (pandas\index.c:4433)
File "pandas\index.pyx", line 154, in pandas.index.IndexEngine.get_loc (pandas\index.c:4279)
File "pandas\src\hashtable_class_helper.pxi", line 732, in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:13742)
File "pandas\src\hashtable_class_helper.pxi", line 740, in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:13696)
KeyError: 'ID'
I want to get the list of index column.
how to do it?
why I get error?
If I want to merge two dataframe use the index column, how to do it?
First column is index so for select use:
print (df1.index)
Int64Index([28677, 27170, 39245, 31880, 41441], dtype='int64', name='ID')
But if possible MultiIndex in index use get_level_values:
print (df1.index.get_level_values('ID'))
Int64Index([28677, 27170, 39245, 31880, 41441], dtype='int64', name='ID')
You can use df.index property:
df.index (or df.index.values for numpy array)
pd.Series(df.index)