Python class method chaining: Unexpected results with pandas - python

While investigating method chaining within a python class for a project I'm starting, I encountered this unexpected error. Wonder if someone can explain what might be happening in method3() below with the error thrown by the commented-out code which calculates df['col6'] using df['col5'] ... for some reason any reference to col5 seems to throw errors.
import pandas as pd
class mchain:
def __init__(self, df):
self._df = df
#property
def df(self):
return self._df
def method1(self, msg):
print(msg)
self.df['col4'] = self.df.apply(lambda x: x['col1']+x['col3'], axis=1)
return self
def method2(self, msg):
print(msg)
self.df['col5'] = self.df.apply(lambda x: x['col4']**2, axis=1)
return self
def method3(self, msg):
print(msg)
self.df['col6'] = self.df.apply(lambda x: x['col4']*2, axis=1)
#self.df['col6'] = self.df.apply(lambda x: x['col5']*2, axis=1)
print(self.df.columns)
return self
def method4(self):
print(self.df.head().to_string())
return self
if __name__ == '__main__':
df = pd.DataFrame({'col1': [1,2,3,4,5], 'col2': ['a', 'b', 'c', 'd', 'e'], 'col3': [3,3,2,4,6]})
a = mchain(df)
print(a.method1('beginning').method3('end'))
print(a.method1('beginning').method2('middle').method3('end').method4())
Here's the error traceback when I uncomment the code in method3 ..
(calculator) [Mon Sep 20 11:20:18]:Project python3 ./utils/methodchainingtest.py
beginning
end
Traceback (most recent call last):
File "/usr/local/lib/python3.9/site-packages/pandas/core/indexes/base.py", line 3080, in get_loc
return self._engine.get_loc(casted_key)
File "pandas/_libs/index.pyx", line 70, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 101, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 4554, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 4562, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'col5'
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/Users/zzz/Documents/repos/Project/./utils/methodchainingtest.py", line 40, in <module>
print(a.method1('beginning').method3('end'))
File "/Users/zzz/Documents/repos/Project/./utils/methodchainingtest.py", line 27, in method3
self.df['col6'] = self.df.apply(lambda x: x['col5']*2, axis=1)
File "/usr/local/lib/python3.9/site-packages/pandas/core/frame.py", line 7768, in apply
return op.get_result()
File "/usr/local/lib/python3.9/site-packages/pandas/core/apply.py", line 185, in get_result
return self.apply_standard()
File "/usr/local/lib/python3.9/site-packages/pandas/core/apply.py", line 276, in apply_standard
results, res_index = self.apply_series_generator()
File "/usr/local/lib/python3.9/site-packages/pandas/core/apply.py", line 290, in apply_series_generator
results[i] = self.f(v)
File "/Users/zzz/Documents/repos/Project/./utils/methodchainingtest.py", line 27, in <lambda>
self.df['col6'] = self.df.apply(lambda x: x['col5']*2, axis=1)
File "/usr/local/lib/python3.9/site-packages/pandas/core/series.py", line 853, in __getitem__
return self._get_value(key)
File "/usr/local/lib/python3.9/site-packages/pandas/core/series.py", line 961, in _get_value
loc = self.index.get_loc(label)
File "/usr/local/lib/python3.9/site-packages/pandas/core/indexes/base.py", line 3082, in get_loc
raise KeyError(key) from err
KeyError: 'col5'
(calculator) [Mon Sep 20 11:34:31]:Project

Related

Invalid comparison between dtype=datetime64[ns, UTC-03:00] and datetime

This problem is happening in an etl creation, It's a problem of date transformation, using panda. Can someone help me?
df_delivered = df.loc[(df['state'] == 2) | (df['state'] == 4) | (df['state'] == 7) | (df['direction'] == 1) | (df['delivered_at'] > last_update)]
df_delivered['Delivered'] = 1
df1 = df.merge(df_delivered, left_index=True, right_index=True, how='outer', suffixes=('', '_y'))
df1.drop(df1.filter(regex='_y$').columns, axis=1, inplace=True)
df = df1
and have been given this error:
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/airflow/.local/lib/python3.9/site-packages/pandas/core/arrays/datetimelike.py", line 1008, in \_cmp_method
other = self.\_validate_comparison_value(other)
File "/home/airflow/.local/lib/python3.9/site-packages/pandas/core/arrays/datetimelike.py", line 543, in \_validate_comparison_value
raise InvalidComparison(other) from err
pandas.core.arrays.datetimelike.InvalidComparison: 2019-01-01 00:00:00
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/airflow/.local/lib/python3.9/site-packages/airflow/task/task_runner/standard_task_runner.py", line 85, in \_start_by_fork
args.func(args, dag=self.dag)
File "/home/airflow/.local/lib/python3.9/site-packages/airflow/cli/cli_parser.py", line 48, in command
return func(\*args, \*\*kwargs)
File "/home/airflow/.local/lib/python3.9/site-packages/airflow/utils/cli.py", line 92, in wrapper
return f(\*args, \*\*kwargs)
File "/home/airflow/.local/lib/python3.9/site-packages/airflow/cli/commands/task_command.py", line 292, in task_run
\_run_task_by_selected_method(args, dag, ti)
File "/home/airflow/.local/lib/python3.9/site-packages/airflow/cli/commands/task_command.py", line 107, in \_run_task_by_selected_method
\_run_raw_task(args, ti)
File "/home/airflow/.local/lib/python3.9/site-packages/airflow/cli/commands/task_command.py", line 180, in \_run_raw_task
ti.\_run_raw_task(
File "/home/airflow/.local/lib/python3.9/site-packages/airflow/utils/session.py", line 70, in wrapper
return func(\*args, session=session, \*\*kwargs)
File "/home/airflow/.local/lib/python3.9/site-packages/airflow/models/taskinstance.py", line 1332, in \_run_raw_task
self.\_execute_task_with_callbacks(context)
File "/home/airflow/.local/lib/python3.9/site-packages/airflow/models/taskinstance.py", line 1458, in \_execute_task_with_callbacks
result = self.\_execute_task(context, self.task)
File "/home/airflow/.local/lib/python3.9/site-packages/airflow/models/taskinstance.py", line 1509, in \_execute_task
result = execute_callable(context=context)
File "/home/airflow/.local/lib/python3.9/site-packages/airflow/operators/python.py", line 151, in execute
return_value = self.execute_callable()
File "/home/airflow/.local/lib/python3.9/site-packages/airflow/operators/python.py", line 162, in execute_callable
return self.python_callable(\*self.op_args, \*\*self.op_kwargs)
File "/opt/airflow/etl/dw_reports/etl_communication_customer_engagement.py", line 62, in etl_communication_customer_engagement
df_delivered = df.loc\[(df\['state'\] == 2) | (df\['state'\] == 4) | (df\['state'\] == 7) | (df\['direction'\] == 1) | (df\['delivered_at'\] \> last_update)\]
File "/home/airflow/.local/lib/python3.9/site-packages/pandas/core/ops/common.py", line 69, in new_method
return method(self, other)
File "/home/airflow/.local/lib/python3.9/site-packages/pandas/core/arraylike.py", line 48, in __gt__
return self.\_cmp_method(other, operator.gt)
File "/home/airflow/.local/lib/python3.9/site-packages/pandas/core/series.py", line 5502, in \_cmp_method
res_values = ops.comparison_op(lvalues, rvalues, op)
File "/home/airflow/.local/lib/python3.9/site-packages/pandas/core/ops/array_ops.py", line 270, in comparison_op
res_values = op(lvalues, rvalues)
File "/home/airflow/.local/lib/python3.9/site-packages/pandas/core/ops/common.py", line 69, in new_method
return method(self, other)
File "/home/airflow/.local/lib/python3.9/site-packages/pandas/core/arraylike.py", line 48, in __gt__
return self.\_cmp_method(other, operator.gt)
File "/home/airflow/.local/lib/python3.9/site-packages/pandas/core/arrays/datetimelike.py", line 1010, in \_cmp_method
return invalid_comparison(self, other, op)
File "/home/airflow/.local/lib/python3.9/site-packages/pandas/core/ops/invalid.py", line 34, in invalid_comparison
raise TypeError(f"Invalid comparison between dtype={left.dtype} and {typ}")
TypeError: Invalid comparison between dtype=datetime64\[ns, UTC-03:00\] and datetime
\[2023-01-09, 13:54:11 UTC\] {local_task_job.py:154} INFO - Task exited with return code 1
\[2023-01-09, 13:54:11 UTC\] {local_task_job.py:264} INFO - 0 downstream tasks scheduled from follow-on schedule check\`
seems like this comparison might be problematic:
(df['delivered_at'] > last_update)
if last_update here is passed value it needs to be of the same/comparable type as data in df['delivered_at'] column.
btw. this can be achieved with:
df.loc[:, ('delivered_at')] = pandas.to_datetime(df.loc[:, ('delivered_at')])

iterate a dataframe

I'm trying to iterate a dataframe to call queries in mongodb from a list and save each query in a csv file. I have the connection with no errors, but when I iterate it just creates the frist file (0.csv) and I have an error for the second row of the dataframe.
This is my code:
sql = [
('tran','transactions',{"den": "00100002773060"}),
('tran','Data',{'name': 'john'}),
]
df = pd.DataFrame(sql, columns = ["database", "entity", "sql"])
for i in range(len(df)):
database = df.iloc[i]["database"]
entity=df.iloc[i]["entity"]
myquery=df.iloc[i]["sql"]
collection = client[database][entity]
try:
mydoc = list(collection.find(myquery))
if len(mydoc) > 0:
df = pd.DataFrame(mydoc)
df.pop("_id")
df.to_csv(str(i) + '.csv')
print("file saved")
except:
print("error on file")
and this the error
Traceback (most recent call last):
File "/home/r/Desktop/table_csv/entorno_virtual/lib/python3.8/site-packages/pandas/core/indexes/base.py", line 3629, in get_loc
return self._engine.get_loc(casted_key)
File "pandas/_libs/index.pyx", line 136, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 163, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 5198, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 5206, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'database'
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "getSql.py", line 12, in <module>
database = df.iloc[i]["database"]
File "/home/r/Desktop/table_csv/entorno_virtual/lib/python3.8/site-packages/pandas/core/series.py", line 958, in __getitem__
return self._get_value(key)
File "/home/r/Desktop/table_csv/entorno_virtual/lib/python3.8/site-packages/pandas/core/series.py", line 1069, in _get_value
loc = self.index.get_loc(label)
File "/home/r/Desktop/table_csv/entorno_virtual/lib/python3.8/site-packages/pandas/core/indexes/base.py", line 3631, in get_loc
raise KeyError(key) from err
KeyError: 'database'
from what I can see here you are changing your df variable here
df = pd.DataFrame(mydoc)
probably just rename it

saving coordinates from Dataframe as Polygons (shapely.geometry) AttributeError

I want to create a Polygon from a list of coordinates:
import pandas as pd
from shapely.geometry import Point, Polygon
data = pd.read_csv('path.csv', sep=';')
the data is in the following format
Suburb
features_geometry_x
features_geometry_y
1
50.941840
6.9595637
1
50.941845
6.9595698
3
50.94182
6.9595632
4
50.9418837
6.9595958
with several rows for suburb 1, 3 and 4
#create a polygon
I = data.loc[data['Suburb'] == 1]
I['coordinates'] = list(zip(I['features_geometry_x'], I['features_geometry_y']))
poly_i = Polygon(I['coordinates'])
the code above works fine but if I do the same thing for suburb 3 and 4 it yields the following error:
L = data.loc[data['Suburb'] == 3]
L['coordinates'] = list(zip(L['features_geometry_x'], L['features_geometry_y']))
poly_l = Polygon(L['coordinates'])
File "shapely/speedups/_speedups.pyx", line 252, in shapely.speedups._speedups.geos_linearring_from_py
File "/Users/Jojo/opt/anaconda3/lib/python3.8/site-packages/pandas/core/generic.py", line 5487, in getattr
return object.getattribute(self, name)
AttributeError: 'Series' object has no attribute 'array_interface'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Users/Jojo/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexes/base.py", line 3361, in get_loc
return self._engine.get_loc(casted_key)
File "pandas/_libs/index.pyx", line 76, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 2131, in pandas._libs.hashtable.Int64HashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 2140, in pandas._libs.hashtable.Int64HashTable.get_item
KeyError: 0
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/var/folders/j6/wgg72kmx145f3krf14nzjfq40000gn/T/ipykernel_4092/214655495.py", line 3, in
poly_l = Polygon(Lindenthal['coordinates'])
File "/Users/Jojo/opt/anaconda3/lib/python3.8/site-packages/shapely/geometry/polygon.py", line 261, in init
ret = geos_polygon_from_py(shell, holes)
File "/Users/Jojo/opt/anaconda3/lib/python3.8/site-packages/shapely/geometry/polygon.py", line 539, in geos_polygon_from_py
ret = geos_linearring_from_py(shell)
File "shapely/speedups/_speedups.pyx", line 344, in shapely.speedups._speedups.geos_linearring_from_py
File "/Users/Jojo/opt/anaconda3/lib/python3.8/site-packages/pandas/core/series.py", line 942, in getitem
return self._get_value(key)
File "/Users/Jojo/opt/anaconda3/lib/python3.8/site-packages/pandas/core/series.py", line 1051, in _get_value
loc = self.index.get_loc(label)
File "/Users/Jojo/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexes/base.py", line 3363, in get_loc
raise KeyError(key) from err
KeyError: 0
Please help :)
I think the issue here is that you need more than one data point to create a polygon where as your suburb 2 and 3 each got only a single point.

Issues to access to a list in Pandas dataframe - RESOLVED

I am currently working with list in a dataframe defined as follow :
class DATA():
def __init__(self):
self.BDD_EEC = pd.DataFrame([],columns = ['Nom','Nmbre Param','Param', 'Units','Fixe','Min Param', 'Max Param','Entry', 'Equation'])
## R+R/C
n=0
self.BDD_EEC.at[n,'Nom'] = 'Re+R1/C1'
self.BDD_EEC.at[n,'Nombre Param'] = 3
self.BDD_EEC.at[n,'Param'] = ['Re','C1','R1']
self.BDD_EEC.at[n,'Units'] = ['Ohm','F','Ohm']
self.BDD_EEC.at[n,'Fixe'] = [0,0,0]
self.BDD_EEC.at[n,'Min Param'] = [0,1e-10,0]
self.BDD_EEC.at[n,'Max Param'] = [1000,1,1e10]
self.BDD_EEC.at[n,'Equation'] = 'Re+(R1)/(1+1j*2*np.pi*f*R1*Q1)'
#### R+R/Q
n = 1
self.BDD_EEC.at[n,'Nom'] = 'Re+R1/Q1'
self.BDD_EEC.at[n,'Nombre Param'] = 4
self.BDD_EEC.at[n,'Param'] = ['Re','Q1', 'a1', 'R1']
self.BDD_EEC.at[n,'Units'] = ['Ohm','F.s^(a-1)','--','Ohm']
self.BDD_EEC.at[n,'Fixe'] = [0,0,0,0]
self.BDD_EEC.at[n,'Min Param'] = [0.001, 1e-10, 0.001, 0.001]
self.BDD_EEC.at[n,'Max Param'] = [200, 1, 1, 1e6]
self.BDD_EEC.at[n,'Equation'] = 'Re+(R1)/(1+1j*2*np.pi*f*np.power(R1*Q1,a1))'
After creating the object 'mes_datas', I try to access to the list ['Param'] thanks to the name, it works well :
mes_datas.BDD_EEC['Param'][mes_datas.BDD_EEC['Nom'] == 'Re+R1/C1'][0]
Out[26]: ['Re', 'C1', 'R1']
but with the other name, I have this error :
mes_datas.BDD_EEC['Param'][mes_datas.BDD_EEC['Nom'] == 'Re+R1/Q1'][0]
Traceback (most recent call last):
File "C:\Users\cboissy\anaconda3\lib\site-packages\pandas\core\indexes\base.py", line 2895, in get_loc
return self._engine.get_loc(casted_key)
File "pandas\_libs\index.pyx", line 70, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 101, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 1032, in pandas._libs.hashtable.Int64HashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 1039, in pandas._libs.hashtable.Int64HashTable.get_item
KeyError: 0
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "<ipython-input-27-1bad18a3e796>", line 1, in <module>
mes_datas.BDD_EEC['Param'][mes_datas.BDD_EEC['Nom'] == 'Re+R1/Q1'][0]
File "C:\Users\cboissy\anaconda3\lib\site-packages\pandas\core\series.py", line 882, in __getitem__
return self._get_value(key)
File "C:\Users\cboissy\anaconda3\lib\site-packages\pandas\core\series.py", line 989, in _get_value
loc = self.index.get_loc(label)
File "C:\Users\cboissy\anaconda3\lib\site-packages\pandas\core\indexes\base.py", line 2897, in get_loc
raise KeyError(key) from err
KeyError: 0
I am quite confuse... Does anyone can help ?
Thanks
You're getting KeyError because the returning index is 1, not 0. You can use .iat[0] to get first result:
print(mes_datas.BDD_EEC.loc[mes_datas.BDD_EEC["Nom"] == "Re+R1/Q1", "Param"].iat[0])
Prints:
['Re', 'Q1', 'a1', 'R1']

Pandas error when using resample and groupby to apply function

I am new to python. I used to code in R and work with period_apply function. So I tried the following approaches in python below.
First, I do not understand what the errors are trying to tell me.
Second, I do not understand why I only get errors with groupby if I include the first row of the data. Yet with resample, i get error no matter whether I include the first row or not.
Third, how do I resolve this problem, please do not tell me skip the first row, because I work with a much much bigger dataset
Data
Best_Bid Best_Ask
Timestamp
2019-05-02 11:59:59.602 29636.0 29638.0
2019-05-02 12:59:00.033 NaN NaN
2019-05-02 12:59:00.033 NaN NaN
2019-05-02 12:59:00.033 NaN NaN
2019-05-02 12:59:00.033 NaN NaN
2019-05-02 12:59:00.033 NaN NaN
2019-05-02 12:59:00.033 NaN NaN
{'Best_Bid': {Timestamp('2019-05-02 11:59:59.602000'): 29636.0,
Timestamp('2019-05-02 12:59:00.033000'): nan},
'Best_Bid_Q': {Timestamp('2019-05-02 11:59:59.602000'): 4.0,
Timestamp('2019-05-02 12:59:00.033000'): nan},
'Best_Ask': {Timestamp('2019-05-02 11:59:59.602000'): 29638.0,
Timestamp('2019-05-02 12:59:00.033000'): nan}}
And I am trying to apply the below function(I know I could have just done .agg({'Best_Bid':['last']}) but this is a simplified version of my original code).
Function
def func(x):
best_bid = (x['Best_Bid'])[-1]
best_ask = (x['Best_Ask'])[-1]
return pd.Series([best_bid,best_ask], index=['bbbid', 'aaask'])
groupby and grouper
If I skip the first row and run. Things work fine.
df.iloc[1:,:].groupby(pd.Grouper(freq='180S',closed='right',label='right',base=-0.0001)).apply(func)
bbbid aaask
Timestamp
2019-05-02 12:59:59.999899904 NaN NaN
However, if i include the first row, I got the following error.
df.groupby(pd.Grouper(freq='180S',closed='right',label='right',base=-0.0001)).apply(func)
Traceback (most recent call last):
File "C:\Users\testUser\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\indexes\base.py", line 4405, in get_value
return self._engine.get_value(s, k, tz=getattr(series.dtype, "tz", None))
File "pandas\_libs\index.pyx", line 80, in pandas._libs.index.IndexEngine.get_value
File "pandas\_libs\index.pyx", line 90, in pandas._libs.index.IndexEngine.get_value
File "pandas\_libs\index.pyx", line 471, in pandas._libs.index.DatetimeEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 997, in pandas._libs.hashtable.Int64HashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 1004, in pandas._libs.hashtable.Int64HashTable.get_item
KeyError: -1
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\testUser\AppData\Local\Programs\Python\Python38\lib\site-packages\IPython\core\interactiveshell.py", line 3331, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-191-85550b07b869>", line 1, in <module>
df.iloc[371448:371455,0:3].groupby(pd.Grouper(freq='180S',closed='right',label='right',base=-0.0001)).apply(func)
File "C:\Users\testUser\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\groupby\groupby.py", line 735, in apply
result = self._python_apply_general(f)
File "C:\Users\testUser\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\groupby\groupby.py", line 751, in _python_apply_general
keys, values, mutated = self.grouper.apply(f, self._selected_obj, self.axis)
File "C:\Users\testUser\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\groupby\ops.py", line 206, in apply
res = f(group)
File "<ipython-input-104-c57c7e2b6885>", line 2, in func
best_bid = (x['Best_Bid'])[-1]
File "C:\Users\testUser\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\series.py", line 871, in __getitem__
result = self.index.get_value(self, key)
File "C:\Users\testUser\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\indexes\datetimes.py", line 651, in get_value
value = Index.get_value(self, series, key)
File "C:\Users\testUser\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\indexes\base.py", line 4411, in get_value
return libindex.get_value_at(s, key)
File "pandas\_libs\index.pyx", line 44, in pandas._libs.index.get_value_at
File "pandas\_libs\index.pyx", line 45, in pandas._libs.index.get_value_at
File "pandas\_libs\util.pxd", line 98, in pandas._libs.util.get_value_at
File "pandas\_libs\util.pxd", line 89, in pandas._libs.util.validate_indexer
IndexError: index out of bounds
Traceback (most recent call last):
File "C:\Users\testUser\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\indexes\base.py", line 4405, in get_value
return self._engine.get_value(s, k, tz=getattr(series.dtype, "tz", None))
File "pandas\_libs\index.pyx", line 80, in pandas._libs.index.IndexEngine.get_value
File "pandas\_libs\index.pyx", line 90, in pandas._libs.index.IndexEngine.get_value
File "pandas\_libs\index.pyx", line 471, in pandas._libs.index.DatetimeEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 997, in pandas._libs.hashtable.Int64HashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 1004, in pandas._libs.hashtable.Int64HashTable.get_item
KeyError: -1
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\testUser\AppData\Local\Programs\Python\Python38\lib\site-packages\IPython\core\interactiveshell.py", line 3331, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-191-85550b07b869>", line 1, in <module>
df.iloc[371448:371455,0:3].groupby(pd.Grouper(freq='180S',closed='right',label='right',base=-0.0001)).apply(func)
File "C:\Users\testUser\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\groupby\groupby.py", line 735, in apply
result = self._python_apply_general(f)
File "C:\Users\testUser\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\groupby\groupby.py", line 751, in _python_apply_general
keys, values, mutated = self.grouper.apply(f, self._selected_obj, self.axis)
File "C:\Users\testUser\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\groupby\ops.py", line 206, in apply
res = f(group)
File "<ipython-input-104-c57c7e2b6885>", line 2, in func
best_bid = (x['Best_Bid'])[-1]
File "C:\Users\testUser\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\series.py", line 871, in __getitem__
result = self.index.get_value(self, key)
File "C:\Users\testUser\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\indexes\datetimes.py", line 651, in get_value
value = Index.get_value(self, series, key)
File "C:\Users\testUser\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\indexes\base.py", line 4411, in get_value
return libindex.get_value_at(s, key)
File "pandas\_libs\index.pyx", line 44, in pandas._libs.index.get_value_at
File "pandas\_libs\index.pyx", line 45, in pandas._libs.index.get_value_at
File "pandas\_libs\util.pxd", line 98, in pandas._libs.util.get_value_at
File "pandas\_libs\util.pxd", line 89, in pandas._libs.util.validate_indexer
IndexError: index out of bounds
Resample
I got the following error regardless of including the first row or not.
df.resample(rule='180S',closed='right',label='right',base=-0.0001).agg(func)
Traceback (most recent call last):
File "C:\Users\testUser\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\indexes\base.py", line 4411, in get_value
return libindex.get_value_at(s, key)
File "pandas\_libs\index.pyx", line 44, in pandas._libs.index.get_value_at
File "pandas\_libs\index.pyx", line 45, in pandas._libs.index.get_value_at
File "pandas\_libs\util.pxd", line 98, in pandas._libs.util.get_value_at
File "pandas\_libs\util.pxd", line 83, in pandas._libs.util.validate_indexer
TypeError: 'str' object cannot be interpreted as an integer
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\testUser\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\indexes\datetimes.py", line 651, in get_value
value = Index.get_value(self, series, key)
File "C:\Users\testUser\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\indexes\base.py", line 4419, in get_value
raise e1
File "C:\Users\testUser\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\indexes\base.py", line 4405, in get_value
return self._engine.get_value(s, k, tz=getattr(series.dtype, "tz", None))
File "pandas\_libs\index.pyx", line 80, in pandas._libs.index.IndexEngine.get_value
File "pandas\_libs\index.pyx", line 90, in pandas._libs.index.IndexEngine.get_value
File "pandas\_libs\index.pyx", line 473, in pandas._libs.index.DatetimeEngine.get_loc
File "pandas\_libs\index.pyx", line 479, in pandas._libs.index.DatetimeEngine._date_check_type
KeyError: 'Best_Bid'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "pandas\_libs\tslibs\conversion.pyx", line 520, in pandas._libs.tslibs.conversion.convert_str_to_tsobject
File "pandas\_libs\tslibs\parsing.pyx", line 228, in pandas._libs.tslibs.parsing.parse_datetime_string
File "C:\Users\testUser\AppData\Local\Programs\Python\Python38\lib\site-packages\dateutil\parser\_parser.py", line 1374, in parse
return DEFAULTPARSER.parse(timestr, **kwargs)
File "C:\Users\testUser\AppData\Local\Programs\Python\Python38\lib\site-packages\dateutil\parser\_parser.py", line 649, in parse
raise ParserError("Unknown string format: %s", timestr)
dateutil.parser._parser.ParserError: Unknown string format: Best_Bid
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\testUser\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\indexes\datetimes.py", line 660, in get_value
return self.get_value_maybe_box(series, key)
File "C:\Users\testUser\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\indexes\datetimes.py", line 675, in get_value_maybe_box
key = Timestamp(key)
File "pandas\_libs\tslibs\timestamps.pyx", line 418, in pandas._libs.tslibs.timestamps.Timestamp.__new__
File "pandas\_libs\tslibs\conversion.pyx", line 292, in pandas._libs.tslibs.conversion.convert_to_tsobject
File "pandas\_libs\tslibs\conversion.pyx", line 523, in pandas._libs.tslibs.conversion.convert_str_to_tsobject
ValueError: could not convert string to Timestamp
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\testUser\AppData\Local\Programs\Python\Python38\lib\site-packages\IPython\core\interactiveshell.py", line 3331, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-190-d2caa0c5152a>", line 1, in <module>
df.iloc[371448:371455,0:3].resample(rule='180S',closed='right',label='right',base=-0.0001).agg(func)
File "C:\Users\testUser\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\resample.py", line 285, in aggregate
result = self._groupby_and_aggregate(how, grouper, *args, **kwargs)
File "C:\Users\testUser\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\resample.py", line 359, in _groupby_and_aggregate
result = grouped._aggregate_item_by_item(how, *args, **kwargs)
File "C:\Users\testUser\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\groupby\generic.py", line 1172, in _aggregate_item_by_item
result[item] = colg.aggregate(func, *args, **kwargs)
File "C:\Users\testUser\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\groupby\generic.py", line 269, in aggregate
result = self._aggregate_named(func, *args, **kwargs)
File "C:\Users\testUser\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\groupby\generic.py", line 452, in _aggregate_named
output = func(group, *args, **kwargs)
File "<ipython-input-104-c57c7e2b6885>", line 2, in func
best_bid = (x['Best_Bid'])[-1]
File "C:\Users\testUser\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\series.py", line 871, in __getitem__
result = self.index.get_value(self, key)
File "C:\Users\testUser\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\indexes\datetimes.py", line 662, in get_value
raise KeyError(key)
KeyError: 'Best_Bid'
From what I can tell you are getting an error with the (x['Best_Bid'])[-1] because it is returning a KeyError: -1
Your apply function is iterating though each element (x) from the column (Best_Bid and Bid_Ask) and trying to grab the last index from the element (x) which doesn't make sense.
I don't have your dataset in front of me to work with but I would try this code to see if it works.
gdf = df.groupby(pd.Grouper(freq='180S',closed='right',label='right',base=-0.0001)).copy()
print(gdf['Best_Bid'][gdf.index[-1]],gdf.index[-1])
print(gdf['Best_Ask'][gdf.index[-1]],gdf.index[-1])
Now this code can definitely be simplified but it should work for all rows and it will be much faster than the .apply method if it is a large dataset.

Categories

Resources