builtin keyerror while using pandas datareader to extract data - python

I'm using a loop to extract data by using pandas datareader, the first two loops are working properly.
But from the third loop, the code starts to return a builtin keyerror which is unexpected. i wonder since the first two loops are working properly, why from the third loop it starts to return error? and how to fix it?
import pandas as pd
import datetime as dt
import pandas_datareader as web
#====================================================
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)
#############
prev=15
endDate=dt.datetime.today().date()
sDate=endDate-pd.to_timedelta(prev,unit='d')
#############
def get_price(tickers): #input is a list or Series
result=pd.DataFrame()
for i in tickers:
df=pd.DataFrame()
df['Adj Close']=web.DataReader(i,'yahoo',sDate,endDate)['Adj Close']
df['MA']=df['Adj Close'].rolling(5).mean()
df.sort_values(ascending=False,inplace=True,by="Date")
df['Higher?']=df['Adj Close']>df['MA']
df['Higher?']=df['Higher?'].astype(int)
result['{}'.format(i)]=df['Higher?']
return result
#=============================================================================
base_url = "http://www.sectorspdr.com/sectorspdr/IDCO.Client.Spdrs.Holdings/Export/ExportExcel?symbol="
data = {
'Ticker' : [ 'XLC','XLY','XLP','XLE','XLF','XLV','XLI','XLB','XLRE','XLK','XLU' ]
, 'Name' : [ 'Communication Services','Consumer Discretionary','Consumer Staples','Energy','Financials','Health Care','Industrials','Materials','Real Estate','Technology','Utilities' ]
}
spdr_df = pd.DataFrame(data)
print(spdr_df)
for i, row in spdr_df.iterrows():
url = base_url + row['Ticker']
df_url = pd.read_excel(url)
header = df_url.iloc[0]
holdings_df = df_url[1:]
holdings_df.set_axis(header, axis='columns', inplace=True)
holdings_df=holdings_df['Symbol'].replace('.','-')
a=get_price(holdings_df)
print(a)
the errors are listed below:
a=get_price(holdings_df)
File "C:/Users/austi/Desktop/stock&trading/get etf holdings Main Version.py", line 25, in <module>
df['Adj Close']=web.DataReader(i,'yahoo',sDate,endDate)['Adj Close']
File "C:\Users\austi\Downloads\Python 3.6.3\Lib\site-packages\pandas\util\_decorators.py", line 214, in wrapper
return func(*args, **kwargs)
File "C:\Users\austi\Downloads\Python 3.6.3\Lib\site-packages\pandas_datareader\data.py", line 387, in DataReader
session=session,
File "C:\Users\austi\Downloads\Python 3.6.3\Lib\site-packages\pandas_datareader\base.py", line 251, in read
df = self._read_one_data(self.url, params=self._get_params(self.symbols))
File "C:\Users\austi\Downloads\Python 3.6.3\Lib\site-packages\pandas_datareader\yahoo\daily.py", line 165, in _read_one_data
prices["Date"] = to_datetime(to_datetime(prices["Date"], unit="s").dt.date)
File "C:\Users\austi\Downloads\Python 3.6.3\Lib\site-packages\pandas\core\frame.py", line 2800, in __getitem__
indexer = self.columns.get_loc(key)
File "C:\Users\austi\Downloads\Python 3.6.3\Lib\site-packages\pandas\core\indexes\base.py", line 2648, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "C:\Users\austi\Downloads\Python 3.6.3\Lib\site-packages\pandas\_libs\index.cp36-win32.pyd", line 111, in pandas._libs.index.IndexEngine.get_loc
File "C:\Users\austi\Downloads\Python 3.6.3\Lib\site-packages\pandas\_libs\index.cp36-win32.pyd", line 138, in pandas._libs.index.IndexEngine.get_loc
File "C:\Users\austi\Downloads\Python 3.6.3\Lib\site-packages\pandas\_libs\hashtable.cp36-win32.pyd", line 1619, in pandas._libs.hashtable.PyObjectHashTable.get_item
Array of values of which unique will be calculated
File "C:\Users\austi\Downloads\Python 3.6.3\Lib\site-packages\pandas\_libs\hashtable.cp36-win32.pyd", line 1627, in pandas._libs.hashtable.PyObjectHashTable.get_item
builtins.KeyError: 'Date'```

For same tickers, there is no date column.
To catch the error and continue, try this code:
def get_price(tickers): #input is a list or Series
result=pd.DataFrame()
for i in tickers:
try:
df=pd.DataFrame()
df['Adj Close']=web.DataReader(i,'yahoo',sDate,endDate)['Adj Close']
df['MA']=df['Adj Close'].rolling(5).mean()
df.sort_values(ascending=False,inplace=True,by="Date") # sometimes error
df['Higher?']=df['Adj Close']>df['MA']
df['Higher?']=df['Higher?'].astype(int)
result['{}'.format(i)]=df['Higher?']
except Exception as ex: # no date column
print('Ticker', i, 'ERROR', ex)
print(df)
return result

Related

pandas_datareader throwing error when requesting multiple cryptocurrency datasets in single request

If I make a call for only one cryptocurrency it works, but for multiple it fails.
import pandas_datareader as pdr
...
crypto_df = pdr.DataReader('BTC-USD', data_source = 'yahoo', start = '2015-01-01')
works fine
crypto_df = pdr.DataReader('ETH-USD', data_source = 'yahoo', start = '2015-01-01')
also works fine
crypto_df = pdr.DataReader(['BTC-USD', 'ETH-USD'], data_source = 'yahoo', start = '2015-01-01')
fails with the following error:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/home/alex/.local/lib/python3.8/site-packages/pandas/util/_decorators.py", line 199, in wrapper
return func(*args, **kwargs)
File "/home/alex/.local/lib/python3.8/site-packages/pandas_datareader/data.py", line 376, in DataReader
return YahooDailyReader(
File "/home/alex/.local/lib/python3.8/site-packages/pandas_datareader/base.py", line 258, in read
df = self._dl_mult_symbols(self.symbols)
File "/home/alex/.local/lib/python3.8/site-packages/pandas_datareader/base.py", line 285, in _dl_mult_symbols
result = concat(stocks, sort=True).unstack(level=0)
File "/home/alex/.local/lib/python3.8/site-packages/pandas/core/frame.py", line 7349, in unstack
result = unstack(self, level, fill_value)
File "/home/alex/.local/lib/python3.8/site-packages/pandas/core/reshape/reshape.py", line 417, in unstack
return _unstack_frame(obj, level, fill_value=fill_value)
File "/home/alex/.local/lib/python3.8/site-packages/pandas/core/reshape/reshape.py", line 444, in _unstack_frame
return _Unstacker(
File "/home/alex/.local/lib/python3.8/site-packages/pandas/core/reshape/reshape.py", line 118, in __init__
self._make_selectors()
File "/home/alex/.local/lib/python3.8/site-packages/pandas/core/reshape/reshape.py", line 167, in _make_selectors
raise ValueError("Index contains duplicate entries, cannot reshape")
This works as expected with stocks, but fails with cryptocurrency.
I'm confident this is not an issue on my side, but I am hoping someone can confirm. I will open a ticket with the developers if this is an unknown bug.
You need to define the index you want to fetch.
#Trying to fetch crypto data from yahoo
from pandas_datareader import data as wb
tickers = ['BTC-USD', 'ETH-USD']
crypto_data = pd.DataFrame()
for t in tickers:
crypto_data[t] = wb.DataReader(t, data_source ='yahoo', start= '2020-12-01')['Adj Close']
You are missing ['Adj Close'] in this case.

pd.Series - "Filename" KeyError

I am having trouble running a script for getting counts of predictions from csv files at a given directory. The format of the csv looks like this:
Sample data
and the code is the following:
import os
from glob import glob
import pandas as pd
def get_count(distribution, keyname):
try:
count = distribution[keyname]
except KeyError:
count = 0
return count
main_path = "K:\\...\\folder_name"
folder_paths = glob("%s\\*" % main_path)
data = []
for path in folder_paths:
file_name = os.path.splitext(os.path.basename(path))[0]
results = pd.read_csv(path, error_bad_lines=False)results['Label'] = pd.Series(results['Filename'].str.split("\\").str[0])
distribution = results.Predictions.value_counts()
print(distribution)
num_of_x = get_count(distribution, "x")
num_of_y = get_count(distribution,"y")
num_of_z = get_count(distribution,"z")
d = {"filename": file_name, "x": num_of_x, "y": num_of_y, "z": num_of_z}
data.append(d)
df = pd.DataFrame(data=data)
df.to_csv(os.path.join(main_path,"summary_counts.csv"), index=False)
the output error is Keyerror: "Filename" reffering to the pd.Series function, anyone would know how to solve this?
I am using Python 3.7.3 and pandas 1.0.5 and I am a beginner in programming...
Many thanks in advance
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File ".\save_counts.py", line 24, in <module>
results['Label'] = pd.Series(results['Filename'].str.split("\\").str[0])
File "K:\...\lib\site-packages\pandas\core\frame.py
", line 2800, in __getitem__
indexer = self.columns.get_loc(key)
File "K:\...\site-packages\pandas\core\indexes\
base.py", line 2648, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas\_libs\index.pyx", line 111, in pandas._libs.index.IndexEngine.get
_loc
File "pandas\_libs\index.pyx", line 138, in pandas._libs.index.IndexEngine.get
_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 1619, in pandas._libs.has
htable.PyObjectHashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 1627, in pandas._libs.has
htable.PyObjectHashTable.get_item
KeyError: 'Filename'
in here:
for path in folder_paths:
file_name = os.path.splitext(os.path.basename(path))[0]
results = pd.read_csv(path, error_bad_lines=False)
results['Label'] = pd.Series(results['Filename'].str.split("\\").str[0])
you are creating pd.Series, but those values exist only inside this for loop.
if after this loop you want to use results df in distribution you need to use append()
create empty df and append results in this df
final_results = pd.Dataframe()
for path in folder_paths:
file_name = os.path.splitext(os.path.basename(path))[0]
results = pd.read_csv(path, error_bad_lines=False)
results['Label'] = pd.Series(results['Filename'].str.split("\\").str[0])
final_results = final_results.append(results)
#and from this point you can continue

Python datetime- Get an interval of dates in a dataframe

I have a dataset in the following format:
date_time,open,close,...
2012-02-01,1307.25,1320.5,...
2012-02-03,1322.5,1339.5,...
....
These data are in a file called Dataset.csv. I read it as follows:
#This is the whole data. I will use it only later
self.data= pd.read_csv('./dataset/Dataset.csv')
#I will get only the indices from date_time columns. This is what I
#want now
self.sp = pd.read_csv('./dataset/Dataset.csv')
#Set index
self.sp = self.sp.set_index('date_time')
#save indices
self.sp = self.sp.index
if I print self.sp, here is what I get
Index(['2012-02-01', '2012-02-02', '2012-02-03', '2012-02-06', '2012-02-07',
'2012-02-08', '2012-02-09', '2012-02-10', '2012-02-13', '2012-02-14',
...
'2019-08-19', '2019-08-20', '2019-08-21', '2019-08-22', '2019-08-23',
'2019-08-26', '2019-08-27', '2019-08-28', '2019-08-29', '2019-08-30'],
dtype='object', name='date_time', length=1960)
I would like to get an interval of values, according to the date_time column considering a beginning date and interval of dates as the following:
#The initial date
begin=datetime.datetime(2012,2,1,0,0,0,0)
#The total number of days I will get from the dataset is 360, starting
#from the date in the
#begin variable
trainSize=datetime.timedelta(days=360*1).days
#The TrainMinLimit will be loaded as the initial date
#If the initial date cannot be used, add 1 day to the initial date and
#consider it as the initial date
trainMinLimit=None
while(trainMinLimit is None):
try:
trainMinLimit = self.sp.get_loc(begin)
except:
begin+=datetime.timedelta(1,0,0,0,0,0,0)
#The TrainMaxLimit will be loaded as the interval between the initial date plus the training
#size. If the initial date cannot be used, add 1 day to the initial date and consider it the
#initial date
trainMaxLimit=None
while(trainMaxLimit is None):
try:
trainMaxLimit = self.sp.get_loc(begin+trainSize)
except:
begin+=datetime.timedelta(1,0,0,0,0,0,0)
When I run this code, I have the following error:
trainMinLimit = self.sp.get_loc(begin)
File "/usr/local/lib/python3.5/dist-packages/pandas/core/indexes/base.py", line 2659, in
get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/_libs/index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 127, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 153, in
pandas._libs.index.IndexEngine._get_loc_duplicates
File "pandas/_libs/index.pyx", line 170, in
pandas._libs.index.IndexEngine._maybe_get_bool_indexer
KeyError: datetime.date(2012, 2, 1)
Here Python is not understanding in the get_loc() how the begin variable format (datetime) can index the dataframe of dates. How can I use a datetime variable to get the position it is located in a dates pandas dataframe?
edit: As suggested, I tried to convert the index to datetime format as the following:
self.sp = pd.read_csv('./dataset/Dataset.csv')
self.sp = self.sp.set_index('date_time')
self.sp = pd.to_datetime(self.sp.index)
And I have the following error:
self.sp = pd.to_datetime(self.sp.index)
File "/usr/local/lib/python3.5/dist-
packages/pandas/core/tools/datetimes.py", line 603, in to_datetime
result = convert_listlike(arg, box, format)
File "/usr/local/lib/python3.5/dist-
packages/pandas/core/tools/datetimes.py", line 302, in
_convert_listlike_datetimes
allow_object=True)
File "/usr/local/lib/python3.5/dist-
packages/pandas/core/arrays/datetimes.py", line 1866, in
objects_to_datetime64ns
raise e
File "/usr/local/lib/python3.5/dist-
packages/pandas/core/arrays/datetimes.py", line 1857, in
objects_to_datetime64ns
require_iso8601=require_iso8601
File "pandas/_libs/tslib.pyx", line 460, in
pandas._libs.tslib.array_to_datetime
File "pandas/_libs/tslib.pyx", line 685, in
pandas._libs.tslib.array_to_datetime
File "pandas/_libs/tslib.pyx", line 809, in
pandas._libs.tslib.array_to_datetime_object
File "pandas/_libs/tslib.pyx", line 803, in
pandas._libs.tslib.array_to_datetime_object
File "pandas/_libs/tslibs/parsing.pyx", line 99, in
pandas._libs.tslibs.parsing.parse_datetime_string
File "/usr/local/lib/python3.5/dist-
packages/dateutil/parser/_parser.py", line 1374, in parse
return DEFAULTPARSER.parse(timestr, **kwargs)
File "/usr/local/lib/python3.5/dist-
packages/dateutil/parser/_parser.py", line 649, in parse
raise ParserError("Unknown string format: %s", timestr)
dateutil.parser._parser.ParserError: Unknown string format: date_time

Pandas datareader failure

I want to get all the stocks from sp500 to a folder in csv format.
Now while scanning the sp500 everything works great but it seems to be that in some cases the index referred to date is missing because stock doesn't exist or has no date for a specific time, whatever I tried to change startdate and enddate but no effect - in en earlier post I was said to filter those dates with an exception but due to python is new land for me I was like an alien... is there someone who can help me?
If this error occurs:
/home/mu351i/PycharmProjects/untitled/venv/bin/python /home/mu351i/PycharmProjects/untitled/get_sp500_beautifulsoup_intro.py
Traceback (most recent call last):
File "/home/mu351i/PycharmProjects/untitled/venv/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 2897, in get_loc
return self._engine.get_loc(key)
File "pandas/_libs/index.pyx", line 107, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 131, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1607, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1614, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'Date'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/mu351i/PycharmProjects/untitled/get_sp500_beautifulsoup_intro.py", line 44, in get_data_from_yahoo
df = web.DataReader (ticker, 'yahoo', start, end)
File "/home/mu351i/PycharmProjects/untitled/venv/lib/python3.7/site-packages/pandas/util/_decorators.py", line 208, in wrapper
return func(*args, **kwargs)
File "/home/mu351i/PycharmProjects/untitled/venv/lib/python3.7/site-packages/pandas_datareader/data.py", line 387, in DataReader
session=session,
File "/home/mu351i/PycharmProjects/untitled/venv/lib/python3.7/site-packages/pandas_datareader/base.py", line 251, in read
df = self._read_one_data(self.url, params=self._get_params(self.symbols))
File "/home/mu351i/PycharmProjects/untitled/venv/lib/python3.7/site-packages/pandas_datareader/yahoo/daily.py", line 165, in _read_one_data
prices["Date"] = to_datetime(to_datetime(prices["Date"], unit="s").dt.date)
File "/home/mu351i/PycharmProjects/untitled/venv/lib/python3.7/site-packages/pandas/core/frame.py", line 2995, in getitem
indexer = self.columns.get_loc(key)
File "/home/mu351i/PycharmProjects/untitled/venv/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 2899, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/_libs/index.pyx", line 107, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 131, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1607, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1614, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'Date'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/mu351i/PycharmProjects/untitled/get_sp500_beautifulsoup_intro.py", line 57, in
get_data_from_yahoo()
File "/home/mu351i/PycharmProjects/untitled/get_sp500_beautifulsoup_intro.py", line 48, in get_data_from_yahoo
except RemoteDataError:
NameError: name 'RemoteDataError' is not defined
Process finished with exit code 1
how would you avoid this by changing this code?
import datetime as dt
import os
import pickle
import bs4 as bs
import pandas_datareader.data as web
import requests
def safe_sp500_tickers():
resp = requests.get('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
soup = bs.BeautifulSoup(resp.text,'lxml')
table = soup.find('table',{'class':'wikitable sortable'})
tickers = []
for row in table.findAll('tr')[1:]:
ticker=row.findAll('td')[0].text.strip()
tickers.append(ticker)
with open('sp500tickers.pickle','wb') as f:
pickle.dump(tickers,f)
return tickers
safe_sp500_tickers()
def get_data_from_yahoo(reload_sp500=False):
if reload_sp500:
tickers=safe_sp500_tickers()
else:
with open('sp500tickers.pickle', 'rb') as f:
tickers = pickle.load(f)
if not os.path.exists('stock_dfs'):
os.makedirs('stock_dfs')
start = dt.datetime(1999,1,1)
end = dt.datetime(2019,12,19)
for ticker in tickers:
try:
if not os.path.exists ('stock_dfs/{}.csv'.format (ticker)):
df = web.DataReader (ticker, 'yahoo', start, end)
df.to_csv ('stock_dfs/{}.csv'.format (ticker))
else:
print ("Ticker from {} already availablle".format (ticker))
except RemoteDataError:
print ("No information for ticker '%s'" % i)
continue
except KeyError:
print("no Date for Ticker: " +ticker )
continue
get_data_from_yahoo()
A Commentator asked for some DATA Sample, well this is DATA form TSLA.csv
Date,High,Low,Open,Close,Volume,Adj Close
2010-06-29,25.0,17.540000915527344,19.0,23.889999389648438,18766300,23.889999389648438
2010-06-30,30.420000076293945,23.299999237060547,25.790000915527344,23.829999923706055,17187100,23.829999923706055
2010-07-01,25.920000076293945,20.270000457763672,25.0,21.959999084472656,8218800,21.959999084472656
2010-07-02,23.100000381469727,18.709999084472656,23.0,19.200000762939453,5139800,19.200000762939453
2010-07-06,20.0,15.829999923706055,20.0,16.110000610351562,6866900,16.110000610351562
2010-07-07,16.6299991607666,14.979999542236328,16.399999618530273,15.800000190734863,6921700,15.800000190734863
2010-07-08,17.520000457763672,15.569999694824219,16.139999389648438,17.459999084472656,7711400,17.459999084472656
2010-07-09,17.899999618530273,16.549999237060547,17.579999923706055,17.399999618530273,4050600,17.399999618530273
2010-07-12,18.06999969482422,17.0,17.950000762939453,17.049999237060547,2202500,17.049999237060547
2010-07-13,18.639999389648438,16.899999618530273,17.389999389648438,18.139999389648438,2680100,18.139999389648438
2010-07-14,20.149999618530273,17.760000228881836,17.940000534057617,19.84000015258789,4195200,19.84000015258789
2010-07-15,21.5,19.0,19.940000534057617,19.889999389648438,3739800,19.889999389648438
2010-07-16,21.299999237060547,20.049999237060547,20.700000762939453,20.639999389648438,2621300,20.639999389648438
Please provide constructive feedback because I'new here.
Thanks :)
You are missing an import
Add the following import at the top of your script
from pandas_datareader._utils import RemoteDataError
import pandas as pd
df = pd.read_html(
"https://en.wikipedia.org/wiki/List_of_S%26P_500_companies")[0]
sort = pd.DataFrame(df).sort_values(by=['Date first added'])
sort['Date first added'] = pd.to_datetime(sort['Date first added'])
start_date = '1-1-1999'
end_date = '11-12-2019'
mask = (sort['Date first added'] > start_date) & (
sort['Date first added'] <= end_date)
sort = sort.loc[mask]
pd.DataFrame(sort).to_csv('result.csv', index=False)
Output: View Online
ScreenShot:

Python Outer join

The below code is used to calculate statistical values.
import re
from pathlib import Path
import pandas as pd
def prepare_values(df):
df_columns = ['frame.time_delta_displayed', 'frame.len']
df_values = []
for col in df_columns:
df_values +=[
df[col].max(),
df[col].min(),
df[col].std(),
df[col].quantile(0.25),
df[col].quantile(0.5),
df[col].quantile(0.75),
df[col].mean(),
df[col].mad(),
df[col].var(),
df[col].skew(),
df[col].kurtosis(),
df[col].sum(),
]
return df_values
source_dir = Path('/media/root/HASARA/Snipping Experiment-App Activities/Time-0.5/InOutFiltered')
in_data = []
for file in source_dir.glob('**/*.in.csv'):
activity = {'activity': file.stem.split('.')[0]}
df = pd.read_csv(file)
cols =['maxTimeIn', 'minTimeIn', 'stdTimeIn', 'q1TimeIn', 'q2TimeIn', 'q3TimeIn', 'meanTimeIn', 'madTimeIn', 'varianceTimeIn', 'skewTimeIn', 'kurtosisTimeIn', 'sumTimeIn', 'maxLenIn', 'minLenIn', 'stdLenIn', 'q1LenIn','q2lenIn', 'q3LenIn', 'meanLenIn', 'madLenIn', 'varianceLenIn', 'skewLenIn', 'kurtosisLenIn', 'sumLenIn']
values = prepare_values(df)
file_data ={**activity, **dict(zip(cols,values))}
in_data.append(file_data)
out_data =[]
for file in source_dir.glob('**/*.out.csv'):
activity = {'activity': file.stem.split('.')[0]}
df = pd.read_csv(file)
cols =['maxTimeOut', 'minTimeOut', 'stdTimeOut', 'q1TimeOut', 'q2TimeOut', 'q3TimeOut', 'meanTimeOut', 'madTimeOut', 'varianceTimeOut', 'skewTimeOut', 'kurtosisTimeOut', 'sumTimeOut', 'maxLenOut', 'minLenOut', 'stdLenOut', 'q1LenOut', 'q2LenOut', 'q3LenOut', 'meanLenOut', 'madLenOut', 'varianceLenOut', 'skewLenOut', 'kurtosisLenOut','sumLenOut']
values=prepare_values(df)
file_data = {**activity, **dict(zip(cols, values))}
out_data.append(file_data)
in_df = pd.DataFrame(in_data)
out_df = pd.DataFrame(out_data)
all_df = in_df.join(out_df.set_index('activity'), on='activity', how='outer')
all_df.dropna(subset=all_df.columns.tolist()[1:], how='all', inplace=True)
all_df.fillna(0, inplace=True)
all_df['activity'] = all_df['activity'].apply(lambda x:re.sub(r'^([a-zA-Z]+).*', r'\1',x))
all_df.to_csv('/media/root/HASARA/Snipping Experiment-App Activities/Time-0.5/AllDataNew.csv', index=False)
I am getting an error. Can't figure out what it means.
Traceback (most recent call last):
File "/root/PycharmProjects/AppAct/StatisticCal.py", line 48, in <module>
all_df= in_df.join(out_df.set_index('activity'), on='activity', how='outer')
File "/root/PycharmProjects/AppAct/venv/lib/python3.7/site-packages/pandas/core/frame.py", line 4178, in set_index
level = frame[col]._values
File "/root/PycharmProjects/AppAct/venv/lib/python3.7/site-packages/pandas/core/frame.py", line 2927, in __getitem__
indexer = self.columns.get_loc(key)
File "/root/PycharmProjects/AppAct/venv/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 2659, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/_libs/index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'activity'

Categories

Resources