Pandas To_Excel parsing problem - outputting only 1 file - python

Hello I have working code like this:
import pandas as pdfrom pandas.io.json import json_normalize
import json
import warnings
warnings.filterwarnings('ignore')
with open('yieldfull.json') as file:
data = json.load(file)
df_json = json_normalize(data)
df_json_stripped = data[0]
platform_dict = df_json_stripped['result']
platform_names = []
for key in platform_dict:
platform_names.append(key)
for name in platform_names:
if name == 'Autofarm':
vault_name_df = json_normalize(pd.DataFrame(dict([(k , pd.Series(v)) for k,v in df_json['result.'+name+'.LPVaults.vaults'].items()]))[0])['name']
current_token_0 = json_normalize(pd.DataFrame(dict([(k , pd.Series(v)) for k,v in df_json['result.'+name+'.LPVaults.vaults'].items()]))[0])['LPInfo.currentToken0']
current_token_1 = json_normalize(pd.DataFrame(dict([(k , pd.Series(v)) for k,v in df_json['result.'+name+'.LPVaults.vaults'].items()]))[0])['LPInfo.currentToken1']
df_json = pd.DataFrame({'Vault_Name':vault_name_df, 'Current_Token_0':current_token_0 , 'Current_Token_1':current_token_1})
df_json.to_excel('Output_'+name+'.xlsx', index = False)
platform_names.remove(name)
elif name == 'Acryptos':
vault_name_df = json_normalize(pd.DataFrame(dict([(k , pd.Series(v)) for k,v in df_json['result.'+name+'.vaults.vaults'].items()]))[0])['name']
price_USD = json_normalize(pd.DataFrame(dict([(k , pd.Series(v)) for k,v in df_json['result.'+name+'.vaults.vaults'].items()]))[0])['priceInUSDDepositToken']
current_token_0 = json_normalize(pd.DataFrame(dict([(k , pd.Series(v)) for k,v in df_json['result.'+name+'.vaults.vaults'].items()]))[0])['currentTokens']
deposited_token = json_normalize(pd.DataFrame(dict([(k, pd.Series(v)) for k,v in df_json['result.'+name+'.vaults.vaults'].items()]))[0])['depositedTokens']
df_json = pd.DataFrame({'Vault_Name':vault_name_df, 'Price_USD':price_USD, 'Current_Token_0':current_token_0, 'Deposited_Token':deposited_token})
df_json.to_excel('Output_'+name+'.xlsx', index = False)
else:
pass
Problem is: If I leave it like this it only outputs for first if. When I comment out that if section it will successfully output elif, but I can't get it to output 2 files whatever I do. Any ideas?
Error I'm getting for Acryptos:
Traceback (most recent call last):
File "C:\Users\Adam\PycharmProjects\Scrapy_Things\venv\lib\site-packages\pandas\core\indexes\base.py", line 3080, in get_loc
return self._engine.get_loc(casted_key)
File "pandas\_libs\index.pyx", line 70, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 101, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 4554, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 4562, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'result.Acryptos.vaults.vaults'
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:/Users/Adam/PycharmProjects/Scrapy_Things/yieldwatch/yieldwatch/spiders/JsonExcel.py", line 27, in <module>
vault_name_df = json_normalize(pd.DataFrame(dict([(k , pd.Series(v)) for k,v in df_json['result.'+name+'.vaults.vaults'].items()]))[0])['name']
File "C:\Users\Adam\PycharmProjects\Scrapy_Things\venv\lib\site-packages\pandas\core\frame.py", line 3024, in __getitem__
indexer = self.columns.get_loc(key)
File "C:\Users\Adam\PycharmProjects\Scrapy_Things\venv\lib\site-packages\pandas\core\indexes\base.py", line 3082, in get_loc
raise KeyError(key) from err
KeyError: 'result.Acryptos.vaults.vaults'
But if I comment out Autofarm and just process if for Acryptos is outputs excel just fine.

please remove the below line from your code
platform_names.remove(name)
debug code:
platform_names=['Autofarm','Acryptos']
for name in platform_names:
if name == 'Autofarm':
print("Autofarm")
#platform_names.remove(name) # remove this line
elif name == "Acryptos":
print("Acryptos")
you have initially created
df_json = json_normalize(data)
and also in loop, you are overwriting it -->
df_json = pd.DataFrame({'Vault_Name':vault_name_df, 'Current_Token_0':current_token_0 , 'Current_Token_1':current_token_1})
df_json.to_excel('Output_'+name+'.xlsx', index = False)
so change the name in loop and it will be okay.

Related

Issues to access to a list in Pandas dataframe - RESOLVED

I am currently working with list in a dataframe defined as follow :
class DATA():
def __init__(self):
self.BDD_EEC = pd.DataFrame([],columns = ['Nom','Nmbre Param','Param', 'Units','Fixe','Min Param', 'Max Param','Entry', 'Equation'])
## R+R/C
n=0
self.BDD_EEC.at[n,'Nom'] = 'Re+R1/C1'
self.BDD_EEC.at[n,'Nombre Param'] = 3
self.BDD_EEC.at[n,'Param'] = ['Re','C1','R1']
self.BDD_EEC.at[n,'Units'] = ['Ohm','F','Ohm']
self.BDD_EEC.at[n,'Fixe'] = [0,0,0]
self.BDD_EEC.at[n,'Min Param'] = [0,1e-10,0]
self.BDD_EEC.at[n,'Max Param'] = [1000,1,1e10]
self.BDD_EEC.at[n,'Equation'] = 'Re+(R1)/(1+1j*2*np.pi*f*R1*Q1)'
#### R+R/Q
n = 1
self.BDD_EEC.at[n,'Nom'] = 'Re+R1/Q1'
self.BDD_EEC.at[n,'Nombre Param'] = 4
self.BDD_EEC.at[n,'Param'] = ['Re','Q1', 'a1', 'R1']
self.BDD_EEC.at[n,'Units'] = ['Ohm','F.s^(a-1)','--','Ohm']
self.BDD_EEC.at[n,'Fixe'] = [0,0,0,0]
self.BDD_EEC.at[n,'Min Param'] = [0.001, 1e-10, 0.001, 0.001]
self.BDD_EEC.at[n,'Max Param'] = [200, 1, 1, 1e6]
self.BDD_EEC.at[n,'Equation'] = 'Re+(R1)/(1+1j*2*np.pi*f*np.power(R1*Q1,a1))'
After creating the object 'mes_datas', I try to access to the list ['Param'] thanks to the name, it works well :
mes_datas.BDD_EEC['Param'][mes_datas.BDD_EEC['Nom'] == 'Re+R1/C1'][0]
Out[26]: ['Re', 'C1', 'R1']
but with the other name, I have this error :
mes_datas.BDD_EEC['Param'][mes_datas.BDD_EEC['Nom'] == 'Re+R1/Q1'][0]
Traceback (most recent call last):
File "C:\Users\cboissy\anaconda3\lib\site-packages\pandas\core\indexes\base.py", line 2895, in get_loc
return self._engine.get_loc(casted_key)
File "pandas\_libs\index.pyx", line 70, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 101, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 1032, in pandas._libs.hashtable.Int64HashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 1039, in pandas._libs.hashtable.Int64HashTable.get_item
KeyError: 0
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "<ipython-input-27-1bad18a3e796>", line 1, in <module>
mes_datas.BDD_EEC['Param'][mes_datas.BDD_EEC['Nom'] == 'Re+R1/Q1'][0]
File "C:\Users\cboissy\anaconda3\lib\site-packages\pandas\core\series.py", line 882, in __getitem__
return self._get_value(key)
File "C:\Users\cboissy\anaconda3\lib\site-packages\pandas\core\series.py", line 989, in _get_value
loc = self.index.get_loc(label)
File "C:\Users\cboissy\anaconda3\lib\site-packages\pandas\core\indexes\base.py", line 2897, in get_loc
raise KeyError(key) from err
KeyError: 0
I am quite confuse... Does anyone can help ?
Thanks
You're getting KeyError because the returning index is 1, not 0. You can use .iat[0] to get first result:
print(mes_datas.BDD_EEC.loc[mes_datas.BDD_EEC["Nom"] == "Re+R1/Q1", "Param"].iat[0])
Prints:
['Re', 'Q1', 'a1', 'R1']

KeyError(Key) when using append with defaultdict

I am getting the following error when I am trying to append to a dictionary using defaultdict(list). From my understanding, defaultdict is suppose to prevent a keyerror.
raise KeyError(key) from err
KeyError: 'id'
The following is my code:
weather_data = defaultdict(list)
m = len(_ids)
date = str(date.today())
i = 0
while i < m:
url = ("https://api.openweathermap.org/data/2.5/weather?id=%s&units=%s&appid=%s") %
(_ids.loc[i], 'imperial', weather_key)
payload = r.get(url).json()
payload_from_json = pd.json_normalize(payload)
weather_data[date].append(date)
weather_data['id'].append(payload_from_json['id'])
weather_data['weather'].append(payload_from_json['weather'])
weather_data['base'].append(payload_from_json['base'])
weather_data['visibility'].append(payload_from_json['visibility'])
weather_data['dt'].append(payload_from_json['dt'])
weather_data['name'].append(payload_from_json['name'])
weather_data['cod'].append(payload_from_json['cod'])
weather_data['coord.lon'].append(payload_from_json['coord.lon'])
weather_data['coord.lat'].append(payload_from_json['coord.lat'])
weather_data['main.temp'].append(payload_from_json['main.temp'])
weather_data['main.feels_like'].append(payload_from_json['main.feels_like'])
weather_data['main.temp_min'].append(payload_from_json['main.temp_min'])
weather_data['main.temp_max'].append(payload_from_json['main.temp_max'])
weather_data['main.pressure'].append(payload_from_json['main.pressure'])
weather_data['main.humidity'].append(payload_from_json['main.humidity'])
weather_data['wind.speed'].append(payload_from_json['wind.speed'])
weather_data['wind.deg'].append(payload_from_json['wind.deg'])
weather_data['clouds.all'].append(payload_from_json['clouds.all'])
weather_data['sys.type'].append(payload_from_json['sys.type'])
weather_data['sys.id'].append(payload_from_json['sys.id'])
weather_data['sys.country'].append(payload_from_json['sys.country'])
weather_data['sys.sunrise'].append(payload_from_json['sys.sunrise'])
weather_data['sys.sunset'].append(payload_from_json['sys.sunset'])
i = i + 1
print(weather_data)
Here is the traceback error - can someone tell me how to interpret this:
Traceback (most recent call last):
File "/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexes/base.py", line 2895, in get_loc
return self._engine.get_loc(casted_key)
File "pandas/_libs/index.pyx", line 70, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 101, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1675, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1683, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'coord.lon'
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "main.py", line 40, in <module>
weather_data['coord.lon'].append(payload_from_json['coord.lon'])
File "/opt/anaconda3/lib/python3.8/site-packages/pandas/core/frame.py", line 2902, in __getitem__
indexer = self.columns.get_loc(key)
File "/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexes/base.py", line 2897, in get_loc
raise KeyError(key) from err
[EDIT]
your weather_data is your default dict, but payload_from_json not. So your error was raised by payload_from_json.
You can fix this by using get to access the key:
weather_data['id'].append(payload_from_json.get('id'))
if you dont want to include junk data, you can add some verifications before append:
if payload_from_json.get('id') is not None:
weather_data['id'].append(payload_from_json.get('id'))
Also, you can add some default value like this:
weather_data['id'].append(payload_from_json.get('id', 'missing'))
or
weather_data['id'].append(payload_from_json.get('id', ''))
or by default:
weather_data['id'].append(payload_from_json.get('id', None))
In your specific problem, this should work:
weather_data = defaultdict(list)
m = len(_ids)
date = str(date.today())
i = 0
while i < m:
url = ("https://api.openweathermap.org/data/2.5/weather?id=%s&units=%s&appid=%s") %
(_ids.loc[i], 'imperial', weather_key)
payload = r.get(url).json()
payload_from_json = pd.json_normalize(payload)
weather_data[date].append(date)
weather_data['id'].append(payload_from_json.get('id'))
weather_data['weather'].append(payload_from_json.get('weather'))
weather_data['base'].append(payload_from_json.get('base'))
weather_data['visibility'].append(payload_from_json.get('visibility'))
weather_data['dt'].append(payload_from_json.get('dt'))
weather_data['name'].append(payload_from_json.get('name'))
weather_data['cod'].append(payload_from_json.get('cod'))
weather_data['coord.lon'].append(payload_from_json.get('coord.lon'))
weather_data['coord.lat'].append(payload_from_json.get('coord.lat'))
weather_data['main.temp'].append(payload_from_json.get('main.temp'))
weather_data['main.feels_like'].append(payload_from_json.get('main.feels_like'))
weather_data['main.temp_min'].append(payload_from_json.get('main.temp_min'))
weather_data['main.temp_max'].append(payload_from_json.get('main.temp_max'))
weather_data['main.pressure'].append(payload_from_json.get('main.pressure'))
weather_data['main.humidity'].append(payload_from_json.get('main.humidity'))
weather_data['wind.speed'].append(payload_from_json.get('wind.speed'))
weather_data['wind.deg'].append(payload_from_json.get('wind.deg'))
weather_data['clouds.all'].append(payload_from_json.get('clouds.all'))
weather_data['sys.type'].append(payload_from_json.get('sys.type'))
weather_data['sys.id'].append(payload_from_json.get('sys.id'))
weather_data['sys.country'].append(payload_from_json.get('sys.country'))
weather_data['sys.sunrise'].append(payload_from_json.get('sys.sunrise'))
weather_data['sys.sunset'].append(payload_from_json.get('sys.sunset'))
i += 1

pd.Series - "Filename" KeyError

I am having trouble running a script for getting counts of predictions from csv files at a given directory. The format of the csv looks like this:
Sample data
and the code is the following:
import os
from glob import glob
import pandas as pd
def get_count(distribution, keyname):
try:
count = distribution[keyname]
except KeyError:
count = 0
return count
main_path = "K:\\...\\folder_name"
folder_paths = glob("%s\\*" % main_path)
data = []
for path in folder_paths:
file_name = os.path.splitext(os.path.basename(path))[0]
results = pd.read_csv(path, error_bad_lines=False)results['Label'] = pd.Series(results['Filename'].str.split("\\").str[0])
distribution = results.Predictions.value_counts()
print(distribution)
num_of_x = get_count(distribution, "x")
num_of_y = get_count(distribution,"y")
num_of_z = get_count(distribution,"z")
d = {"filename": file_name, "x": num_of_x, "y": num_of_y, "z": num_of_z}
data.append(d)
df = pd.DataFrame(data=data)
df.to_csv(os.path.join(main_path,"summary_counts.csv"), index=False)
the output error is Keyerror: "Filename" reffering to the pd.Series function, anyone would know how to solve this?
I am using Python 3.7.3 and pandas 1.0.5 and I am a beginner in programming...
Many thanks in advance
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File ".\save_counts.py", line 24, in <module>
results['Label'] = pd.Series(results['Filename'].str.split("\\").str[0])
File "K:\...\lib\site-packages\pandas\core\frame.py
", line 2800, in __getitem__
indexer = self.columns.get_loc(key)
File "K:\...\site-packages\pandas\core\indexes\
base.py", line 2648, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas\_libs\index.pyx", line 111, in pandas._libs.index.IndexEngine.get
_loc
File "pandas\_libs\index.pyx", line 138, in pandas._libs.index.IndexEngine.get
_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 1619, in pandas._libs.has
htable.PyObjectHashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 1627, in pandas._libs.has
htable.PyObjectHashTable.get_item
KeyError: 'Filename'
in here:
for path in folder_paths:
file_name = os.path.splitext(os.path.basename(path))[0]
results = pd.read_csv(path, error_bad_lines=False)
results['Label'] = pd.Series(results['Filename'].str.split("\\").str[0])
you are creating pd.Series, but those values exist only inside this for loop.
if after this loop you want to use results df in distribution you need to use append()
create empty df and append results in this df
final_results = pd.Dataframe()
for path in folder_paths:
file_name = os.path.splitext(os.path.basename(path))[0]
results = pd.read_csv(path, error_bad_lines=False)
results['Label'] = pd.Series(results['Filename'].str.split("\\").str[0])
final_results = final_results.append(results)
#and from this point you can continue

Pandas datareader failure

I want to get all the stocks from sp500 to a folder in csv format.
Now while scanning the sp500 everything works great but it seems to be that in some cases the index referred to date is missing because stock doesn't exist or has no date for a specific time, whatever I tried to change startdate and enddate but no effect - in en earlier post I was said to filter those dates with an exception but due to python is new land for me I was like an alien... is there someone who can help me?
If this error occurs:
/home/mu351i/PycharmProjects/untitled/venv/bin/python /home/mu351i/PycharmProjects/untitled/get_sp500_beautifulsoup_intro.py
Traceback (most recent call last):
File "/home/mu351i/PycharmProjects/untitled/venv/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 2897, in get_loc
return self._engine.get_loc(key)
File "pandas/_libs/index.pyx", line 107, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 131, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1607, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1614, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'Date'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/mu351i/PycharmProjects/untitled/get_sp500_beautifulsoup_intro.py", line 44, in get_data_from_yahoo
df = web.DataReader (ticker, 'yahoo', start, end)
File "/home/mu351i/PycharmProjects/untitled/venv/lib/python3.7/site-packages/pandas/util/_decorators.py", line 208, in wrapper
return func(*args, **kwargs)
File "/home/mu351i/PycharmProjects/untitled/venv/lib/python3.7/site-packages/pandas_datareader/data.py", line 387, in DataReader
session=session,
File "/home/mu351i/PycharmProjects/untitled/venv/lib/python3.7/site-packages/pandas_datareader/base.py", line 251, in read
df = self._read_one_data(self.url, params=self._get_params(self.symbols))
File "/home/mu351i/PycharmProjects/untitled/venv/lib/python3.7/site-packages/pandas_datareader/yahoo/daily.py", line 165, in _read_one_data
prices["Date"] = to_datetime(to_datetime(prices["Date"], unit="s").dt.date)
File "/home/mu351i/PycharmProjects/untitled/venv/lib/python3.7/site-packages/pandas/core/frame.py", line 2995, in getitem
indexer = self.columns.get_loc(key)
File "/home/mu351i/PycharmProjects/untitled/venv/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 2899, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/_libs/index.pyx", line 107, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 131, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1607, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1614, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'Date'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/mu351i/PycharmProjects/untitled/get_sp500_beautifulsoup_intro.py", line 57, in
get_data_from_yahoo()
File "/home/mu351i/PycharmProjects/untitled/get_sp500_beautifulsoup_intro.py", line 48, in get_data_from_yahoo
except RemoteDataError:
NameError: name 'RemoteDataError' is not defined
Process finished with exit code 1
how would you avoid this by changing this code?
import datetime as dt
import os
import pickle
import bs4 as bs
import pandas_datareader.data as web
import requests
def safe_sp500_tickers():
resp = requests.get('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
soup = bs.BeautifulSoup(resp.text,'lxml')
table = soup.find('table',{'class':'wikitable sortable'})
tickers = []
for row in table.findAll('tr')[1:]:
ticker=row.findAll('td')[0].text.strip()
tickers.append(ticker)
with open('sp500tickers.pickle','wb') as f:
pickle.dump(tickers,f)
return tickers
safe_sp500_tickers()
def get_data_from_yahoo(reload_sp500=False):
if reload_sp500:
tickers=safe_sp500_tickers()
else:
with open('sp500tickers.pickle', 'rb') as f:
tickers = pickle.load(f)
if not os.path.exists('stock_dfs'):
os.makedirs('stock_dfs')
start = dt.datetime(1999,1,1)
end = dt.datetime(2019,12,19)
for ticker in tickers:
try:
if not os.path.exists ('stock_dfs/{}.csv'.format (ticker)):
df = web.DataReader (ticker, 'yahoo', start, end)
df.to_csv ('stock_dfs/{}.csv'.format (ticker))
else:
print ("Ticker from {} already availablle".format (ticker))
except RemoteDataError:
print ("No information for ticker '%s'" % i)
continue
except KeyError:
print("no Date for Ticker: " +ticker )
continue
get_data_from_yahoo()
A Commentator asked for some DATA Sample, well this is DATA form TSLA.csv
Date,High,Low,Open,Close,Volume,Adj Close
2010-06-29,25.0,17.540000915527344,19.0,23.889999389648438,18766300,23.889999389648438
2010-06-30,30.420000076293945,23.299999237060547,25.790000915527344,23.829999923706055,17187100,23.829999923706055
2010-07-01,25.920000076293945,20.270000457763672,25.0,21.959999084472656,8218800,21.959999084472656
2010-07-02,23.100000381469727,18.709999084472656,23.0,19.200000762939453,5139800,19.200000762939453
2010-07-06,20.0,15.829999923706055,20.0,16.110000610351562,6866900,16.110000610351562
2010-07-07,16.6299991607666,14.979999542236328,16.399999618530273,15.800000190734863,6921700,15.800000190734863
2010-07-08,17.520000457763672,15.569999694824219,16.139999389648438,17.459999084472656,7711400,17.459999084472656
2010-07-09,17.899999618530273,16.549999237060547,17.579999923706055,17.399999618530273,4050600,17.399999618530273
2010-07-12,18.06999969482422,17.0,17.950000762939453,17.049999237060547,2202500,17.049999237060547
2010-07-13,18.639999389648438,16.899999618530273,17.389999389648438,18.139999389648438,2680100,18.139999389648438
2010-07-14,20.149999618530273,17.760000228881836,17.940000534057617,19.84000015258789,4195200,19.84000015258789
2010-07-15,21.5,19.0,19.940000534057617,19.889999389648438,3739800,19.889999389648438
2010-07-16,21.299999237060547,20.049999237060547,20.700000762939453,20.639999389648438,2621300,20.639999389648438
Please provide constructive feedback because I'new here.
Thanks :)
You are missing an import
Add the following import at the top of your script
from pandas_datareader._utils import RemoteDataError
import pandas as pd
df = pd.read_html(
"https://en.wikipedia.org/wiki/List_of_S%26P_500_companies")[0]
sort = pd.DataFrame(df).sort_values(by=['Date first added'])
sort['Date first added'] = pd.to_datetime(sort['Date first added'])
start_date = '1-1-1999'
end_date = '11-12-2019'
mask = (sort['Date first added'] > start_date) & (
sort['Date first added'] <= end_date)
sort = sort.loc[mask]
pd.DataFrame(sort).to_csv('result.csv', index=False)
Output: View Online
ScreenShot:

Python Outer join

The below code is used to calculate statistical values.
import re
from pathlib import Path
import pandas as pd
def prepare_values(df):
df_columns = ['frame.time_delta_displayed', 'frame.len']
df_values = []
for col in df_columns:
df_values +=[
df[col].max(),
df[col].min(),
df[col].std(),
df[col].quantile(0.25),
df[col].quantile(0.5),
df[col].quantile(0.75),
df[col].mean(),
df[col].mad(),
df[col].var(),
df[col].skew(),
df[col].kurtosis(),
df[col].sum(),
]
return df_values
source_dir = Path('/media/root/HASARA/Snipping Experiment-App Activities/Time-0.5/InOutFiltered')
in_data = []
for file in source_dir.glob('**/*.in.csv'):
activity = {'activity': file.stem.split('.')[0]}
df = pd.read_csv(file)
cols =['maxTimeIn', 'minTimeIn', 'stdTimeIn', 'q1TimeIn', 'q2TimeIn', 'q3TimeIn', 'meanTimeIn', 'madTimeIn', 'varianceTimeIn', 'skewTimeIn', 'kurtosisTimeIn', 'sumTimeIn', 'maxLenIn', 'minLenIn', 'stdLenIn', 'q1LenIn','q2lenIn', 'q3LenIn', 'meanLenIn', 'madLenIn', 'varianceLenIn', 'skewLenIn', 'kurtosisLenIn', 'sumLenIn']
values = prepare_values(df)
file_data ={**activity, **dict(zip(cols,values))}
in_data.append(file_data)
out_data =[]
for file in source_dir.glob('**/*.out.csv'):
activity = {'activity': file.stem.split('.')[0]}
df = pd.read_csv(file)
cols =['maxTimeOut', 'minTimeOut', 'stdTimeOut', 'q1TimeOut', 'q2TimeOut', 'q3TimeOut', 'meanTimeOut', 'madTimeOut', 'varianceTimeOut', 'skewTimeOut', 'kurtosisTimeOut', 'sumTimeOut', 'maxLenOut', 'minLenOut', 'stdLenOut', 'q1LenOut', 'q2LenOut', 'q3LenOut', 'meanLenOut', 'madLenOut', 'varianceLenOut', 'skewLenOut', 'kurtosisLenOut','sumLenOut']
values=prepare_values(df)
file_data = {**activity, **dict(zip(cols, values))}
out_data.append(file_data)
in_df = pd.DataFrame(in_data)
out_df = pd.DataFrame(out_data)
all_df = in_df.join(out_df.set_index('activity'), on='activity', how='outer')
all_df.dropna(subset=all_df.columns.tolist()[1:], how='all', inplace=True)
all_df.fillna(0, inplace=True)
all_df['activity'] = all_df['activity'].apply(lambda x:re.sub(r'^([a-zA-Z]+).*', r'\1',x))
all_df.to_csv('/media/root/HASARA/Snipping Experiment-App Activities/Time-0.5/AllDataNew.csv', index=False)
I am getting an error. Can't figure out what it means.
Traceback (most recent call last):
File "/root/PycharmProjects/AppAct/StatisticCal.py", line 48, in <module>
all_df= in_df.join(out_df.set_index('activity'), on='activity', how='outer')
File "/root/PycharmProjects/AppAct/venv/lib/python3.7/site-packages/pandas/core/frame.py", line 4178, in set_index
level = frame[col]._values
File "/root/PycharmProjects/AppAct/venv/lib/python3.7/site-packages/pandas/core/frame.py", line 2927, in __getitem__
indexer = self.columns.get_loc(key)
File "/root/PycharmProjects/AppAct/venv/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 2659, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/_libs/index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'activity'

Categories

Resources