KeyError on the 2nd loop on pandas df - python

so on the first run in the loop everything works fine but on the second loop, it causes a KeyError on the column values on my df. I don't understand why this is happening since in every loop I'm triggering a set of functions.
Part of the code that creates the error
def market_data (crypto, ts_float):
#request to kraken for pricing data
r = requests.get('https://futures.kraken.com/api/charts/v1/trade/' + crypto + '/15m?from=' + ts_float)
#set JSON response to data
data = r.json()
#normalize data into dataframe
df = pd.json_normalize(data, record_path=['candles'])
#convert unix time back into readable time
df['time'] = pd.to_datetime(df['time'],unit='ms')
#set time as index
df = df.set_index('time')
#convert into integer for calculations
df['open'] = df['open'].astype(float).astype(int)
df['high'] = df['high'].astype(float).astype(int)
df['low'] = df['low'].astype(float).astype(int)
df['close'] = df['close'].astype(float).astype(int)
df['volume'] = df['volume'].astype(float).astype(int)
return df
crypto_pairs = [
{"crypto": "pf_ethusd", "size": 0.05},
{"crypto": "pf_btcusd", "size": 0.0003},
{"crypto": "pf_avaxusd", "size":3},
{"crypto": "pf_dotusd", "size":10},
{"crypto": "pf_ltcusd", "size":1.5}
]
# getting the timstamp to get the data from
ts = (datetime.now() - timedelta(hours = 48)).timestamp()
ts_float = str(int(ts))
for cryptos in enumerate(crypto_pairs):
data = market_data(cryptos[1]['crypto'], ts_float)
KeyError: time
I have a set of functions in my enumerate loop and the market_data which is the first one generates the mentioned error on the 2nd loop. The errors are always happening when changing the column names such as "time" and "open".

I don't have skills in 'request', but this worked for me. Try the following. In the 'deep market_data' function, after receiving the dataframe, set a check, if len(df)<=0, then exit.
Where the dataframe turns out to be empty, the request returns 200, that is, everything is fine. Printed out 'crypto'. An empty dataframe is obtained on 'pf_btcusd'. I tried to swap it and again an empty dataframe turns out to be 'pf_btcusd'. Something is wrong with this symbol.
def market_data (crypto, ts_float):
#request to kraken for pricing data
r = requests.get('https://futures.kraken.com/api/charts/v1/trade/' + crypto + '/15m?from=' + ts_float)
#print(r.status_code)
#set JSON response to data
data = r.json()
#normalize data into dataframe
df = pd.json_normalize(data, record_path=['candles'])
if len(df) <=0:
print(r.status_code)
print(crypto)
return

Related

Python Script it not showing more than 1 page of results of Shopify Orders

I’m having some hard time trying to make this code shows more than 1 page of orders.
I already tried different methods, such as loops and also the one below (which is just a workaround) where I tried to get the page 2.
I just need it to brings me all the orders generated in a specific day - but I got completely stuck.
import requests
import pandas as pd
from datetime import datetime, timedelta
# Set the API token for the Shopify API
api_token = 'MYTOKEN'
# Get the current date and subtract one day
today = datetime.now()
yesterday = today - timedelta(days=1)
# Format the date strings for the API request
start_date = yesterday.strftime('%Y-%m-%dT00:00:00Z')
end_date = yesterday.strftime('%Y-%m-%dT23:59:59Z')
# Set the initial limit to 1
limit = 1
page_info = 2
# Set the initial URL for the API endpoint you want to access, including the limit and date range parameters
url = f'https://MYSTORE.myshopify.com/admin/api/2020-04/orders.json?page_info=%7Bpage_info%7D&limit=%7Blimit%7D&created_at_min=%7Bstart_date%7D&created_at_max=%7Bend_date%7D&'
# Set the API token as a header for the request
headers = {'X-Shopify-Access-Token': api_token}
# Make the GET request
response = requests.get(url, headers=headers)
# Check the status code of the response
if response.status_code == 200:
# Parse the JSON response directly
orders = response.json()['orders']
# Flatten the JSON response into a Pandas DataFrame, including the 'name' column (order number) and renaming the 'id' column to 'order_id'
df = pd.json_normalize(orders, sep='_', record_path='line_items', meta=['name', 'id'], meta_prefix='meta_')
# Flatten the line_items data into a separate DataFrame
line_items_df = pd.json_normalize(orders, 'line_items', ['id'], meta_prefix='line_item_')
# Flatten the 'orders' data into a separate | Added in Dec.26-2022
orders_df = pd.json_normalize(orders, sep='_', record_path='line_items', meta=['created_at', 'id'], meta_prefix='ordersDTbs_')
# Merge the 'df' and 'orders_df' DataFrames | Added in Dec.26-2022
df = pd.merge(df, orders_df[['id', 'ordersDTbs_created_at']], on='id')
# Converting create_at date to DATE only | Added in Dec.26-2022
df['ordersDTbs_created_at'] = df['ordersDTbs_created_at'].apply(lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%S%z').date())
# Concatenate the two dataframes
df = pd.merge(df, line_items_df[['id', 'sku', 'quantity']], on='id')
# Calculate the discount amount and add it as a new column in the dataframe
df['price_set_shop_money_amount'] = pd.to_numeric(df['price_set_shop_money_amount'])
df['total_discount_set_shop_money_amount'] = pd.to_numeric(df['total_discount_set_shop_money_amount'])
df = df.assign(paid_afterdiscount=df['price_set_shop_money_amount'] - df['total_discount_set_shop_money_amount'])
# Print the DataFrame
print(df[['meta_name','ordersDTbs_created_at','sku_y','title','fulfillable_quantity','quantity_x','quantity_y','paid_afterdiscount']])
#Checking if API ran smoothly
else:
print('Something went wrong.')
I already tried different methods, such as loops and also the one below (which is just a workaround) where I tried to get the page 2.

Merge 1min, 5min and Daily OHLC dataframes WITHOUT upsampling the 1min?

To make this simple, let's say I have 3 datasets, a 1min OHLCV dataset, a 5min, and a Daily. Given these 3 datasets:
1min
5min
Daily
...how can I merge them and turn them into something like this using Pandas/Python 3?
As you can see, every time a new 5 mins is hit going down the 1min dataframe, the matching time from the 5min chart <= the 1min time gets added to their respective columns. I've left the blanks in there to help visualize what's happening, but for simplicity's sake, I can just forward fill the values. No backfilling so as to not introduce a lookahead bias. The daily value would be the previous day's OHLCV data, except for 4:00 PM, that would be the current day's data. This is because of how Alpha Vantage structures their dataframes. I also have a 15min and 60min dataset to go along with this, but I think once the 1 to 5min merge is done, the same logic could apply.
I have included a reproducible code below to get the exact dataframes I'm working with, however you have to pip install alpha_vantage and get a free API key from here.
SIMPLE DESIRED SOLUTION - What I've outlined above.
ADVANCED DESIRED SOLUTION - Rather than forward filling the 5min data, ideally those empty spaces would consist of the respective running prices. For example, the next empty open price in the 5min_open column would be the same 1min_open at the beginning of that 5 minutes. Think of something like:
conversion = {'Open' : 'first', 'High' : 'max', 'Low' : 'min', 'Close' : 'last', 'Volume' : 'sum'}
That way the empty spaces in the 5min columns are updating as they should, but the simple solution would suffice.
I don't want to just upsample the 1min dataframe because I want to include all of the previous data from a past daily dataframe, so to upsample the 1min to a daily dataset, I'd lose a bunch of past daily data, so I want the solution to NOT include upsampling, but merging of different df's by datetime. Thanks!
Windows 10, Python 3:
from alpha_vantage.timeseries import TimeSeries
import pandas as pd
import sys, os
import os.path
from time import sleep
# Make a historical folder if it's not created already
if not os.path.exists("data/historical/"):
os.makedirs("data/historical/")
api_call_limit_daily = 500
total_api_count = 0
timeframes = ['1min','5min','15min','60min','daily']
ts = TimeSeries(key='YOUR_API_KEY_HERE', output_format='csv')
tickers = ['SPY']
for ticker in tickers:
# Ensure uppercase
ticker = str(ticker.upper())
for timeframe in timeframes:
print("Downloading dataset for", ticker, "-", timeframe, "...")
if total_api_count > api_call_limit_daily:
print("Daily API call limit reached. Closing...")
sys.exit(0)
if timeframe != 'daily':
data, meta_data = ts.get_intraday_extended(symbol=ticker, interval=timeframe)
elif timeframe == 'daily':
data, meta_data = ts.get_daily(symbol=ticker)
# Convert the data object to a dataframe, and reverse the order so oldest date's on top
csv_list = list(data)
data = pd.DataFrame.from_records(csv_list[1:], columns=csv_list[0])
data = data.iloc[::-1]
if timeframe == 'daily':
data = data.rename(columns={"timestamp": "time"})
print(data)
df = data.set_index('time')
total_api_count += 1
df.to_csv("data/historical/" + ticker + "_" + timeframe + ".csv", index=True)
print("Success...")
# Sleep if we're not through tickers/timeframes yet re: api limits
sleep(15)
print("Done!")
UPDATE
I have frankenstein'd an iterative process for the simple solution that works for each dataframe, except for the daily. Still trying to work that out. Basically for any time on the 1min rows, I want to display YESTERDAY's daily data, unless the time is >= 16:00:00, then it can be the current day's. I want that so not to introduce forward peaking. Anyway, here's the code that accomplishes what I'm looking for iteratively, but hoping there's a faster/cleaner way to do this:
import numpy as np
# Merge all timeframes together into one dataframe
for ticker in tickers:
# Ensure uppercase
ticker = str(ticker.upper())
for timeframe in timeframes:
# Define the 1min timeframe as the main df we'll add other columns to
if timeframe == "1min":
main_df = pd.read_csv("./data/historical/" + ticker + "_" + timeframe + ".csv")
main_df['time'] = pd.to_datetime(main_df['time'])
continue
# Now add in some nan's for the next timeframe's columns
main_df[timeframe + "_open"] = np.nan
main_df[timeframe + "_high"] = np.nan
main_df[timeframe + "_low"] = np.nan
main_df[timeframe + "_close"] = np.nan
main_df[timeframe + "_volume"] = np.nan
# read in the next timeframe's dataset
df = pd.read_csv("./data/historical/" + ticker + "_" + timeframe + ".csv")
df['time'] = pd.to_datetime(df['time'])
# Rather than doing a double for loop to iterate through both datasets, just keep a counter
# of what row we're at in the second dataframe. Used as a row locater
curr_df_row = 0
# Do this for all datasets except the daily one
if timeframe != 'daily':
# Iterate through the main_df
for i in range(len(main_df)):
# If the time in the main df is >= the current timeframe's row's df, add the values to their columns
if main_df['time'].iloc[i] >= df['time'].iloc[curr_df_row]:
main_df[timeframe + "_open"].iloc[i] = df['open'].iloc[curr_df_row]
main_df[timeframe + "_high"].iloc[i] = df['high'].iloc[curr_df_row]
main_df[timeframe + "_low"].iloc[i] = df['low'].iloc[curr_df_row]
main_df[timeframe + "_close"].iloc[i] = df['close'].iloc[curr_df_row]
main_df[timeframe + "_volume"].iloc[i] = df['volume'].iloc[curr_df_row]
curr_df_row += 1
# Daily dataset logic would go here
print(main_df)
main_df.to_csv("./TEST.csv", index=False)

Joining time series by common date in Python (dataframe & series/list question)

Noob here. PLEASE FORGIVE ABYSMAL FORMATTING as I am still learning. I am trying to create a time series (a dataframe, I think?) that consists of three columns. One is a date column, the next is an inventory column, and the last is a price column.
I have pulled two separate series (date & inventory; date & price) and I want to meld the two series so that I can see three columns instead of two sets of two. This is my code.
import json
import numpy as np
import pandas as pd
from urllib.error import URLError, HTTPError
from urllib.request import urlopen
class EIAgov(object):
def __init__(self, token, series):
'''
Purpose:
Initialise the EIAgov class by requesting:
- EIA token
- id code(s) of the series to be downloaded
Parameters:
- token: string
- series: string or list of strings
'''
self.token = token
self.series = series
def __repr__(self):
return str(self.series)
def Raw(self, ser):
# Construct url
url = 'http://api.eia.gov/series/?api_key=' + self.token + '&series_id=' + ser.upper()
try:
# URL request, URL opener, read content
response = urlopen(url);
raw_byte = response.read()
raw_string = str(raw_byte, 'utf-8-sig')
jso = json.loads(raw_string)
return jso
except HTTPError as e:
print('HTTP error type.')
print('Error code: ', e.code)
except URLError as e:
print('URL type error.')
print('Reason: ', e.reason)
def GetData(self):
# Deal with the date series
date_ = self.Raw(self.series[0])
date_series = date_['series'][0]['data']
endi = len(date_series) # or len(date_['series'][0]['data'])
date = []
for i in range (endi):
date.append(date_series[i][0])
# Create dataframe
df = pd.DataFrame(data=date)
df.columns = ['Date']
# Deal with data
lenj = len(self.series)
for j in range (lenj):
data_ = self.Raw(self.series[j])
data_series = data_['series'][0]['data']
data = []
endk = len(date_series)
for k in range (endk):
data.append(data_series[k][1])
df[self.series[j]] = data
return df
if __name__ == '__main__':
tok = 'mytoken'
# Natural Gas - Weekly Storage
#
ngstor = ['NG.NW2_EPG0_SWO_R48_BCF.W'] # w/ several series at a time ['ELEC.REV.AL-ALL.M', 'ELEC.REV.AK-ALL.M', 'ELEC.REV.CA-ALL.M']
stordata = EIAgov(tok, ngstor)
print(stordata.GetData())
# Natural Gas - Weekly Prices
#
ngpx = ['NG.RNGC1.W'] # w/ several series at a time ['ELEC.REV.AL-ALL.M', 'ELEC.REV.AK-ALL.M', 'ELEC.REV.CA-ALL.M']
pxdata = EIAgov(tok, ngpx)
print(pxdata.GetData())
Note that 'mytoken' needs to be replaced by an eia.gov API key. I can get this to successfully create an output of two lists...but then to get the lists merged I tried to add this at the end:
joined_frame = pd.concat([ngstor, ngpx], axis = 1, sort=False)
print(joined_frame.GetData())
But I get an error
("TypeError: cannot concatenate object of type '<class 'list'>'; only Series and DataFrame objs are valid")
because apparently I don't know the difference between a list and a series.
How do I merge these lists by date column? Thanks very much for any help. (Also feel free to advise why I am terrible at formatting code correctly in this post.)
If you want to manipulate them as DataFrames in the rest of your code, you can transform ngstor and ngpx into DataFrames as follows:
import pandas as pd
# I create two lists that look like yours
ngstor = [[1,2], ["2020-04-03", "2020-05-07"]]
ngpx = [[3,4] , ["2020-04-03", "2020-05-07"]]
# I transform them to DataFrames
ngstor = pd.DataFrame({"value1": ngstor[0],
"date_col": ngstor[1]})
ngpx = pd.DataFrame({"value2": ngpx[0],
"date_col": ngpx[1]})
Then you can either use pandas.merge or pandas.concat :
# merge option
joined_framed = pd.merge(ngstor, ngpx, on="date_col",
how="outer")
# concat option
ngstor = ngstor.set_index("date_col")
ngpx = ngpx.set_index("date_col")
joined_framed = pd.concat([ngstor, ngpx], axis=1,
join="outer").reset_index()
The result will be:
date_col value1 value2
0 2020-04-03 1 3
1 2020-05-07 2 4

KeyError for column that is in Pandas dataframe

I'm having an issue that I can't seem to understand. I've written a function that takes a dataframe as the input and then performs a number of cleaning steps on it. When I run the function I get the error message KeyError: ('amount', 'occurred at index date'). This doesn't make sense to me because amount is a column in my dataframe .
Here is some code with a subset of the data created:
data = pd.DataFrame.from_dict({"date": ["10/31/2019","10/27/2019"], "amount": [-13.3, -6421.25], "vendor": ["publix","verizon"]})
#create cleaning function for dataframe
def cleaning_func(x):
#convert the amounts to positive numbers
x['amount'] = x['amount'] * -1
#convert dates to datetime for subsetting purposes
x['date'] = pd.to_datetime(x['date'])
#begin removing certain strings
x['vendor'] = x['vendor'].str.replace("PURCHASE AUTHORIZED ON ","")
x['vendor'] = x['vendor'].str.replace("[0-9]","")
x['vendor'] = x['vendor'].str.replace("PURCHASE WITH CASH BACK $ . AUTHORIZED ON /","")
#build table of punctuation and remove from vendor strings
table = str.maketrans(dict.fromkeys(string.punctuation)) # OR {key: None for key in string.punctuation}
x['vendor'] = x['vendor'].str.translate(table)
return x
clean_data = data.apply(cleaning_func)
If someone could shed some light on why this error appears I would appreciate it.
Don't use apply here, it's slow and basically loops over your dataframe. Just pass the function your data and let it return a cleaned up dataframe, this way it will use the vectorized methods over the whole column.
def cleaning_func(df):
#convert the amounts to positive numbers
df['amount'] = df['amount'] * -1
#convert dates to datetime for subsetting purposes
df['date'] = pd.to_datetime(df['date'])
#begin removing certain strings
df['vendor'] = df['vendor'].str.replace("PURCHASE AUTHORIZED ON ","")
df['vendor'] = df['vendor'].str.replace("[0-9]","")
df['vendor'] = df['vendor'].str.replace("PURCHASE WITH CASH BACK $ . AUTHORIZED ON /","")
#build table of punctuation and remove from vendor strings
table = str.maketrans(dict.fromkeys(string.punctuation)) # OR {key: None for key in string.punctuation}
df['vendor'] = df['vendor'].str.translate(table)
return df
clean_df = cleaning_func(data)

using pandas HTML to read and print data from a website

Problem:
I need to make a code that lists off data from a online web data set with pandas pd.read_HTML then call out a temperature based on that list and have it display that row of data with a few parameters.
The trouble is the final part I need to make it to where it loops for when the user input is out of range or not == to one of the documented temperatures it returns a retry command a message saying something like invalid input
What I have tried:
I tried running it through a while loop, try except commands and if and elif but I'm sure I did it wrong because it almost all the time breaks my spyder program so I have to close it and try again.
Any recommendation or solutions would be super helpful cause I'm past the point of vague hints that supposed to lead me to an answer but leave me more confused.
My code:
def get_t_data(t):
t_table = pd.read_html('https://thermo.pressbooks.com/chapter/saturation-properties-temperature-table/', header=0)
t_df = t_table[0]
data_df =t_df.loc[t_df['Temp'] == t]
df_result = data_df[['Pressure', 'Volume ()', 'Energy (kJ/kg)', 'Enthalpy (kJ/kg)', 'Entropy (kJ/kg.K)']]
df_final = df_result.to_string(index=False)
return df_final
user_t = input('Please enter the temp you will like to research: ')
print('\n')
data = get_t_data(user_t)
print('For temperature {}°C your outputs are \n'.format(user_t))
print(data)```
[upd]
something like this:
import pandas as pd
def get_t_data(t):
t_table = pd.read_html('https://thermo.pressbooks.com/chapter/saturation-properties-temperature-table/', header=0)
t_df = t_table[0]
t_df = t_df.iloc[1:,:] # to skip additional line of header
ind = list(t_df['Temp'].astype(float)) # get all indexes as float as you have not only integer (0.01 and 373.95)
if float(t) not in ind: # check if the 't' in index
return {'exist': False, 'result':'no such temp'}
data_df =t_df.loc[t_df['Temp'] == t]
df_result = data_df[['Pressure', 'Volume ()', 'Energy (kJ/kg)', 'Enthalpy (kJ/kg)', 'Entropy (kJ/kg.K)']]
df_final = df_result.to_string(index=False)
return {'exist': True, 'result': df_final}
# data format for get_t_data response
data = {'exist': False, 'result':''}
while data['exist'] == False:
user_t = input('Please enter the temp you will like to research: ')
print('\n')
data = get_t_data(user_t)
print('For temperature {}°C your outputs are \n'.format(user_t))
print(data['result'])

Categories

Resources