How do you make dataframe match datetime dates? - python

So I am writing a code for a Tkinter GUI, and in it, the code pulls data from FRED and uses it to present graphs. There is an option at the start to save the pulled data in a CSV file so you can run it without the internet. But when the code runs to use the CSV, something happens with the scale and it gives me a graph like this. I think it has something to do with the datetime data not being remembered. Current code situation follows:
Imports: from tkinter import *, from tkinter import ttk, pandas_datareader as pdr, pandas as pd, from datetime import datetime
Example of how data is called:
def getBudgetData():
'''
PURPOSE: Get the government budget balance data
INPUTS: None
OUTPUTS: The dataframe of the selected country
'''
global namedCountry
# Reads what country is in the combobox when selected, then gives the index value so the correct
# code is used in the graph
namedCountry = countryCombo.get()
selectedCountry = countryOptions.index(namedCountry)
df = dfBudget[dfBudget.columns[selectedCountry]]
return df
Code for getting/reading the dataframes
def readDataframeCSV():
global dfCPIQuarterly, dfCPIMonthly, dfGDP, dfUnemployment, dfCashRate, dfBudget
dfCPIQuarterly = pd.read_csv('dataframes\dfCPIQuarterly.csv', infer_datetime_format = True)
dfCPIMonthly = pd.read_csv('dataframes\dfCPIMonthly.csv')
dfGDP = pd.read_csv('dataframes\dfGDP.csv')
dfUnemployment = pd.read_csv('dataframes\dfUnemployment.csv')
dfCashRate = pd.read_csv('dataframes\dfCashRate.csv')
dfBudget = pd.read_csv('dataframes\dfBudget.csv')
def LogDiff(x, frequency):
'''
PURPOSE: Transform level data into growth
INPUTS: x (time series), frequency (frequency of time series)
OUTPUTS: x_diff (growth rate of time series)
REFERENCE: Tau, Ran, & Chris Brookes. (2019). Python Guide to accompany
introductary econometrics for finance (4th Edition).
Cambridge University Press.
'''
x_diff = 100*log(x/x.shift(frequency))
x_diff = x_diff.dropna()
return x_diff
def getAllFredData():
'''
PURPOSE: Extract all required data from FRED
INPUTS: None
OUTPUTS: Dataframes of all time series
REFERENCE: https://fred.stlouisfed.org/
'''
global dfCPIQuarterly, dfCPIMonthly, dfGDP, dfUnemployment, dfCashRate, dfBudget
# Country codes
countryCPIQuarterlyCodes = ['AUSCPIALLQINMEI', 'NZLCPIALLQINMEI']
countryCPIMonthlyCodes = ['CPALCY01CAM661N', 'JPNCPIALLMINMEI', 'GBRCPIALLMINMEI', 'CPIAUCSL']
countryGDPCodes = ['AUSGDPRQDSMEI', 'NAEXKP01CAQ189S', 'JPNRGDPEXP',
'NAEXKP01NZQ189S', 'CLVMNACSCAB1GQUK', 'GDPC1']
countryUnemploymentCodes = ['LRUNTTTTAUQ156S', 'LRUNTTTTCAQ156S', 'LRUN64TTJPQ156S',
'LRUNTTTTNZQ156S', 'LRUNTTTTGBQ156S', 'LRUN64TTUSQ156S']
countryCashRateCodes = ['IR3TBB01AUM156N', 'IR3TIB01CAM156N', 'INTDSRJPM193N',
'IR3TBB01NZM156N', 'IR3TIB01GBM156N', 'FEDFUNDS']
countryBudgetCodes = ['GGNLBAAUA188N', 'GGNLBACAA188N', 'GGNLBAJPA188N',
'NZLGGXCNLG01GDPPT', 'GGNLBAGBA188N', 'FYFSGDA188S']
# Inflation
dfCPIQuarterly = pdr.DataReader(countryCPIQuarterlyCodes,
'fred', start, end)
for country in countryCPIQuarterlyCodes:
dfCPIQuarterly[country] = pd.DataFrame({"Inflation rate":LogDiff(dfCPIQuarterly[country], 4)})
dfCPIMonthly = pdr.DataReader(countryCPIMonthlyCodes,
'fred', start, end)
for country in countryCPIMonthlyCodes:
dfCPIMonthly[country] = pd.DataFrame({"Inflation rate":LogDiff(dfCPIMonthly[country], 12)})
# GDP
dfGDP = pdr.DataReader(countryGDPCodes,
'fred', start, end)
for country in countryGDPCodes:
dfGDP[country] = pd.DataFrame({"Economic Growth":LogDiff(dfGDP[country], 4)})
# Unemployment
dfUnemployment = pdr.DataReader(countryUnemploymentCodes,
'fred', start, end)
# Cash Rate
dfCashRate = pdr.DataReader(countryCashRateCodes,
'fred', start, end)
# Budget
dfBudget = pdr.DataReader(countryBudgetCodes,
'fred', start, end)
print('')
saveToCSVLoop = True
while saveToCSVLoop == True:
saveToCSV = input('Would you like to save the dataframes to a CSV file so start-up will be quicker next (y or n): ')
if saveToCSV == 'y':
dfCPIQuarterly.to_csv('dataframes\dfCPIQuarterly.csv', index = True)
dfCPIMonthly.to_csv('dataframes\dfCPIMonthly.csv', index = False)
dfGDP.to_csv('dataframes\dfGDP.csv', index = False)
dfUnemployment.to_csv('dataframes\dfUnemployment.csv', index = False)
dfCashRate.to_csv('dataframes\dfCashRate.csv', index = False)
dfBudget.to_csv('dataframes\dfBudget.csv', index = False)
saveToCSVLoop = False
elif saveToCSV == 'n':
saveToCSVLoop = False
else:
print('\nNot a valid option')
sleep(1)

It's hard to help you without the csv data. It could be that the dates aren't saved properly, or aren't interpreted properly. Maybe you could try parsing the datetime first. It kind of looks like there are no years, or something that is expected to be the year is actually a month?
Since it starts at 1970, I have a feeling that it's interpreting your time as unix epoch, not normal yyyymmdd type dates. Try printing dfCPIQuarterly and see if it looks like a date. Maybe you shouldn't use infer_datetime_format = True when reading it from the csv, but it's hard to tell without more details.

Related

Big O notation : limited input

As an exercise, I am trying to set Monte Carlo Simulation on a chosen ticker symbol.
from numpy.random import randint
from datetime import date
from datetime import timedelta
import pandas as pd
import yfinance as yf
from math import log
# ticker symbol
ticker_input = "AAPL" # change
# start day + endday for Yahoo Finance API, 5 years of data
start_date = date.today()
end_date = start_date - timedelta(days=1826)
# retrieve data from Yahoo Finance
data = yf.download(ticker_input, end_date,start_date)
yf_data = data.reset_index()
# dataframe : define columns
df = pd.DataFrame(columns=['date', "ln_change", 'open_price', 'random_num'])
open_price = []
date_historical = []
for column in yf_data:
open_price = yf_data["Open"].values
date_historical = yf_data["Date"].values
# list order: descending
open_price[:] = open_price[::-1]
date_historical[:] = date_historical[::-1]
# Populate data into dataframe
for i in range(0, len(open_price)-1):
# date
day = date_historical[i]
# ln_change
lnc = log(open_price[i]/open_price[i+1], 2)
# random number
rnd = randint(1, 1258)
# op = (open_price[i]) open price
df.loc[i] = [day, open_price[i], lnc, rnd]
I was wondering how to calculate Big O if you have e.g. nested loops or exponential complexity but have a limited input like one in my example, maximum input size is 1259 instances of float number. Input size is not going to change.
How do you calculate code complexity in that scenario?
It is a matter of points of view. Both ways of seeing it are technically correct. The question is: What information do you wish to convey to the reader?
Consider the following code:
quadraticAlgorithm(n) {
for (i <- 1...n)
for (j <- 1...n)
doSomethingConstant();
}
quadraticAlgorithm(1000);
The function is clearly O(n2). And yet the program will always run in the same, constant time, because it just contains one function call with n=1000. It is still perfectly valid to refer to the function as O(n2). And we can refer to the program as O(1).
But sometimes the boundaries are not that clear. Then it is up to you to choose if you wish to see it as an algorithm with a time complexity as some function of n, or as a piece of constant code that runs in O(1). The importance is to make it clear to the reader how you define things.

Dividend stocks does not appear from function

I'm trying to gather dividend yields from multiple stocks via yfinance. I have a loop which creates a CSV-file for each ticker with historical data.
When I've downloaded dividend data via a function previously, it has worked - basically I created a function with a for-loop and then appended a dataframe with the stocks.
However, now I want to do it the same way but with a boolean expression instead, and it's not working.. I'm not getting any errors but I'm not receiving any ticker symbols (which I know satisfy the condition). I've tried to formulate the boolean loop differently, without success.
What am I doing wrong? Below is my code:
import yfinance as yf
import pandas as pd
import os
df = pd.read_csv(r'C:\\Users\Name\Stocks\Trading\teststocks.csv')
tickers = df["Symbol"].tolist()
i=0
listlength = len(tickers)
for ticker in tickers:
i=i+1
print("Downloading data for",ticker,",",i,"of",listlength)
df = yf.download(ticker, period = "max", interval = "1wk", rounding = True)
df.dropna(inplace=True)
df.to_csv(os.path.join("C:\\Users\Name\Stocks\dataset",ticker + ".csv"))
def dividend(df):
info = yf.Ticker(ticker).info
div = info.get("dividendYield")
if div is None:
pass
elif div > 0.04:
return True
else:
return False
for filename in os.listdir("C:\\Users\Name\Stocks\dataset"):
df = pd.read_csv("C:\\Users\Name\Stocks\dataset\{}".format(filename))
if dividend(df):
print("{}".format(filename))
So this function is looping through the ticker symbols from the dataset folder and getting the dividend data from yfinance, however it's not returning with the ticker that satisfy the condition - which in this case is if the dividend yield is higher than 4%. The first dataframe being read is a CSV file with the ticker symbols in the OMXS30 - so for example HM-B.ST should appear from the dividend function..
Another thing that I want to add is that I'm using the same logic for a function for marketcap, which does work. See below:
def marketcap(df):
info = yf.Ticker(ticker).info
mcap = info.get("marketCap")
if mcap is None:
pass
elif mcap > 10000000000:
return True
else:
return False
for filename in os.listdir("C:\\Users\Name\Stocks\dataset"):
df = pd.read_csv("C:\\Users\Name\Stocks\dataset\{}".format(filename))
if marketcap(df):
print("{}".format(filename))
I do not know why the dividend boolean expression does not work, when the marketcap does work.
Thanks in advance.
Neither the function dividend nor marketcap is working as it should. The reason has to do with the following:
for ticker in tickers:
# do stuff
Here you are taking a list of tickers and doing some stuff for each ticker in this list. This means that by the end of your loop, the variable ticker equals the last item in the list. E.g. suppose tickers = ['HM-B.ST','AAPL'], then ticker will at the end equal AAPL.
Now, let's have a look at your function dividend:
def dividend(df):
info = yf.Ticker(ticker).info
div = info.get("dividendYield")
if div is None:
pass
elif div > 0.04:
return True
else:
return False
This function has one argument (df), but it is not actually using it. Instead you are applying yf.Ticker(...).info to a variable ticker, which is no longer being updated at all. If the function is not returning any True values, this must simply mean that the last ticker (e.g. "AAPL") does not represent a dividend stock. So, to fix this you want to change the input for the function: def dividend(ticker). Write something like:
for filename in os.listdir("C:\\Users\Name\Stocks\dataset"):
df = pd.read_csv("C:\\Users\Name\Stocks\dataset\{}".format(filename))
# e.g. with filename like "HM-B.ST.csv", split at "."
# and select only first part
ticker = filename.split('.')[0]
if dividend(ticker):
print("{}".format(filename))
You need to make the same change for your function marketcap. Again, if this function is currently returning True values, this just means that your last list item references a stock has a higher mcap than the threshold.
Edit: Suggested refactored code
import yfinance as yf
import pandas as pd
tickers = ['ABB.ST','TELIA.ST','ELUX-B.ST','HM-B.ST']
def buy_dividend(ticker):
info = yf.Ticker(ticker).info
# keys we need
keys = ['marketCap','trailingPE','dividendYield']
# store returned vals in a `list`. E.g. for 'HM-B.ST':
# [191261163520, 13.417525, 0.0624], i.e. mcap, PE, divYield
vals = [info.get(key) for key in keys]
# if *any* val == `None`, `all()` will be `False`
if all(vals):
# returns `True` if *all* conditions are met, else `False`
return (vals[0] > 1E10) & (vals[1] < 20) & (vals[2] > 0.04)
return False
for ticker in tickers:
# `progress=False` suppresses the progress print
df = yf.download(ticker, period = "max", interval = "1wk",
rounding = True, progress = False)
df.dropna(inplace=True)
if df.empty:
continue
# df.to_csv(os.path.join("C:\\Users\Name\Stocks\dataset",ticker + ".csv"))
# get last close & mean from column `df.Close`
last_close = df.loc[df.index.max(),'Close']
mean = df.Close.mean()
if last_close < mean:
if buy_dividend(ticker):
print("{} is a good buy".format(ticker))
else:
print("{} is not a good buy".format(ticker))
This will print:
TELIA.ST is not a good buy
ELUX-B.ST is a good buy
HM-B.ST is a good buy
# and will silently pass over 'ABB.ST', since `(last_close < mean) == False` here
New function looks like this:
def buy_dividend(ticker):
if df.empty:
pass
else:
last_close = df[-1:]['Close'].values[0]
mean = df["Close"].mean()
if last_close < mean:
info = yf.Ticker(ticker).info
mcap = info.get("marketCap")
if mcap is None:
pass
elif mcap > 1E10:
PE = info.get('trailingPE')
if PE is None:
pass
elif PE < 20:
div = info.get("dividendYield")
if div is None:
pass
elif div > 0.04:
return True
for filename in os.listdir("C:\\Users\Andreas\Aktieanalys\dataset"):
df = pd.read_csv("C:\\Users\Andreas\Aktieanalys\dataset\{}".format(filename))
if buy_dividend(ticker):
print("{} is a good buy".format(filename))
But somehow the dividend yield are messing things up. If the rows containing "div" are being #, then the function works perfect and correctly. Why is that?

The one-way ANOVA function I'm using keeps spitting out F values that don't make sense

I'm working on a project for college and it's kicking my ass.
I downloaded a data file from https://www.kaggle.com/datasets/majunbajun/himalayan-climbing-expeditions
I'm trying to use an ANOVA to see if there's a statistically significant difference in time taken to summit between the seasons.
The F value I'm getting back doesn't seem to make any sense. Any suggestions?
#import pandas
import pandas as pd
#import expeditions as csv file
exp = pd.read_csv('C:\\filepath\\expeditions.csv')
#extract only the data relating to everest
exp= exp[exp['peak_name'] == 'Everest']
#create a subset of the data only containing
exp_peaks = exp[['peak_name', 'member_deaths', 'termination_reason', 'hired_staff_deaths', 'year', 'season', 'basecamp_date', 'highpoint_date']]
#extract successful attempts
exp_peaks = exp_peaks[(exp_peaks['termination_reason'] == 'Success (main peak)')]
#drop missing values from basecamp_date & highpoint_date
exp_peaks = exp_peaks.dropna(subset=['basecamp_date', 'highpoint_date'])
#convert basecamp date to datetime
exp_peaks['basecamp_date'] = pd.to_datetime(exp_peaks['basecamp_date'])
#convert basecamp date to datetime
exp_peaks['highpoint_date'] = pd.to_datetime(exp_peaks['highpoint_date'])
from datetime import datetime
exp_peaks['time_taken'] = exp_peaks['highpoint_date'] - exp_peaks['basecamp_date']
#convert seasons from strings to ints
exp_peaks['season'] = exp_peaks['season'].replace('Spring', 1)
exp_peaks['season'] = exp_peaks['season'].replace('Autumn', 3)
exp_peaks['season'] = exp_peaks['season'].replace('Winter', 4)
#remove summer and unknown
exp_peaks = exp_peaks[(exp_peaks['season'] != 'Summer')]
exp_peaks = exp_peaks[(exp_peaks['season'] != 'Unknown')]
#subset the data according to the season
exp_peaks_spring = exp_peaks[exp_peaks['season'] == 1]
exp_peaks_autumn = exp_peaks[exp_peaks['season'] == 3]
exp_peaks_winter = exp_peaks[exp_peaks['season'] == 4]
#calculate the average time taken in spring
exp_peaks_spring_duration = exp_peaks_spring['time_taken']
mean_exp_peaks_spring_duration = exp_peaks_spring_duration.mean()
#calculate the average time taken in autumn
exp_peaks_autumn_duration = exp_peaks_autumn['time_taken']
mean_exp_peaks_autumn_duration = exp_peaks_autumn_duration.mean()
#calculate the average time taken in winter
exp_peaks_winter_duration = exp_peaks_winter['time_taken']
mean_exp_peaks_winter_duration = exp_peaks_winter_duration.mean()
# Turn the season column into a categorical
exp_peaks['season'] = exp_peaks['season'].astype('category')
exp_peaks['season'].dtypes
from scipy.stats import f_oneway
# One-way ANOVA
f_value, p_value = f_oneway(exp_peaks['season'], exp_peaks['time_taken'])
print("F-score: " + str(f_value))
print("p value: " + str(p_value))
It seems that f_oneway requires the different samples of continuous data to be arguments, rather than taking a categorical variable argument. You can achieve this using groupby.
f_oneway(*(group for _, group in exp_peaks.groupby("season")["time_taken"]))
Or equivalently, since you have already created series for each season:
f_oneway(exp_peaks_spring_duration, exp_peaks_autumn_duration, exp_peaks_winter_duration)
I would have thought there would be an easier way to perform an ANOVA in this common case but can't find it.

Any way for Pandas to create and output corresponding to user's requests graphs, using input()?

I have a Pandas Dataframe which I got using parsing from IMDb. I want to be able to create and output certain barplots, corresponding to the users request from input().
name decade genre
0 movie1 20s adventure
1 movie2 20s fantasy
...
89 movie35 00s drama
90 movie36 10s sci-fi
For example, the input is '30', then the corresponding graph in range(10, 19) (considering that df starts from the 20's and there're 10 movies in each decade) pops up for the user. The code for the graph is below (oscar_df is the original df, can provide the parsing code):
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
glo = []
from collections import Counter
for st in range(0, 10):
for i in oscar_df['Genre'][st]:
glo.append(i.strip())
os_df = pd.DataFrame(list(Counter(glo).items()), columns = ['G', 'Q'])
os_df.sort_values('G')
os_df
ax = sns.barplot(x='Q',y='G', data=os_df, palette="hls", orient='h');
ax.set(xlabel='Number of Oscars', ylabel = None)
I was thinking of a fstring and a loop, and was searching a lot, but didn't find much. Any way to tackle this?
I'm just posting a few tips for helpfulness.
I would recommend validating responses. You can use the re module (regular expressions) and just look for two digits in the input string. The following snippet also has a while loop that will repeat until a desired response is entered:
import re
number_of_attempts = 3
while number_of_attempts > 0:
dex = input('Enter two last digits of the decade (for instanse: 30): ')
# Does the input contain two digits?
if re.match(r"\d{2}", dex):
# Break our loop by setting our variable to zero
number_of_attempts = 0
else:
# Increment the variable lower by 1
number_of_attempts -= 1
You can use .str.contains() to filter a DataFrame. Then we can skip the 's' on the decade:
df_decade = oscar_df[oscar_df['decade'].str.contains(dex)]
# e.g. - df_decade = oscar_df[oscar_df['decade'].str.contains('30')]
Or, save time and just get the column you want:
df_decade_genre = oscar_df[oscar_df['decade'].str.contains(dex)]['genre']
Instead of importing Counter, you could use Series.value_counts(). Below, we set a category variable to 'genre' and then counted those results:
category = 'genre'
df_counts = df_decade[category].value_counts()
Finally, while not Seaborn, Pandas has a plot feature that you can call directly from a DataFrame or Series. This snippet doesn't use a dataframe, but you can use Pandas .sort_values() function to order your series by ascending or descending order before plotting.
category = 'genre'
df_counts = df_decade[category].value_counts().sort_values(ascending=False)
Get the top n values:
n = 10
df_top_n_count = df_decade[category].value_counts().sort_values(ascending=False)[:n]
And plot the results:
df_counts.plot(kind='bar') # Very rudimentary.
Or, use the dataframe and set x and y to your desired categories:
oscar_df.plot(x='genre', y='decade', kind='bar')
Hope this helps!
Consider no loop or counter collection but simply filter rows by input and then run a groupby count:
dex = str(input('Enter two last digits of the decade (for instanse: 30): ')+'s')
os_df = (oscar_df[oscar_df['Decade'] == dex]
.reindex(['Genre', 'Decade'], axis = 1)
.groupby(['Genre'], as_index = False)
.count()
.rename({'Decade':'Q', 'Genre':'G'})
)
ax = sns.barplot(x='Q',y='G', data=os_df, palette="hls", orient='h')
ax.set(xlabel='Amount of Academy Awards', ylabel = None)
plt.title('Stats for the decade you have chosen',
fontname='Helvetica', fontsize=18)
I figured out the answer myself, maybe not the most sophisticated one, though.
dex = str(input('Enter two last digits of the decade (for instanse: 30): ')+'s')
for i in range(len(oscar_df['Decade'])):
if (set(oscar_df['Decade'][i]) == set(dex)) == True:
for a in oscar_df['Genre'][i]:
glo.append(a.strip())
os_df = pd.DataFrame(list(Counter(glo).items()), columns = ['G', 'Q'])
list.clear(glo)
os_df.sort_values('G')
ax = sns.barplot(x='Q',y='G', data=os_df, palette="hls", orient='h')
ax.set(xlabel='Amount of Academy Awards', ylabel = None);
plt.title('Stats for the decade you have chosen', fontname='Helvetica',
fontsize=18);

Python Loop Addition

No matter what I do I don't seem to be able to add all the base volumes and quote volumes together easily! I want to end up with a total base volume and a total quote volume of all the data in the data frame. Can someone help me on how you can do this easily?
I have tried summing and saving the data in a dictionary first and then adding it but I just don't seem to be able to make this work!
import urllib
import pandas as pd
import json
def call_data(): # Call data from Poloniex
global df
datalink = 'https://poloniex.com/public?command=returnTicker'
df = urllib.request.urlopen(datalink)
df = df.read().decode('utf-8')
df = json.loads(df)
global current_eth_price
for k, v in df.items():
if 'ETH' in k:
if 'USDT_ETH' in k:
current_eth_price = round(float(v['last']),2)
print("Current ETH Price $:",current_eth_price)
def calc_volumes(): # Calculate the base & quote volumes
global volume_totals
for k, v in df.items():
if 'ETH' in k:
basevolume = float(v['baseVolume'])*current_eth_price
quotevolume = float(v['quoteVolume'])*float(v['last'])*current_eth_price
if quotevolume > 0:
percentages = (quotevolume - basevolume) / basevolume * 100
volume_totals = {'key':[k],
'basevolume':[basevolume],
'quotevolume':[quotevolume],
'percentages':[percentages]}
print("volume totals:",volume_totals)
print("#"*8)
call_data()
calc_volumes()
A few notes:
For the next 2 years don't use the keyword globals for anything.
put function documentation under the function in quotes
using the requests library will be much easier than urllib. However ...
pandas can fetch the JSON and parse it all in one step
ok it doesn't have to be as split up as this, I'm just showing you how to properly pass variables around instead of globals.
I could not find "ETH" by itself. In the data they sent they have these 3 ['BTC_ETH', 'USDT_ETH', 'USDC_ETH']. So I used "USDT_ETH" I hope the substitution is ok.
calc_volumes is seeming to do the calculation and being some sort of filter (it's picky as to what it prints). This function needs to be broken up in to it's two separate jobs. printing and calculating. (maybe there was a filter step but I leave that for homework)
.
import pandas as pd
eth_price_url = 'https://poloniex.com/public?command=returnTicker'
def get_data(url=''):
""" Call data from Poloniex and put it in a dataframe"""
data = pd.read_json(url)
return data
def get_current_eth_price(data = None):
""" grab the price out of the dataframe """
current_eth_price = data['USDT_ETH']['last'].round(2)
return current_eth_price
def calc_volumes(data=None, current_eth_price=None):
""" Calculate the base & quote volumes """
data = df[df.columns[df.columns.str.contains('ETH')]].loc[['baseVolume', 'quoteVolume', 'last']]
data = data.transpose()
data[['baseVolume','quoteVolume']]*= current_eth_price
data['quoteVolume']*=data['last']
data['percentages']=(data['quoteVolume'] - data['baseVolume']) / data['quoteVolume'] * 100
return data
df = get_data(url = eth_price_url)
the_price = get_current_eth_price(data = df)
print(f'the current eth price is: {the_price}')
volumes = calc_volumes(data=df, current_eth_price=the_price)
print(volumes)
This code seems kind of odd and inconsistent... for example, you're importing pandas and calling your variable df but you're not actually using dataframes. If you used df = pd.read_json('https://poloniex.com/public?command=returnTicker', 'index')* to get a dataframe, most of your data manipulation here would become much easier, and wouldn't require any loops either.
For example, the first function's code would become as simple as current_eth_price = df.loc['USDT_ETH','last'].
The second function's code would basically be
eth_rows = df[df.index.str.contains('ETH')]
total_base_volume = (eth_rows.baseVolume * current_eth_price).sum()
total_quote_volume = (eth_rows.quoteVolume * eth_rows['last'] * current_eth_price).sum()
(*The 'index' argument tells pandas to read the JSON dictionary indexed by rows, then columns, rather than columns, then rows.)

Categories

Resources