Split json file into multiple csv files depending on date?

Split json file into multiple csv files depending on date? - python

I am trying to split up a json file from alpha-vantages api into separate files depending on the date. I'm also trying to reformat the file to have blank values in the gaps where dates are missing. The following code is what I have come up with but it gives me the TypeError: 'list' object is not callable". I'm fairly new to python and pandas so I'm sure there is a better way to go about this.
import requests
import pandas as pd
from datetime import datetime, timedelta
from dateutil import parser
import numpy as np
from pandas import DataFrame
import json
symbol = "MSFT"
symbol_list = symbol.split(",")
def num_el(list):
count = 0
for element in list:
count += 1
return count
def csv_make(sy, dar, dat):
csv_file = open(f"{sy}_1min_{dar}.csv", "w", newline="")
csv_file.write(dat)
csv_file.close()
i = 0
x = -1
n = num_el(symbol_list)
while i < n:
namesym = symbol_list[x]
ticker = namesym
api_key = 'APIKEYHERE'
url = f'https://www.alphavantage.co/query?function=TIME_SERIES_INTRADAY&symbol={ticker}&outputsize=full&interval=1min&apikey={api_key}'
data = requests.get(url)
dsf = data.json()
daf = pd.DataFrame(dsf['Time Series (1min)'])
dxf: DataFrame = daf.T
dxf.index.name = 'time'
dxf.reset_index(inplace=True)
dxf['time'] = pd.to_datetime(dxf['time'])
dxf['minute'] = dxf['time'].dt.time
dxf['day'] = dxf['time'].dt.day
dxf['date'] = dxf['time'].dt.date
agg = dxf.groupby([dxf['day']])
length1 = dxf.groupby([dxf['day']]).size()
length = pd.DataFrame(length1)
length.index.name = 'day'
length.reset_index(inplace=True)
length_sum = length[0].sum()
v = 0
d = length_sum
b = len(length)
x2 = length_sum
while v < b:
a = length[0][v]
x2 -= length[0][v]
xd = agg.get_group(length['day'][v])
date = xd['date'][x2]
max_dt = parser.parse(str(max(xd['minute'])))
min_dt = parser.parse(str(min(xd['minute'])))
dt_range = []
while min_dt <= max_dt:
dt_range.append(min_dt.strftime("%H:%M:%S"))
min_dt += timedelta(seconds=60)
complete_df = pd.DataFrame({'minute': dt_range})
xy = complete_df.astype('str')
yx = xd.astype('str')
dasf = xy.merge(yx, how='left', on='minute')
dasf['ev'] = np.where(dasf['1. open'].notnull(), 'False', 'True')
time = []
open = []
high = []
low = []
close = []
volume = []
empty_value = []
for ib in range(len(dasf)):
time.append(dasf['minute'][ib])
open.append(dasf['1. open'][ib])
high.append(dasf['2. high'][ib])
low.append(dasf['3. low'][ib])
close.append(dasf['4. close'][ib])
volume.append(dasf['5. volume'][ib])
empty_value.append(dasf['ev'][ib])
time_df = pd.DataFrame(time).rename(columns={0: 'Time'})
open_df = pd.DataFrame(open).rename(columns={0: 'Open'})
high_df = pd.DataFrame(high).rename(columns={0: 'High'})
low_df = pd.DataFrame(low).rename(columns={0: 'Low'})
close_df = pd.DataFrame(close).rename(columns={0: 'Close'})
volume_df = pd.DataFrame(volume).rename(columns={0: 'Volume'})
empty_value_df = pd.DataFrame(empty_value).rename(columns={0: 'Empty Value'})
frames = [time_df, open_df, high_df, low_df, close_df, volume_df, empty_value_df]
df = pd.concat(frames, axis=1, join='inner')
df = df.set_index('Time')
ad = df.to_csv()
csv_make(namesym, date, ad)
v += 1
i += 1

Related

Infinity loop issue using for loops

import pandas as pd
import time
import yfinance as yf
import money_18
import talib
def backtest(df,us_code, profit_target, stop_loss, macd_diff):
pos_opened = False
open_price = 0
close_price = 0
pnl = 0
pnl_list = []
original_capital = 100000
temp_capital = original_capital
num_of_lot = 0
equity_value = 0
equity_value_list = []
dd_dollar = 0
dd_dollar_list = []
dd_pct = 0
dd_pct_list = []
mdd_dollar = 0
mdd_pct = 0
total_profit = 0
num_of_trade = 0
for i in range(1, len(df)):
now_date = df.loc[i,'Date']
now_open = df.loc[i,'Open']
now_high = df.loc[i,'High']
now_low = df.loc[i,'Low']
now_close = df.loc[i,'Close']
now_rsi = df.loc[i,'RSI']
now_upper_band = df.loc[i,'Upper_Band']
now_middle_band = df.loc[i,'Middle_Band']
now_lower_band = df.loc[i,'Lower_Band']
now_macd = df.loc[i,'MACD']
now_macd_signal = df.loc[i,'MACD_Signal']
now_macd_hist = df.loc[i,'MACD_Hist']
##### equity curve #####
equity_value = round(temp_capital + (now_open - open_price) * num_of_lot )
equity_value_list.append(equity_value)
temp_max_equity = max(equity_value_list)
dd_dollar = temp_max_equity - equity_value
dd_dollar_list.append(dd_dollar)
mdd_dollar = max(dd_dollar_list)
dd_pct = (temp_max_equity - equity_value) / temp_max_equity
dd_pct_list.append(dd_pct)
mdd_pct = max(dd_pct_list)
##### open position #####
if (pos_opened == False) and (i < len(df) - 1) and now_macd_hist > macd_diff :
pos_opened = True
open_price = now_close
num_of_lot = temp_capital // (open_price)
##### profit taking and stop loss #####
if (pos_opened == True) and ((now_open - open_price > profit_target * open_price) or (now_open - open_price < stop_loss * open_price) or (i == len(df) -1)):
pos_opened = False
close_price = now_open
pnl = (close_price - open_price) * num_of_lot
pnl_list.append(pnl)
open_price = 0
num_of_lot = 0
temp_capital = temp_capital + pnl
if len(pnl_list) > 0:
total_profit = sum(pnl_list)
num_of_trade = len(pnl_list)
return us_code, profit_target, stop_loss, total_profit, num_of_trade, mdd_dollar, mdd_pct, macd_diff
if __name__ == '__main__':
us_code_list = ['TSLA', 'AAPL']
macd_diff_list = [0, 0.05]
profit_target_list = [0.03, 0.06]
stop_loss_list = [-0.01, -0.02, -0.03]
start_date = '2020-01-01'
end_date = '2020-12-31'
df_dict = {}
for us_code in us_code_list:
df= yf.Ticker(us_code).history(start=start_date, end=end_date)
df= df[df['Volume'] > 0]
df = df[['Open', 'High', 'Low', 'Close']]
df['RSI'] = talib.RSI(df['Close'], timeperiod=14)
df['Upper_Band'], df['Middle_Band'], df['Lower_Band'] = talib.BBANDS(df['Close'], 20, 2, 2)
df['MACD'], df['MACD_Signal'], df['MACD_Hist'] = talib.MACD(df['Close'], fastperiod=12, slowperiod=26,
signalperiod=9)
df = df[df['MACD_Hist'].notna()]
df = df.reset_index()
df_dict[us_code] = df
save_us_code = ''
save_macd_diff = 0
save_profit_target = 0
save_stop_loss = 0
total_profit = 0
num_of_trade = 0
mdd_dollar = 0
mdd_pct = 0
save_us_code_list = []
save_macd_diff_list = []
save_profit_target_list = []
save_stop_loss_list = []
total_profit_list = []
num_of_trade_list = []
mdd_dollar_list = []
mdd_pct_list = []
result_dict = {}
for us_code in us_code_list:
for macd_diff in macd_diff_list:
for profit_target in profit_target_list:
for stop_loss in stop_loss_list:
print(us_code, macd_diff, profit_target, stop_loss) ## the problem should be starting from here##
save_us_code, save_profit_target, save_stop_loss, total_profit, num_of_trade, mdd_dollar, mdd_pct, macd_diff = backtest(df, us_code, profit_target, stop_loss, macd_diff)
save_us_code_list.append(save_us_code)
save_profit_target_list.append(save_profit_target)
save_stop_loss_list.append(save_stop_loss)
total_profit_list.append(total_profit)
num_of_trade_list.append(num_of_trade)
mdd_dollar_list.append(mdd_dollar)
mdd_pct_list.append(mdd_pct)
macd_diff_list.append(macd_diff)
I am working on the algo trade, however, I created a for loop to put my parameter into my backtest function. However, the for loop keeps looping non-stop.
I think the error starting from "for macd_diff in macd_diff_list:" because i try to print the result below that row, the result is already indefinite.

Now that you've shown the full code, your problem is obvious. Your original example didn't show the issue because you didn't include all relevant code. Here's your example with the relevant code that's causing the issue:
for us_code in us_code_list:
for macd_diff in macd_diff_list:
for profit_target in profit_target_list:
for stop_loss in stop_loss_list:
... # irrelevant code not shown
macd_diff_list.append(macd_diff)
The issue is that you're looping through each item in macd_diff_list, but then for each loop iteration, you add an item to that list. So of course the loop will be infinite. You need to be looping through a different list, or adding items to a different list.

Making Excel histograms using python

This is the output of my python script so far.
Excel Table
The vertical axis of the table are road names. The horizontal axis are dates. The values indicate if a road was under construction at the time and why. I'd like to make a line graph that groups the dates by years 2017, 2018, 2019 etc... and plots the longest amount a time within those groups that a road was under construction and the average amount for the whole year. I'm a complete novice in excel and don't know how to leverage it's features to achieve my goal, though I suspect that there may be built in functions that do what I want without much difficulty. Any suggestions on how can achieve my desired output would be much appreciated. EDIT: It was suggested that I post my code so far.
import re
import time
startTime = time.time()
import collections
import xlsxwriter as xlswr
import scipy.spatial as spy
from itertools import islice
from itertools import groupby
from natsort import natsorted
from functools import partial
from collections import Counter
from datetime import date as DATE
from indexed import IndexedOrderedDict
from multiprocessing.dummy import Pool as ThreadPool
import multiprocessing as mp
workBook = xlswr.Workbook("testfix.xlsx")
cell_format = workBook.add_format()
format1 = workBook.add_format({'num_format': 'mm/dd/yy'})
sheet = workBook.add_worksheet()
def to_raw(string):
return fr"{string}"
def cvrt(x):
ans = re.split(r'(\d+)(?!.*\d)', x)
return int(ans[1])
def indexer(s):
pattern = re.compile(r'I, [0-9]+, ')
gm = re.split(pattern, s);
values = s.rsplit(gm[1])
gm = gm[1]
values[1] = gm
return values
def int2Date(x):
string = str(x)
Y = int(string[0:4])
M = int(string[4:6])
D = int(string[6:8])
return DATE(Y,M,D)
def dDelta(x, y):
string1 = str(x)
string2 = str(y)
Y1 = int(string1[0:4])
M1 = int(string1[4:6])
D1 = int(string1[6:8])
Y2 = int(string2[0:4])
M2 = int(string2[4:6])
D2 = int(string2[6:8])
f_date = DATE(Y1,M1,D1)
l_date = DATE(Y2,M2,D2)
delta = l_date - f_date
if isinstance(y, int):
return float(int((delta.days)/30.44))
else:
return int((delta.days)/30.44)
def Book(path):
file = open(path,'r')
lines = file.readlines()
file.close()
book = IndexedOrderedDict()
for line in lines:
if re.match("I", line):
IDs = indexer(line)[1]
if re.match(" 0.00,", line):
rID = line
#"GM_FINAL_AUTH,0,[1-9]"
if re.search("GM_FINAL_AUTH,0,[1-9]", line):
book.update({(rID, line): to_raw(IDs)})
return sort_book(book)
def dUpdate(dic, key, value):
return dic.update({(key[0], "GM_FINAL_AUTH,0,0"): value})
def valSplt(s):
pattern = re.compile(r'(\d+)')
gm = re.split(pattern, s)
values = s.rsplit(gm[1])
gm = gm[1]
values[1] = gm
return values
def sort_book(book):
book = natsorted([value, key] for key, value in book.items())
book = IndexedOrderedDict((data[1], data[0]) for data in book)
return book
def alph_order(word1, word2):
for i in range(min(len(word1), len(word2))):
if ord(word1[i]) == ord(word2[i]):
pass
elif ord(word1[i]) > ord(word2[i]):
return word2
else:
return word1
return word1
def read(cpdm, date_list):
sCnt = [0] * len(cpdm)
lowest_number = 999999999999
terminationCondition = [True] * len(cpdm)
saved_results = [0] * len(cpdm)
current_prefix = None
cnt = 0
while any(terminationCondition) is True:
saved_results = [0] * len(cpdm)
last_prefix = None
lowest_number = 999999999999
for dicIdx, dicVal in enumerate(sCnt):
if dicVal < len(cpdm[dicIdx]):
ID = cpdm[dicIdx].values()[dicVal]
# print(entry)
current_prefix, road_number = valSplt(ID)
road_number = int(road_number)
if last_prefix is None:
last_prefix = current_prefix
higherOrder_prefix = alph_order(last_prefix, current_prefix)
# print('check:',[higherOrder_prefix, last_prefix, current_prefix])
if current_prefix == higherOrder_prefix:
if current_prefix != last_prefix:
lowest_number = road_number
last_prefix = current_prefix
elif road_number < lowest_number:
lowest_number = road_number
last_prefix = current_prefix
for dicIdx, dicVal in enumerate(sCnt):
if dicVal < len(cpdm[dicIdx]):
# print(dicIdx, dicVal, len(cpdm[dicIdx]))
ID = cpdm[dicIdx].values()[dicVal]
VALUE = cpdm[dicIdx].keys()[dicVal]
# print(entry)
road_name, road_number = valSplt(ID)
road_number = int(road_number)
if road_name == last_prefix and lowest_number == road_number:
saved_results[dicIdx] = [ID, VALUE[1], date_list[dicIdx], VALUE[0]]
if dicVal < len(cpdm[dicIdx]):
sCnt[dicIdx] += 1
else:
terminationCondition[dicIdx] = False
else:
terminationCondition[dicIdx] = False
for rst in range(len(saved_results)):
if saved_results[rst] == 0:
pass
else:
sheet.write(cnt+1, 0, str(saved_results[rst][0]))
sheet.write(cnt+1, rst+1, cvrt(saved_results[rst][1]))
#sheet.write(cnt+1, 2*et+3, int2Date(saved_results[et][2]), format1)
#sheet.write(cnt+1, 0, saved_results[rst][3])
cnt += 1
def main():
# 2018 MAPS
path1 = "W:\\Scripting\\2018\\DBData_84577881.txt"
path2 = "W:\\Scripting\\2018\\DBData_84639568.txt"
path3 = "W:\\Scripting\\2018\\DBData_84652483.txt"
path4 = "W:\\Scripting\\2018\\DBData_84670490.txt"
# 2019 MAPS
path5 = "W:\\Scripting\\2019\\DBData_84706383.txt"
path6 = "W:\\Scripting\\2019\\DBData_84715201.txt"
path7 = "W:\\Scripting\\2019\\DBData_84743195.txt"
path8 = "W:\\Scripting\\2019\\DBData_84777742.txt"
path9 = "W:\\Scripting\\2019\\DBData_84815446.txt"
path10 = "W:\\Scripting\\2019\\DBData_84835743.txt"
# 2020 MAPS
path11 = "W:\\Scripting\\2020\\DBData_84882849.txt"
path12 = "W:\\Scripting\\2020\\DBData_84966202.txt"
path13 = "W:\\Scripting\\2020\\DBData_84988789.txt"
p_list = [path1, path2, path3, path4, path5, path6, path7,
path8, path9, path10, path11, path12, path13]
pool = mp.Pool(mp.cpu_count())
CPDM = pool.map(Book, p_list)
pool.close()
#pool.join()
date_list = [20180809, 20180913, 20181011, 20181204, 20190222, 20190325,
20190501, 20190628, 20190815, 20190925, 20200207, 20200501, 20200617]
#CPDM = [b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13]
for i in CPDM:
print(len(i))
#sheet.write("A1", "Lat Long")
sheet.write("A1", "ID")
#for i in range(len(CPDM)):
cn = 0
for i in date_list:
#sheet.write(0, 3*i+1, "ID" + str(i+1))
sheet.write(0, cn+1, int2Date(i), format1)
cn += 1
#sheet.write(0, 2*i+3, "Date" + str(i+1))
read(CPDM, date_list)
workBook.close()
if __name__ == "__main__":
main()
executionTime = (time.time() - startTime)
print('Execution time in minutes: ' + str(executionTime/60))

Long story short, what you want is not exactly possible. Your data contains spot measurements, so what happened in between? Or after? Was the road under construction or not? This makes it impossible to calculate an accurate number of days that the road was under construction.
It is possible to do something that approximates what you want, but that will require some choices from your side. For example, if you measure that the road is under construction on 08/15/2019 but not anymore on 05/01/2020, do you count all the days between those 2 dates as closed? Or only until new years?
To help you get started I've added a little script that does some formatting on your data. It should give you an idea of how to handle the data.
import pandas
import plotly.express as px
# Read the Excel file
df = pandas.read_excel("./test.xlsx", index_col="ID")
# Flip the dataframe (dates should be on the index)
df = df.transpose()
# Fill any empty cells with 0
df = df.fillna(0)
# Combine columns with the same name
df = df.groupby(df.columns, axis=1).agg(lambda column: column.max(axis=1))
# Make sure the dates are sorted
df = df.sort_index()
# Create a list to hold all the periods per road
roads = []
for road_name in df.columns:
# Group by consecutive 1's
groups = df.loc[df[road_name] == 1, road_name].groupby((df[road_name] != 1).cumsum())
# Every group denotes a period for which the road was under construction
for _, group in groups:
# Get the start and finish for each group
roads.append({
"road": road_name,
"start": group.index[0],
"finish": group.index[-1] + pandas.Timedelta(1, unit="D"), # Add one day because groups with same start and finish will not be visible on the plot
})
# Convert back to a dataframe
roads_df = pandas.DataFrame(roads)
# Create a Gantt chart with Plotly (NOTE: you'll need version 4.9+ of Plotly)
fig = px.timeline(roads_df, x_start="start", x_end="finish", y="road")
fig.update_yaxes(autorange="reversed") # otherwise tasks are listed from the bottom up
fig.show()

Python generated Excel file only shows one row of data vs multiple rows

I am trying to write the results from the loop into an Excel file (keys = column names) and (values = rows data). This code generates the file for me, but it only prints one row of data in the file. How can i make it append the other rows to the file?
import pandas as pd
p = (('BusinessName', 'CustomerNameToSearch'), ('PageSize', '2'), ('CountryCode', 'CA'))
prepare_link = requests.get('https://api.myapiloopuplink?', auth=BearerAuth('PMay4TY5K577b76154i97yC9DlbPytqd'), params=p)
test = requests.get(prepare_link.url, auth=BearerAuth('PMay4TY5K577b76154i97yC9DlbPytqd'), params=p)
data = json.loads(test.text)
CustomerIdList = []
for customer in data['Data']:
BusinessID = customer['BusinessId']
BusinessName = customer['BusinessName']
CustomerIdList.append(str(customer['BusinessId']))
for i in CustomerIdList:
links2 = ("https://api.myapiloopuplink/"+i+"/History?count=1")
test2 = requests.get(links2, auth=BearerAuth('PMay4TY5K577b76154i97yC9DlbPytqd'))
data2 = json.loads(test2.text)
start_row = 0
for extradetails in data2['Data']:
myDict = {}
myDict["BusinessId"] = customer['BusinessId']
myDict["BusinessName"] = customer['BusinessName']
myDict["Year"] = extradetails['Year']
myDict["Rate"] = extradetails['Rate']
print(myDict)
k = list(myDict.keys())
v = list(myDict.values())
#print(k)
#print(v)
x = [myDict]
df = pd.DataFrame(x)
df.to_excel ('locationandnameoffile.xlsx', sheet_name = 'sheet1', index = False, startrow=start_row)
start_row = start_row + len(df) + 1
This is the output i currently get
This is the output i am trying to get
In the loop i get the right results when i print (it shows multiple rows)
print(myDict)

I think the problem is here:
for extradetails in data2['Data']:
myDict = {}
myDict["BusinessId"] = customer['BusinessId']
myDict["BusinessName"] = customer['BusinessName']
myDict["Year"] = extradetails['Year']
myDict["Rate"] = extradetails['Rate']
print(myDict)
k = list(myDict.keys())
v = list(myDict.values())
#print(k)
#print(v)
x = [myDict]
df = pd.DataFrame(x) #problem
df.to_excel ('locationandnameoffile.xlsx', sheet_name = 'sheet1', index = False, startrow=start_row)#problem
start_row = start_row + len(df) + 1
You are creating an excel file in every loop. How about create an excel file after the loop completes. like this:
datas=[]
for extradetails in data2['Data']:
myDict = {}
myDict["BusinessId"] = customer['BusinessId']
myDict["BusinessName"] = customer['BusinessName']
myDict["Year"] = extradetails['Year']
myDict["Rate"] = extradetails['Rate']
print(myDict)
k = list(myDict.keys())
v = list(myDict.values())
#print(k)
#print(v)
datas.append([myDict])
start_row = start_row + len(df) + 1
df = pd.DataFrame(datas)
df.to_excel ('locationandnameoffile.xlsx', sheet_name = 'sheet1', index = False, startrow=start_row)

Python Pandas NameError: name 'data' is not defined

I'm new to coding. When I attempt to run this it says:
NameError: name 'data' is not defined.
import numpy as np
import pandas as pd
from scipy import stats
from matplotlib import pyplot as plt
from matplotlib.ticker import MaxNLocator
import datetime
import json
from bs4 import BeautifulSoup
import requests
import time
def fetchCryptoClose(fsym, tsym):
# function fetches the close-price time-series from cryptocompare.com
# it may ignore USDT coin (due to near-zero pricing)
# daily sampled
cols = ['date', 'timestamp', fsym]
lst = ['time', 'open', 'high', 'low', 'close']
timestamp_today = datetime.today().timestamp()
curr_timestamp = timestamp_today
for j in range(2):
df = pd.DataFrame(columns=cols)
url = "https://min-api.cryptocompare.com/data/histoday?fsym=" + fsym + \
"&tsym=" + tsym + "&toTs=" + str(int(curr_timestamp)) + "&limit=3"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
dic = json.loads(soup.prettify())
for i in range(1, 4):
tmp = []
for e in enumerate(lst):
x = e[0]
y = dic['Data'][i][e[1]]
if(x == 0):
tmp.append(str(timestamp2date(y)))
tmp.append(y)
if(np.sum(tmp[-4::]) > 0): # remove for USDT
tmp = np.array(tmp)
tmp = tmp[[0,1,4]] # filter solely for close prices
df.loc[len(df)] = np.array(tmp)
# ensure a correct date format
df.index = pd.to_datetime(df.date, format="%Y-%m-%d")
df.drop('date', axis=1, inplace=True)
curr_timestamp = int(df.ix[0][0])
if(j == 0):
df0 = df.copy()
else:
data = pd.concat([df, df0], axis=0)
data.drop("timestamp", axis=1, inplace=True)
return data # DataFrame
# N-Cryptocurrency Portfolio (tickers)
fsym = ['BTC', 'ETH', 'XRP', 'LTC', 'DASH', 'XMR', 'ETC', 'MAID', 'XEM', 'REP']
# vs.
tsym = 'USD'
for e in enumerate(fsym):
print(e[0], e[1])
if(e[0] == 0):
try:
data = fetchCryptoClose(e[1], tsym)
except:
pass
else:
try:
data = data.join(fetchCryptoClose(e[1], tsym))
except:
pass
# ensure values to be floats
# save portfolio to a file (HDF5 file format)
store = pd.HDFStore('portfolio2.h5')
store['data'] = data
store.close()
# read in your portfolio from a file
df = pd.read_hdf('portfolio2.h5', 'data')
print(df)

Don't use try-except-pass because will silence all your exceptions and you might never actually create `data.
Replace this code:
for e in enumerate(fsym):
print(e[0], e[1])
if(e[0] == 0):
try:
data = fetchCryptoClose(e[1], tsym)
except:
pass
else:
try:
data = data.join(fetchCryptoClose(e[1], tsym))
except:
pass
with this:
for e in enumerate(fsym):
print(e[0], e[1])
if(e[0] == 0):
data = fetchCryptoClose(e[1], tsym)
else:
data = data.join(fetchCryptoClose(e[1], tsym))
and see where your real exceptions are.

Add each new dictionary result in the order of the columns of a dataframe

I am new to Python, but hope to explain the issue.
dfrow - is a dictionary of a single regression summary
results - is an empty dataframe with same columns as in dfrow
I would like to save regression results for each observation in the outer loop at the same time making sure column order in the inner loop. I am getting a result for the first observations but cannot move further, error saying:
Traceback (most recent call last):
File "<stdin>", line 109, in <module>
TypeError: 'numpy.int64' object is not iterable
when I run this code
import pandas as pd
import numpy as np
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.stats import stattools as st
import statsmodels.api as sm
import collections
import datetime
import warnings
import scipy.stats
df_rent = import_rents()
df_return = import_ee_rets()
mostrecent = df_return.iloc[len(df_return) - 1]
mostrecentYYYY = mostrecent['Year']
mostrecentQ = mostrecent['Quarter']
mostrecentperiod = str(mostrecentYYYY) + "-Q" + str(mostrecentQ)
rentcols = df_rent.columns.values
colnames = []
#loop through the columns in df_rent until the column == the most recent period for which we have ee return data
for colname in rentcols:
if colname != mostrecentperiod:
colnames.append(colname)
else:
colnames.append(colname)
break
rentcols = colnames
#subset df_rent to only include columns that also have ee return data
df_rent = df_rent[rentcols]
#change dtype of metro_code / metro columns to string for matching later
df_rent['metro_code'] = df_rent['metro_code'].apply(str)
df_return['Metro'] = df_return['Metro'].apply(str)
df = pd.read_csv('//x/Project/_data/raw_data/rent_change.csv')
metros = list(np.unique(df['metro_code']))
regress_result_names = [
'metro',
'num_lag',
'num_ma',
'num_AR',
'beta_x1_retmov',
'x1_se',
'x1_tstat',
'x1_pval',
'r-squared',
'reg_fstat',
'fstat_pvalue',
'durbin-watson',
'resid_var']
regress_result_names = pd.Series(regress_result_names)
results = pd.DataFrame(columns=regress_result_names)
row = 0
for metro in metros:
for nlag in range(0, 5):
for nma in range(1, 11):
for AR in range(1, 5):
y = df_rent[df_rent['metro_code'] == str(metro)]
y = y.values.tolist()
y = y[0]
# delete first two columns of df_rent (they don't contain numeric data)
y.pop(0)
y.pop(0)
#y = rent time series data for specific metro
y = pd.Series(y)
#x1 = lagged moving average data for given params
df_return1 = df_return[df_return['Metro'] == str(metro)]
df_return1 = df_return1.reset_index(drop = True)
x1 = lagged_moving_avg(df = df_return1, metro_code = metro, nlag = nlag, nma = nma)
#y and x1 dataframe
y_label = 'y_Rent'
x_lagMA_label = 'x1_LaggedMA'
df1 = pd.DataFrame()
df1[y_label] = y
df1[x_lagMA_label] = x1
if mostrecentQ == 1:
currmonth = "01"
elif mostrecentQ == 2:
currmonth = "04"
elif mostrecentQ == 3:
currmonth = "07"
else:
currmonth = "10"
#convert index to datetime to run the regressions
currpd = pd.to_datetime((str(mostrecentYYYY) + currmonth), format='%Y%m')
df1.index = pd.date_range(*(pd.to_datetime(['1990-01', currpd]) + pd.offsets.QuarterEnd()), freq='Q')
#drop any rows that have missing observations
df1 = df1.dropna()
#df1.to_csv('//Nisfile01/x/Project - Real Estate Database/real_estate/odil/XandY.csv', index=True)
reg = ARIMA(endog = df1[y_label], order = (AR, 0,0)).fit(trend = 'nc', disp = 0, tol=1e-20)
resid_reg = reg.resid
reg2 = sm.OLS(resid_reg, df1[x_lagMA_label]).fit()
resid_reg2 = reg2.resid
dfrow = {
'metro': metro,
'num_lag': nlag,
'num_ma': nma,
'num_AR': AR,
'beta_x1_retmov': reg2.params[0],
'x1_se': reg2.bse[0],
'x1_tstat': reg2.tvalues[0],
'x1_pval': reg2.pvalues[0],
'r-squared': reg2.rsquared,
'reg_fstat':reg2.fvalue,
'fstat_pvalue': reg2.f_pvalue,
'durbin-watson': st.durbin_watson(reg2.resid),
'resid_var': resid_reg2.var(),
}
#create df for output called results
for key in dfrow.keys():
results.loc[row, key] = list(dfrow[key])
row = row + 1
Any help is very much appreciated.
P.S. Sorry for the messy code

The offending line is results.loc[row, key] = list(dfrow[key]).
You are trying to convert a single value, in this case a numpy.int64 object, to a list. I assume that what you're trying to do, and correct me if I am wrong, is create a singleton list with the int64 inside it. If that's what you want to do, you should use:
results.loc[row, key] = [dfrow[key]]

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Split json file into multiple csv files depending on date? - python

Related

Infinity loop issue using for loops

Making Excel histograms using python

Python generated Excel file only shows one row of data vs multiple rows

Python Pandas NameError: name 'data' is not defined

Add each new dictionary result in the order of the columns of a dataframe

Categories

Resources