Python dictionary comprehension Scoping - python

I am running the code below and getting a NameError.
When I run it one line at a time, it works, but when I wrap the lines inside a function, I get NameError: name 'primes_cols' is not defined. Why is the code below producing a NameError when the variables are defined?
import pandas as pd
primes = pd.DataFrame(columns = ['A','B','C','D'], data=[[3,5,7,11]])
tens = pd.DataFrame(columns = ['E','F','G','H'], data=[[10,20,30,40]])
evens = pd.DataFrame(columns = ['I','J','K','L'], data=[[4,8,12,16]])
def process():
keys = ['primes','tens','evens']
# My Data
data = {key:eval(key) for key in keys}
# Trim Dataframes
primes_cols = ['A','B']
tens_cols = ['E','F']
evens_cols = ['I','J']
data = {key:value[eval(key + '_cols')] for (key, value) in data.items()}
return data
df = process()

The best approach (#juanpa.arrivillaga), as seen in the comments, Is to avoid eval
import pandas as pd
primes = pd.DataFrame(columns = ['A','B','C','D'], data=[[3,5,7,11]])
tens = pd.DataFrame(columns = ['E','F','G','H'], data=[[10,20,30,40]])
evens = pd.DataFrame(columns = ['I','J','K','L'], data=[[4,8,12,16]])
def process():
# Trim Dataframes
primes_cols = ['A','B']
tens_cols = ['E','F']
evens_cols = ['I','J']
keys = ['primes','tens','evens']
values = [primes, tens, evens]
cols = [primes_cols, tens_cols, evens_cols]
return {key:df[c] for key,df,c in zip(keys, values, cols)}

Related

Convert DF column values to column (like pivot)

I am scraping api data and totaling counts of different values into a dictionary 'building':'count' for each player (row). I would like to be able to analyze it further. An easy solution would be to pull the different unique 'buildings' (dictionary keys within the row) as dataframe columns and then do the equivalent of an index/match/match on them. The script currently gets the data, and I can extract the unique keys, but I am lost at how to make them into DF columns and then how to do the index/match/match. There may be a better approach from even before running the 'count' part of the script.
You should be able to run the script, no credentials are required to GET against the API. If you see the ranklist DF column with the building counts you will see what I am referencing.
Thank you for any guidance!
import requests
import pandas as pd
from datetime import datetime
from datetime import date
from datetime import timedelta
import operator
from time import sleep
ranklist = pd.DataFrame()
for i in range(430):
baserank_url = 'https://www.simcompanies.com/api/v3/encyclopedia/ranking/' + str(i) + '/'
r = requests.get(baserank_url)
rank_json = r.json()
df = pd.DataFrame.from_dict(rank_json)
df=df.filter(['company','id','rank','value'])
ranklist = ranklist.append(df)
ranklist.to_csv(r'rank_dataframe.csv',index=False)
print('Ranking list started succesfully!')
levellist=[]
bcolist=[]
today= date.today()
for row in ranklist.itertuples():
attempt = 0
while True:
if attempt == 6:
break
try:
print(str(row.rank + 1) +' ' + str(attempt))
account_url = 'https://www.simcompanies.com/api/v2/players/' + str(row.id) + '/'
r = requests.get(account_url)
account_json = r.json()
playerid = account_json.get("player").get("id")
playerlevel = account_json.get("player").get("level")
datestart = datetime.strptime(account_json.get("player").get("dateJoined")[:10],'%Y-%m-%d').date()
yearsactive = round((today - datestart)/ timedelta(days=365.2425),2)
buildings = account_json.get("buildings")
certificates = account_json.get("certificates")
bnames = [d['name'] for d in buildings]
bnames = [n.replace('Park','Recreation').replace('Lake','Recreation').replace('Castle','Recreation') for n in bnames]
cnames = [d['name'] for d in certificates]
sptr = 'Yes' if 'Supporter' in cnames else 'No'
dictOfElems = dict()
for elem in bnames:
if elem in dictOfElems:
dictOfElems[elem] += 1
else:
dictOfElems[elem] = 1
blist = {key:value for key, value in dictOfElems.items()}
blist = dict(sorted(blist.items(),key=operator.itemgetter(1),reverse=True))
bcolist.append([blist.keys()])
levellist.append([playerid, playerlevel,sptr, datestart,yearsactive,blist])
except:
sleep(20)
attempt +=1
continue
break
#get unique building values
bcodf= pd.DataFrame(bcolist,columns=['buildings'])
bcouni = list(set([a for b in bcodf.buildings.tolist() for a in b]))
print(bcouni)
leveldf = pd.DataFrame(levellist,columns=['id','level','sptr','datestart','yearsactive','blist'])
#clist = list(set([a for b in leveldf.cnames.tolist() for a in b]))
#print(leveldf[blist])
#bul = leveldf[blist].keys()
#buniq = list(set([a for b in leveldf.bul.tolist() for a in b]))
#print(bul)
ranklist = ranklist.merge(leveldf, on='id', how='left')
ranklist['rank'] +=1
ranklist.to_csv(r'rank_dataframe.csv',index=False)

Python generated Excel file only shows one row of data vs multiple rows

I am trying to write the results from the loop into an Excel file (keys = column names) and (values = rows data). This code generates the file for me, but it only prints one row of data in the file. How can i make it append the other rows to the file?
import pandas as pd
p = (('BusinessName', 'CustomerNameToSearch'), ('PageSize', '2'), ('CountryCode', 'CA'))
prepare_link = requests.get('https://api.myapiloopuplink?', auth=BearerAuth('PMay4TY5K577b76154i97yC9DlbPytqd'), params=p)
test = requests.get(prepare_link.url, auth=BearerAuth('PMay4TY5K577b76154i97yC9DlbPytqd'), params=p)
data = json.loads(test.text)
CustomerIdList = []
for customer in data['Data']:
BusinessID = customer['BusinessId']
BusinessName = customer['BusinessName']
CustomerIdList.append(str(customer['BusinessId']))
for i in CustomerIdList:
links2 = ("https://api.myapiloopuplink/"+i+"/History?count=1")
test2 = requests.get(links2, auth=BearerAuth('PMay4TY5K577b76154i97yC9DlbPytqd'))
data2 = json.loads(test2.text)
start_row = 0
for extradetails in data2['Data']:
myDict = {}
myDict["BusinessId"] = customer['BusinessId']
myDict["BusinessName"] = customer['BusinessName']
myDict["Year"] = extradetails['Year']
myDict["Rate"] = extradetails['Rate']
print(myDict)
k = list(myDict.keys())
v = list(myDict.values())
#print(k)
#print(v)
x = [myDict]
df = pd.DataFrame(x)
df.to_excel ('locationandnameoffile.xlsx', sheet_name = 'sheet1', index = False, startrow=start_row)
start_row = start_row + len(df) + 1
This is the output i currently get
This is the output i am trying to get
In the loop i get the right results when i print (it shows multiple rows)
print(myDict)
I think the problem is here:
for extradetails in data2['Data']:
myDict = {}
myDict["BusinessId"] = customer['BusinessId']
myDict["BusinessName"] = customer['BusinessName']
myDict["Year"] = extradetails['Year']
myDict["Rate"] = extradetails['Rate']
print(myDict)
k = list(myDict.keys())
v = list(myDict.values())
#print(k)
#print(v)
x = [myDict]
df = pd.DataFrame(x) #problem
df.to_excel ('locationandnameoffile.xlsx', sheet_name = 'sheet1', index = False, startrow=start_row)#problem
start_row = start_row + len(df) + 1
You are creating an excel file in every loop. How about create an excel file after the loop completes. like this:
datas=[]
for extradetails in data2['Data']:
myDict = {}
myDict["BusinessId"] = customer['BusinessId']
myDict["BusinessName"] = customer['BusinessName']
myDict["Year"] = extradetails['Year']
myDict["Rate"] = extradetails['Rate']
print(myDict)
k = list(myDict.keys())
v = list(myDict.values())
#print(k)
#print(v)
datas.append([myDict])
start_row = start_row + len(df) + 1
df = pd.DataFrame(datas)
df.to_excel ('locationandnameoffile.xlsx', sheet_name = 'sheet1', index = False, startrow=start_row)

Create an organized DF from a List of mixed type items (Python)

I have a list of items in a 'variable:value' format, but the same 'variable' can appear multiple times. The only thing I know is that all values that follow the 'ID' category belong to the same 'ID', so I know how many rows I need (3 in this example).
I need to create a dataframe from this list. The problem I am encountering is that I cannot add a string value to my DF ('could not convert str to float'). I am not sure how to proceed.
mylist = ['ID:1', 'Date: Oct 2', 'B:88', 'C:noun', 'D:44', 'ID:2', 'B:55', 'C:noun', 'D:45', 'ID:3',
'Date:Sept 5', 'B:55', 'C:verb']
categories = []
for i in mylist:
var = i.split(":")
categories.append(var[0])
variables = list(set(categories))
df = np.empty((3,len(variables)))
df = pd.DataFrame(df)
counter = -1
for i in mylist:
item = i.split(":")
category = item[0]
value = item[1]
tracker = -1
for j in variables:
tracker = tracker + 1
if j == category:
float(value)
df[counter, tracker] = value
if category == "ID":
counter = counter + 1
float(value)
df[counter, 0] = value
In addition, I've tried converting the items in the list to dictionary, but I am not sure if that's the best way to achieve my goal:
df = np.empty((3,len(variables)))
df = pd.DataFrame(df, columns = variables)
mydict = {}
counter = -1
for i in mylist:
item = i.split(":")
category = item[0]
value = item[1]
mydict = {category:value}
if category == "ID":
counter = counter + 1
df[counter] = pd.DataFrame.from_dict(mydict)
else:
df[counter] = pd.DataFrame.from_dict(mydict)
Edit:
I solved it. Code below:
df = np.empty((0,len(variables)))
df = pd.DataFrame(df, columns = variables)
mydict = {}
counter = 0
for i in mylist:
item = i.split(":")
category = item[0]
value = item[1]
mynewdef = {category:value}
counter = counter + 1
if counter == len(mylist):
df = df.append(mydict, ignore_index = True)
df = df.iloc[1:]
elif category == 'ID':
df = df.append(mydict, ignore_index = True)
mydict = {}
mydict.update(mynewdef)
else:
mydict.update(mynewdef)
Perhaps this works
df = pd.DataFrame([e.split(':') for e in my_list],
columns=['key', 'value'])
df = df.pivot(columns='key', values='value') #not tested

Script in python/pandas works but doesn't work when placed in side a function

I have this script I'm running to try to create a dataframe to summarize some statistics:
month = [may,june,july,august,sept]
month_str = [5,6,7,8,9]
avg_age = []
avg_use = []
avg_kwh = []
avg_coll = []
avg_cred = []
for i in month:
avg_age.append(i[i['Age']!=0]['Age'].mean())
avg_use.append(i[i['AverageBilledUsage']!=0]['AverageBilledUsage'].mean())
avg_kwh.append(i[i['AverageKWH']!=0]['AverageKWH'].mean())
avg_coll.append(i[i['Total Collected']!=0]['Total Collected'].mean())
avg_cred.append(i[(i['credit_score']!=0) & (i['credit_score']!=99999)]['credit_score'].mean())
pd.DataFrame(data = [avg_age,avg_use,avg_kwh,avg_coll,avg_cred],columns = month_str,index = ['Age','Usage','kwh','collected','creditscore'])
It returns exactly what I want to see. But when I place it inside a function I get the following error:
AssertionError: 5 columns passed, passed data had 1 columns
Here is the code inside the function:
def get_nums():
months = [may,june,july,august,sept]
month_str = [5,6,7,8,9]
avg_age = []
avg_use = []
avg_kwh = []
avg_coll = []
avg_cred = []
for i in months:
avg_age.append(i[i['Age']!=0]['Age'].mean())
avg_use.append(i[i['AverageBilledUsage']!=0]['AverageBilledUsage'].mean())
avg_kwh.append(i[i['AverageKWH']!=0]['AverageKWH'].mean())
avg_coll.append(i[i['Total Collected']!=0]['Total Collected'].mean())
avg_cred.append(i[(i['credit_score']!=0) & (i['credit_score']!=99999)]['credit_score'].mean())
this_df = pd.DataFrame(data = [avg_age,avg_use,avg_kwh,avg_coll,avg_cred],columns = month_str,index = ['Age','Usage','kwh','collected','creditscore'])
return this_df
You have a problem with the last line of the for loop in the function. this_df is being defined in every iteration of the loop.
The corrected code is below.
def get_nums():
months = [may,june,july,august,sept]
month_str = [5,6,7,8,9]
avg_age = []
avg_use = []
avg_kwh = []
avg_coll = []
avg_cred = []
for i in months:
avg_age.append(i[i['Age']!=0]['Age'].mean())
avg_use.append(i[i['AverageBilledUsage']!=0]['AverageBilledUsage'].mean())
avg_kwh.append(i[i['AverageKWH']!=0]['AverageKWH'].mean())
avg_coll.append(i[i['Total Collected']!=0]['Total Collected'].mean())
avg_cred.append(i[(i['credit_score']!=0) & (i['credit_score']!=99999)]['credit_score'].mean())
this_df = pd.DataFrame(data = [avg_age,avg_use,avg_kwh,avg_coll,avg_cred],columns = month_str,index = ['Age','Usage','kwh','collected','creditscore'])
return this_df
Base on my understanding , you do not need the for loop here
month = [may,june,july,august,sept]
month_str = [5,6,7,8,9]
df=pd.concat(month,keys=month_str)
df=df.mask(df==0|df==99999)
df.groupby(level=0).mean().T

Add each new dictionary result in the order of the columns of a dataframe

I am new to Python, but hope to explain the issue.
dfrow - is a dictionary of a single regression summary
results - is an empty dataframe with same columns as in dfrow
I would like to save regression results for each observation in the outer loop at the same time making sure column order in the inner loop. I am getting a result for the first observations but cannot move further, error saying:
Traceback (most recent call last):
File "<stdin>", line 109, in <module>
TypeError: 'numpy.int64' object is not iterable
when I run this code
import pandas as pd
import numpy as np
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.stats import stattools as st
import statsmodels.api as sm
import collections
import datetime
import warnings
import scipy.stats
df_rent = import_rents()
df_return = import_ee_rets()
mostrecent = df_return.iloc[len(df_return) - 1]
mostrecentYYYY = mostrecent['Year']
mostrecentQ = mostrecent['Quarter']
mostrecentperiod = str(mostrecentYYYY) + "-Q" + str(mostrecentQ)
rentcols = df_rent.columns.values
colnames = []
#loop through the columns in df_rent until the column == the most recent period for which we have ee return data
for colname in rentcols:
if colname != mostrecentperiod:
colnames.append(colname)
else:
colnames.append(colname)
break
rentcols = colnames
#subset df_rent to only include columns that also have ee return data
df_rent = df_rent[rentcols]
#change dtype of metro_code / metro columns to string for matching later
df_rent['metro_code'] = df_rent['metro_code'].apply(str)
df_return['Metro'] = df_return['Metro'].apply(str)
df = pd.read_csv('//x/Project/_data/raw_data/rent_change.csv')
metros = list(np.unique(df['metro_code']))
regress_result_names = [
'metro',
'num_lag',
'num_ma',
'num_AR',
'beta_x1_retmov',
'x1_se',
'x1_tstat',
'x1_pval',
'r-squared',
'reg_fstat',
'fstat_pvalue',
'durbin-watson',
'resid_var']
regress_result_names = pd.Series(regress_result_names)
results = pd.DataFrame(columns=regress_result_names)
row = 0
for metro in metros:
for nlag in range(0, 5):
for nma in range(1, 11):
for AR in range(1, 5):
y = df_rent[df_rent['metro_code'] == str(metro)]
y = y.values.tolist()
y = y[0]
# delete first two columns of df_rent (they don't contain numeric data)
y.pop(0)
y.pop(0)
#y = rent time series data for specific metro
y = pd.Series(y)
#x1 = lagged moving average data for given params
df_return1 = df_return[df_return['Metro'] == str(metro)]
df_return1 = df_return1.reset_index(drop = True)
x1 = lagged_moving_avg(df = df_return1, metro_code = metro, nlag = nlag, nma = nma)
#y and x1 dataframe
y_label = 'y_Rent'
x_lagMA_label = 'x1_LaggedMA'
df1 = pd.DataFrame()
df1[y_label] = y
df1[x_lagMA_label] = x1
if mostrecentQ == 1:
currmonth = "01"
elif mostrecentQ == 2:
currmonth = "04"
elif mostrecentQ == 3:
currmonth = "07"
else:
currmonth = "10"
#convert index to datetime to run the regressions
currpd = pd.to_datetime((str(mostrecentYYYY) + currmonth), format='%Y%m')
df1.index = pd.date_range(*(pd.to_datetime(['1990-01', currpd]) + pd.offsets.QuarterEnd()), freq='Q')
#drop any rows that have missing observations
df1 = df1.dropna()
#df1.to_csv('//Nisfile01/x/Project - Real Estate Database/real_estate/odil/XandY.csv', index=True)
reg = ARIMA(endog = df1[y_label], order = (AR, 0,0)).fit(trend = 'nc', disp = 0, tol=1e-20)
resid_reg = reg.resid
reg2 = sm.OLS(resid_reg, df1[x_lagMA_label]).fit()
resid_reg2 = reg2.resid
dfrow = {
'metro': metro,
'num_lag': nlag,
'num_ma': nma,
'num_AR': AR,
'beta_x1_retmov': reg2.params[0],
'x1_se': reg2.bse[0],
'x1_tstat': reg2.tvalues[0],
'x1_pval': reg2.pvalues[0],
'r-squared': reg2.rsquared,
'reg_fstat':reg2.fvalue,
'fstat_pvalue': reg2.f_pvalue,
'durbin-watson': st.durbin_watson(reg2.resid),
'resid_var': resid_reg2.var(),
}
#create df for output called results
for key in dfrow.keys():
results.loc[row, key] = list(dfrow[key])
row = row + 1
Any help is very much appreciated.
P.S. Sorry for the messy code
The offending line is results.loc[row, key] = list(dfrow[key]).
You are trying to convert a single value, in this case a numpy.int64 object, to a list. I assume that what you're trying to do, and correct me if I am wrong, is create a singleton list with the int64 inside it. If that's what you want to do, you should use:
results.loc[row, key] = [dfrow[key]]

Categories

Resources