Key Error: None of the [Index[Columns] are in the Columns - python

import pandas as pd
customer_master_loc = r"D:\LOC_RUN\LOC_2021\2021_SP_INPUT_FILES\loc_cust_master_new_rules_09072021.txt"
customer_master_df = pd.read_csv(customer_master_loc,sep = '|',error_bad_lines=False,dtype = str)
customer_master_df = customer_master_df[['CUSTOMER_NUM','CUSTOMER_NAME','CUSTOMER_NAME2','CUSTOMER_ADDR_LN1', \
'CUSTOMER_ADDR_LN2','CUSTOMER_ADDR_LN3','CUSTOMER_ADDR_LN4', \
'CUSTOMER_CITY','STATE_CD','CUSTOMER_ZIP_BASE','COUNTY','SALESBLOCK_CD', \
'ORGANIZATION_CD','LOSER_FLAG','SHARED_CUSTOMER_NUM','SHARED_CUSTOMER_FLAG','SHARED_CUSTOMER_ORGANIZATION_CD','WINNER_CUSTOMER_NUM','WINNER_CUSTOMER_FLAG','WINNER_CUSTOMER_ORGANIZATION_CD']]

Related

Extracting Nested List-Dictionaries to Pandas Series in a DataFrame

I have a pandas dataframe that I have extracted from a JSON file for breweries I'm interested in. most of these columns are as nested list of dictionaries. However two columns 'hours' and 'memberships' are being problematic.
I'd like to extract the 'hours' column into 7 columns "Mon_Hours","Tue_hours"...'Sun_Hours'.
I have tried and tried to figure this out but these two columns are proving challenging.
Here is a link to the initial data: https://www.coloradobrewerylist.com/wp-json/cbl_api/v1/locations/?location-type%5Bnin%5D=404,405&page_size=1000&page_token=1
and here is my code:
import requests
import re
import pandas as pd
import numpy as np
import csv
import json
from datetime import datetime
### get the data from the Colorado Brewery list
url = "https://www.coloradobrewerylist.com/wp-json/cbl_api/v1/locations/?location-type%5Bnin%5D=404,405&page_size=1000&page_token=1"
payload={}
headers = {}
response = requests.request("GET", url, headers=headers, data=payload)
data=response.json()
### convert results to table
pd.set_option('display.max_columns', None)
brewdf = pd.DataFrame.from_dict(data['results'])
#brewdf
############################################
#### CLEAN UP NESTED LIST-DICT COLUMNS #####
############################################
## cleanup dogs column
dogs = pd.json_normalize(brewdf['dogs'])
dogs2 = dogs.squeeze()
dogsdf = pd.json_normalize(dogs2)
dogsdf = dogsdf.drop(columns =['id','slug'])
dogsdf = dogsdf.rename(columns={'name':'dogs_allowed'})
#dogsdf
## cleanup parking column
parking = pd.json_normalize(brewdf['parking'])
parking = parking.rename(columns = {0:'Parking1',1:'Parking2',2:'Parking3'})
a = pd.json_normalize(parking['Parking1'])
b = pd.json_normalize(parking['Parking2'])
c = pd.json_normalize(parking['Parking3'])
parkcombo = pd.concat([a,b,c],ignore_index=True, axis=1)
parkcombo = parkcombo.rename(columns = {2:'P1',5:'P2',8:'P3'})
parkcombo['parking_type'] = parkcombo['P1'].map(str) + ',' + parkcombo['P2'].map(str) + ',' + parkcombo['P3'].map(str)
parkcombo['parking_type'] = parkcombo['parking_type'].str.replace(",nan",'')
parkdf = parkcombo['parking_type'].to_frame()
#parkdf
## cleanup food type column
food = pd.json_normalize(brewdf['food_type'])
food
food = food.rename(columns = {0:'Food1',1:'Food2',2:'Food3',3:'Food4',4:'Food5',5:'Food6'})
a = pd.json_normalize(food['Food1'])
b = pd.json_normalize(food['Food2'])
c = pd.json_normalize(food['Food3'])
d = pd.json_normalize(food['Food4'])
e = pd.json_normalize(food['Food5'])
f = pd.json_normalize(food['Food6'])
foodcombo = pd.concat([a,b,c,d,e,f],ignore_index=True, axis =1)
foodcombo
foodcombo = foodcombo.rename(columns = {2:'F1',5:'F2',8:'F3',11:'F4',14:'F5',17:'F6'})
foodcombo['food_type'] = foodcombo['F1'].map(str) + ',' + foodcombo['F2'].map(str) + ',' + foodcombo['F3'].map(str) + ',' + foodcombo['F4'].map(str)+ ',' + foodcombo['F5'].map(str) + ',' + foodcombo['F6'].map(str)
foodcombo['food_type'] = foodcombo['food_type'].str.replace(",nan",'')
fooddf = foodcombo['food_type'].to_frame()
#fooddf
## cleanup patio column
patio = pd.json_normalize(brewdf['patio'])
patio = patio.rename(columns = {0:'P1',1:'P2',2:'P3'})
a = pd.json_normalize(patio['P1'])
b = pd.json_normalize(patio['P2'])
c = pd.json_normalize(patio['P3'])
patiocombo = pd.concat([a,b,c],ignore_index=True, axis =1)
patiocombo
patiocombo = patiocombo.rename(columns = {2:'P1',5:'P2',8:'P3'})
patiocombo['patio_type'] = patiocombo['P1'].map(str) + ',' + patiocombo['P2'].map(str) + ',' + patiocombo['P3'].map(str)
patiocombo['patio_type'] = patiocombo['patio_type'].str.replace(",nan",'')
patiodf = patiocombo['patio_type'].to_frame()
#patiodf
## clean visitor type column
visitor = pd.json_normalize(brewdf['visitors'])
visitor
visitor = visitor.rename(columns = {0:'V1',1:'V2',2:'V3'})
a = pd.json_normalize(visitor['V1'])
b = pd.json_normalize(visitor['V2'])
c = pd.json_normalize(visitor['V3'])
visitorcombo = pd.concat([a,b,c],ignore_index=True, axis =1)
visitorcombo
visitorcombo = visitorcombo.rename(columns = {2:'V1',5:'V2',8:'V3'})
visitorcombo['visitor_type'] = visitorcombo['V1'].map(str) + ',' + visitorcombo['V2'].map(str) + ',' + visitorcombo['V3'].map(str)
visitorcombo['visitor_type'] = visitorcombo['visitor_type'].str.replace(",nan",'')
visitordf = visitorcombo['visitor_type'].to_frame()
#visitordf
## clean tour type column
tour = pd.json_normalize(brewdf['tour_type'])
tour
tour = tour.rename(columns = {0:'T1',1:'T2',2:'T3',3:'T4'})
a = pd.json_normalize(tour['T1'])
b = pd.json_normalize(tour['T2'])
c = pd.json_normalize(tour['T3'])
d = pd.json_normalize(tour['T4'])
tourcombo = pd.concat([a,b,c,d],ignore_index=True, axis =1)
tourcombo
tourcombo = tourcombo.rename(columns = {2:'T1',5:'T2',8:'T3',11:'T4'})
tourcombo['tour_type'] = tourcombo['T1'].map(str) + ',' + tourcombo['T2'].map(str) + ',' + tourcombo['T3'].map(str) + ','+ tourcombo['T4'].map(str)
tourcombo['tour_type'] = tourcombo['tour_type'].str.replace(",nan",'')
tourdf = tourcombo['tour_type'].to_frame()
#tourdf
## clean other drinks column
odrink = pd.json_normalize(brewdf['otherdrinks_type'])
odrink
odrink = odrink.rename(columns = {0:'O1',1:'O2',2:'O3',3:'O4',4:'O5',5:'O6',6:'O7',7:'O8',8:'O9'})
a = pd.json_normalize(odrink['O1'])
b = pd.json_normalize(odrink['O2'])
c = pd.json_normalize(odrink['O3'])
d = pd.json_normalize(odrink['O4'])
e = pd.json_normalize(odrink['O5'])
f = pd.json_normalize(odrink['O6'])
g = pd.json_normalize(odrink['O7'])
h = pd.json_normalize(odrink['O8'])
i = pd.json_normalize(odrink['O9'])
odrinkcombo = pd.concat([a,b,c,d,e,f,g,h,i],ignore_index=True, axis =1)
odrinkcombo
odrinkcombo = odrinkcombo.rename(columns = {2:'O1',5:'O2',8:'O3',11:'O4',14:'O5',17:'O6',20:'O7',23:'O8',26:'O9'})
odrinkcombo['odrink_type'] = odrinkcombo['O1'].map(str) + ',' + odrinkcombo['O2'].map(str) + ',' + odrinkcombo['O3'].map(str) + ','+ odrinkcombo['O4'].map(str) + ','+ odrinkcombo['O5'].map(str)+ ','+ odrinkcombo['O6'].map(str)+ ','+ odrinkcombo['O7'].map(str)+','+ odrinkcombo['O8'].map(str)+','+ odrinkcombo['O9'].map(str)
odrinkcombo['odrink_type'] = odrinkcombo['odrink_type'].str.replace(",nan",'')
odrinkdf = odrinkcombo['odrink_type'].to_frame()
#odrinkdf
## clean to-go column
togo = pd.json_normalize(brewdf['togo_type'])
togo
togo = togo.rename(columns = {0:'TG1',1:'TG2',2:'TG3',3:'TG4',4:'TG5'})
a = pd.json_normalize(togo['TG1'])
b = pd.json_normalize(togo['TG2'])
c = pd.json_normalize(togo['TG3'])
d = pd.json_normalize(togo['TG4'])
e = pd.json_normalize(togo['TG5'])
togocombo = pd.concat([a,b,c,d,e],ignore_index=True, axis =1)
togocombo
togocombo = togocombo.rename(columns = {2:'TG1',5:'TG2',8:'TG3',11:'TG4',14:'TG5'})
togocombo['togo_type'] = togocombo['TG1'].map(str) + ',' + togocombo['TG2'].map(str) + ',' + togocombo['TG3'].map(str) + ','+ togocombo['TG4'].map(str) + ','+ togocombo['TG5'].map(str)
togocombo['togo_type'] = togocombo['togo_type'].str.replace(",nan",'')
togodf = togocombo['togo_type'].to_frame()
#togodf
## clean merch column
merch = pd.json_normalize(brewdf['merch_type'])
merch
merch = merch.rename(columns = {0:'M1',1:'M2',2:'M3',3:'M4',4:'M5',5:'M6',6:'M7',7:'M8',8:'M9',9:'M10',10:'M11',11:'M12'})
a = pd.json_normalize(merch['M1'])
b = pd.json_normalize(merch['M2'])
c = pd.json_normalize(merch['M3'])
d = pd.json_normalize(merch['M4'])
e = pd.json_normalize(merch['M5'])
f = pd.json_normalize(merch['M6'])
g = pd.json_normalize(merch['M7'])
h = pd.json_normalize(merch['M8'])
i = pd.json_normalize(merch['M9'])
j = pd.json_normalize(merch['M10'])
k = pd.json_normalize(merch['M11'])
l = pd.json_normalize(merch['M12'])
merchcombo = pd.concat([a,b,c,d,e,f,g,h,i,j,k,l],ignore_index=True, axis =1)
merchcombo
merchcombo = merchcombo.rename(columns = {2:'M1',5:'M2',8:'M3',11:'M4',14:'M5',17:'M6',20:'M7',23:'M8',26:'M9',29:'M10',32:'M11',35:'M12'})
merchcombo['merch_type'] = (merchcombo['M1'].map(str) + ',' + merchcombo['M2'].map(str) + ',' + merchcombo['M3'].map(str) + ','+ merchcombo['M4'].map(str) + ','
+ merchcombo['M5'].map(str) + ',' + merchcombo['M6'].map(str)+ ',' + merchcombo['M7'].map(str) + ',' + merchcombo['M8'].map(str)
+ ',' + merchcombo['M9'].map(str)+ ',' + merchcombo['M10'].map(str)+ ',' + merchcombo['M11'].map(str)+ ',' + merchcombo['M12'].map(str))
merchcombo['merch_type'] = merchcombo['merch_type'].str.replace(",nan",'')
merchdf = merchcombo['merch_type'].to_frame()
#merchdf
### clean description column
brewdf['description'] = brewdf['description'].str.replace(r'<[^<>]*>', '', regex=True)
#brewdf
### replace nan with null
brewdf = brewdf.replace('nan',np.nan)
brewdf = brewdf.replace('None',np.nan)
brewdf
cleanedbrewdf = brewdf.drop(columns = {'food_type','tour_type','otherdrinks_type','articles','merch_type','togo_type','patio','visitors','parking','dogs'})
mergedbrewdf = pd.concat([cleanedbrewdf,dogsdf,parkdf,fooddf,patiodf,
visitordf,tourdf,odrinkdf,togodf,merchdf,],ignore_index=False,axis=1)
mergedbrewdf
### remove non-existing
finalbrewdf = mergedbrewdf.loc[(mergedbrewdf['lon'].notnull())].copy()
finalbrewdf['lon'] = finalbrewdf['lon'].astype(float)
finalbrewdf['lat'] = finalbrewdf['lat'].astype(float)
finalbrewdf
Can someone please point me in the right direction for the hours and memberships columns? Also, is there a more efficient way to look through these different columns? They have different nested list-dict lengths which I thought might prevent me from writing a function.

Split json file into multiple csv files depending on date?

I am trying to split up a json file from alpha-vantages api into separate files depending on the date. I'm also trying to reformat the file to have blank values in the gaps where dates are missing. The following code is what I have come up with but it gives me the TypeError: 'list' object is not callable". I'm fairly new to python and pandas so I'm sure there is a better way to go about this.
import requests
import pandas as pd
from datetime import datetime, timedelta
from dateutil import parser
import numpy as np
from pandas import DataFrame
import json
symbol = "MSFT"
symbol_list = symbol.split(",")
def num_el(list):
count = 0
for element in list:
count += 1
return count
def csv_make(sy, dar, dat):
csv_file = open(f"{sy}_1min_{dar}.csv", "w", newline="")
csv_file.write(dat)
csv_file.close()
i = 0
x = -1
n = num_el(symbol_list)
while i < n:
namesym = symbol_list[x]
ticker = namesym
api_key = 'APIKEYHERE'
url = f'https://www.alphavantage.co/query?function=TIME_SERIES_INTRADAY&symbol={ticker}&outputsize=full&interval=1min&apikey={api_key}'
data = requests.get(url)
dsf = data.json()
daf = pd.DataFrame(dsf['Time Series (1min)'])
dxf: DataFrame = daf.T
dxf.index.name = 'time'
dxf.reset_index(inplace=True)
dxf['time'] = pd.to_datetime(dxf['time'])
dxf['minute'] = dxf['time'].dt.time
dxf['day'] = dxf['time'].dt.day
dxf['date'] = dxf['time'].dt.date
agg = dxf.groupby([dxf['day']])
length1 = dxf.groupby([dxf['day']]).size()
length = pd.DataFrame(length1)
length.index.name = 'day'
length.reset_index(inplace=True)
length_sum = length[0].sum()
v = 0
d = length_sum
b = len(length)
x2 = length_sum
while v < b:
a = length[0][v]
x2 -= length[0][v]
xd = agg.get_group(length['day'][v])
date = xd['date'][x2]
max_dt = parser.parse(str(max(xd['minute'])))
min_dt = parser.parse(str(min(xd['minute'])))
dt_range = []
while min_dt <= max_dt:
dt_range.append(min_dt.strftime("%H:%M:%S"))
min_dt += timedelta(seconds=60)
complete_df = pd.DataFrame({'minute': dt_range})
xy = complete_df.astype('str')
yx = xd.astype('str')
dasf = xy.merge(yx, how='left', on='minute')
dasf['ev'] = np.where(dasf['1. open'].notnull(), 'False', 'True')
time = []
open = []
high = []
low = []
close = []
volume = []
empty_value = []
for ib in range(len(dasf)):
time.append(dasf['minute'][ib])
open.append(dasf['1. open'][ib])
high.append(dasf['2. high'][ib])
low.append(dasf['3. low'][ib])
close.append(dasf['4. close'][ib])
volume.append(dasf['5. volume'][ib])
empty_value.append(dasf['ev'][ib])
time_df = pd.DataFrame(time).rename(columns={0: 'Time'})
open_df = pd.DataFrame(open).rename(columns={0: 'Open'})
high_df = pd.DataFrame(high).rename(columns={0: 'High'})
low_df = pd.DataFrame(low).rename(columns={0: 'Low'})
close_df = pd.DataFrame(close).rename(columns={0: 'Close'})
volume_df = pd.DataFrame(volume).rename(columns={0: 'Volume'})
empty_value_df = pd.DataFrame(empty_value).rename(columns={0: 'Empty Value'})
frames = [time_df, open_df, high_df, low_df, close_df, volume_df, empty_value_df]
df = pd.concat(frames, axis=1, join='inner')
df = df.set_index('Time')
ad = df.to_csv()
csv_make(namesym, date, ad)
v += 1
i += 1

Python method not recognized

I am learning python so I have worked on this one thing a long time. I still can't find the answer.
Interpreter says there is no method called _set_icon()
code:
import pyodbc as db
import pandas as pd
import Globals
class BatchNodeData(object):
"""support batch node of the tree. Contains what it needs to do that"""
def __init__(self):
pass
def _set_icon():
sql_conn = db.connect(Globals.SQL_CONN_STRING)
b_query = " \
SELECT top 1 * \
FROM dbo.ETLBatchRun a \
Where b.BatchID = " + str(batchid) + \
"Order by a.StatusDT desc"
df_icon = pd.read_sql(b_query, sql_conn)
if not df_icon.empty:
self.last_status = df_icon['StatusID'].iloc[0]
def _get_icon_index():
switcher = {
1: 2,
2: 2,
3: 3,
4: 4
}
switcher_selected = {
1: 7,
2: 7,
3: 8,
4: 8
}
if selected:
return switcher_selected.get(statusid, 0) # default 0 (yellow bar)
else:
return switcher.get(statusid, 0) # default 0 (yellow bar)
def __init__(self, batchid):
self.batch_id = None
self.batch_name = None
self.critical = None
self.node_icon_index = None
self.last_status = None
self.selected = False
self.running = False
sql_conn = db.connect(Globals.SQL_CONN_STRING)
b_query = " \
select b.BatchID \
, b.BatchName \
, c.AttributeValue as Critical \
, noRun.AttributeValue as noRun \
from dbo.ETLBatch b (nolock) \
left join dbo.etlbatchattribute (nolock) c \
on c.batchid = b.batchid \
and c.AttributeName = 'Critical' \
and c.AttributeValue = '1' \
left join dbo.etlbatchattribute (nolock) noRun \
on noRun.batchid = b.batchid \
and noRun.AttributeName = 'NotRunnableInETLMonitor' \
and noRun.AttributeValue = '1' \
Where b.BatchID = " + str(batchid)
df_batch = pd.read_sql(b_query, sql_conn)
for index, row in df_batch.iterrows():
batch_id = row['BatchID']
batch_name = row['BatchName']
critical = row['Critical']
_set_icon()
self.node_icon_index = _get_icon_index()
Since you've declared _set_icon() as a method bounded by the class, you should be able to call it as:
BatchNodeData._set_icon()

code showing empty dataframe

I write the below code for my project but dataframe df showing empty records.I want to know where i am lacking in the code:
import urllib
from urllib2 import *
import pandas as pd
def urlmake(req):
requests = [req]
for parms in requests:
url = 'http://localhost:8983/solr/data/select?indent=on&' + urllib.urlencode(parms)
connection = urlopen(url)
response = eval(connection.read())
t = response['response']['numFound']
req2 = req['q'][13:17]
print(req2)
if(req2 == 'AXIS'):
print('true')
for i in range(0,t):
t1 = float((response['response']['docs'][i]['message']).split(" ")[1])
#print(t1)
t2 = response['response']['docs'][i]['customer_id']
#print(t2)
df = df.append(pd.DataFrame(t2,t1))
ba_query = [{'q':'sender_name:*AXIS* AND message:*Avbl Lmt*','start':0,'rows':211,'wt':'json'}]
for i in range(0,len(ba_query)):
urlmake(ba_query[i])
getting errror as:
UnboundLocalError: local variable 'df' referenced before assignment
import urllib
from urllib2 import *
import pandas as pd
df = pd.DataFrame(columns=['Customer_id','Spent'])
def urlmake(req):
requests = [req]
for parms in requests:
url = 'http://localhost:8983/solr/data/select?indent=on&' + urllib.urlencode(parms)
connection = urlopen(url)
response = eval(connection.read())
t = response['response']['numFound']
req2 = req['q'][13:17]
print(req2)
if(req2 == 'AXIS'):
print('true')
for i in range(0,t):
t1 = float((response['response']['docs'][i]['message']).split(" ")[1])
#print(t1)
t2 = response['response']['docs'][i]['customer_id']
#print(t2)
df = df.append({'Customer_id':t2, 'Spent':t1}, ignore_index=True) # HERE
See the comment in the code
.
Here's an MCVE of how your code should look:
import pandas as pd
import numpy as np
df = pd.DataFrame()
for iteration in range(0, 5):
dummy_data = np.random.rand(3, 3)
df = df.append(pd.DataFrame(dummy_data))
df.columns = ['a', 'b', 'c']
New MCVE:
import pandas as pd
import numpy as np
def myfunc():
df = pd.DataFrame()
for iteration in range(0, 5):
dummy_data = np.random.rand(3, 3)
df = df.append(pd.DataFrame(dummy_data))
df.columns = ['a', 'b', 'c']
return df
df2 = myfunc()
print(df2)

Python Pandas NameError: name 'data' is not defined

I'm new to coding. When I attempt to run this it says:
NameError: name 'data' is not defined.
import numpy as np
import pandas as pd
from scipy import stats
from matplotlib import pyplot as plt
from matplotlib.ticker import MaxNLocator
import datetime
import json
from bs4 import BeautifulSoup
import requests
import time
def fetchCryptoClose(fsym, tsym):
# function fetches the close-price time-series from cryptocompare.com
# it may ignore USDT coin (due to near-zero pricing)
# daily sampled
cols = ['date', 'timestamp', fsym]
lst = ['time', 'open', 'high', 'low', 'close']
timestamp_today = datetime.today().timestamp()
curr_timestamp = timestamp_today
for j in range(2):
df = pd.DataFrame(columns=cols)
url = "https://min-api.cryptocompare.com/data/histoday?fsym=" + fsym + \
"&tsym=" + tsym + "&toTs=" + str(int(curr_timestamp)) + "&limit=3"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
dic = json.loads(soup.prettify())
for i in range(1, 4):
tmp = []
for e in enumerate(lst):
x = e[0]
y = dic['Data'][i][e[1]]
if(x == 0):
tmp.append(str(timestamp2date(y)))
tmp.append(y)
if(np.sum(tmp[-4::]) > 0): # remove for USDT
tmp = np.array(tmp)
tmp = tmp[[0,1,4]] # filter solely for close prices
df.loc[len(df)] = np.array(tmp)
# ensure a correct date format
df.index = pd.to_datetime(df.date, format="%Y-%m-%d")
df.drop('date', axis=1, inplace=True)
curr_timestamp = int(df.ix[0][0])
if(j == 0):
df0 = df.copy()
else:
data = pd.concat([df, df0], axis=0)
data.drop("timestamp", axis=1, inplace=True)
return data # DataFrame
# N-Cryptocurrency Portfolio (tickers)
fsym = ['BTC', 'ETH', 'XRP', 'LTC', 'DASH', 'XMR', 'ETC', 'MAID', 'XEM', 'REP']
# vs.
tsym = 'USD'
for e in enumerate(fsym):
print(e[0], e[1])
if(e[0] == 0):
try:
data = fetchCryptoClose(e[1], tsym)
except:
pass
else:
try:
data = data.join(fetchCryptoClose(e[1], tsym))
except:
pass
# ensure values to be floats
# save portfolio to a file (HDF5 file format)
store = pd.HDFStore('portfolio2.h5')
store['data'] = data
store.close()
# read in your portfolio from a file
df = pd.read_hdf('portfolio2.h5', 'data')
print(df)
Don't use try-except-pass because will silence all your exceptions and you might never actually create `data.
Replace this code:
for e in enumerate(fsym):
print(e[0], e[1])
if(e[0] == 0):
try:
data = fetchCryptoClose(e[1], tsym)
except:
pass
else:
try:
data = data.join(fetchCryptoClose(e[1], tsym))
except:
pass
with this:
for e in enumerate(fsym):
print(e[0], e[1])
if(e[0] == 0):
data = fetchCryptoClose(e[1], tsym)
else:
data = data.join(fetchCryptoClose(e[1], tsym))
and see where your real exceptions are.

Categories

Resources