For loop to add column in multiple pandas dataframes - python

Good evening,
I will start by saying I am very early in my coding journey. Currently using a number of excel sheets from government data for a pandas project. Each of these sheets represents a year. I am attempting to add a column to each dataframe before I concat the entire list so I know which year the data came from at each point. Currently, the code looks like this:
'''
df10 = pd.read_excel(r'C:\Market research\national_m2010_dl.xls')
df11 = pd.read_excel(r'C:\Market research\national_m2011_dl.xls')
df12 = pd.read_excel(r'C:\Market research\national_m2012_dl.xls')
df13 = pd.read_excel(r'C:\Market research\national_m2013_dl.xls')
df14 = pd.read_excel(r'C:\Market research\national_m2014_dl.xlsx')
df15 = pd.read_excel(r'C:\Market research\national_m2015_dl.xlsx')
df16 = pd.read_excel(r'C:\Market research\national_m2016_dl.xlsx')
df17 = pd.read_excel(r'C:\Market research\national_m2017_dl.xlsx')
df18 = pd.read_excel(r'C:\Market research\national_m2018_dl.xlsx')
df19 = pd.read_excel(r'C:\Market research\national_m2019_dl.xlsx')
df10['Year'] = '2010'
df11['Year'] = '2011'
df12['Year'] = '2012'
df13['Year'] = '2013'
df14['Year'] = '2014'
df15['Year'] = '2015'
df16['Year'] = '2016'
df17['Year'] = '2017'
df18['Year'] = '2018'
df19['Year'] = '2019'
'''
However, I am sure there is a cleaner way to do this and was wondering if there might be a better way. I originally attempted a For loop similar to this:
'''
for num in range(10,20):
df+str(num)['Year'] = '20'+str(num)
'''
but I had no luck. Thoughts?

Try this:
df_dic = dict()
for n in range(10,20): #remember, the second number in range is exclusive
year = f"20{n}"
df = pd.read_excel(f'C:\Market research\national_m{year}_dl.xls')
df["Year"] = year
df_dic[year] = df

instead of using df10, df11,... you can use df[10], df[11],... which is make the code very easy.
for num in range(10,20):
df[num] = pd.read_excel(r'C:\Market research\national_m20'+str(num)+'_dl.xlsx')
df[num]['Year'] = '20'+str(num)

Related

Is there a way to store output dataframes and appending them to the last output in the same dataframe

I am trying to fetch data from API for 50 parcels. I want them to be in a single data frame. While running this loop the data frame is storing only the last parcel which is satisfying the loop condition. Is there any way to store all the previous outputs also in the same dataframe.
For e.g upon running this code it only returns the data frame for foreign id=50, I want the dataframe for all 1-50.
import requests
import pandas as pd
foreign=1
while (foreign <=50):
s1_time_series_url_p6 = 'https://demodev2.kappazeta.ee/ard_api_demo/v1/time_series/s1?limit_to_rasters=true&parcel_foreign_id=0&properties=parcel_foreign_id%2Cs1product_end_time%2Cs1product_ron%2Ccohvh_avg%2Ccohvv_avg%2Cvhvv_avg'
s2_time_series_url_p6 = 'https://demodev2.kappazeta.ee/ard_api_demo/v1/time_series/s2?limit_to_rasters=true&parcel_foreign_id=0&properties=parcel_foreign_id%2Cs2product_start_time%2Cs2product_ron%2Cndvi_avg'
position = 101
foreign_n=str(foreign)
s1_time_series_url_p6 = s1_time_series_url_p6[:position] + foreign_n + s1_time_series_url_p6[position+1:]
s2_time_series_url_p6 = s2_time_series_url_p6[:position] + foreign_n + s2_time_series_url_p6[position+1:]
r_s1_time_series_p6 = requests.get(s1_time_series_url_p6)
r_s2_time_series_p6 = requests.get(s2_time_series_url_p6)
json_s1_time_series_p6 = r_s1_time_series_p6.json()
json_s2_time_series_p6 = r_s2_time_series_p6.json()
df_s1_time_series_p6 = pd.DataFrame(json_s1_time_series_p6['s1_time_series'])
df_s2_time_series_p6 = pd.DataFrame(json_s2_time_series_p6['s2_time_series'])
df_s2_time_series_p6.s2product_start_time=df_s2_time_series_p6.s2product_start_time.str[0:11]
df_s1_time_series_p6.s1product_end_time=df_s1_time_series_p6.s1product_end_time.str[0:11]
dfinal_p6 = df_s1_time_series_p6.merge(df_s2_time_series_p6, how='inner', left_on='s1product_end_time', right_on='s2product_start_time')
cols_p6 = ['parcel_foreign_id_x', 's1product_ron','parcel_foreign_id_y','s2product_ron']
dfinal_p6[cols_p6] = dfinal_p6[cols_p6].apply(pd.to_numeric, errors='coerce', axis=1)
dfinal_p6
The issue is resolved by first creating an empty data frame and then appending the outputs in the dataframe within the loop.
The updated code is as follows:
column_names = ["parcel_foreign_id_x", "s1product_end_time", "s1product_ron","cohvh_avg", "cohvv_avg", "vhvv_avg","parcel_foreign_id_y", "s2product_start_time", "s2product_ron", "ndvi_avg" ]
df = pd.DataFrame(columns = column_names)
foreign=1
while (foreign <=50):
s1_time_series_url_p6 = 'https://demodev2.kappazeta.ee/ard_api_demo/v1/time_series/s1?limit_to_rasters=true&parcel_foreign_id=0&properties=parcel_foreign_id%2Cs1product_end_time%2Cs1product_ron%2Ccohvh_avg%2Ccohvv_avg%2Cvhvv_avg'
s2_time_series_url_p6 = 'https://demodev2.kappazeta.ee/ard_api_demo/v1/time_series/s2?limit_to_rasters=true&parcel_foreign_id=0&properties=parcel_foreign_id%2Cs2product_start_time%2Cs2product_ron%2Cndvi_avg'
position = 101
foreign_n=str(foreign)
s1_time_series_url_p6 = s1_time_series_url_p6[:position] + foreign_n + s1_time_series_url_p6[position+1:]
s2_time_series_url_p6 = s2_time_series_url_p6[:position] + foreign_n + s2_time_series_url_p6[position+1:]
r_s1_time_series_p6 = requests.get(s1_time_series_url_p6)
r_s2_time_series_p6 = requests.get(s2_time_series_url_p6)
json_s1_time_series_p6 = r_s1_time_series_p6.json()
json_s2_time_series_p6 = r_s2_time_series_p6.json()
df_s1_time_series_p6 = pd.DataFrame(json_s1_time_series_p6['s1_time_series'])
df_s2_time_series_p6 = pd.DataFrame(json_s2_time_series_p6['s2_time_series'])
df_s2_time_series_p6.s2product_start_time=df_s2_time_series_p6.s2product_start_time.str[0:11]
df_s1_time_series_p6.s1product_end_time=df_s1_time_series_p6.s1product_end_time.str[0:11]
dfinal_p6 = df_s1_time_series_p6.merge(df_s2_time_series_p6, how='inner', left_on='s1product_end_time', right_on='s2product_start_time')
cols_p6 = ['parcel_foreign_id_x', 's1product_ron','parcel_foreign_id_y','s2product_ron']
dfinal_p6[cols_p6] = dfinal_p6[cols_p6].apply(pd.to_numeric, errors='coerce', axis=1)
df = pd.concat([dfinal_p6,df],ignore_index = True)
foreign = foreign+1

Keyerror none of are in the [index]

im trying to write a Collaborative Filter for my movie recommendation engine. I need a mapping dictionary for fuzzy matching. I tried to write a movieId-title matrix but i got an exception like that:
Keyerror none of are in the [index] in enumerate line.
I checked matrices, id's etc. But i couldn't find solution. (I am using The Movies Dataset from kaggle).
movies_metadata = pd.DataFrame(pd.read_csv('movies_metadata.csv'))
ratings = pd.DataFrame(pd.read_csv('ratings_small.csv'))
movies = movies_metadata[['id','title']]
movies['movieId'] = movies['id']
movies = movies.drop('id',axis = 1)
ratings = ratings.drop('timestamp', axis=1)
user_nums = len(ratings.userId.unique())
item_nums = len(movies.movieId.unique())
total_count = user_nums * item_nums
rating_zero = total_count - ratings.shape[0]
ratings_count_temp = pd.DataFrame(ratings.groupby('rating').size(), columns = ['count'])
ratings_count = ratings_count_temp.append(pd.DataFrame({'count': rating_zero}, index=[0.0]),verify_integrity=True,).sort_index()
ratings_count['log_count'] = np.log(ratings_count['count'])
movies_count = pd.DataFrame(ratings.groupby('movieId').size(), columns = ['count'])
movies_count.head()
movies_count['count'].quantile(np.arange(1, 0.6, -0.05))
popular_movies = list(set(movies_count.query('count >= 50').index))
popular_ratings = ratings[ratings.movieId.isin(popular_movies)]
users_count = pd.DataFrame(popular_ratings.groupby('userId').size(), columns=['count'])
users_count['count'].quantile(np.arange(1, 0.5, -0.05))
active_users = list(set(users_count.query('count >= 50').index))
popular_active_ratings = popular_ratings[popular_ratings.userId.isin(active_users)]
movie_user_mat = popular_active_ratings.pivot(index='movieId', columns='userId', values='rating').fillna(0)
movie_to_idx = {
movies: i for i, movies in
enumerate(list(movies.set_index('movieId').loc[movie_user_mat.index].title))
}

Python 'list.insert()' only saves the last result of calculation loop

I was making my automatic stock strategy yield calculation program with Python. Here's my code:
import FinanceDataReader as fdr
import numpy as np
# ...(more modules for python)
pd.options.display.float_format = '{:.5f}'.format
file_list = os.listdir('/home/sejahui/projects/stock_data_excel')
for i in range(20):
os.chdir('/home/sejahui/projects/stock_data_excel')
odd = file_list[i]
data = pd.read_excel('/home/sejahui/projects/stock_data_excel/'+str(odd))
def calMACD(data, short=5, long=25, signal=9):
data.sort_index()
data['MVA_25']=data['Close'].ewm(span=long, adjust=False).mean()
data['MVA_5']=data['Close'].ewm(span=short, adjust=False).mean()
data['MACD']=data['Close'].ewm(span=short, adjust=False).mean() - data['Close'].ewm(span=long, adjust=False).mean()
data['Signal']=data['MACD'].ewm(span=signal, adjust=False).mean( )
#data['Buy_sign']=(data['MACD']-data['Signal']) >=600
data['Buy_sign']=np.where(data['MACD']-data['Signal'] >=451, 'Buy' , 'Sell' )
#data['Target_1']=(data['Close']-data['Close'].shift(1))/data['Close'].shift(1)*100
#data['Target_1']=np.where(data['Buy_sign']=='Buy', (data['Change'])+1,1)
#data['Target_2']=np.where(data['Buy_sign']=='Sell', (data['Change'])+1,1)
#data['Real_world']= 1000000*data['Target_1']
#data['Real_world_2']= 1000000*data['Target_2']
#data['Condition'] = np.where(data['Real_world']<1000000, data['Real_world']-data['Real_world'].shift(-2),1)
##data['Condition_2'] = np.where(data['Real_world']<1000000, data['Target_1'].shift(-2),1)
#data['Moneyflow'] =
#plt.plot(data['Date'], data['Real_world'])
#data[data.Buy_sign !='Sell']
'''
data['Target_1']=np.where(data['Buy_sign']=='Buy', data['Change'],1)
data['Target_2']=np.where(data['Buy_sign']=='Sell', data ['Change'],1)
data['Yield']=np.where(data['Buy_sign']=='Sell', data['Target_1']/data['Target_2'],1 )
'''
'''
data['Result']=data['Target_1'].cumprod()
data['Result_2']=data['Target_2'].cumprod()
data['??????'] = data['Result'] - data['Result_2']
'''
return data
Adjusted = calMACD(data)
Adjusted.drop(['Change'], axis=1, inplace = True)
Filtered = Adjusted[Adjusted.Buy_sign!='Sell'].copy()
#print(Filtered)
#Filtered = (Adjusted.Buy_sign =='Buy') #(Adjusted.Condition = 1.0)
#Master = Adjusted.loc[Adjusted,['Date','Buy_sign','Target_1','Real_world',]]
#print(Adjusted)
def backtester(Filtered):
Filtered['Change'] = ((Filtered['Close'] - Filtered['Close'].shift(1)) / Filtered['Close'].shift(1))+1
#data['Target_1']=np.where(data['Buy_sign']=='Buy', (data['Change'])+1,1)
Filtered['Real_world'] = 1000000*Filtered['Change']
#Filtered['Condition'] = np.where(Filtered['Real_world']<1000000, Filtered['Real_world'].shift(-2)-Filtered['Real_world'],1)
Filtered['Condition'] = np.where(Filtered['Real_world']<1000000, Filtered['Change'].shift(-2),1)
#Filtered['Target_1'] = np.where(Filtered['Buy_sign']=='Buy', (Filtered['Change'])+1,1)
#Filtered['Condition'] = np.where(Filtered['Real_world']<1000000, Filtered['Real_world'].shift(-2)-Filtered['Real_world'],1)
return Filtered
s = backtester(Filtered)
e = s[s.Condition!=1.00000]
x = e.dropna()
y = x['Condition']
list_1 = []
write_wb = Workbook()
write_ws = write_wb.create_sheet('MACD&Signal gap data sheet')
write_ws = write_wb.active
write_ws['A1'] = 'Name'
write_ws['B1'] = 'Profit'
try:
print(geometric_mean(y)*1000000*12)
except StatisticsError as e:
print ('Sell is empty':',odd)
else:
d = (geometric_mean(y)*1000000*12)
print(d,odd)
list_1.insert(i,d)
Print(list_1)
Here's the part where I'm troubling with:
s = backtester(Filtered)
e = s[s.Condition!=1.00000]
x = e.dropna()
y = x['Condition']
list_1 = []
try:
print(geometric_mean(y)*1000000*12)
except StatisticsError as e:
print ('Sell is empty':',odd)
else:
d = (geometric_mean(y)*1000000*12)
print(d)
list_1.insert(d)
print(list_1)
When I initiate the code where I am having problems, list only saves the last result of 'try, except, else' function. My intention was saving all the results. What change should I give to save all the results?
Here's the output of the list:
[11772769.197974786]
Your problem is that you are using insert instead of append and the main difference that insert takes a second argument for the position that you want to insert your element at and when none is provided it is 0 by default so you are consistently inserting at the same index resulting in a list with only the last element at the first position.
To fix that simply use append instead.
else:
d = (geometric_mean(y)*1000000*12)
print(d)
list_1.append(d)
You want to use append, not insert. see Python Data Structures
Change list_1.insert(d) to list_1.append(d)
The insert is defaulting to index 0 and just updating it each time.
Edit: Just noticed your answer is in the question title.

facing issue to add Data from dictionary into datafram

Facing trouble creating a function to store the response in columns,
Like passing the city name and as the response getting details of max_temp, min_temp, Pressure. which I want to store that in the new column.
import pyowm
from pyowm.utils import config
from pyowm.utils import timestamps
api_key = {key from openweather(free)}
mgr = owm.weather_manager()
data =[]
def get_weather(city):
observation = mgr.weather_at_place(city)
l = observation.weather
Wind_Speed = l.wind()['speed']
Temp = l.temperature('celsius')['temp']
Max_temp = l.temperature('celsius')['temp_max']
Min_temp = l.temperature('celsius')['temp_min']
#Heat_index = l.heat_index
Humidity = l.humidity
Pressure = l.pressure['press']
weather = {"City": city,"Wind_Speed" : Wind_Speed, "Temp":
Temp,"Max_temp":Max_temp, "Min_temp":Min_temp, "Humidity":Humidity,
"Pressure":Pressure}
return weather
for city in df2['City']:
get_weather(city)
df = df.append(data, True)
Want to add each weather details as column based on city name
Want to create one function which stores all the details in columns,
don't want to create separate functions.
Data Frame is like:
You can return a dictionary from your fucntion.
def get_weather(city):
observation = mgr.weather_at_place(city)
l = observation.weather
Wind_Speed = l.wind()['speed']
Temp = l.temperature('celsius')['temp']
Max_temp = l.temperature('celsius')['temp_max']
resp=dict()
resp['Wind_Speed ']=Wind_Speed
resp['Temp']=Temp
resp['Max_temp']=Max_temp
return resp
df["Wind_speed"] = df["city"].apply(lambda x: get_weather(x)['Wind_Speed'])
df["Temp"] = df["city"].apply(lambda x: get_weather(x)['Temp'])
df["Max_temp"] = df["city"].apply(lambda x: get_weather(x)['Max_temp'])

Pandas Dataframe Only Returning first Row of JSON Data

I'm working on a web scraping project, and have all the right code that returns me the json data in the format that I want if I used the #print command below, but when I got to run the same code except through Pandas Dataframe it only returns the first row of Data that I'm looking for. Just running the print, it returns the expected 17 rows of data I'm looking for. Dataframe to CSV gives me the first row only. Totally stumped! So grateful for anyone's help!
for item in response['body']:
DepartureDate = item['legs'][0][0]['departDate']
ReturnDate = item['legs'][1][0]['departDate']
Airline = item['legs'][0][0]['airline']['code']
Origin = item['legs'][0][0]['depart']
Destination = item['legs'][0][0]['destination']
OD = (Origin + Destination)
TrueBaseFare = item['breakdown']['baseFareAmount']
YQYR = item['breakdown']['fuelSurcharge']
TAX = item['breakdown']['totalTax']
TTL = item['breakdown']['totalFareAmount']
MARKEDUPTTL = item['breakdown']['totalCalculatedFareAmount']
MARKUP = ((MARKEDUPTTL - TTL) / (TTL)*100)
FBC = item['fareBasisCode']
#print(DepartureDate,ReturnDate,Airline,OD,TrueBaseFare,YQYR,TAX,TTL,MARKEDUPTTL,MARKUP,FBC)
MI = pd.DataFrame(
{'Dept': [DepartureDate],
'Ret': [ReturnDate],
'AirlineCode': [Airline],
'Routing': [OD],
'RealFare': [TrueBaseFare],
'Fuel': [YQYR],
'Taxes': [TAX],
'RealTotal': [TTL],
'AgencyTotal': [MARKEDUPTTL],
'Margin': [MARKUP],
'FareBasis': [FBC],
})
df = pd.DataFrame(MI)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
df.to_csv('MITest7.csv')
When you print all your values after the cycle, you will see that you get only the last values. To resolve this problem you need to create lists and put there your values.
Try this:
DepartureDate = []
ReturnDate = []
Airline = []
Origin = []
Destination = []
OD = []
TrueBaseFare = []
YQYR = []
TAX = []
TTL = []
MARKEDUPTTL = []
MARKUP = []
FBC = []
for item in response['body']:
DepartureDate.append(item['legs'][0][0]['departDate'])
ReturnDate.append(item['legs'][1][0]['departDate'])
Airline.append(item['legs'][0][0]['airline']['code'])
Origin.append(item['legs'][0][0]['depart'])
Destination.append(item['legs'][0][0]['destination'])
OD.append((Origin[-1] + Destination[-1]))
TrueBaseFare.append(item['breakdown']['baseFareAmount'])
YQYR.append(item['breakdown']['fuelSurcharge'])
TAX.append(item['breakdown']['totalTax'])
TTL.append(item['breakdown']['totalFareAmount'])
MARKEDUPTTL.append(item['breakdown']['totalCalculatedFareAmount'])
MARKUP.append(((MARKEDUPTTL[-1] - TTL[-1]) / (TTL[-1])*100))
FBC.append(item['fareBasisCode'])

Categories

Resources