im trying to write a Collaborative Filter for my movie recommendation engine. I need a mapping dictionary for fuzzy matching. I tried to write a movieId-title matrix but i got an exception like that:
Keyerror none of are in the [index] in enumerate line.
I checked matrices, id's etc. But i couldn't find solution. (I am using The Movies Dataset from kaggle).
movies_metadata = pd.DataFrame(pd.read_csv('movies_metadata.csv'))
ratings = pd.DataFrame(pd.read_csv('ratings_small.csv'))
movies = movies_metadata[['id','title']]
movies['movieId'] = movies['id']
movies = movies.drop('id',axis = 1)
ratings = ratings.drop('timestamp', axis=1)
user_nums = len(ratings.userId.unique())
item_nums = len(movies.movieId.unique())
total_count = user_nums * item_nums
rating_zero = total_count - ratings.shape[0]
ratings_count_temp = pd.DataFrame(ratings.groupby('rating').size(), columns = ['count'])
ratings_count = ratings_count_temp.append(pd.DataFrame({'count': rating_zero}, index=[0.0]),verify_integrity=True,).sort_index()
ratings_count['log_count'] = np.log(ratings_count['count'])
movies_count = pd.DataFrame(ratings.groupby('movieId').size(), columns = ['count'])
movies_count.head()
movies_count['count'].quantile(np.arange(1, 0.6, -0.05))
popular_movies = list(set(movies_count.query('count >= 50').index))
popular_ratings = ratings[ratings.movieId.isin(popular_movies)]
users_count = pd.DataFrame(popular_ratings.groupby('userId').size(), columns=['count'])
users_count['count'].quantile(np.arange(1, 0.5, -0.05))
active_users = list(set(users_count.query('count >= 50').index))
popular_active_ratings = popular_ratings[popular_ratings.userId.isin(active_users)]
movie_user_mat = popular_active_ratings.pivot(index='movieId', columns='userId', values='rating').fillna(0)
movie_to_idx = {
movies: i for i, movies in
enumerate(list(movies.set_index('movieId').loc[movie_user_mat.index].title))
}
Related
I am trying to fetch data from API for 50 parcels. I want them to be in a single data frame. While running this loop the data frame is storing only the last parcel which is satisfying the loop condition. Is there any way to store all the previous outputs also in the same dataframe.
For e.g upon running this code it only returns the data frame for foreign id=50, I want the dataframe for all 1-50.
import requests
import pandas as pd
foreign=1
while (foreign <=50):
s1_time_series_url_p6 = 'https://demodev2.kappazeta.ee/ard_api_demo/v1/time_series/s1?limit_to_rasters=true&parcel_foreign_id=0&properties=parcel_foreign_id%2Cs1product_end_time%2Cs1product_ron%2Ccohvh_avg%2Ccohvv_avg%2Cvhvv_avg'
s2_time_series_url_p6 = 'https://demodev2.kappazeta.ee/ard_api_demo/v1/time_series/s2?limit_to_rasters=true&parcel_foreign_id=0&properties=parcel_foreign_id%2Cs2product_start_time%2Cs2product_ron%2Cndvi_avg'
position = 101
foreign_n=str(foreign)
s1_time_series_url_p6 = s1_time_series_url_p6[:position] + foreign_n + s1_time_series_url_p6[position+1:]
s2_time_series_url_p6 = s2_time_series_url_p6[:position] + foreign_n + s2_time_series_url_p6[position+1:]
r_s1_time_series_p6 = requests.get(s1_time_series_url_p6)
r_s2_time_series_p6 = requests.get(s2_time_series_url_p6)
json_s1_time_series_p6 = r_s1_time_series_p6.json()
json_s2_time_series_p6 = r_s2_time_series_p6.json()
df_s1_time_series_p6 = pd.DataFrame(json_s1_time_series_p6['s1_time_series'])
df_s2_time_series_p6 = pd.DataFrame(json_s2_time_series_p6['s2_time_series'])
df_s2_time_series_p6.s2product_start_time=df_s2_time_series_p6.s2product_start_time.str[0:11]
df_s1_time_series_p6.s1product_end_time=df_s1_time_series_p6.s1product_end_time.str[0:11]
dfinal_p6 = df_s1_time_series_p6.merge(df_s2_time_series_p6, how='inner', left_on='s1product_end_time', right_on='s2product_start_time')
cols_p6 = ['parcel_foreign_id_x', 's1product_ron','parcel_foreign_id_y','s2product_ron']
dfinal_p6[cols_p6] = dfinal_p6[cols_p6].apply(pd.to_numeric, errors='coerce', axis=1)
dfinal_p6
The issue is resolved by first creating an empty data frame and then appending the outputs in the dataframe within the loop.
The updated code is as follows:
column_names = ["parcel_foreign_id_x", "s1product_end_time", "s1product_ron","cohvh_avg", "cohvv_avg", "vhvv_avg","parcel_foreign_id_y", "s2product_start_time", "s2product_ron", "ndvi_avg" ]
df = pd.DataFrame(columns = column_names)
foreign=1
while (foreign <=50):
s1_time_series_url_p6 = 'https://demodev2.kappazeta.ee/ard_api_demo/v1/time_series/s1?limit_to_rasters=true&parcel_foreign_id=0&properties=parcel_foreign_id%2Cs1product_end_time%2Cs1product_ron%2Ccohvh_avg%2Ccohvv_avg%2Cvhvv_avg'
s2_time_series_url_p6 = 'https://demodev2.kappazeta.ee/ard_api_demo/v1/time_series/s2?limit_to_rasters=true&parcel_foreign_id=0&properties=parcel_foreign_id%2Cs2product_start_time%2Cs2product_ron%2Cndvi_avg'
position = 101
foreign_n=str(foreign)
s1_time_series_url_p6 = s1_time_series_url_p6[:position] + foreign_n + s1_time_series_url_p6[position+1:]
s2_time_series_url_p6 = s2_time_series_url_p6[:position] + foreign_n + s2_time_series_url_p6[position+1:]
r_s1_time_series_p6 = requests.get(s1_time_series_url_p6)
r_s2_time_series_p6 = requests.get(s2_time_series_url_p6)
json_s1_time_series_p6 = r_s1_time_series_p6.json()
json_s2_time_series_p6 = r_s2_time_series_p6.json()
df_s1_time_series_p6 = pd.DataFrame(json_s1_time_series_p6['s1_time_series'])
df_s2_time_series_p6 = pd.DataFrame(json_s2_time_series_p6['s2_time_series'])
df_s2_time_series_p6.s2product_start_time=df_s2_time_series_p6.s2product_start_time.str[0:11]
df_s1_time_series_p6.s1product_end_time=df_s1_time_series_p6.s1product_end_time.str[0:11]
dfinal_p6 = df_s1_time_series_p6.merge(df_s2_time_series_p6, how='inner', left_on='s1product_end_time', right_on='s2product_start_time')
cols_p6 = ['parcel_foreign_id_x', 's1product_ron','parcel_foreign_id_y','s2product_ron']
dfinal_p6[cols_p6] = dfinal_p6[cols_p6].apply(pd.to_numeric, errors='coerce', axis=1)
df = pd.concat([dfinal_p6,df],ignore_index = True)
foreign = foreign+1
I'm working on a web scraping project, and have all the right code that returns me the json data in the format that I want if I used the #print command below, but when I got to run the same code except through Pandas Dataframe it only returns the first row of Data that I'm looking for. Just running the print, it returns the expected 17 rows of data I'm looking for. Dataframe to CSV gives me the first row only. Totally stumped! So grateful for anyone's help!
for item in response['body']:
DepartureDate = item['legs'][0][0]['departDate']
ReturnDate = item['legs'][1][0]['departDate']
Airline = item['legs'][0][0]['airline']['code']
Origin = item['legs'][0][0]['depart']
Destination = item['legs'][0][0]['destination']
OD = (Origin + Destination)
TrueBaseFare = item['breakdown']['baseFareAmount']
YQYR = item['breakdown']['fuelSurcharge']
TAX = item['breakdown']['totalTax']
TTL = item['breakdown']['totalFareAmount']
MARKEDUPTTL = item['breakdown']['totalCalculatedFareAmount']
MARKUP = ((MARKEDUPTTL - TTL) / (TTL)*100)
FBC = item['fareBasisCode']
#print(DepartureDate,ReturnDate,Airline,OD,TrueBaseFare,YQYR,TAX,TTL,MARKEDUPTTL,MARKUP,FBC)
MI = pd.DataFrame(
{'Dept': [DepartureDate],
'Ret': [ReturnDate],
'AirlineCode': [Airline],
'Routing': [OD],
'RealFare': [TrueBaseFare],
'Fuel': [YQYR],
'Taxes': [TAX],
'RealTotal': [TTL],
'AgencyTotal': [MARKEDUPTTL],
'Margin': [MARKUP],
'FareBasis': [FBC],
})
df = pd.DataFrame(MI)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
df.to_csv('MITest7.csv')
When you print all your values after the cycle, you will see that you get only the last values. To resolve this problem you need to create lists and put there your values.
Try this:
DepartureDate = []
ReturnDate = []
Airline = []
Origin = []
Destination = []
OD = []
TrueBaseFare = []
YQYR = []
TAX = []
TTL = []
MARKEDUPTTL = []
MARKUP = []
FBC = []
for item in response['body']:
DepartureDate.append(item['legs'][0][0]['departDate'])
ReturnDate.append(item['legs'][1][0]['departDate'])
Airline.append(item['legs'][0][0]['airline']['code'])
Origin.append(item['legs'][0][0]['depart'])
Destination.append(item['legs'][0][0]['destination'])
OD.append((Origin[-1] + Destination[-1]))
TrueBaseFare.append(item['breakdown']['baseFareAmount'])
YQYR.append(item['breakdown']['fuelSurcharge'])
TAX.append(item['breakdown']['totalTax'])
TTL.append(item['breakdown']['totalFareAmount'])
MARKEDUPTTL.append(item['breakdown']['totalCalculatedFareAmount'])
MARKUP.append(((MARKEDUPTTL[-1] - TTL[-1]) / (TTL[-1])*100))
FBC.append(item['fareBasisCode'])
Good evening,
I will start by saying I am very early in my coding journey. Currently using a number of excel sheets from government data for a pandas project. Each of these sheets represents a year. I am attempting to add a column to each dataframe before I concat the entire list so I know which year the data came from at each point. Currently, the code looks like this:
'''
df10 = pd.read_excel(r'C:\Market research\national_m2010_dl.xls')
df11 = pd.read_excel(r'C:\Market research\national_m2011_dl.xls')
df12 = pd.read_excel(r'C:\Market research\national_m2012_dl.xls')
df13 = pd.read_excel(r'C:\Market research\national_m2013_dl.xls')
df14 = pd.read_excel(r'C:\Market research\national_m2014_dl.xlsx')
df15 = pd.read_excel(r'C:\Market research\national_m2015_dl.xlsx')
df16 = pd.read_excel(r'C:\Market research\national_m2016_dl.xlsx')
df17 = pd.read_excel(r'C:\Market research\national_m2017_dl.xlsx')
df18 = pd.read_excel(r'C:\Market research\national_m2018_dl.xlsx')
df19 = pd.read_excel(r'C:\Market research\national_m2019_dl.xlsx')
df10['Year'] = '2010'
df11['Year'] = '2011'
df12['Year'] = '2012'
df13['Year'] = '2013'
df14['Year'] = '2014'
df15['Year'] = '2015'
df16['Year'] = '2016'
df17['Year'] = '2017'
df18['Year'] = '2018'
df19['Year'] = '2019'
'''
However, I am sure there is a cleaner way to do this and was wondering if there might be a better way. I originally attempted a For loop similar to this:
'''
for num in range(10,20):
df+str(num)['Year'] = '20'+str(num)
'''
but I had no luck. Thoughts?
Try this:
df_dic = dict()
for n in range(10,20): #remember, the second number in range is exclusive
year = f"20{n}"
df = pd.read_excel(f'C:\Market research\national_m{year}_dl.xls')
df["Year"] = year
df_dic[year] = df
instead of using df10, df11,... you can use df[10], df[11],... which is make the code very easy.
for num in range(10,20):
df[num] = pd.read_excel(r'C:\Market research\national_m20'+str(num)+'_dl.xlsx')
df[num]['Year'] = '20'+str(num)
I have written the following code that has a function model_data to perform a particular set of tasks. I have to pass the list of Badges and the type of category 1 or 2 along with an empty dataframe data.
But while running the code I am getting an error. I searched SO for answers but this type of Question was not found.
CODE
#Model Function
def model_data(badge_list, data):
for key, value in badge_list.items():
#Check for Post Type
if (value == 1):
badge_type = posts.loc[posts.PostTypeId == '1']
elif (value == 2):
badge_type = posts.loc[posts.PostTypeId == '2']
#Obtain required fields from Badge Data
badge_type = badge_type[['OwnerUserId', 'Id','Score', 'CreationDate']]
badge_type.columns = ['UserId', 'Id', 'Score','CreationDate']
Badge = key
#Obtain time when user first obtained Badge
badge_data = user_badge_dt(Badge)
#Find the number of posts made before and after 1 week of Badge Attainment
post_data = post_details(df1 = badge_data, df2 = badge_type)
post_data.date = pd.to_datetime(post_data.date)
#Calculate APR
post_data = APR(post_data)
#Calculate Score
post_data = score(df = post_data, post_type = badge_type)
#Generate Final Dataframe with Badge Count
data1 = badge_number(post_data)
data1 = data1[['1','2','3','date','Score','APR']]
#Append Dataframe
data = data.append(data1)
return data
#Function Call
questionBadge_list = {'Good Question':1, 'Explainer':2}
data = pd.DataFrame()
badge1_data = model_data(badge_list = questionBadge_list, data = data)
ERROR
IndexError: Too many levels: Index has only 1 level, not 2
ERROR LINE
The code line badge_data = user_badge_dt(Badge) gives this error so I am adding the complete function.
#Function to obtain UserId with the date-time of obtaining given badge for the first time
def user_badge_dt(badge):
#Creating DataFrame to obtain all UserId and date-Time of given badge
df = badges[['UserId','Date']].loc[badges.Name == badge]
#Obtaining the first date-time of badge attainment
v = df.groupby("UserId", group_keys=False)['Date'].nsmallest(1)
v.index = v.index.droplevel(1)
df['date'] = df['UserId'].map(v)
df.drop(columns='Date',inplace=True)
#Removing all duplicate values of Users
df.drop_duplicates(subset='UserId', inplace=True )
return df
I have built an inverted index dictionary of text collection and need to compress dictionary using blocked compression k=8 and in posting file,gaps between docids using gamma encoding
def createCompressedIndex(dictionary_uncomp_v1):
for term in dictionary_uncomp_v1.keys():
entry = dictionary_uncomp_v1.get(term)
postingList = []
prevId = 0
pEntry = PostingEntry(0,0,0,0)
for pEntry in entry.postingList:
docId = getGammaCode(pEntry.docId - prevId)
frequency = getGammaCode(pEntry.termFreq)
newPEntry = PostingEntry(docId,frequency,0,0)
postingList.extend(newPEntry)
prevId = pEntry.docId
ptemp = docId+frequency
docFrequency = getGammaCode(entry.docFreq)
entrytemp = ptemp+docFrequency
totalTermFreq = getGammaCode(entry.totTermFreq)
compressedEntry = DictEntry(term,docFrequency,totalTermFreq,postingList)
dictionary_comp_v1[term] = bytearray(entrytemp)