Column value combinations with multiple constraints based on column header - python

I have two columns such as:
Name : Alex, Mohan, Rex
City : Delhi, Chennai, Mumbai, Kolkata
Constraint:
(if Name = Mohan then City = Chennai) or (if name = REX than City = Mumbai)
Output:
[(Alex,Delhi),(Alex,Chennai),(Alex,Mumbai),(Alex,Kolkata),(Mohan,Chennai),(Rex, Mumbai)]
Normal combination, I am able to generate which would be 12 but after apply constraint I am not able to. Please suggest your solutions.

You could do a cross join then filter out your rows manually.
assuming your dataframe looks like this
name city
0 Alex Delhi
1 Mohan Chennai
2 Rex Mumbai
3 NaN Kolkata
df2 = pd.merge(
df[['city']].assign(key='key'),
df[['name']].dropna().assign(key='key'),
on='key',how='outer'
).drop('key',axis=1)
mohan = df2[(df2['name'] == 'Mohan') & (df2['city'] == 'Chennai')].index
rex = df2[(df2['name'] == 'Rex') & (df2['city'] == 'Mumbai')].index
df3 = pd.concat([df2.iloc[mohan|rex],df2[~df2['name'].isin(['Mohan','Rex'])]])
print(list(df3.itertuples(index=None,name=None)))
[('Chennai', 'Mohan'),
('Mumbai', 'Rex'),
('Delhi', 'Alex'),
('Chennai', 'Alex'),
('Mumbai', 'Alex'),
('Kolkata', 'Alex')]

names = ["Alex", "Mohan", "Rex"]
cities = ["Delhi", "Chennai", "Mumbai", "Kolkata"]
constraint = {"Mohan":"Chennai","Rex":"Mumbai"}
result = []
for name in names:
if name in constraint:
result.append((name,constraint[name]))
continue
else:
for city in cities:
result.append((name,city))
print(result)
output:
[('Alex', 'Delhi'), ('Alex', 'Chennai'), ('Alex', 'Mumbai'), ('Alex', 'Kolkata'), ('Mohan', 'Chennai'), ('Rex', 'Mumbai')]
other scenario:
cities = ["Delhi", "Chennai", "Mumbai", "Kolkata"]
cities_map = [1, 2, 3, 4]
dict_city = dict(zip(cities_map,cities))
result2 = []
for city_ix,city in dict_city.items():
if city_ix >=2:
result2.append(("Alex",city))
continue
else:
for name in names:
if name is not "Alex":
result2.append((name,city))

Related

Extract data from nested JSON | Pandas

I'm dealing with a nested JSON in order to extract data about transactions from my database using pandas.
My JSON can have one of these contents :
{"Data":{"Parties":[{"ID":"JackyID","Role":12}],"NbIDs":1}} #One party identified
{"Data":{"Parties":[{"ID":"JackyID","Role":12},{"ID":"SamNumber","Role":10}],"NbIDs":2}} #Two Parties identified
{"Data":{"Parties":[],"NbIDs":0}} #No parties identified
{"Data": None} #No data
When looking to extract the values of ID (ID of the party - String datatype) and Role (Int datatype - refer to buyers when Role=12 and sellers when Role=10) and write it in a pandas dataframe, I'm using the following code :
for i,row in df.iterrows():
json_data = json.dumps(row['Data'])
data = pd_json.loads(json_data)
data_json = json.loads(data)
df['ID'] = pd.json_normalize(data_json, ['Data', 'Parties'])['ID']
df['Role'] = pd.json_normalize(data_json, ['Data', 'Parties'])['Role']
Now when trying to check its values and give every Role its correspending ID:
for i,row in df.iterrows():
if row['Role'] == 12:
df.at[i,'Buyer'] = df.at[i,'ID']
elif row['Role'] == 10:
df.at[i,'Seller'] = df.at[i,'ID']
df = df[['Buyer', 'Seller']]
The expected df result for the given scenario should be as below :
{"Data":{"Parties":[{"ID":"JackyID","Role":12}],"NbIDs":1}} #Transaction 1
{"Data":{"Parties":[{"ID":"JackyID","Role":12},{"ID":"SamNumber","Role":10}],"NbIDs":2}} #Transaction 2
{"Data":{"Parties":[],"NbIDs":0}} #Transaction 3
{"Data": None} #Transaction 4
>>print(df)
Buyer | Seller
------------------
JackyID| #Transaction 1 we have info about the buyer
JackyID| SamNumber #Transaction 2 we have infos about the buyer and the seller
| #Transaction 3 we don't have any infos about the parties
| #Transaction 4 we don't have any infos about the parties
What is the correct way to do so ?
You can special consider case 4 where there is no Data as empty Parties
df = pd.DataFrame(data['Data']['Parties'] if data['Data'] else [], columns=['ID', 'Role'])
df['Role'] = df['Role'].map({10: 'Seller', 12: 'Buyer'})
Then add possible missing values for Role
df = df.set_index('Role').reindex(['Seller', 'Buyer'], fill_value=pd.NA).T
print(df)
# Case 1
Role Seller Buyer
ID <NA> JackyID
# Case 2
Role Seller Buyer
ID SamNumber JackyID
# Case 3
Role Seller Buyer
ID <NA> <NA>
# Case 4
Role Seller Buyer
ID <NA> <NA>

I need to find n best students overall and m best students of each country will group together to form their national group

I managed to get the 32 best points. Now I am trying to get the index of 32 best students so that I can show who they are.
The link to my json file is here:
https://drive.google.com/file/d/1OOkX1hAD6Ot-I3h_DUM2gRqdSl5Hy2Pl/view
And the code is below:
import json
file_path = "C:/Users/User/Desktop/leksion 10/testim/u2/olympiad.json"
with open(file_path, 'r') as j:
contents = json.loads(j.read())
print(contents)
print("\n================================================")
class Competitor:
def __init__(self, first_name, last_name, country, point):
self.first_name = first_name
self.last_name = last_name
self.country= country
self.point = int(point)
def __repr__(self):
return f'{self.first_name} {self.last_name} {self.country} {self.point}'
olimpiade=[]
for i in contents:
olimpiade.append(Competitor(i.get('first_name'),
i.get('last_name'),
i.get('country'),
i.get('point'),))
print(olimpiade)
print("\n================================================")
#32 nxënësit më të mirë do të kalojnë në fazën e dytë. Të ndërtohet një funksion i cili kthen konkurentët e fazës së dytë.
print("\n================================================")
print(type(olimpiade))
print(type(contents))
print(type(Competitor))
for i in contents:
print(a)
print("\n================================================")
for i in olimpiade:
for j in i:
L=olimpiade.sort(key=lambda x: x.point)
print(L)
I have tried this for example
pike=[]
for value in contents:
pike.append(value['point'])
print(pike)
n = 32
pike.sort()
print(pike[-n:])
Using the data from your link and downloading to file 'olympiad.json'
Code
import json
def best_students(lst, n=1):
'''
Top n students
'''
return sorted(lst,
key = lambda d: d['point'], # sort based upon points
reverse = True)[:n] # Take n talk students
def best_students_by_country(lst, m=1):
'''
Top m students in each country
'''
# Sort by country
by_country = sorted(lst, key = lambda d: d['country'])
groups = []
for d in by_country:
if not groups:
groups.append([])
elif groups[-1][-1]['country'] != d['country']:
groups.append([]) # add new country
# Append student
groups[-1].append(d) # append student to new country
# List comprehension for best m students in each group
return [best_students(g, m) for g in groups]
Usage
# Deserialize json file
with open('olympiad.json', 'r') as f:
data = json.load(f)
# Top two students overall
print(best_students(data, 2))
# Top two students by country
print(best_students_by_country(data, 2))
Outputs
[{'first_name': 'Harvey',
'last_name': 'Massey',
'country': 'Bolivia',
'point': 9999},
{'first_name': 'Barbra',
'last_name': 'Knight',
'country': 'Equatorial Guinea',
'point': 9998}]
[[{'first_name': 'Wade',
'last_name': 'Dyer',
'country': 'Afghanistan',
'point': 9822},
{'first_name': 'Terrell',
'last_name': 'Martin',
'country': 'Afghanistan',
'point': 8875}],
[{'first_name': 'Delaney',
'last_name': 'Buck',
'country': 'Albania',
'point': 9729},
{'first_name': 'Melton',
'last_name': 'Ford',
'country': 'Albania',
'point': 9359}],
...
I have written how to make a useful dictionary out of your question.
Firstly, I am assuming all your values are in a list, and each value is a string
That would be texts
We can get list of countries from external sources
pip install country-list
from country_list import countries_for_language
countries = dict(countries_for_language('en'))
countries = list(countries.values())
Initialise empty dictionary - scores_dict = {}
for i in texts:
for j in countries:
if j in i:
country = j
score = [int(s) for s in i.split() if s.isdigit()]
try:
scores_dict[country].extend(score)
except:
scores_dict[country] = score
This will give you a dictionary that looks like this
{'Albania': [5287],
'Bolivia': [1666],
'Croatia': [1201],
'Cyprus': [8508]}
From here, you can just iterate through each country to get top 5 students overall and top 5 students for each country.
From your file I created a dataframe in pandas.
The general sorting is 'sorted_all'. 'ascending=False' means that the highest data will come first.
In the national team, Mexico selected the best 7 players.
head() by default, it shows five values.
import pandas as pd
df = pd.read_json('olympiad.json')
sorted_all = df.sort_values(by='point', ascending=False)
sorted_national = df.sort_values(['country','point'], ascending=[True, False])
print(sorted_all.head())
print(sorted_national.loc[sorted_national['country'] == 'Mexico'].head(7))
Output all
first_name last_name country point
1453 Harvey Massey Bolivia 9999
3666 Barbra Knight Equatorial Guinea 9998
5228 Rebecca Navarro Tunisia 9994
338 Jolene Pratt Mexico 9993
5322 Barnett Herrera Comoros 9986
Output national Mexico
first_name last_name country point
338 Jolene Pratt Mexico 9993
5118 Doyle Goodman Mexico 9980
2967 Mindy Watson Mexico 9510
6074 Riley Hall Mexico 9426
5357 Leah Collins Mexico 8798
5596 Luz Bartlett Mexico 8592
3684 Annette Perry Mexico 8457
There should be a grade range and grade of each student, that is what will help you filter the best students.

How can I use a value in a dataframe to look up an attribute

Say I have the 2 Dataframes below; one with a list of students and test scores, and different student sessions that made up of the students. Say I want to add a new column, "Sum", to df with the sum of the scores for each session and a new column for the number of years passed since the most recent year that either student took the test, "Years Elapsed". What is the best way to accomplish this? I can make the students a class and make each student an object but then I am stuck on how to link the object to their name in the dataframe.
data1 = {'Student': ['John','Kim','Adam','Sonia'],
'Score': [92,100,76,82],
'Year': [2015,2013,2016,2018]}
df_students = pd.DataFrame(data1, columns=['Student','Score','Year'])
data2 = {'Session': [1,2,3,4],
'Student1': ['Sonia','Kim','John','Adam'],
'Student2': ['Adam','Sonia','Kim','John']}
df = pd.DataFrame(data2, columns=['Session','Student1','Student2'])
The desired outcome:
outcome = {'Session': [1,2,3,4],
'Student1': ['Sonia','Kim','John','Adam'],
'Student2': ['Adam','Sonia','Kim','John'],
'Sum': [158, 182, 192, 168],
'Years Elapsed': [4,4,7,6]}
df_outcome = pd.DataFrame(outcome, columns=['Session','Student1','Student2','Sum','Years Elasped'])
I have made a class called Student and made each student an object but after this is where I am stuck.
df_students.columns = df_students.columns.str.lower()
class Student:
def __init__(self, s, sc, yr):
self.student = s
self.score = sc
self.year = yr
students = [Student(row.student, row.score, row.year) for index, row in df_students.iterrows()]
#check to see if list of objects was created correctly
s1 = students[1]
s1.__dict__
Thanks in advance!
Using apply method:
import pandas as pd
data1 = {'Student': ['John','Kim','Adam','Sonia'],
'Score': [92,100,76,82],
'Year': [2015,2013,2016,2018]}
df_students = pd.DataFrame(data1, columns=['Student','Score','Year'])
data2 = {'Session': [1,2,3,4],
'Student1': ['Sonia','Kim','John','Adam'],
'Student2': ['Adam','Sonia','Kim','John']}
df = pd.DataFrame(data2, columns=['Session','Student1','Student2'])
# SOLUTION
def sum_scores(student1, student2):
_score_s1 = df_students.loc[(df_students['Student']==student1)]['Score'].values[0]
_score_s2 = df_students.loc[(df_students['Student']==student2)]['Score'].values[0]
return _score_s1 + _score_s2
def years_elapsed(student1, student2):
_year = pd.to_datetime("today").year
_year_s1 = df_students.loc[(df_students['Student']==student1)]['Year'].values[0]
_year_s2 = df_students.loc[(df_students['Student']==student2)]['Year'].values[0]
return _year - max(_year_s1, _year_s2)
df['sum_score'] = df.apply(lambda row: sum_scores(row['Student1'], row['Student2']), axis=1)
df['years_elapsed'] = df.apply(lambda row: years_elapsed(row['Student1'], row['Student2']), axis=1)
df
You can try this:
df2 = pd.merge(df, df_students, left_on="Student1", right_on="Student")
df3 = pd.merge(df2, df_students, left_on="Student2", right_on="Student")
df3['Sum'] = df3[['Score_x','Score_y']].sum(axis=1)
df3['Years Elapsed'] = 2022 - df3[['Year_x', 'Year_y']].max(axis=1)
df3 = df3[['Session', 'Student1', 'Student2', 'Sum', 'Years Elapsed']]
print(df3)
It gives:
Session Student1 Student2 Sum Years Elapsed
0 1 Sonia Adam 158 4
1 2 Kim Sonia 182 4
2 3 John Kim 192 7
3 4 Adam John 168 6

Getting KeyError when trying to assign data from a dictionary into class and object in python

The file, data used
Austin = null|Stone Cold Austin|996003892|987045321|Ireland
keller = null|Mathew Keller|02/05/2002|0199999999|0203140819|019607892|9801 2828 5596 0889
The Nested Dictionary
data = {'Austin': {'Full Name': 'Stone Cold Steve Austin', 'Contact Details': '996003892', 'Emergency Contact Number': '987045321', Country: 'Ireland'}}
The class and Object that I want to use to assign the dict data
class member2:
def __init__(self, realname, phone, emergencyContact, country):
self.realname = realname
self.phone = phone
self.emergencyContact = emergencyContact
self.country = country
Assigning text file data into a nested dictionary
with open("something.txt", 'r') as f:
for line in f:
key, values = line.strip().split(" = ") # note the space around =, to avoid trailing space in key
values = values.split('|')
data2 = {key: dict(zip(keys, values[1:]))}
#To assign data to the class (NOT WORKING)
member2.realname = data2[values[2]]
print(member2)
if key == username:
data2 = {key: dict(zip(keys, values[1:]))}
Output
member2.realname = data2[values[2]]
KeyError: 'Stone Cold Steve Austin'
You are referring non existing key 'Stone Cold Steve Austin'
Maybe you wish to access something like data2[key][keys[0]]:
keys = ["Full Name", "Contact Details", "Emergency Contact Number", "Country"]
with open("we.txt", 'r') as f:
for line in f:
key, values = line.strip().split(" = ") # note the space around =, to avoid trailing space in key
values = values.split('|')
data2 = {key: dict(zip(keys, values[1:]))}
print(data2[key][keys[0]])
Output:
Stone Cold Austin
Mathew Keller

pandas add row using lookup value

I have a dataframe with a model id and associated values. The columns are date, client_id, model_id, category1, category2, color, and price. I have a simple flask app where the user can select a model id and add to their "purchase" history. Based on the model id I would like to add a row to the dataframe and bring the associated values of category1, category2, color, and price. What is the best way to do this using Pandas? I know in Excel I'd use a vlookup but I am unsure how to go about it using Python. Assume category1, category2, color, and price are unique to each model id.
client_id = input("ENTER Model ID: ")
model_id = input("ENTER Model ID: ")
def update_history(df, client_id, model_id):
today=pd.to_datetime('today')
#putting in tmp but just need to "lookup" these values from the original dataframe somehow
df.loc[len(df)]=[today, client_id, model_id, today, 'tmp', 'tmp','tmp', 'tmp']
return df
Code below adds a new row with new values to an existing dataframe. The list of new values could be passed in to the function.
Import libraries
import pandas as pd
import numpy as np
import datetime
Create sample dataframe
model_id = ['M1', 'M2', 'M3']
today = ['2018-01-01', '2018-01-02', '2018-01-01']
client_id = ['C1', 'C2', 'C3']
category1 = ['orange', 'apple', 'beans']
category2 = ['fruit', 'fruit', 'grains']
df = pd.DataFrame({'today':today, 'model_id': model_id, 'client_id':client_id,
'category1': category1, 'category2':category2})
df['today'] = pd.to_datetime(df['today'])
df
Function
def update_history(df, client_id, model_id, category1, category2):
today=pd.to_datetime('today')
# Create a temp dataframe with new values.
# Column names in this dataframe should match the existing dataframe
temp = pd.DataFrame({'today':[today], 'model_id': [model_id], 'client_id':[client_id],
'category1': [category1], 'category2':[category2]})
df = df.append(temp)
return df
Call function to append a row with new values to existing dataframe
update_history(df, client_id='C4', model_id='M4', category1='apple', category2='fruit')
You could try this. In case you are appending more than one row at a time, appending a dictionary to list and then appending them at once to a dataframe is faster.
modelid = ['MOD1', 'MOD2', 'MOD3']
today = ['2018-07-15', '2018-07-18', '2018-07-20']
clients = ['CLA', 'CLA', 'CLB']
cat_1 = ['CAT1', 'CAT2', 'CAT3']
cat_2 = ['CAT11', 'CAT12', 'CAT13']
mdf = pd.DataFrame({"model_id": modelid, "today": today, "client_id": clients, "cat_1":cat_1, "cat_2":cat_2})
def update_history(df, client_id, model_id):
today = pd.to_datetime('today')
row = df[df.model_id==model_id].iloc[0]
rows_list = []
dict = {"today":today, "client_id":client_id,
"model_id":model_id,"cat_1":row["cat_1"],
"cat_2":row["cat_2"]}
rows_list.append(dict)
df2 = pd.DataFrame(rows_list)
df = df.append(df2)
return df
mdf = update_history(mdf,"CLC","MOD1")
This is what I ended up doing. I still think there is a more elegant solution, so please let me know!
#create dataframe
modelid = ['MOD1', 'MOD2', 'MOD3']
today = ['2018-07-15', '2018-07-18', '2018-07-20']
clients = ['CLA', 'CLA', 'CLB']
cat_1 = ['CAT1', 'CAT2', 'CAT3']
cat_2 = ['CAT11', 'CAT12', 'CAT13']
mdf = pd.DataFrame({"model_id": modelid, "today": today, "client_id": clients, "cat_1":cat_1, "cat_2":cat_2})
#reorder columns
mdf = mdf[['cat_1', 'cat_2', 'model_id', 'client_id', 'today']]
#create lookup table
lookup=mdf[['cat_1','cat_2','model_id']]
lookup.drop_duplicates(inplace=True)
#get values
client_id = input("ENTER Client ID: ")
model_id = input("ENTER Model ID: ")
#append model id to list
model_id_lst=[]
model_id_lst.append(model_id)
today=pd.to_datetime('today')
#grab associated cat_1, and cat_2 from lookup table
temp=lookup[lookup['model_id'].isin(model_id_lst)]
out=temp.values.tolist()
out[0].extend([client_id, today])
#add this as a row to the df
mdf.loc[len(mdf)]=out[0]

Categories

Resources