Extract data from nested JSON | Pandas - python

I'm dealing with a nested JSON in order to extract data about transactions from my database using pandas.
My JSON can have one of these contents :
{"Data":{"Parties":[{"ID":"JackyID","Role":12}],"NbIDs":1}} #One party identified
{"Data":{"Parties":[{"ID":"JackyID","Role":12},{"ID":"SamNumber","Role":10}],"NbIDs":2}} #Two Parties identified
{"Data":{"Parties":[],"NbIDs":0}} #No parties identified
{"Data": None} #No data
When looking to extract the values of ID (ID of the party - String datatype) and Role (Int datatype - refer to buyers when Role=12 and sellers when Role=10) and write it in a pandas dataframe, I'm using the following code :
for i,row in df.iterrows():
json_data = json.dumps(row['Data'])
data = pd_json.loads(json_data)
data_json = json.loads(data)
df['ID'] = pd.json_normalize(data_json, ['Data', 'Parties'])['ID']
df['Role'] = pd.json_normalize(data_json, ['Data', 'Parties'])['Role']
Now when trying to check its values and give every Role its correspending ID:
for i,row in df.iterrows():
if row['Role'] == 12:
df.at[i,'Buyer'] = df.at[i,'ID']
elif row['Role'] == 10:
df.at[i,'Seller'] = df.at[i,'ID']
df = df[['Buyer', 'Seller']]
The expected df result for the given scenario should be as below :
{"Data":{"Parties":[{"ID":"JackyID","Role":12}],"NbIDs":1}} #Transaction 1
{"Data":{"Parties":[{"ID":"JackyID","Role":12},{"ID":"SamNumber","Role":10}],"NbIDs":2}} #Transaction 2
{"Data":{"Parties":[],"NbIDs":0}} #Transaction 3
{"Data": None} #Transaction 4
>>print(df)
Buyer | Seller
------------------
JackyID| #Transaction 1 we have info about the buyer
JackyID| SamNumber #Transaction 2 we have infos about the buyer and the seller
| #Transaction 3 we don't have any infos about the parties
| #Transaction 4 we don't have any infos about the parties
What is the correct way to do so ?

You can special consider case 4 where there is no Data as empty Parties
df = pd.DataFrame(data['Data']['Parties'] if data['Data'] else [], columns=['ID', 'Role'])
df['Role'] = df['Role'].map({10: 'Seller', 12: 'Buyer'})
Then add possible missing values for Role
df = df.set_index('Role').reindex(['Seller', 'Buyer'], fill_value=pd.NA).T
print(df)
# Case 1
Role Seller Buyer
ID <NA> JackyID
# Case 2
Role Seller Buyer
ID SamNumber JackyID
# Case 3
Role Seller Buyer
ID <NA> <NA>
# Case 4
Role Seller Buyer
ID <NA> <NA>

Related

Is it possible to join a table with itself twice? (Using SQLAlchemy or SQL)

I have a following model in SQLAlchemy:
class SomeEvent(db.Model):
id = db.Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
created_on = db.Column(db.DateTime())
type = db.Column(db.String(20))
event_target = db.Column(db.String(10))
group = db.Column(db.String(20))
Is there any way to produce a query that will return a result consisting of all records of one type + records of two other types (if these two records have same event_target else just records of a single type) of the same group ordered by created_on date?
What I've managed to come up with:
Single_type_event = aliased(SomeEvent)
Paired_type_event_1 = aliased(SomeEvent)
Paired_type_event_2 = aliased(SomeEvent)
columns = [
Single_type_event.id.label('single_type_event_id'),
Single_type_event.type.label('single_type_event_type'),
Single_type_event.event_target.label('single_type_event_target'),
Paired_type_event_1.id.label('paired_type_event_1_id'),
Paired_type_event_1.type.label('paired_type_event_1_type'),
Paired_type_event_1.event_target.label('paired_type_event_1_target'),
Paired_type_event_2.id.label('paired_type_event_2_id'),
Paired_type_event_2.type.label('paired_type_event_2_type'),
Paired_type_event_2.event_target.label('paired_type_event_2_target'),
]
query = (db.session.query(*columns)
.outerjoin(SomeEvent, (
(SomeEvent.group == 'some_group') &
(SomeEvent.id == Single_type_event.id) |
(SomeEvent.id == Paired_type_event_1.id) |
(SomeEvent.id == Paired_type_event_2.id)
))
.outerjoin(Single_type_event, (
(SomeEvent.id == Single_type_event.id) &
(SomeEvent.event_target == Single_type_event.event_target) &
(SomeEvent.type == 'type_1')
))
.outerjoin(Paired_type_event_1, (
(SomeEvent.id == Paired_type_event_1.id) &
(SomeEvent.event_target == Paired_type_event_1.event_target) &
(SomeEvent.type == 'type_2')
))
.outerjoin(Paired_type_event_2, (
(SomeEvent.id == Paired_type_event_2.id) &
(SomeEvent.event_target == Paired_type_event_2.event_target) &
(SomeEvent.type == 'type_3')
))
.order_by(SomeEvent.created_on.asc())
)
But of course it doesn't work, because I'm lacking the understanding of how to do this right, if it's even possible =(
Example of the query results that I would like to achieve(._asdict() applied):
If a record with a given event_target has type == 'type_1':
{'single_type_event_id': 'some_id',
'single_type_event_type': 'type_1',
'single_type_event_target': 'target_1',
'paired_type_event_1_id': None,
'paired_type_event_1_type': None,
'paired_type_event_1_target': None,
'paired_type_event_2_id': None,
'paired_type_event_2_type': None,
'paired_type_event_2_target': None}
If there is only a record with type == 'type_2' for a given event_target (still it could have a type_1 record earlier but it should be in a distinct row like the one before):
{'single_type_event_id': None,
'single_type_event_type': None,
'single_type_event_target': None,
'paired_type_event_1_id': 'some_id',
'paired_type_event_1_type': 'type_2',
'paired_type_event_1_target': 'target_1',
'paired_type_event_2_id': None,
'paired_type_event_2_type': None,
'paired_type_event_2_target': None}
And, finally, if there are records of both event types for a given event_target (there shouldn't be distinct rows for each type - only this combined one):
{'single_type_event_id': None,
'single_type_event_type': None,
'single_type_event_target': None,
'paired_type_event_1_id': 'some_id_1',
'paired_type_event_1_type': 'type_2',
'paired_type_event_1_target': 'target_1',
'paired_type_event_2_id': 'some_id_2',
'paired_type_event_2_type': 'type_3',
'paired_type_event_2_target': 'target_1'}
I would like to have all these results having the same group and ordered by created_on (for the last example it should be ordered by the paired_type_event_1 date).
An abstract example:
a person_1(event_target) is living in a town_a(group). He has a record of when he was born(type == 'birth') and this is his single_type_event (first case of the results from above). This person_1 also has a record that he started school(type == 'enrollment'), but he doesn't have a record of graduation. This is his paired_type_event_1 (second case of the results). If this person had a graduation record (type == 'graduation') then it would have been presented in the same row with the enrollment record. Assuming that this table is actually a paper record book for the town_a these records should be organized in the following order: born (a single row) -> enrolled (another row) (born (a single row) -> enrolled+graduated (another single row) for a person who graduated)
I know that it looks like a mess and I have some solid doubts about this, but if it's possible to achieve in SQLAlchemy or using raw SQL I would be very thankful for the guidance!
Not sure I completely followed the requirement, but assuming that 'event types' are unique within an 'event group' (that is, the 'event group' binds the single event and the paired events) ...
The idea is to start with a list of all event groups and targets (allGroups), then match to it when data of the various types is available (left joins)
;with allGroups as
(
select distinct eventGroup, eventTarget
from SomeEvent
where ???
)
select ag.eventGroup, ag.eventTarget, type1 = e1.eventType, type2 = e2.eventType, type3 = e3.eventType
, createDate = case when e1.createDate is not null then e1.createDate
when e2.createDate is not null then e2.createDate
when e3.createDate is not null then e3.createDate
else null
end
from allGroups ag
left join SomeEvent e1 on ag.eventTarget = e1.eventTarget and ag.eventGroup = e1.eventGroup and e2.eventType = 'type_1'
left join SomeEvent e2 on ag.eventTarget = e2.eventTarget and ag.eventGroup = e2.eventGroup and e2.eventType = 'type_2'
left join SomeEvent e3 on ag.eventTarget = e3.eventTarget and ag.eventGroup = e3.eventGroup and e3.eventType = 'type_3'
order by createDate
If you can't do CTE's (Common Table Expressions) on your platform, you can do the initial select into a temp table, table variable, etc.
Hopefully this will get you thinking.

Pandas Dataframe iteration loop keeps loading undefinitely

I have the following dataframe consisting of UserId and the Name of the badge earned by that person on Stackoverflow. Now, each badge belongs to a particular category such as Question, Answer, Participation, Moderation and Tag. I want to create a column called Category to store the category of each badge.
The code that I have written works well if data is less than 1M users, for more data it just keeps loading. How to fix this?
Dataframe (badges)
UserId | Name
1 | Altruist
2 | Autobiographer
3 | Enlightened
4 | Citizen Patrol
5 | python
Code
def category(df):
questionCategory = ['Altruist', 'Benefactor', 'Curious', 'Inquisitive', 'Socratic', 'Favorite Question', 'Stellar Question', 'Investor', 'Nice Question', 'Good Question', 'Great Question', 'Popular Question', 'Notable Question', 'Famous Question', 'Promoter', 'Scholar', 'Student']
answerCategory = ['Enlightened', 'Explainer', 'Refiner', 'Illuminator', 'Generalist', 'Guru', 'Lifejacket', 'Lifeboat', 'Nice Answer', 'Good Answer', 'Great Answer', 'Populist', 'Revival', 'Necromancer', 'Self-Learner','Teacher', 'Tenacious', 'Unsung Hero']
participationCategory = ['Autobiographer','Caucus', 'Constituent', 'Commentator', 'Pundit', 'Enthusiast', 'Fanatic', 'Mortarboard', 'Epic', 'Legendary', 'Precognitive', 'Beta', 'Quorum', 'Convention', 'Talkative', 'Outspoken', 'Yearling']
moderationCategory = ['Citizen Patrol', 'Deputy', 'Marshal', 'Civic Duty', 'Cleanup', 'Constable', 'Sheriff', 'Critic', 'Custodian', 'Reviewer', 'Steward', 'Disciplined', 'Editor', 'Strunk & White', 'Copy Editor', 'Electorate', 'Excavator', 'Archaelogist', 'Organizer', 'Peer Pressure', 'Proofreader', 'Sportsmanship', 'Suffrage', 'Supporter', 'Synonymizer', 'Tag Editor', 'Research Assistant', 'Taxonomist', 'Vox Populi']
#Tag Category will be represented as 0
df['Category'] = 0
for i in range(len(df)) :
if (df.loc[i, "Name"] in questionCategory):
df.loc[i, 'Category'] = 1
elif (df.loc[i, "Name"] in answerCategory):
df.loc[i, 'Category'] = 2
elif (df.loc[i, "Name"] in participationCategory):
df.loc[i, 'Category'] = 3
elif (df.loc[i, "Name"] in moderationCategory):
df.loc[i, 'Category'] = 4
return df
category(stackoverflow_badges)
Expected Output
UserId | Name | Category
1 | Altruist | 1
2 | Autobiographer | 3
3 | Enlightened | 2
4 | Citizen Patrol | 4
5 | python | 0
If you want to update a dataframe with more than 1M rows, than you definetely want to avoid for loops whenever possible. There is an easier to update your 'Category' column, like it was done here.
In your case, you just need to convert your 4 lists with the badges names to a dictionary matching the badge name to its numerical category, like:
category_dict = {
**{key: 1 for key in questionCategory},
**{key: 2 for key in answerCategory},
**{key: 3 for key in participationCategory},
**{key: 4 for key in moderationCategory},
}
And then you can replace all your for loops for this command:
df['Category'] = df['Name'].map(category_dict).fillna(0)
This may not solve your whole issue, but at least will save some time.

Column value combinations with multiple constraints based on column header

I have two columns such as:
Name : Alex, Mohan, Rex
City : Delhi, Chennai, Mumbai, Kolkata
Constraint:
(if Name = Mohan then City = Chennai) or (if name = REX than City = Mumbai)
Output:
[(Alex,Delhi),(Alex,Chennai),(Alex,Mumbai),(Alex,Kolkata),(Mohan,Chennai),(Rex, Mumbai)]
Normal combination, I am able to generate which would be 12 but after apply constraint I am not able to. Please suggest your solutions.
You could do a cross join then filter out your rows manually.
assuming your dataframe looks like this
name city
0 Alex Delhi
1 Mohan Chennai
2 Rex Mumbai
3 NaN Kolkata
df2 = pd.merge(
df[['city']].assign(key='key'),
df[['name']].dropna().assign(key='key'),
on='key',how='outer'
).drop('key',axis=1)
mohan = df2[(df2['name'] == 'Mohan') & (df2['city'] == 'Chennai')].index
rex = df2[(df2['name'] == 'Rex') & (df2['city'] == 'Mumbai')].index
df3 = pd.concat([df2.iloc[mohan|rex],df2[~df2['name'].isin(['Mohan','Rex'])]])
print(list(df3.itertuples(index=None,name=None)))
[('Chennai', 'Mohan'),
('Mumbai', 'Rex'),
('Delhi', 'Alex'),
('Chennai', 'Alex'),
('Mumbai', 'Alex'),
('Kolkata', 'Alex')]
names = ["Alex", "Mohan", "Rex"]
cities = ["Delhi", "Chennai", "Mumbai", "Kolkata"]
constraint = {"Mohan":"Chennai","Rex":"Mumbai"}
result = []
for name in names:
if name in constraint:
result.append((name,constraint[name]))
continue
else:
for city in cities:
result.append((name,city))
print(result)
output:
[('Alex', 'Delhi'), ('Alex', 'Chennai'), ('Alex', 'Mumbai'), ('Alex', 'Kolkata'), ('Mohan', 'Chennai'), ('Rex', 'Mumbai')]
other scenario:
cities = ["Delhi", "Chennai", "Mumbai", "Kolkata"]
cities_map = [1, 2, 3, 4]
dict_city = dict(zip(cities_map,cities))
result2 = []
for city_ix,city in dict_city.items():
if city_ix >=2:
result2.append(("Alex",city))
continue
else:
for name in names:
if name is not "Alex":
result2.append((name,city))

Cannot get the value if the sharepoint column type is "Person" - Python

I am trying to extract a list from Sharepoint. The thing is that if the column type is "Person or Group" Python show me a KeyError but if the column type is different I can get it.
This is my code to to get the values:
print("Item title: {0}, Id: {1}".format(item.properties["Title"], item.properties['AnalystName']))
And Title works but AnalystName does not. both are the internal names in the sharepoint.
authcookie = Office365('https://xxxxxxxxx.sharepoint.com', username='xxxxxxxxx', password='xxxxxxxxx').GetCookies()
site = Site('https://xxxxxxxxxxxx.sharepoint.com/sites/qualityassuranceteam', authcookie=authcookie)
new_list = site.List('Process Review - Customer Service Opt In/Opt Out')
query = {'Where': [('Gt', 'Audit Date', '2020-02-16')]}
sp_data = new_list.GetListItems(fields=['App ID', 'Analyst Name', 'Team Member Name', "Team Member's Supervisor Name",
'Audit Date', 'Event Date (E.g. Call date)', 'Product Type', 'Master Contact Id',
'Location', 'Team member read the disclosure?', 'Team member withheld the disclosure?',
'Did the team member take the correct action?', 'Did the team member notate the account?',
'Did the team member add the correct phone number?', 'Comment (Required)',
'Modified'], query=query)
#print(sp_data[0])
final_file = '' #Create an empty File
num = 0
for k in sp_data:
values = sp_data[num].values()
val = "|".join(str(v).replace('None', 'null') for v in values) + '\n'
num += 1
final_file += val
file_name = 'test.txt'
with open(file_name, 'a', encoding='utf-8') as file:
file.write(final_file)
So right now I´m getting what I want but there is a problem. When a Column is empty it skips the column instead of bring an empty space. for example:
col-1 | col-2 | col-3 |
HI | 10 | 8 |
Hello | | 7 |
So in this table the row 1 is full so it will bring me evertything as:
HI|10|8
but the second row brings me
Hello|7
and I need Hello||7
Person Fields are getting parsed with different names from items
Ex: UserName gets changed to UserNameId and UserNameString
That is the reason for 'KeyError' since the items list is not having the item
Use Below code to get the person field values
#Python Code
from office365.runtime.auth.user_credential import UserCredential
from office365.sharepoint.client_context import ClientContext
site_url = "enter sharepoint url"
sp_list = "eneter list name"
ctx = ClientContext(site_url).with_credentials(UserCredential("username","password"))
tasks_list = ctx.web.lists.get_by_title(sp_list)
items = tasks_list.items.get().select(["*", "UserName/Id", "UserName/Title"]).expand(["UserName"]).execute_query()
for item in items: # type:ListItem
print("{0}".format(item.properties.get('UserName').get("Title")))

How do I merge two csv files?

I have two csv files. EMPLOYEES contains a dict of every employee at a company with 10 rows of information about each one. SOCIAL contains a dict of employees who filled out a survey, with 8 rows of information. Every employee in survey is also on the master dict. Both dicts have a unique identifier (the EXTENSION.)
I want to say "If an employee is on the SOCIAL dict, add rows 4,5,6 to their column in the EMPLOYEES dict" In other words, if an employee filled out a survey, additional information should be appended to the master dict.
Currently, my program pulls out all information from EMPLOYEES for employees who have taken the SURVEY. But I don't know how to add the additional rows of information to the EMPLOYEES csv. I have spent much of the day reading StackOverflow about DictReader and Dictionary and am still confused.
Thank you in advance for your guidance.
Sample EMPLOYEE:
Name Extension Job
Bill 1111 plumber
Alice 2222 fisherman
Carl 3333 rodeo clown
Sample SURVEY:
Extension Favorite Color Book
2222 blue A Secret Garden
3333 green To Kill a Mockingbird
Sample OUTPUT
Name Extension Job Favorite Color Favorite Book
Bill 1111 plumber
Alice 2222 fisherman blue A Secret Garden
Carl 3333 rodeo clown green To Kill a Mockingbird
import csv
with open('employees.csv', "rU") as npr_employees:
employees = csv.DictReader(npr_employees)
all_employees = {}
total_employees = {}
for employee in employees:
all_employees[employee['Extension']] = employee
with open('social.csv', "rU") as social_employees:
social_employee = csv.DictReader(social_employees)
for row in social_employee:
print all_employees.get(row['Extension'], None)
You can merge two dictionaries in Python using:
dict(d1.items() + d2.items())
Using a dict, all_employees, with the key as 'Extension' works perfectly to link a "social employee" row with its corresponding "employee" row.
Then you need to go through all the updated employee info and output their fields in a consistent order. Since dictionaries are inherently orderless, we keep a list of the headers, output_headers as we see them.
import csv
# Store all the info about the employees
all_employees = {}
output_headers = []
# First, get all employee record info
with open('employees.csv', 'rU') as npr_employees:
employees = csv.DictReader(npr_employees)
for employee in employees:
ext = employee['Extension']
all_employees[ext] = employee
# Add headers from "all employees"
output_headers.extend(employees.fieldnames)
# Then, get all info from social, and update employee info
with open('social.csv', 'rU') as social_employees:
social_employees = csv.DictReader(social_employees)
for social_employee in social_employees:
ext = social_employee['Extension']
# Combine the two dictionaries.
all_employees[ext] = dict(
all_employees[ext].items() + social_employee.items()
)
# Add headers from "social employees", but don't add duplicate fields
output_headers.extend(
[field for field in social_employees.fieldnames
if field not in output_headers]
)
# Finally, output the records ordered by extension
with open('output.csv', 'wb') as f:
writer = csv.writer(f)
writer.writerow(output_headers)
# Write the new employee rows. If a field doesn't exist,
# write an empty string.
for employee in sorted(all_employees.values()):
writer.writerow(
[employee.get(field, '') for field in output_headers]
)
outputs:
Name,Extension,Job,Favorite Color,Book
Bill,1111,plumber,,
Alice,2222,fisherman,blue,A Secret Garden
Carl,3333,rodeo clown,green,To Kill a Mockingbird
Let me know if you have any questions!
You Could try:
for row in social_employee:
employee = all_employees.get(row['Extension'], None)
if employee is not None:
all_employees[employee['additionalinfo1']] = row['additionalinfo1']
all_employees[employee['additionalinfo2']] = row['additionalinfo2']

Categories

Resources