Nested Counter for json data

Nested Counter for json data - python

I have a JSON data as:
{
"persons": [
{
"city": "Seattle",
"name": "Brian"
"dob" : "19-03-1980"
},
{
"city": "Amsterdam",
"name": "David"
"dob" : "19-09-1979"
}
{
"city": "London",
"name": "Joe"
"dob" : "19-01-1980"
}
{
"city": "Kathmandu",
"name": "Brian"
"dob" : "19-03-1980"
}
]
}
How can I count the individual elements, like, number of person born in Jan-Dec (0 if none were born) and born in given year using python in one single iteration. Also the number of unique names registered in each month
Like:
1980 :3
--Jan:1
--Mar:2
1979 :1
--Sep:1
Names:
Mar 1980: 1 #Brian is same for both cities
Jan 1980: 1
Sep 1979: 1
counters_mon is the counter that has values for specific months of year
for k_mon,v_mon in counters_mon.items():
print('{}={}'.format(k_mon,v_mon))
But I want details too to be printed. How can I achieve this?

import json
f = open('/path/to/your/json', 'r')
persons = json.load(f)
years_months = {}
years_months_names = {}
for person in persons['persons']:
year = person['dob'][-4:]
month = person['dob'][3:5]
month_year = month + ' ' + year
name = person['name']
if year not in years_months.keys():
years_months[year] = { 'count': 1, 'months' : {} }
if month not in years_months[year]['months'].keys():
years_months[year]['months'][month] = 1
else:
years_months[year]['months'][month] += 1
else:
years_months[year]['count'] += 1
if month not in years_months[year]['months'].keys():
years_months[year]['months'][month] = 1
else:
years_months[year]['months'][month] += 1
if month_year not in years_months_names.keys():
years_months_names[month_year] = set([name])
else:
years_months_names[month_year].add(name)
for k, v in years_months.items():
print(k + ': ' + str(v['count']))
for month, count in v['months'].items():
print("-- " + str(month) + ": " + str(count))
for k, v in years_months_names.items():
print(k + ": " + str(len(v)))
I'm assuming that you have the path to your json. I also tested my answer on the JSON that you've posted, and be careful to make sure that your JSON is structured correctly.

This is a good case for using defaultdicts (https://docs.python.org/3/library/collections.html#collections.defaultdict).
data # assume you have your data in a var called data
from collections import defaultdict
from calendar import month_abbr
# slightly strange construction here but we want a 2 levels of defaultdict followed by lists
aggregate = defaultdict(lambda:defaultdict(list))
# then the population is super simple - you'll end up with something like
# aggregate[year][month] = [name1, name2]
for person in data['persons']:
day, month, year = map(int, person['dob'].split('-'))
aggregate[year][month].append(person['name'])
# I'm sorting in chronological order for printing
for year, months in sorted(aggregate.items()):
print('{}: {}'.format(year, sum(len(names) for names in months.values())))
for month, names in sorted(months.items()):
print('--{}: {}'.format(month_abbr[month], len(names)))
for year, months in sorted(aggregate.items()):
for month, names in sorted(months.items()):
print('{} {}: {}'.format(month_abbr[month], year, len(set(names))))
Depending on how the data was going to be used I'd actually consider not having the complex nesting in the aggregation and instead opt for something like aggregate[(year, month)] = [name1, name2,...]. I find that the more nested my data, the more confusing it is to work with.
EDIT Alternatively you can create several structures on the first pass so the printing step is simplified. Again, I'm using defaultdict to clean up all the provisioning.
agg_years = defaultdict(lambda:defaultdict(int)) # [year][month] = counter
agg_years_total = defaultdict(int) # [year] = counter
agg_months_names = defaultdict(set) # [(year, month)] = set(name1, name2...)
for person in data['persons']:
day, month, year = map(int, person['dob'].split('-'))
agg_years[year][month] += 1
agg_years_total[year] += 1
agg_months_names[(year, month)].add(person['name'])
for year, months in sorted(agg_years.items()):
print('{}: {}'.format(year, agg_years_total[year]))
for month, quant in sorted(months.items()):
print('--{}: {}'.format(month_abbr[month], quant))
for (year, month), names in sorted(agg_months_names.items()):
print('{} {}: {}'.format(month_abbr[month], year, len(names)))

Related

Convert complex comma-separated string into Python dictionary

I am getting following string format from csv file in Pandas
"title = matrix, genre = action, year = 2000, rate = 8"
How can I change the string value into a python dictionary like this:
movie = "title = matrix, genre = action, year = 2000, rate = 8"
movie = {
"title": "matrix",
"genre": "action",
"year": "1964",
"rate":"8"
}

You can split the string and then convert it into a dictionary.
A sample code is given below
movie = "title = matrix, genre = action, year = 2000, rate = 8"
movie = movie.split(",")
# print(movie)
tempMovie = [i.split("=") for i in movie]
movie = {}
for i in tempMovie:
movie[i[0].strip()] = i[1].strip()
print(movie)

For the solution you can use regex
import re
input_user = "title = matrix, genre = action, year = 2000, rate = 8"
# Create a pattern to match the key-value pairs
pattern = re.compile(r"(\w+) = ([\w,]+)" )
# Find all matches in the input string
matches = pattern.findall(input_user)
# Convert the matches to a dictionary
result = {key: value for key, value in matches}
print(result)
The result:
{'title': 'matrix,', 'genre': 'action,', 'year': '2000,', 'rate': '8'}
I hope this can solve your problem.

movie = "title = matrix, genre = action, year = 2000, rate = 8"
dict_all_movies = {}
for idx in df.index:
str_movie = df.at[idx, str_movie_column]
movie_dict = dict(item.split(" = ") for item in str_movie.split(", "))
dict_all_movies[str(idx)] = movie_dict

Storing data from json file using python - adding tuple of data to dictionary

I have a json file with lots of claims like this and I'm trying to loop through the file and for each unique year in reviewDate, store every unique claimant with a counter for how often it appears
{
"text": "“This president, though, for immigrants, there is nothing he will not do to separate a family, cage a child, or erase their existence by weaponizing the census.\"",
"claimant": "Eric Swalwell",
"claimDate": "2019-06-27T00:00:00Z",
"claimReview": [
{
"publisher": {
"name": "PolitiFact",
"site": "politifact.com"
},
"url": "https://www.politifact.com/article/2019/jun/28/fact-checking-2nd-night-democratic-debate-miami/",
"title": "Fact-checking the 2nd night of the Democratic debate in Miami",
"reviewDate": "2019-06-28T16:49:26Z",
"textualRating": "Frequent attack needs context",
"languageCode": "en"
}
]
},
I have this script right now, but it just adds a new entry for every single entry instead of find the claimant in the dictionary and incrementing its counter
def split_by_year(data):
year_dict = {}
claimant_dict = {}
counter = 0
# for every claim in the file
for claim in data['claims']:
# placeholder for year & claimant
year = ''
claimant = ''
if 'claimant' in claim:
claimant = claim['claimant']
# the reviewDate is in the review so we go into it
for review in claim['claimReview']:
# if the review date exists
if 'reviewDate' in review.keys():
# get the year
year = review['reviewDate'][0:4]
if year in year_dict:
# loop through to find the claimant
if claimant in year_dict[year]:
counter += 1
year_dict[year][1] += 1
else:
# claimant doesnt exist
year_dict[year].append([claimant, 1])
else:
# year not in year_dict. Add w/ counter
year_dict[year] = [claimant, 1]
This is the current output
'2019': ['Eric Swalwell',
3,
['Ted Budd', 1],
['Donald Trump', 1],
['Henry Cuellar', 1],
['Mike Pence', 1],
['Mike Pence', 1],
['Michael Bennet', 1],
['Facebook posts', 1],
['Donald Trump', 1],
['Mark Walker', 1],
I'm not sure how to properly add the claimant with a counter under each year. And then also the check if the claimant has already been added, to increment the counter

from collections import defaultdict, Counter
def split_by_year(data):
year_dict = defaultdict(Counter)
# for every claim in the file
for claim in data['claims']:
if 'claimant' in claim:
claimant = claim['claimant']
else:
continue # skip this one, move to next claim
for review in claim['claimReview']:
if 'reviewDate' in review.keys():
year = review['reviewDate'][0:4]
else:
continue # skip this one, move to next claimReview
year_dict[year][claimant] += 1
return year_dict
result = split_by_year(data)
print(result['2019']["Eric Swalwell"])

creating coumn for each output receive in one field in python

I am implementing an emotion analysis using lstm method where I have already done my training model as well as my prediction part. but my prediction is appearing in one column.. I will show you below.
Here are my codes:
with open('output1.json', 'w') as f:
json.dump(new_data, f)
selection1 = new_data['selection1']
#creating empty list to be able to create a dataframe
names = []
dates = []
commentss = []
labels = []
hotelname = []
for item in selection1:
name = item['name']
hotelname.append(name)
#print ('>>>>>>>>>>>>>>>>>> ', name)
Date = item['reviews']
for d in Date:
names.append(name)
#convert date from 'january 12, 2020' to 2020-01-02
date = pd.to_datetime(d['date']).strftime("%Y-%m-%d")
#adding date to the empty list dates[]
dates.append(date)
#print('>>>>>>>>>>>>>>>>>> ', date)
CommentID = item['reviews']
for com in CommentID:
comment = com['review']
lcomment = comment.lower() # converting all to lowercase
result = re.sub(r'\d+', '', lcomment) # remove numbers
results = (result.translate(
str.maketrans('', '', string.punctuation))).strip() # remove punctuations and white spaces
comments = remove_stopwords(results)
commentss.append(comment)
# print('>>>>>>',comments)
#add the words in comments that are already present in the keys of dictionary
encoded_samples = [[word2id[word] for word in comments if word in word2id.keys()]]
# Padding
encoded_samples = keras.preprocessing.sequence.pad_sequences(encoded_samples, maxlen=max_words)
# Make predictions
label_probs, attentions = model_with_attentions.predict(encoded_samples)
label_probs = {id2label[_id]: prob for (label, _id), prob in zip(label2id.items(), label_probs[0])}
labels.append(label_probs)
#creating dataframe
dataframe={'name': names,'date': dates, 'comment': commentss, 'classification': labels}
table = pd.DataFrame(dataframe, columns=['name', 'date', 'comment', 'classification'])
json = table.to_json('hotel.json', orient='records')
here is the results i obtain:
[
{
"name": "Radisson Blu Azuri Resort & Spa",
"date": "February 02, 2020",
"comment": [
"enjoy",
"daily",
"package",
"start",
"welcoming",
"end",
"recommend",
"hotel"
],
"label": {
"joy": 0.0791392997,
"surprise": 0.0002606699,
"love": 0.4324670732,
"sadness": 0.2866959572,
"fear": 0.0002588668,
"anger": 0.2011781186
}
},
you can find the complete output on this link: https://jsonblob.com/a9b4035c-5576-11ea-afe8-1d95b3a2e3fd
Is it possible to break the label field into separate fields like below??
[
{
"name": "Radisson Blu Azuri Resort & Spa",
"date": "February 02, 2020",
"comment": [
"enjoy",
"daily",
"package",
"start",
"welcoming",
"end",
"recommend",
"hotel"
],
"joy": 0.0791392997,
"surprise": 0.0002606699,
"love": 0.4324670732,
"sadness": 0.2866959572,
"fear": 0.0002588668,
"anger": 0.2011781186
},
Can someone please help me how do i need to modify my codes and make this possible please guys explain to me please..

If you can't do it before you produce the result, you can easily manipulate that dictionary like so:
def move_labels_to_dict_root(result):
labels = result["labels"]
meta_data = result
del meta_data["labels"]
result = {**meta_data, **labels}
return result
and then call move_labels_to_dict_root in a list comprehension like [move_labels_to_dict_root(result) for result in results].
However, I would ask why you want to do this?

Pandas - DateTime groupby to structured dict

I have a dataset which contains a DateTime field. I need to group by hours and dispatch each group to a dictionary with the following structure:
{year_1:
{month_1:
{week_1:
{day_1:
{hour_1: df_1, hour_2: df_2}
}
},
{week_2:
{day_1:
{hour_1: df_1}
}
}
},
{month_3:
{week_1:
{day_1:
{hour_1: df_1, hour_2: df_2}
}
}
},
year_2:
{month_5:
{week_1:
{day_1:
{hour_2: df_2}
}
}
}
}
To do that I am using the following code:
import pandas as pd
df = df = pd.DataFrame({'date': [pd.datetime(2015,3,17,2), pd.datetime(2014,3,24,3), pd.datetime(2014,3,17,4)], 'hdg_id': [4041,4041,4041],'stock': [1.0,1.0,1.0]})
df.loc[:,'year'] = [x.year for x in df['date']]
df.loc[:,'month'] = [x.month for x in df['date']]
df.loc[:,'week'] = [x.week for x in df['date']]
df.loc[:,'day'] = [x.day for x in df['date']]
df.loc[:,'hour'] = [x.hour for x in df['date']]
result = {}
for to_unpack, df_hour in df.groupby(['year','month','day','week','hour']):
year, month, week, day, hour = to_unpack
try:
result[year]
except KeyError:
result[year] = {}
try:
result[year][month]
except KeyError:
result[year][month] = {}
try:
result[year][month][week]
except KeyError:
result[year][month][week] = {}
try:
result[year][month][week][day]
except KeyError:
result[year][month][week][day] = {}
result[year][month][week][day][hour] = df_hour
As you can see this is pretty much a brute-force solution and I was looking for something that looks more clean and understandable. Furthermore, it is also extremely slow. I tried different ways for grouping (Python Pandas Group by date using datetime data) and I also tried a multindex with each component of datetime (Pandas DataFrame with MultiIndex: Group by year of DateTime level values). However, the problem is always how to create the dict. Ideally, I would like just to write something like:
result[year][month][week][day][hour] = df_hour
but to the best of my knowledge, I first need to initialize each dict.

You need dict.setdefault
result = {}
for to_unpack, df_hour in df.groupby(['year','month','day','week','hour']):
year, month, week, day, hour = to_unpack
result.setdefault(year, {}) \
.setdefault(month, {}) \
.setdefault(week, {}) \
.setdefault(day, {}) \
.setdefault(hour, df_hour)
You can also subclass dict to do this
class Fict(dict):
def __getitem__(self, item):
return super().setdefault(item, type(self)())
result = Fict()
for to_unpack, df_hour in df.groupby(['year','month','day','week','hour']):
year, month, week, day, hour = to_unpack
result[year][month][week][day][hour] = df_hour

Parsing through JSON file with python and selecting multiple values on certain conditions

I have this JSON file.
{
"reviewers":[
{
"user":{
"name":"keyname",
"emailAddress":"John#email",
"id":3821,
"displayName":"John Doe",
"active":true,
"slug":"jslug",
"type":"NORMAL",
"link":{
"url":"/users/John",
"rel":"self"
},
},
"role":"REVIEWER",
"approved":true
},
{
"user":{
"name":"keyname2",
"emailAddress":"Harry#email",
"id":6306,
"displayName":"Harry Smith",
"active":true,
"slug":"slug2",
"link":{
"type":"NORMAL",
"url":"/users/Harry",
"rel":"self"
},
},
"role":"REVIEWER",
"approved":false
}
],
}
Initially, I was using a snippet of code that would go through and grab the full names of the reviewers.
def get_reviewers(json):
reviewers = ""
for key in json["reviewers"]:
reviewers += key["user"]["displayName"] + ", "
reviewers = reviewers[:-2]
return reviewers
which would return "John Doe, Harry Smith". However, now I'm trying to get it so that the script will return a (A) next to the name of the user if their tag equals true "approved"=true.
So for example the code above would get the names, then see that John's approved tag is true and Harry's is false, then return "John Doe(A), Harry Smith". I'm just not sure where to even begin to do this. Can anyone point me in the right direction?
This is what I've been trying so far but obviously it isn't working as I'd like it to.
def get_reviewers(stash_json):
reviewers = ""
for key in stash_json["reviewers"]:
if stash_json["reviewers"][0]["approved"] == true:
reviewers += key["user"]["displayName"] + "(A)" + ", "
else:
reviewers += key["user"]["displayName"] + ", "
reviewers = reviewers[:-2]
return reviewers
which outputs Jason Healy(A), Joan Reyes(A)
This is what my stash_json outputs when put through pprint.

You probably want something along the lines of this:
def get_reviewers(stash_json):
reviewers = ""
for item in stash_json["reviewers"]:
if item["approved"]:
reviewers += item["user"]["displayName"] + "(A)" + ", "
else:
reviewers += item["user"]["displayName"] + ", "
reviewers = reviewers[:-2]
return reviewers
I think part of your confusion comes from the fact that "reviewers" is a list of dict elements, and each dict element has a key-value approved, but also a key "user" which value itself is another dict.
Read the JSON file carefully, and for debugging purposes, use plenty of
print(...)
print(type(...)) # whether something is a dict, list, str, bool etc
or
from pprint import pprint # pretty printing
pprint(...)

This looks like a good place to use join and list comprehension:
def get_reviewers(stash_json):
return ", ".join([item['user']['displayName'] + ('(A)' if item['approved'] else '') for item in stash_json['reviewers']])

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Nested Counter for json data - python

Related

Convert complex comma-separated string into Python dictionary

Storing data from json file using python - adding tuple of data to dictionary

creating coumn for each output receive in one field in python

Pandas - DateTime groupby to structured dict

Parsing through JSON file with python and selecting multiple values on certain conditions

Categories

Resources