I am writing a parser that goes through a list of data that is roughly formatted:
{
"teachers": [
{
"fullName": "Testing",
"class": [
{
"className": "Counselor",
"school": {
"id": "2b6671cb-617d-48d6-b0b5-3d44ce4da21c"
}
}
]
},
...
}
The parser is supposed to check for duplicate names within this json object, and when it stumbles upon said duplicate name, append the class to the class array.
So for example:
{
"teachers": [
{
"fullName": "Testing",
"class": [
{
"className": "Counselor",
"school": {
"id": "2b6671cb-617d-48d6-b0b5-3d44ce4da21c"
}
}
]
},
{
"fullName": "Testing",
"class": [
{
"className": "Math 8",
"school": {
"id": "2b6671cb-617d-48d6-b0b5-3d44ce4da21c"
}
}
]
},
...
}
Would return
{
"teachers": [
{
"fullName": "Testing",
"class": [
{
"className": "Counselor",
"school": {
"id": "2b6671cb-617d-48d6-b0b5-3d44ce4da21c"
}
},
{
"className": "Math 8",
"school": {
"id": "2b6671cb-617d-48d6-b0b5-3d44ce4da21c"
}
},
]
},
...
}
My current parser works just fine for most objects, however for some reason it doesn't catch some of the duplicates despite the names being the exact same, and also is appending the string
}7d-48d6-b0b5-3d44ce4da21c"
}
}
]
}
]
to the end of the json document. I am not sure why it would do this considering I am just dumping the modified json (which only is modified within the array).
My parser code is:
i_duplicates = []
name_duplicates = []
def converter():
global i_duplicates
file = open("final2.json", "r+")
infinite = json.load(file)
for i, teacher in enumerate(infinite["teachers"]):
class_name = teacher["class"][0]["className"]
class_data = {
"className": class_name,
"school": {
"id": "2b6671cb-617d-48d6-b0b5-3d44ce4da21c"
}
}
d = {
"fullName": teacher["fullName"],
"index": i
}
c = {
"fullName": teacher["fullName"]
}
if c in name_duplicates:
infinite["teachers"][search(i_duplicates, c["fullName"])]["class"].append(class_data)
infinite["teachers"].pop(i)
file.seek(0)
json.dump(infinite, file, indent=4)
else:
i_duplicates.append(d)
name_duplicates.append(c)
def search(a, t):
for i in a:
if i["fullName"] == t:
return i["index"]
print(Fore.RED + "not found" + Fore.RESET)
I know I am going about this inefficiently, but I am not sure how to fix the issues the current algorithm is having. Any feedback appreciated.
Related
hi I'm pretty new at coding and I was trying to create a program in python that reads and save in another file the data inside a json file (not everything, just what I want). I googled how to parse data but there's something I don't understand.
that's a part of the json file:
`
{
"profileRevision": 548789,
"profileId": "campaign",
"profileChangesBaseRevision": 548789,
"profileChanges": [
{
"changeType": "fullProfileUpdate",
"profile": {
"_id": "2da4f079f8984cc48e84fc99dace495d",
"created": "2018-03-29T11:02:15.190Z",
"updated": "2022-10-31T17:34:43.284Z",
"rvn": 548789,
"wipeNumber": 9,
"accountId": "63881e614ef543b2932c70fed1196f34",
"profileId": "campaign",
"version": "refund_teddy_perks_september_2022",
"items": {
"8ec8f13f-6bf6-4933-a7db-43767a055e66": {
"templateId": "Quest:heroquest_loadout_constructor_2",
"attributes": {
"quest_state": "Claimed",
"creation_time": "min",
"last_state_change_time": "2019-05-18T16:09:12.750Z",
"completion_complete_pve03_diff26_loadout_constructor": 300,
"level": -1,
"item_seen": true,
"sent_new_notification": true,
"quest_rarity": "uncommon",
"xp_reward_scalar": 1
},
"quantity": 1
},
"6940c71b-c74b-4581-9f1e-c0a87e246884": {
"templateId": "Worker:workerbasic_sr_t01",
"attributes": {
"gender": "2",
"personality": "Homebase.Worker.Personality.IsDreamer",
"level": 1,
"item_seen": true,
"squad_slot_idx": -1,
"portrait": "WorkerPortrait:IconDef-WorkerPortrait-Dreamer-F02",
"building_slot_used": -1,
"set_bonus": "Homebase.Worker.SetBonus.IsMeleeDamageLow"
}
}
}
]
}
`
I can access profileChanges. I wrote this to create another json file with only the profileChanges things:
`
myjsonfile= open("file.json",'r')
jsondata=myjsonfile.read()
obj=json.loads(jsondata)
ciso=obj['profileChanges']
for i in ciso:
print(i)
with open("file2", "w") as outfile:
json.dump( ciso, outfile, indent=1)
the issue I have is that I can't access "profile" (inside profileChanges) in the same way by parsing the new file and I have no idea on how to do it
Access to JSON or dict element is realized by list indexes, please look at below example:
a = [
{
"friends": [
{
"id": 0,
"name": "Reba May"
}
],
"greeting": "Hello, Doris Gallagher! You have 2 unread messages.",
"favoriteFruit": "strawberry"
},
]
b = a['friends']['id] # b = 0
I've added a couple of closing braces to make your snippet valid json:
s = '''{
"profileRevision": 548789,
"profileId": "campaign",
"profileChangesBaseRevision": 548789,
"profileChanges": [
{
"changeType": "fullProfileUpdate",
"profile": {
"_id": "2da4f079f8984cc48e84fc99dace495d",
"created": "2018-03-29T11:02:15.190Z",
"updated": "2022-10-31T17:34:43.284Z",
"rvn": 548789,
"wipeNumber": 9,
"accountId": "63881e614ef543b2932c70fed1196f34",
"profileId": "campaign",
"version": "refund_teddy_perks_september_2022",
"items": {
"8ec8f13f-6bf6-4933-a7db-43767a055e66": {
"templateId": "Quest:heroquest_loadout_constructor_2",
"attributes": {
"quest_state": "Claimed",
"creation_time": "min",
"last_state_change_time": "2019-05-18T16:09:12.750Z",
"completion_complete_pve03_diff26_loadout_constructor": 300,
"level": -1,
"item_seen": true,
"sent_new_notification": true,
"quest_rarity": "uncommon",
"xp_reward_scalar": 1
},
"quantity": 1
},
"6940c71b-c74b-4581-9f1e-c0a87e246884": {
"templateId": "Worker:workerbasic_sr_t01",
"attributes": {
"gender": "2",
"personality": "Homebase.Worker.Personality.IsDreamer",
"level": 1,
"item_seen": true,
"squad_slot_idx": -1,
"portrait": "WorkerPortrait:IconDef-WorkerPortrait-Dreamer-F02",
"building_slot_used": -1,
"set_bonus": "Homebase.Worker.SetBonus.IsMeleeDamageLow"
}
}
}
}
}
]
}
'''
d = json.loads(s)
print(d['profileChanges'][0]['profile']['version'])
This prints refund_teddy_perks_september_2022
Explanation:
d is a dict
d['profileChanges'] is a list of dicts
d['profileChanges'][0] is the first dict in the list
d['profileChanges'][0]['profile'] is a dict
d['profileChanges'][0]['profile']['version'] is the value of version key in the profile dict in the first entry of the profileChanges list.
I have written a recursive code. I want more experienced people to tell me how resillient and fail-safe is my code:
I have a json file (Json file can be as big as 300MB):
[
{
"modules": {
"webpages": []
},
"webpages": {
"ip_addr": {
"value": "127.0.0.1",
"tags": []
},
"http": {
"status": {
"value": "Unavailable",
"tags": []
},
"title": {
"value": "403 Forbidden",
"tags": [
{
"category": "Server Code",
"match": "403"
},
{
"category": "Interesting Words",
"match": "Forbidden"
}
]
},
"server": {
"value": "Apache",
"tags": [
{
"category": "Apache Server",
"match": "Apache"
}
]
}
},
"redirects": [],
"robottxt": null
}
},
{
"modules": {
"webpages": []
}
}
]
I want to return value keys where tags are populated.
So I want to ignore:
"status": {
"value": "Unavailable",
"tags": []
},
But I want to return the title and server values. I also want to return ip_addr.value
I have written this code:
def getAllValues(nestedDictionary, firstArray, firstObj, firstUseful):
returnedArray = firstArray
tempValue = firstObj
useful = firstUseful
for key, value in nestedDictionary.items():
ipString = nestedDictionary.get("ip_addr")
if ipString is not None:
ipValue = ipString.get("value")
useful = {"ip_add": ipValue}
if isinstance(value, dict):
temp = {
"Key": key,
"useful": useful,
}
getAllValues(value, returnedArray, temp, useful)
else:
if key == "value":
tempValue["value"] = value
if key == "tags" and isinstance(value, list) and len(value) > 0:
tempValue["tags"] = value
returnedArray.append(tempValue)
return returnedArray
The above code should return:
[
{
"Key": "title",
"value": "403 Forbidden",
"useful": { "ip_addr": "127.0.0.1" },
"tags": [
{
"category": "Server Code",
"match": "403"
},
{
"category": "Interesting Words",
"match": "Forbidden"
}
]
},
{
"Key": "server",
"value": "Apache",
"useful": { "ip_addr": "127.0.0.1" },
"tags": [
{
"category": "Apache Server",
"match": "Apache"
}
]
}
]
Its a long post, but hopefully, someone can give me some assurance :)
This is the json structure that I am having inside of a python file. Here the stationList_of_state is a python list which has some 5-10 values which will change dynamically based on the code.
message = {
"type": "template",
"payload": {
"template_type": "generic",
"elements":
[
{
"buttons": [
{
"title": stationList_of_state[1],
"payload": stationList_of_state[1],
"type": "postback"
}
]
}
]
}
}
I have tried something like this which showed errors:
message = {
"type": "template",
"payload": {
"template_type": "generic",
"elements":
[
for i in range(len(stationList_of_state)):
{
"buttons": [
{
"title": stationList_of_state[i],
"payload": stationList_of_state[i],
"type": "postback"
}
]
}
]
}
}
Can someone suggest an alternate approach to what I have did?
You're almost there:
message = {
"type": "template",
"payload": {
"template_type": "generic",
"elements": [
{
"buttons": [
{
"title": stationList_of_state[i],
"payload": stationList_of_state[i],
"type": "postback",
}
]
}
for i in range(len(stationList_of_state))
],
},
}
or, simplifying the for clause to omit the unnecessary i variable,
message = {
"type": "template",
"payload": {
"template_type": "generic",
"elements": [
{
"buttons": [
{
"title": station,
"payload": station,
"type": "postback",
}
]
}
for station in stationList_of_state
],
},
}
I'm trying to go through a JSON by using python but I can't access the "mbid" node. I want to print only the first "mbid" node.
Here is my function :
def get_data():
newJsonx = dict()
for item in data["resultsPage"]["results"]["calendarEntry"]:
mbid = item["event"]["performance"][0]["artist"]["identifier"][0]["mbid"]
With this function i get this error : IndexError: list index out of range
but when I'm doing
def get_data():
newJsonx = dict()
for item in data["resultsPage"]["results"]["calendarEntry"]:
mbid = item["event"]["performance"][0]["artist"]["identifier"]
And print(mbid), I'm getting a correct answer :
"identifier": [
{
"mbid": "6655955b-1c1e-4bcb-84e4-81bcd9efab30"
},
{
"mbid": "1b1b1b1b-1c1d"
}
]
So means I don't have a problem with the data. Maybe I'm doing something wrong with the second array?
Here is an example of the JSON structure :
{
"resultsPage": {
"status": "ok",
"results": {
"calendarEntry": [
{
"reason": {
},
"event": {
"performance": [
{
"id": 72641494,
"displayName": "Arnalds",
"artist": {
"id": 590465,
"identifier": [
{
"mbid": "6655955b-1c1e-4bcb-84e4-81bcd9efab30"
},
{
"mbid": "1b1b1b1b-1c1d"
}
]
}
}
]
}
}
]
}
}
}
Thanks for your time
def get_data():
newJsonx = dict()
for item in data["resultsPage"]["results"]["calendarEntry"]:
performance=item["event"]["performance"]
if performace:
identifier=performace[0]["artist"]["identifier"]
if identifier:
mbid=identifier[0]["mbid"]
There are 18 records under a company but I can see only 3 records from them. Below is my query and the python code.
{
"query": {
"nested": {
"inner_hits": {
"_source": [
"name",
"country",
"_matched_experiences.role"
]
},
"path": "socials",
"query": {
"match": { "socials._has_email": "true"
}
}
}
},
"_source": [
"com_name"
]
}
Below is my python code.
with open(OUTPUT_FILENAME_1, "a") as f1:
csv_writer_1 = csv.writer(f1)
csv_writer_1.writerow(["company_name","name","country","role"])
query_dictionary = {above query}
scroll = elasticsearch.helpers.scan(es, query=query_dictionary, index=companydirectory, scroll='60m', size=800)
for res in scroll:
try:
record_fields = res["_source"]
name = ""
com_name = ""
company_name = record_fields.get("com_name") #from the ES
name_record_fields = res["inner_hits"]["social_contacts"]["hits"]["hits"]
for j in name_record_fields:
name = j['_source']['name'] #from ES
k = j['_source']['_matched_experiences']
role = k[0].get('role')
country = j['_source']['country']
print company_name,name,validated_email_fromES, function_id,level_id, country, company_name,role
# csv_writer_1.writerow([company_name.encode('utf8'),name.encode('utf8'),country,role.encode('utf8')])
except Exception as e1:
pass
This is the sample output from ES :
"_source": {
"company_name": "Rothborns"
},
"inner_hits": {
"social_contacts": {
"hits": {
"total": 18,
"max_score": 9.87977,
"hits": [
{
"_type": "comp_directory",
"_id": "MC9MY",
"_nested": {
"field": "socials",
"offset": 36
},
"_score": 9.787,
"_source": {
"country": "SA",
"name": "warner Pauli",
"_has_email": true,
"_matched_experiences": [
{
"role": "Financial Controller"
}
]
}
The total record is 18 in Rothborns. But I can get only 3 records from Rothborns in the output file.
Kindly help. Thanks.
The reason is because when using inner_hits the size is 3 by default. You simply need to change your query to this:
{
"query": {
"nested": {
"inner_hits": {
"size": 100, <--- add this
"_source": [
"name",
"country",
"_matched_experiences.role"
]
},
"path": "socials",
"query": {
"match": {
"socials._has_email": "true"
}
}
}
},
"_source": [
"com_name"
]
}