python json remove() element issue for a while condition - python

I have a json file in the following format. What I need to do is to get rid of the whole dictionary of elements where the "URI" does not contain "http"
[
{
"Exchange Name": "Standard & Poor's Indices",
"Number": 33.0,
"URI": "http://us.spindices.com/documents/additional-material/spdji-fixed-income-calendar-2016.xlsx",
"ref_id": "33_Standard_&_Poor's_Indices"
},
{
"Exchange Name": "ISE Mercury",
"Number": 36.0,
"URI": "follow OPRA",
"ref_id": "36_ISE_Mercury"
},
{
"Exchange Name": "Aequitas Neo",
"Number": 37.0,
"URI": "email for holidays",
"ref_id": "37_Aequitas_Neo"
},
{
"Exchange Name": "FINRA SPDS 144A",
"Number": 38.0,
"URI": "https://www.finra.org/industry/trace/trace-holiday-calendar",
"ref_id": "38_FINRA_SPDS_144A"
}
]
So far I have stuck with the following function. Here the problem is that remove() does not actually remove a 'URI' element from the string. But after I run the code for the second time, it works. I think I need to use a while loop for this, but how do I implement it in this setting.
def sys_validate_data():
with open('./data_out/uniq_set.json') as jf:
json_decoded = json.load(jf)
for ix in json_decoded:
if "http" not in ix["URI"]:
json_decoded.remove(ix)
with open('./data_out/uniq_set.json', 'w') as fpw:
json.dump(list(json_decoded), fpw, sort_keys=True, indent=4)

Don't modify a list while iterating over it. Doing so can produce unexpected behavior. Instead, you can use a list comprehension to filter elements from your JSON list:
def sys_validate_data():
with open('./data_out/uniq_set.json') as jf:
json_decoded = [ix for ix in json.load(jf) if "http" in ix["URI"]]
....

Use a list comprehension and don't alter a list while iterating:
validated_json = [entry for entry in json_decoded if entry['URI'].startswith('http')]
Extended example:
def sys_validate_data():
with open('./data_out/uniq_set.json') as jf:
json_decoded = json.load(jf)
validated_json = [entry for entry in json_decoded if entry['URI'].startswith('http')]
with open('./data_out/uniq_set.json', 'w') as fpw:
json.dump(validated_json, fpw, sort_keys=True, indent=4)

Related

Python check value of key in element and return element in list

I have a json file, and I'm reading this file with json library
This is the json content (example)
{
"type": "champion",
"format": "standAloneComplex",
"version": "10.18.1",
"data": {
"Aatrox": {
"version": "10.18.1",
"id": "Aatrox",
"key": "266",
"name": "Aatrox"
},
"Ahri": {
"version": "10.18.1",
"id": "Ahri",
"key": "103",
"name": "Ahri",
},
}
Now how can I check if key is equal to 266 and return the value of name?
I was trying with something like this
import json
with open('./source/champion.json') as json_file:
data_champs = json.load(json_file)['data']
for champ in data_champs:
for champ_info in data_champs[champ]:
if champ['key'] == 266:
print(champ)
But return TypeError: string indices must be integers
Try the following:
import json
with open('./source/champion.json') as json_file:
for name, info in json.load(json_file)['data'].items():
if info['key'] == 266:
print(name)
Or even better, we can close the file after we get the data and not keep it open during processing:
import json
with open('./source/champion.json') as json_file:
data = json.load(json_file)['data']
for name, info in data.items():
if info['key'] == 266:
print(name)
Explanation
The easiest way to iterate over a dict's elements is by using its .items() method:
for key, value in d.items():
print(key, "-->", value)
below (iterating over the values only since the keys are not important here)
import json
with open('data.json') as f:
data = json.load(f)['data']
for v in data.values():
if v['key'] == '266':
print(v['name'])
break
output
Aatrox
Here you go:
import json
with open('champion.json') as json_file:
data_champs = json.load(json_file)['data']
for data in data_champs.keys():
if data_champs[data]['key']=='266':
print(data_champs[data]['name'])
Prints:
Aatrox

Not able to display json value

I have an inventory which I collected from ansible and I'm reading it with python but I have no sucess to print the values here, so I have the following:
the content below is in a file called "localhost".
{
"ansible_facts": {
"facter_architecture": "x86_64",
"facter_bios_release_date": "12/01/2006",
"facter_bios_vendor": "innotek GmbH",
"facter_bios_version": "VirtualBox",
"facter_blockdevice_sda_model": "VBOX HARDDISK",
"facter_blockdevice_sda_size": 10737418240,
"facter_blockdevice_sda_vendor": "ATA",
"facter_blockdevice_sr0_model": "CD-ROM",
"facter_blockdevice_sr0_size": 1073741312,
"facter_blockdevice_sr0_vendor": "VBOX",
"facter_blockdevices": "sda,sr0",
"facter_boardmanufacturer": "Oracle Corporation",
"facter_boardproductname": "VirtualBox",
"facter_boardserialnumber": "0",
"facter_dhcp_servers": {
"enp0s3": "10.0.2.2",
"enp0s8": "192.168.1.1",
"system": "10.0.2.2"
},
"facter_domain": "home",
"facter_facterversion": "2.4.1",
"facter_filesystems": "xfs",
"facter_fqdn": "mylab.home",
"facter_gid": "root",
"facter_hardwareisa": "x86_64",
"facter_hardwaremodel": "x86_64",
"facter_hostname": "mylab",
"facter_id": "root",
"facter_interfaces": "enp0s3,enp0s8,lo",
"facter_ipaddress": "10.0.2.15",
"facter_ipaddress_enp0s3": "10.0.2.15",
"facter_ipaddress_enp0s8": "192.168.1.101",
"facter_ipaddress_lo": "127.0.0.1",
"facter_is_virtual": true,
"facter_kernel": "Linux",
"facter_kernelmajversion": "3.10",
"facter_kernelrelease": "3.10.0-1127.13.1.el7.x86_64",
"facter_kernelversion": "3.10.0",
"facter_macaddress": "08:00:27:dd:47:a8",
"facter_macaddress_enp0s3": "08:00:27:dd:47:a8",
"facter_macaddress_enp0s8": "08:00:27:12:ce:46",
"facter_manufacturer": "innotek GmbH",
"facter_memoryfree": "730.39 MB",
"facter_memoryfree_mb": "730.39",
"facter_memorysize": "990.98 MB",
"facter_memorysize_mb": "990.98",
"facter_mtu_enp0s3": 1500,
"facter_mtu_enp0s8": 1500,
"facter_mtu_lo": 65536,
"facter_netmask": "255.255.255.0",
"facter_netmask_enp0s3": "255.255.255.0",
"facter_netmask_enp0s8": "255.255.255.0",
"facter_netmask_lo": "255.0.0.0",
"facter_network_enp0s3": "10.0.2.0",
"facter_network_enp0s8": "192.168.1.0",
"facter_network_lo": "127.0.0.0",
"facter_operatingsystem": "CentOS",
"facter_operatingsystemmajrelease": "7",
"facter_operatingsystemrelease": "7.8.2003",
"facter_os": {
"family": "RedHat",
"name": "CentOS",
"release": {
"full": "7.8.2003",
"major": "7",
"minor": "8"
}
},
"facter_osfamily": "RedHat",
"facter_partitions": {
"sda1": {
"filesystem": "xfs",
"mount": "/boot",
"size": "2097152",
"uuid": "987fb5e2-f636-423b-997d-c2654993708c"
},
"sda2": {
"filesystem": "LVM2_member",
"size": "18872320"
}
},
"facter_path": "/root/.rbenv/shims:/root/.rbenv/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/usr/bin/python3:/root/bin",
"facter_physicalprocessorcount": 1,
"facter_processor0": "Intel(R) Core(TM) i5-5350U CPU # 1.80GHz",
"facter_processorcount": 1,
"facter_processors": {
"count": 1,
"models": [
"Intel(R) Core(TM) i5-5350U CPU # 1.80GHz"
],
"physicalcount": 1
},
"facter_productname": "VirtualBox",
"facter_ps": "ps -ef",
"facter_rubyplatform": "x86_64-linux",
"facter_rubysitedir": "/usr/local/share/ruby/site_ruby/",
"facter_rubyversion": "2.0.0",
"facter_selinux": false,
"facter_serialnumber": "0",
"facter_sshecdsakey": "AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBE8u+HCceoSA9mrw1oboY4sAXhkgp4CBTe8QjuW2OAeYIQS7LblPztfQmJFkXpHTWhLvSBYglzcuZiJOzUbZQ/0=",
"facter_sshed25519key": "AAAAC3NzaC1lZDI1NTE5AAAAIDutjvzHFEKqc0bprmbkm9ZUoADflkan6dnCVcYsGOTT",
"facter_sshfp_ecdsa": "SSHFP 3 1 2190a1073f110b50ed6fb912cd04144603a85098\nSSHFP 3 2 3a708ee555593b0ad5a2f1992ae949d56d2f3556c37b201ef683d4a3ea850660",
"facter_sshfp_ed25519": "SSHFP 4 1 695978669c105b4a3a06c4b9a685020363b72c67\nSSHFP 4 2 794f24ad5bd7b91a1d3a0f484ceb166088ac8d57e3e1682e8b8fe006fde1c169",
"facter_sshfp_rsa": "SSHFP 1 1 07afd9583d0785ac923230bef3b0d0ffeefad097\nSSHFP 1 2 29158514b311cc7687fa1c3aab1fa1abee0f2f581eb3d607a6b4ffb8ff258d59",
"facter_sshrsakey": "AAAAB3NzaC1yc2EAAAADAQABAAABAQCtgyTEG+VnEnXiiaP4tFpIiWwWfqxdW8BCATa5W9QE0AsfY1OiFoLRXYGqhL72q0N+VTHQGB7eB1sd9Nas48erDzZXpgLoIDqM1pa/vT/j/SygQB2rwgo2wga0tw+zW1cw+sELjXHAYsi8DADKbGlX2cCeT3MKeWdkg+BQogf74Sy4NEPbYhILXPfvt3cJxCM02sIn/eQL+n06iSzesUIEy5n+AlRgACR3zHnk5rtHipj/RzmPv+J0V3du7+g6/3TiKDcBTNHtb8QSa4DSGkmbW7Wdhvnw9GyhO5ySGB2G3rmLiVIm9vdjB9L/X/L2g8TB5+/dO52UxUSX17nwzuJB",
"facter_swapfree": "1024.00 MB",
"facter_swapfree_mb": "1024.00",
"facter_swapsize": "1024.00 MB",
"facter_swapsize_mb": "1024.00",
"facter_system_uptime": {
"days": 0,
"hours": 3,
"seconds": 12984,
"uptime": "3:36 hours"
},
"facter_timezone": "CEST",
"facter_type": "Other",
"facter_uniqueid": "000a0f02",
"facter_uptime": "3:36 hours",
"facter_uptime_days": 0,
"facter_uptime_hours": 3,
"facter_uptime_seconds": 12984,
"facter_uuid": "b64ed9b0-7168-4e49-a34a-90e6ea6f751a",
"facter_virtual": "kvm"
},
"changed": false
}
I have the following code.
I tried a lot of deferents stuffs, but somehow it complains about "list indices must be integers or slices, not str"
import json
dict = []
with open('localhost', 'r') as jsonfile:
myfile = json.load(jsonfile)
result = json.dumps(myfile, indent=2, sort_keys=True)
dict.append(result)
print(dict['ansible_facts'])
From this list I need to extract
facter_system_uptime and facter_hostname only but I had no success.
I tried to put in a loop, but still same issue.
for data in dict['ansible_facts']:
print(data)
What's wrong here?
you don't need to use json.dumps because you've already used json.load to deserialize the JSON document to a Python object.
import json
with open("localhost") as jsonfile:
myfile = json.load(jsonfile)
print(myfile["ansible_facts"]["facter_system_uptime"])
print(myfile["ansible_facts"]["facter_hostname"])
Don't use dict as the name for a variable - it shadows the builtin and will cause you problems.
You created a list (mydict = []) and then you appended your parsed json to it. That means, to access it, you must use mydict[0]['ansible_facts'].
I do not know what ansible is, but to my knowledge, your error is because you initilized dict as a list:
dict = []
which you can access its elements by numerical indices, but instead you gave it a string:
print(dict['ansible_facts'])
this is how you initialize a dictionary:
dict = {}
There are two issues here.
You have created a list instead of dict
You have used reserved key(dict) as variable name
So the final code should be something like
import json
with open('localhost', 'r') as jsonfile:
myfile = json.load(jsonfile)
print(myfile['ansible_facts']['facter_system_uptime'])
print(myfile['ansible_facts']['facter_hostname'])
If you want to print all the data using loop
You can iterate on myfile['ansible_facts'] like this.
import json
with open('localhost', 'r') as jsonfile:
myfile = json.load(jsonfile)
for key, value in myfile['ansible_facts'].items():
print(key, value)

Reading json in python separated by newlines

I am trying to read some json with the following format. A simple pd.read_json() returns ValueError: Trailing data. Adding lines=True returns ValueError: Expected object or value. I've tried various combinations of readlines() and load()/loads() so far without success.
Any ideas how I could get this into a dataframe?
{
"content": "kdjfsfkjlffsdkj",
"source": {
"name": "jfkldsjf"
},
"title": "dsldkjfslj",
"url": "vkljfklgjkdlgj"
}
{
"content": "djlskgfdklgjkfgj",
"source": {
"name": "ldfjkdfjs"
},
"title": "lfsjdfklfldsjf",
"url": "lkjlfggdflkjgdlf"
}
The sample you have above isn't valid JSON. To be valid JSON these objects need to be within a JS array ([]) and be comma separated, as follows:
[{
"content": "kdjfsfkjlffsdkj",
"source": {
"name": "jfkldsjf"
},
"title": "dsldkjfslj",
"url": "vkljfklgjkdlgj"
},
{
"content": "djlskgfdklgjkfgj",
"source": {
"name": "ldfjkdfjs"
},
"title": "lfsjdfklfldsjf",
"url": "lkjlfggdflkjgdlf"
}]
I just tried on my machine. When formatted correctly, it works
>>> pd.read_json('data.json')
content source title url
0 kdjfsfkjlffsdkj {'name': 'jfkldsjf'} dsldkjfslj vkljfklgjkdlgj
1 djlskgfdklgjkfgj {'name': 'ldfjkdfjs'} lfsjdfklfldsjf lkjlfggdflkjgdlf
Another solution if you do not want to reformat your files.
Assuming your JSON is in a string called my_json you could do:
import json
import pandas as pd
splitted = my_json.split('\n\n')
my_list = [json.loads(e) for e in splitted]
df = pd.DataFrame(my_list)
Thanks for the ideas internet. None quite solved the problem in the way I needed (I had lots of newline characters in the strings themselves which meant I couldn't split on them) but they helped point the way. In case anyone has a similar problem, this is what worked for me:
with open('path/to/original.json', 'r') as f:
data = f.read()
data = data.split("}\n")
data = [d.strip() + "}" for d in data]
data = list(filter(("}").__ne__, data))
data = [json.loads(d) for d in data]
with open('path/to/reformatted.json', 'w') as f:
json.dump(data, f)
df = pd.read_json('path/to/reformatted.json')
If you can use jq then solution is simpler:
jq -s '.' path/to/original.json > path/to/reformatted.json

remove duplicates data from complex json file in python

I have a complex json file it included nested dics in it.
it looks like this
{
"objectivelist": [{
"measureid": "1122",
"gradeID": "4222332",
"graduationdate": "May",
"system": {
"platform": "MAC",
"TeacherName": "Mike",
"manager": "Jim",
"studentinfomation": {
"ZIP": "94122",
"city": "SF"
}
},
"measureid": "1122",
"gradeID": "4222332",
"graduationdate": "May",
"system": {
"platform": "MAC",
"TeacherName": "joshe",
"manager": "steven"
},
"studentinfomation": {
"ZIP": "94122",
"city": "SF"
}
}]
}
Here the grade ID and Measured ID are the same, so the result should only need to show one times, and my result should be like this:
{"measureid":"1122","gradeID"4222332","graduationdate":"May"}
I do not need the managername, teachername etc.
not sure how to do this. I try to use comprehensation but do not know who to use it in nest dictionary.
Thank you guys.
Depending on how huge the json file is you may need better solution. We will hash fields which are of interest to us and build the unique json iteratively.
check_set = set()
output = []
interesting_fields = ['measureid', 'gradeID', 'graduationdate']
for dat in X['objectivelist']:
m = hashlib.md5()
m.update(dat['measureid'].encode('utf-8'))
m.update(dat['gradeID'].encode('utf-8'))
m.update(dat['graduationdate'].encode('utf-8'))
digest = m.hexdigest()
if digest not in check_set:
output.append({key: dat[key] for key in ['measureid', 'gradeID', 'graduationdate']})
check_set.add(digest)
And you can find your output in output.

Adding key to values in json using Python

This is the structure of my JSON:
"docs": [
{
"key": [
null,
null,
"some_name",
"12345567",
"test_name"
],
"value": {
"lat": "29.538208354844658",
"long": "71.98762580927113"
}
},
I want to add the keys to the key list. This is what I want the output to look like:
"docs": [
{
"key": [
"key1":null,
"key2":null,
"key3":"some_name",
"key4":"12345567",
"key5":"test_name"
],
"value": {
"lat": "29.538208354844658",
"long": "71.98762580927113"
}
},
What's a good way to do it. I tried this but doesn't work:
for item in data['docs']:
item['test'] = data['docs'][3]['key'][0]
UPDATE 1
Based on the answer below, I have tweaked the code to this:
for number, item in enumerate(data['docs']):
# pprint (item)
# print item['key'][4]
newdict["key1"] = item['key'][0]
newdict["yek1"] = item['key'][1]
newdict["key2"] = item['key'][2]
newdict["yek2"] = item['key'][3]
newdict["key3"] = item['key'][4]
newdict["latitude"] = item['value']['lat']
newdict["longitude"] = item['value']['long']
This creates the JSON I am looking for (and I can eliminate the list I had previously). How does one make this JSON persist outside the for loop? Outside the loop, only the last value from the dictionary is added otherwise.
In your first block, key is a list, but in your second block it's a dict. You need to completely replace the key item.
newdict = {}
for number,item in enumerate(data['docs']['key']):
newdict['key%d' % (number+1)] = item
data['docs']['key'] = newdict

Categories

Resources