Is there any way to convert specific JSON data to CSV? - python

I have JSON format which looks like
Here is the link https://drive.google.com/file/d/1RqU2s0dqjd60dcYlxEJ8vnw9_z2fWixd/view?usp=sharing
result =
{
"ERROR":[
],
"LinkSetDbHistory":[
],
"LinkSetDb":[
{
"Link":[
{
"Id":"8116078"
},
{
"Id":"7654180"
},
{
"Id":"7643601"
},
{
"Id":"7017037"
},
{
"Id":"6190213"
},
{
"Id":"5902265"
},
{
"Id":"5441934"
},
{
"Id":"5417587"
},
{
"Id":"5370323"
},
{
"Id":"5362514"
},
{
"Id":"4818642"
},
{
"Id":"4330602"
}
],
"DbTo":"pmc",
"LinkName":"pubmed_pmc_refs"
}
],
"DbFrom":"pubmed",
"IdList":[
"25209241"
]
},
{
"ERROR":[
],
"LinkSetDbHistory":[
],
"LinkSetDb":[
{
"Link":[
{
"Id":"7874507"
},
{
"Id":"7378719"
},
{
"Id":"6719480"
},
{
"Id":"5952809"
},
{
"Id":"4944516"
}
],
"DbTo":"pmc",
"LinkName":"pubmed_pmc_refs"
}
],
"DbFrom":"pubmed",
"IdList":[
"25209630"
]
},
I want to fetch ID with a length which is 12 and list
"IdList":"25209241"
so the final output will be
IDList: length
25209241: 12 (Total number of Id in link array)
25209630 : 5 (Total number of Id in link array)
I have tried this code but not working with single or multiple values.
pmc_ids = [link["Id"] for link in results["LinkSetDb"]["Link"]]
len(pmc_ids)
How it can work with a large dataset if there?

You have "LinkSetDb" as a list containing a single dictionary but you are indexing it as if it is a dictionary. Use:
pmc_ids = [link["Id"] for link in result["LinkSetDb"][0]["Link"]]
len(pmc_ids)

The 'Link' key is inside a list. So, change pmc_ids = [link["Id"] for link in results["LinkSetDb"]["Link"]] to pmc_ids = [link["Id"] for link in results["LinkSetDb"][0]["Link"]].
To generate csv file, the code would be something like this:
import json
import csv
with open('Citation_with_ID.json', 'r') as f_json:
json_data = f_json.read()
f_json.close()
json_dict = json.loads(json_data)
csv_headers = ["IdList", "length"]
csv_values = []
for i in json_dict:
if len(i["LinkSetDb"])>0:
pmc_ids = [link["Id"] for link in i["LinkSetDb"][0]["Link"]]
else:
pmc_ids = []
length = len(pmc_ids)
if len(i['IdList'])==1:
IdList = i['IdList'][0]
else:
IdList = None
csv_values.append([IdList,length])
with open('mycsvfile.csv', 'w') as f_csv:
w = csv.writer(f_csv)
w.writerow(csv_headers)
w.writerows(csv_values)
f_csv.close()
If you want to store the values in a dictionary then something like this can be used:
values_list = list(zip(*csv_values))
dict(zip(values_list[0],values_list[1]))

Related

Pandas to JSON not respecting DataFrame format

I have a Pandas DataFrame which I need to transform into a JSON object. I thought by grouping it, I would achieve this but this does not seem to yield the correct results. Further, I wouldnt know how to name the sub group.
My data frame as follows:
parent
name
age
nick
stef
10
nick
rob
12
And I do a groupby as I would like all children together under one parent in json:
df = df.groupby(['parent', 'name'])['age'].min()
And I would like it to yield the following:
{
"parent": "Nick",
"children": [
{
"name": "Rob",
"age": 10,
},
{
"name": "Stef",
"age": 15,
},,.. ]
}
When I do .to_json() it seems to regroup everything on age etc.
df.groupby(['parent'])[['name', 'age']].apply(list).to_json()
Given I wanted to add some styling, I ended up solving it as follows:
import json
df_grouped = df.groupby('parent')
new = []
for group_name, df_group in df_grouped:
base = {}
base['parent'] = group_name
children = []
for row_index, row in df_group.iterrows():
temp = {}
temp['name'] = row['name']
temp['age'] = row['age']
children.append(temp)
base['children'] = children
new.append(base)
json_format = json.dumps(new)
print(new)
Which yielded the following results:
[
{
"parent":"fee",
"children":[
{
"name":"bob",
"age":9
},
{
"name":"stef",
"age":10
}
]
},
{
"parent":"nick",
"children":[
{
"name":"stef",
"age":10
},
{
"name":"tobi",
"age":2
},
{
"name":"ralf",
"age":12
}
]
},
{
"parent":"patrick",
"children":[
{
"name":"marion",
"age":10
}
]
}
]

Json in python 3 get element

I build this json file
{
"systems-under-test": [{
"type": "url",
"sytems": [
"www.google.com",
"www.google.com",
"www.google.com"
]
},
{
"type": "api",
"sytems": [
"api.com",
"api.fr"
]
},
{
"type": "ip",
"sytems": [
"172.168 .1 .1",
"172.168 .1 .0"
]
}
],
"headers-default-configuration": {
"boolean": false
},
"headers-custom-configuration": {
"boolean": true,
"settings": {
"headers": {
"header-name": "x - frame - options",
"expected-value": ["deny", "sameorigin"]
}
}
},
"header-results": []
}
I want to add system under test to 3 different list, based on the type, for example type = URL to url_list and so on.
def loadConfigFile(self, urls_list=None):
path = self.validate_path()
with open(path) as f:
data = json.load(f)
pprint(data)
for key, value in data.items():
if key == "systems-under-test":
for x in value:
print(x.keys()[0])
if x.values[0] == "url":
url = x.get("systems")
print(url)
urls_list.add[url]
the output needs to be like:
all this :
"www.google.com"
"www.google.com"
"www.google.com"
needs to be added to url_list
when I try to access key value by using : x.values[0] == "URL", I keep getting this error
TypeError: 'dict_keys' object does not support indexing
the problem is solved by adding () as shown bellow:
def loadConfigFile(self, urls_list=None):
path = self.validate_path()
with open(path) as f:
data = json.load(f)
pprint(data)
for key, value in data.items():
if key == "systems-under-test":
for x in value:
if list(x.values())[0] == "url":
urls = list(x.values())[1]
for url in urls:
print(url)
results will be
www.google.com
www.google.com
www.google.com
This seems like an easy way to do it:
from json import load
with open("data.json") as json_file:
data = load(json_file)
test_data = data["systems-under-test"][0]
if test_data["type"] == "url":
print(test_data["sytems"])
Output:
['www.google.com', 'www.google.com', 'www.google.com']

Python - Problem extracting data from nested json

I have a problem extracting data from json, I tried n different ways. I was able to extract the ID itself, unfortunately I can't manage to show the details of the field.
Below is my json
{
"params": {
"cid": "15482782896",
"datemax": "20190831",
"datemin": "20190601",
"domains": [
"url.com"
],
},
"results": {
"59107": {
"url.com": {
"1946592": {
"data": {
"2019-06-01": {
"ENGINE": {
"DEVICE": {
"": {
"position": 21,
"url": "url3.com"
}
}
}
},
"2019-07-01": {
"ENGINE": {
"DEVICE": {
"": {
"position": 4,
"url": "url3.com"
}
}
}
},
"2019-08-01": {
"ENGINE": {
"DEVICE": {
"": {
"position": 2,
"url": "url3.com"
}
}
}
}
},
"keyword": {
"title": "python_1",
"volume": 10
}
},
"1946602": {
"data": {
"2019-06-01": {
"ENGINE": {
"DEVICE": {
"": {
"position": 5,
"url": "url1.com"
}
}
}
},
"2019-07-01": {
"ENGINE": {
"DEVICE": {
"": {
"position": 12,
"url": "url1.com"
}
}
}
},
"2019-08-01": {
"ENGINE": {
"DEVICE": {
"": {
"position": 10.25,
"url": "url1.com"
}
}
}
}
},
"keyword": {
"title": "python_2",
"volume": 20
}
}
}
}
}
}
I tried the following code but I got the result in the form of id itself
import json
import csv
def get_leaves(item, key=None):
if isinstance(item, dict):
leaves = {}
for i in item.keys():
leaves.update(get_leaves(item[i], i))
return leaves
elif isinstance(item, list):
leaves = {}
for i in item:
leaves.update(get_leaves(i, key))
return leaves
else:
return {key : item}
with open('me_filename') as f_input:
json_data = json.load(f_input)
fieldnames = set()
for entry in json_data:
fieldnames.update(get_leaves(entry).keys())
with open('output.csv', 'w', newline='') as f_output:
csv_output = csv.DictWriter(f_output, fieldnames=sorted(fieldnames))
csv_output.writeheader()
csv_output.writerows(get_leaves(entry) for entry in json_data)
I also tried to use the pandas but also failed to parse properly
import io
import json
import pandas as pd
with open('me_filename', encoding='utf-8') as f_input:
df = pd.read_json(f_input , orient='None')
df.to_csv('output.csv', encoding='utf-8')
The result I'd need to get it :
ID Name page volume url 2019-06-01 2019-07-01 2019-08-01 2019-09-01
1946592 python_1 url.com 10 url3.com 21 4 2 null
1946602 python_2 url.com 20 url1.com 5 12 10,25 null
What could I do wrong?
Hmm this is a bit of a convoluted solution and it looks very messy and no-longer looks like the code provided however I believe it will resolve your issue.
First of all I had a problem with the provided Json (due to the trailing ',' on line 8) however have managed to generate:
Output (temp.csv)
ID,Name,Page,Volume,Url,2019-08-01,2019-07-01,2019-06-01,
1946592,python_1,url.com,10,url3.com,2,4,21,
1946602,python_2,url.com,20,url1.com,10.25,12,5,
using the following:
import json
dates: set = set()
# Collect the data
def get_breakdown(json):
collected_data = []
for result in json['results']:
for page in json['results'][result]:
for _id in json['results'][result][page]:
data_struct = {
'ID': _id,
'Name': json['results'][result][page][_id]['keyword']['title'],
'Page': page,
'Volume': json['results'][result][page][_id]['keyword']['volume'],
'Dates': {}
}
for date in dates:
if date in json['results'][result][page][_id]['data']:
data_struct['URL'] = json['results'][result][page][_id]['data'][date]['ENGINE']['DEVICE']['']['url']
data_struct['Dates'][date] = {'Position' : json['results'][result][page][_id]['data'][date]['ENGINE']['DEVICE']['']['position']}
else:
data_struct['Dates'][date] = {'Position' : 'null'}
collected_data.append(data_struct)
return collected_data
# Collect all dates across the whole data
# structure and save them to a set
def get_dates(json):
for result in json['results']:
for page in json['results'][result]:
for _id in json['results'][result][page]:
for date in json['results'][result][page][_id]['data']:
dates.add(date)
# Write to .csv file
def write_csv(collected_data, file_path):
f = open(file_path, "w")
# CSV Title
date_string = ''
for date in dates:
date_string = '{0}{1},'.format(date_string, date)
f.write('ID,Name,Page,Volume,Url,{0}\n'.format(date_string))
# Data
for data in collected_data:
position_string = ''
for date in dates:
position_string = '{0}{1},'.format(position_string, data['Dates'][date]['Position'])
f.write('{0},{1},{2},{3},{4},{5}\n'.format(
data['ID'],
data['Name'],
data['Page'],
data['Volume'],
data['URL'],
position_string
))
# Code Body
with open('me_filename.json') as f_input:
json_data = json.load(f_input)
get_dates(json_data)
write_csv(get_breakdown(json_data), "output.csv")
Hopefully you can follow the code and it does what is expected. I am sure that it can be made much more reliable - however as previously mentioned I couldn't make it work with the base code you provided.
After a small modification your code works great, but I noticed that showing the date as the next line would be a better solution in the format.
I tried to modify your solution to this form, but I'm still too weak in python to easily deal with it. Can you still tell me how you can do it to achieve this csv file format?
Output(temp.csv)
ID,Name,Page,Volume,Url,data,value,
1946592,python_1,url.com,10,url3.com,2019-08-01,2
1946592,python_1,url.com,10,url3.com,2019-07-01,4
1946592,python_1,url.com,10,url3.com,2019-06-01,21
1946602,python_2,url.com,20,url1.com,2019-08-01,10.25,
1946602,python_2,url.com,20,url1.com,2019-07-01,12,
1946602,python_2,url.com,20,url1.com,2019-06-01,5,

Getting 0 records while parsing json file , if the Key Attribute does not exists

I have few static key columns EmployeeId,type and few columns coming from first FOR loop.
While in the second FOR loop if i have a specific key then only values should be appended to the existing data frame columns else whatever the columns getting fetched from first for loop should remain same.
First For Loop Output:
"EmployeeId","type","KeyColumn","Start","End","Country","Target","CountryId","TargetId"
"Emp1","Metal","1212121212","2000-06-17","9999-12-31","","","",""
After Second For Loop i have below output:
"EmployeeId","type","KeyColumn","Start","End","Country","Target","CountryId","TargetId"
"Emp1","Metal","1212121212","2000-06-17","9999-12-31","","AMAZON","1",""
"Emp1","Metal","1212121212","2000-06-17","9999-12-31","","FLIPKART","2",""
As per code if i have Employee tag available , i have got above 2 records but i may have few json files without Employee tag then output should remain same as per First Loop Output with all the key fields populated and rest columns with null.
But i am getting 0 records as per my code. Please help me if my way of coding is wrong.
Please help me ... If the way of asking question is not clear i am sorry , as i am new to python . Please find the sample data in the below link
Please find below code
for i in range(len(json_file['enty'])):
temp = {}
temp['EmployeeId'] = json_file['enty'][i]['id']
temp['type'] = json_file['enty'][i]['type']
for key in json_file['enty'][i]['data']['attributes'].keys():
try:
temp[key] = json_file['enty'][i]['data']['attributes'][key]['values'][0]['value']
except:
temp[key] = None
for key in json_file['enty'][i]['data']['attributes'].keys():
if(key == 'Employee'):
for j in range(len(json_file['enty'][i]['data']['attributes']['Employee']['group'])):
for key in json_file['enty'][i]['data']['attributes']['Employee']['group'][j].keys():
try:
temp[key] = json_file['enty'][i]['data']['attributes']['Employee']['group'][j][key]['values'][0]['value']
except:
temp[key] = None
temp_df = pd.DataFrame([temp])
df = pd.concat([df, temp_df], sort=True)
# Rearranging columns
df = df[['EmployeeId', 'type'] + [col for col in df.columns if col not in ['EmployeeId', 'type']]]
# Writing the dataset
df[columns_list].to_csv("Test22.csv", index=False, quotechar='"', quoting=1)
If Employee Tag is not available i am getting 0 records as output but i am expecting 1 record as for first for loop
enter link description here
The JSON structure is quite complicated. I try to simplified the data collection from it. The result is a list of flat dicts. The code handles the case where 'Employee' is not found.
import copy
d = {
"enty": [
{
"id": "Emp1",
"type": "Metal",
"data": {
"attributes": {
"KeyColumn": {
"values": [
{
"value": 1212121212
}
]
},
"End": {
"values": [
{
"value": "2050-12-31"
}
]
},
"Start": {
"values": [
{
"value": "2000-06-17"
}
]
},
"Employee": {
"group": [
{
"Target": {
"values": [
{
"value": "AMAZON"
}
]
},
"CountryId": {
"values": [
{
"value": "1"
}
]
}
},
{
"Target": {
"values": [
{
"value": "FLIPKART"
}
]
},
"CountryId": {
"values": [
{
"value": "2"
}
]
}
}
]
}
}
}
}
]
}
emps = []
for e in d['enty']:
entry = {'id': e['id'], 'type': e['type']}
for x in ["KeyColumn", "Start", "End"]:
entry[x] = e['data']['attributes'][x]['values'][0]['value']
if e['data']['attributes'].get('Employee'):
for grp in e['data']['attributes']['Employee']['group']:
clone = copy.deepcopy(entry)
for x in ['Target', 'CountryId']:
clone[x] = grp[x]['values'][0]['value']
emps.append(clone)
else:
emps.add(entry)
# TODO write to csv
for emp in emps:
print(emp)
output
{'End': '2050-12-31', 'Target': 'AMAZON', 'KeyColumn': 1212121212, 'Start': '2000-06-17', 'CountryId': '1', 'type': 'Metal', 'id': 'Emp1'}
{'End': '2050-12-31', 'Target': 'FLIPKART', 'KeyColumn': 1212121212, 'Start': '2000-06-17', 'CountryId': '2', 'type': 'Metal', 'id': 'Emp1'}

Filtering out desired data from a JSON file (Python)

this is a sample of my json file:
{
"pops": [{
"name": "pop_a",
"subnets": {
"Public": ["1.1.1.0/24,2.2.2.0/24"],
"Private": ["192.168.0.0/24,192.168.1.0/24"],
"more DATA":""
}
},
{
"name": "pop_b",
"subnets": {
"Public": ["3.3.3.0/24,4.4.4.0/24"],
"Private": ["192.168.2.0/24,192.168.3.0/24"],
"more DATA":""
}
}
]
}
after i read it, i want to make a dic object and store some of the things that i need from this file.
i want my object to be like this ..
[{
"name": "pop_a",
"subnets": {"Public": ["1.1.1.0/24,2.2.2.0/24"],"Private": ["192.168.0.0/24,192.168.1.0/24"]}
},
{
"name": "pop_b",
"subnets": {"Public": ["3.3.3.0/24,4.4.4.0/24"],"Private": ["192.168.2.0/24,192.168.3.0/24"]}
}]
then i want to be able to access some of the public/private values
here is what i tried, and i know there is update(), setdefault() that gave also same unwanted results
def my_funckion():
nt_json = [{'name':"",'subnets':[]}]
Pname = []
Psubnet= []
for pop in pop_json['pops']: # it print only the last key/value
nt_json[0]['name']= pop['name']
nt_json[0]['subnet'] = pop['subnet']
pprint (nt_json)
for pop in pop_json['pops']:
"""
it print the names in a row then all of the ipss
"""
Pname.append(pop['name'])
Pgre.append(pop['subnet'])
nt_json['pop_name'] = Pname
nt_json['subnet']= Psubnet
pprint (nt_json)
Here's a quick solution using list comprehension. Note that this approach can be taken only with enough knowledge of the json structure.
>>> import json
>>>
>>> data = ... # your data
>>> new_data = [{ "name" : x["name"], "subnets" : {"Public" : x["subnets"]["Public"], "Private" : x["subnets"]["Private"]}} for x in data["pops"]]
>>>
>>> print(json.dumps(new_data, indent=2))
[
{
"name": "pop_a",
"subnets": {
"Private": [
"192.168.0.0/24,192.168.1.0/24"
],
"Public": [
"1.1.1.0/24,2.2.2.0/24"
]
}
},
{
"name": "pop_b",
"subnets": {
"Private": [
"192.168.2.0/24,192.168.3.0/24"
],
"Public": [
"3.3.3.0/24,4.4.4.0/24"
]
}
}
]

Categories

Resources