Python split key values - python

I have a Json file which I'm parsing data and it's generated output is in output.txt. At this moment, after the output.txt is generated i'm reading output.txt line by line. Splitting each line and then deleting first two column.
("\t".join(line.split()[2:]) + "\n")
How can I get the same result from for loop shared below?
Expected output project_name + Files_name.
script.py
import json
x = json.load(open('data.json'))
for sub_dict in x['changed']:
print('project_name', sub_dict['project_name'])
for entry in sub_dict['added_commits']:
print (entry['File_Names'])
data.json
{
"changed": [
{
"prev_revision": "a09936ea19ddc9f69ed00a7929ea81234af82b95",
"added_commits": [
{
"File_Names": [
"115\t0\t1/src/hello.cpp",
"116\t0\t1/src/hell1o.cpp"
],
}
],
"project_name": "android/hello"
},
{
"prev_revision": "a09936ea19ddc9f69ed00a7929ea81234af82b95",
"added_commits": [
{
"File_Names": [
"41\t1\t1/src/hello1.cpp"
],
}
],
"project_name": "android/helloworld"
}
]
}
output.txt
115 0 1/src/hello.cpp
116 0 1/src/hell1o.cpp
41 1 1/src/hello1.cpp
expected output.txt
android/hello/src/hello.cpp
android/hello/src/hell1o.cpp
android/helloworld/src/hello1.cpp

This will do the trick
import json
import re
with open('data.json') as f:
x = json.load(f)
for sub_dict in x['changed']:
proj = sub_dict['project_name']
for entry in sub_dict['added_commits']:
for name in entry['File_Names']:
n = re.findall(r'(?:\s*\d+\s*\d+\s*\d+)(\/.*)', name)[0]
print( proj + n)
Note the use of with to open the file which will also close it afterwards.
I used regex to make this more robust, this will get anything of the from numbers numbers numbers/stuff_to_match

You can iterate through the sub-lists like this:
for d in x['changed']:
for c in d['added_commits']:
for f in c['File_Names']:
print(d['project_name'] + f.split('\t')[2][1:])
This outputs:
android/hello/src/hello.cpp
android/hello/src/hell1o.cpp
android/helloworld/src/hello1.cpp

Related

Generate JSON file from file names and their content

I'm new to python, and couldn't find a close enough answer to make me figure it out. I'm trying generate a single json file that contains current directory file names that end with a .txt extension as nodes, and the contents of those files as a list inside the file name's node.
for example:
node1.txt contains
foo
bar
and node2.txt contains
test1
test2
the output should look like:
{
"node1": [
"foo",
"bar"
],
"node2": [
"test1",
"test2"
]
}
Use pathlib and json modules and a simple loop...
import pathlib
import json
data = {}
for node in pathlib.Path('.').glob('node*.txt'):
with open(node, 'r') as fp:
data[node.stem] = [line.strip() for line in fp.readlines()]
print(json.dumps(data, indent=4))
Output:
{
"node1": [
"foo",
"bar"
],
"node2": [
"test1",
"test2"
]
}

How can I even use the'else' syntax in Python?

I am reading data from a JSON file to check the existence of some values.
In the JSON structure below, I try to find adomain from the data in bid and check if there is a cat value, which is not always present.
How do I fix it in the syntax below?
import pandas as pd
import json
path = 'C:/MyWorks/Python/Anal/data_sample.json'
records = [json.loads(line) for line in open(path, encoding='utf-8')]
adomain = [
rec['win_res']['seatbid'][0]['bid'][0]['adomain']
for rec in records
if 'adomain' in rec
]
Here is a data sample:
[
{ "win_res": {
"id": "12345",
"seatbid": [
{
"bid": [
{
"id": "12345",
"impid": "1",
"price": 0.1,
"adm": "",
"adomain": [
"adomain.com"
],
"iurl": "url.com",
"cid": "11",
"crid": "11",
"cat": [
"IAB12345"
],
"w": 1,
"h": 1
}
],
"seat": "1"
}
]
}}
]
As a result, the adomain value exists unconditionally, but the cat value may not be present sometimes.
So, if cat exists in adomain, I want to express adomain and cat in this way, but if there is no adomain, the cat value, how can I do it?
Your question is not clear but I think this is what you are looking for:
import json
path = 'C:/MyWorks/Python/Anal/data_sample.json'
with open(path, encoding='utf-8') as f:
records = json.load(f)
adomain = [
_['win_res']['seatbid'][0]['bid'][0]['adomain']
for _ in records
if _['win_res']['seatbid'][0]['bid'][0].get('adomain', None) and
_['win_res']['seatbid'][0]['bid'][0].get('cat', None)
]
The code above will add the value of ['win_res']['seatbid'][0]['bid'][0]['adomain'] to the list adomain only if there is a ['win_res']['seatbid'][0]['bid'][0]['cat'] corresponding value.
The code will be a lot clearer if we just walk through a bids list. Something like this:
import json
path = 'C:/MyWorks/Python/Anal/data_sample.json'
with open(path, encoding='utf-8') as f:
records = json.load(f)
bids = [_['win_res']['seatbid'][0]['bid'][0] for _ in records]
adomain = [
_['adomain']
for _ in bids
if _.get('adomain', None) and _.get('cat', None)
]

How i can parse the first information in to CSV?

I'm trying to parse more than 100 json files, but i do not need all the info.
i only need to parse the first set of the 'coordinates', the CSV already have printed URL and URL type, but i cannot print the first set of coordinates.
this is a section of the Json file
{
"type":"featureCollection",
"features" : [
{
"type": "feature",
"geometry": {
"type": "multilinestring",
"coordinates":[
[
[
148.9395348,
-21.3292286
],
[
148.93963,
-21.33001
],
[
148.93969,
-21.3303
]
]
]
},
"properties":{
"url" :"www.thiswebpageisfake.com",
"url_type":"fake"
},
"information":{
"timestamp":"10/10/19"
}
}]
}
i'm using python 2.7, i have tried creating an array for coordinates but i have a type error
import os
import csv
import json
import sys
reload(sys)
file_path = 'C:\\Users\\user\\Desktop\\Python\\json'
dirs = os.listdir(file_path)
file_out = 'C:\\Users\\user\\output.csv'
f = csv.writer(open(file_out, "wb+"))
f.writerow(
['url','url_type','lat','long'])
for file in dirs:
json_dict = json.loads(open(os.path.join(file_path, file)).read())
print file
for key in json_dict['features']:
for key1 in key:
description = key['properties']['description']
if description is None:
description = 'null'
array = ()
array = (key['geometry']['type']['coordinates'])
f.writerow([file,
key['properties']['url'],
key['properties']['url_type'],
array[1]
])
print 'completed'
Firstly, it looks like your second loop is supposed to be nested in the first, otherwise you'll do nothing with all json files except the last and only end up processing one file.
Secondly, your array should be defined as array = (key['geometry']['coordinates']), as 'coordinates' is not contained in 'type'.

How to get file "Path" stored in this json file using Python?

json file
[{"Attachment": [
{
"Page:": [
{
"Path": "a\\b\\c.pdf", #field to be extracted
"PageID": 1
}
]
}
],
"ID": 13221}]
I tried the following but getting the TypeError: list indices must be integers, not str
with open(file) as f:
d = json.load(f)
print(d[0]['Attachment']['Page']['Path'])
d[0]['Attachment'] is a list, so is d[0]['Attachment'][0]['Page:'].
with open(file) as f:
d = json.load(f)
print(d[0]['Attachment'][0]['Page:'][0]['Path'])
will do the job.

How to read a large JSON file using Python ijson?

I am trying to parse a big json file (hundreds of gigs) to extract information from its keys. For simplicity, consider the following example:
import random, string
# To create a random key
def random_string(length):
return "".join(random.choice(string.lowercase) for i in range(length))
# Create the dicitonary
dummy = {random_string(10): random.sample(range(1, 1000), 10) for times in range(15)}
# Dump the dictionary into a json file
with open("dummy.json", "w") as fp:
json.dump(dummy, fp)
Then, I use ijson in python 2.7 to parse the file:
file_name = "dummy.json"
with open(file_name, "r") as fp:
for key in dummy.keys():
print "key: ", key
parser = ijson.items(fp, str(key) + ".item")
for number in parser:
print number,
I was expecting to retrieve all the numbers in the lists corresponding to the keys of the dic. However, I got
IncompleteJSONError: Incomplete JSON data
I am aware of this post: Using python ijson to read a large json file with multiple json objects, but in my case I have a single json file, that is well formed, with a relative simple schema. Any ideas on how can I parse it? Thank you.
ijson has an iterator interface to deal with large JSON files allowing to read the file lazily. You can process the file in small chunks and save results somewhere else.
Calling ijson.parse() yields three values prefix, event, value
Some JSON:
{
"europe": [
{"name": "Paris", "type": "city"},
{"name": "Rhein", "type": "river"}
]
}
Code:
import ijson
data = ijson.parse(open(FILE_PATH, 'r'))
for prefix, event, value in data:
if event == 'string':
print(value)
Output:
Paris
city
Rhein
river
Reference: https://pypi.python.org/pypi/ijson
The sample json content file is given below: it has records of two people. It might as well have 2 million records.
[
{
"Name" : "Joy",
"Address" : "123 Main St",
"Schools" : [
"University of Chicago",
"Purdue University"
],
"Hobbies" : [
{
"Instrument" : "Guitar",
"Level" : "Expert"
},
{
"percussion" : "Drum",
"Level" : "Professional"
}
],
"Status" : "Student",
"id" : 111,
"AltID" : "J111"
},
{
"Name" : "Mary",
"Address" : "452 Jubal St",
"Schools" : [
"University of Pensylvania",
"Washington University"
],
"Hobbies" : [
{
"Instrument" : "Violin",
"Level" : "Expert"
},
{
"percussion" : "Piano",
"Level" : "Professional"
}
],
"Status" : "Employed",
"id" : 112,
"AltID" : "M112"
}
}
]
I created a generator which would return each person's record as a json object. The code would look like below. This is not the generator code. Changing couple of lines would make it a generator.
import json
curly_idx = []
jstr = ""
first_curly_found = False
with open("C:\\Users\\Rajeshs\\PycharmProjects\\Project1\\data\\test.json", 'r') as fp:
#Reading file line by line
line = fp.readline()
lnum = 0
while line:
for a in line:
if a == '{':
curly_idx.append(lnum)
first_curly_found = True
elif a == '}':
curly_idx.pop()
# when the right curly for every left curly is found,
# it would mean that one complete data element was read
if len(curly_idx) == 0 and first_curly_found:
jstr = f'{jstr}{line}'
jstr = jstr.rstrip()
jstr = jstr.rstrip(',')
jstr[:-1]
print("------------")
if len(jstr) > 10:
print("making json")
j = json.loads(jstr)
print(jstr)
jstr = ""
line = fp.readline()
lnum += 1
continue
if first_curly_found:
jstr = f'{jstr}{line}'
line = fp.readline()
lnum += 1
if lnum > 100:
break
You are starting more than one parsing iterations with the same file object without resetting it. The first call to ijson will work, but will move the file object to the end of the file; then the second time you pass the same.object to ijson it will complain because there is nothing to read from the file anymore.
Try opening the file each time you call ijson; alternatively you can seek to the beginning of the file after calling ijson so the file object can read your file data again.
if you are working with json with the following format you can use ijson.item()
sample json:
[
{"id":2,"cost":0,"test":0,"testid2":255909890011279,"test_id_3":0,"meeting":"daily","video":"paused"}
{"id":2,"cost":0,"test":0,"testid2":255909890011279,"test_id_3":0,"meeting":"daily","video":"paused"}
]
input = 'file.txt'
res=[]
if Path(input).suffix[1:].lower() == 'gz':
input_file_handle = gzip.open(input, mode='rb')
else:
input_file_handle = open(input, 'rb')
for json_row in ijson.items(input_file_handle,
'item'):
res.append(json_row)

Categories

Resources