Convert repetitive pattern into JSON file with python - python

Hope you are doing fine,
I have a data file(containing 1000s of a structured pattern of data), like below
PARTNER="ABC"
ADDRESS1="ABC Country INN"
DEPARTMENT="ABC Department"
CONTACT_PERSON="HR"
TELEPHONE="+91.90.XX XX X XXX"
FAX="+01.XX.XX XX XX XX"
EMAIL=""
PARTNER="DEF"
ADDRESS1="DEF Malaysia"
DEPARTMENT=""
CONTACT_PERSON=""
TELEPHONE="(YYY)YYYYY"
FAX="(001)YYYYYYYY"
EMAIL=""
PARTNER="GEH-LOP"
ADDRESS1="GEH LOP Street"
DEPARTMENT="HR"
CONTACT_PERSON="Adam"
TELEPHONE="+91.ZZ.ZZ.ZZZZ"
FAX="+91.ZZ.ZZ.ZZZ"
EMAIL=""
I tried to convert the datafile(partner.txt) to JSON with below code:
Created empty dictionaries dict1 and dict2
Reading the data file line by line
used this if not line.isspace() to make sure the linefeed is read is written in dictionary dict1
When linebreak(empty line appears) appended the content of dict1 to dict2 using dict2.update(dict1)
import json
dict1 = {}
dict2 ={}
with open("partner.txt", "r") as fh:
out_file = open("test1.json", "w")
for line in fh:
if not line.isspace():
command, description = line.strip().split("=")
dict1[command] = description.strip('"')
else:
dict2.update(dict1)
print("space found")
json.dump(dict2,out_file,indent=1)
out_file.close()
print("json file created")
But this code creates a json(test1.json) with only the single block of PARTNER
{
"PARTNER": "DEF",
"ADDRESS1": "DEF Malaysia",
"DEPARTMENT": "",
"CONTACT_PERSON": "",
"TELEPHONE": "(YYY)YYYYY",
"FAX": "(001)YYYYYYYY",
"EMAIL": ""
}
Expected Output
I tried looking up a lot but couldn't find a way:-
{
"data":[
{
"PARTNER": "ABC",
"ADDRESS1": "ABC Country INN",
"DEPARTMENT": "ABC Department",
"CONTACT_PERSON": "HR",
"TELEPHONE": "+91.90.XX XX X XXX",
"FAX": "+01.XX.XX XX XX XX",
"EMAIL": ""
},
{
"PARTNER": "DEF",
"ADDRESS1": "DEF Malaysia",
"DEPARTMENT": "",
"CONTACT_PERSON": "",
"TELEPHONE": "(YYY)YYYYY",
"FAX": "(001)YYYYYYYY",
"EMAIL": ""
},
{
"PARTNER": "GEH-LOP",
"ADDRESS1": "GEH LOP Street",
"DEPARTMENT": "HR",
"CONTACT_PERSON": "Adam",
"TELEPHONE": "+91.ZZ.ZZ.ZZZZ",
"FAX": "+91.ZZ.ZZ.ZZZ",
"EMAIL": ""
}
]
}

You need to set dict1 to a new dict each time:
import json
dict1 = {}
dict2 ={}
with open("partner.txt", "r") as fh:
out_file = open("test1.json", "w")
for line in fh:
if not line.isspace():
command, description = line.strip().split("=")
dict1[command] = description.strip('"')
else:
dict2.update(dict1)
dict1 = {} # set it to new dict
print("space found")
json.dump(dict2,out_file,indent=1)
out_file.close()
print("json file created")

You need to append the dict to a list of dictionaries, not use update, as it overwrites the keys that are always the same:
import json
dict1 = {}
data = []
with open("partner.txt", "r") as fh:
out_file = open("test1.json", "w")
for line in fh:
if not line.isspace():
command, description = line.strip().split("=")
dict1[command] = description.strip('"')
else:
data.append(dict1)
dict1 = {} # set it to new dict
print("space found")
output = {'data': data}
json.dump(output, out_file, indent=1)
out_file.close()
print("json file created")

there are many ways to do this. maybe we should make it maintainable
def list_to_dict(lines):
obj = {}
for liner in lines:
idx = liner.find("=")
obj[liner[0:idx]] = liner[idx + 2 : len(liner) - 1]
return obj
with open("file", "r") as f:
results = []
group = []
for line in list(map(lambda x: x.strip(), f.read().split("\n"))):
if line == "":
results.append(list_to_dict(group))
group = []
else:
group.append(line)
print(results)

Solution
Using regex + json + dict/list-comprehension
You can do this using the regex (regular expression) and json libraries together. The text-processing is carried out with regex and finally the json library is used to format the dictionary into JSON format and write to a .json file.
Additionally we use dict and list comprehensions to gather the intended fields.
Note:
The regex pattern used here is as follows:
# longer manually written version
pat = r'PARTNER="(.*)"\n\s*ADDRESS1="(.*)"\n\s*DEPARTMENT="(.*)"\n\s*CONTACT_PERSON="(.*)"\n\s*TELEPHONE="(.*)"\n\s*FAX="(.*)"\n\s*EMAIL="(.*)"'
# shorter equivalent automated version
pat = '="(.*)"\n\s*'.join(field_labels) + '="(.*)"'
Code
import re
import json
# Read from file or use the dummy data
with open("partner.txt", "r") as f:
s = f.read()
field_labels = [
'PARTNER',
'ADDRESS1',
'DEPARTMENT',
'CONTACT_PERSON',
'TELEPHONE',
'FAX',
'EMAIL'
]
# Define regex pattern and compile for speed
pat = '="(.*)"\n\s*'.join(field_labels) + '="(.*)"'
pat = re.compile(pat)
# Extract target fields
data = pat.findall(s)
# Prepare a list of dicts: each dict for a single block of data
d = [dict((k,v) for k,v in zip(field_labels, field_values)) for field_values in data]
text = json.dumps({'data': d}, indent=2)
print(text)
# Write to a json file
with open('output.json', 'w') as f:
f.write(text)
Output:
# output.json
{
"data": [
{
"PARTNER": "ABC",
"ADDRESS1": "ABC Country INN",
"DEPARTMENT": "ABC Department",
"CONTACT_PERSON": "HR",
"TELEPHONE": "+91.90.XX XX X XXX",
"FAX": "+01.XX.XX XX XX XX",
"EMAIL": ""
},
{
"PARTNER": "DEF",
"ADDRESS1": "DEF Malaysia",
"DEPARTMENT": "",
"CONTACT_PERSON": "",
"TELEPHONE": "(YYY)YYYYY",
"FAX": "(001)YYYYYYYY",
"EMAIL": ""
},
{
"PARTNER": "GEH-LOP",
"ADDRESS1": "GEH LOP Street",
"DEPARTMENT": "HR",
"CONTACT_PERSON": "Adam",
"TELEPHONE": "+91.ZZ.ZZ.ZZZZ",
"FAX": "+91.ZZ.ZZ.ZZZ",
"EMAIL": ""
}
]
}
Dummy Data
# Dummy Data
s = """
PARTNER="ABC"
ADDRESS1="ABC Country INN"
DEPARTMENT="ABC Department"
CONTACT_PERSON="HR"
TELEPHONE="+91.90.XX XX X XXX"
FAX="+01.XX.XX XX XX XX"
EMAIL=""
PARTNER="DEF"
ADDRESS1="DEF Malaysia"
DEPARTMENT=""
CONTACT_PERSON=""
TELEPHONE="(YYY)YYYYY"
FAX="(001)YYYYYYYY"
EMAIL=""
PARTNER="GEH-LOP"
ADDRESS1="GEH LOP Street"
DEPARTMENT="HR"
CONTACT_PERSON="Adam"
TELEPHONE="+91.ZZ.ZZ.ZZZZ"
FAX="+91.ZZ.ZZ.ZZZ"
EMAIL=""
"""

Related

Converting text file to json

I have text file and I want to convert it to JSON:
red|2022-09-29|03:15:00|info 1
blue|2022-09-29|10:50:00|
yellow|2022-09-29|07:15:00|info 2
so i type a script to convert this file into JSON:
import json
filename = 'input_file.txt'
dict1 = {}
fields =['name', 'date', 'time', 'info']
with open(filename) as fh:
l = 1
for line in fh:
description = list( line.strip().split("|", 4))
print(description)
sno ='name'+str(l)
i = 0
dict2 = {}
while i<len(fields):
dict2[fields[i]]= description[i]
i = i + 1
dict1[sno]= dict2
l = l + 1
out_file = open("json_file.json", "w")
json.dump(dict1, out_file, indent = 4)
out_file.close()
and output looks like this:
{
"name1": {
"name": "red",
"date": "2022-09-29",
"time": "03:15:00",
"info": "info 1"
},
"name2": {
"name": "blue",
"date": "2022-09-29",
"time": "10:50:00",
"info": ""
},
"name3": {
"name": "yellow",
"date": "2022-09-29",
"time": "07:15:00",
"info": "info 2"
}
}
As you can see I do so, but now I want to change looks of this JSON file. How can I change it to make my output looks like this:
to look like this:
[
{"name":"red", "date": "2022-09-29", "time": "03:15:00", "info":"info 1"},
{"name":"blue", "date": "2022-09-29", "time": "10:50:00", "info":""},
{"name":"yellow", "date": "2022-09-29", "time": "07:15:00", "info":"info 2"}
]
If you see your required json output, it is a list and not a dict like you have right now. So using a list(data) instead of dict(dict1) should give the correct output.
Following updated code should generate the json data in required format -
import json
filename = 'input_file.txt'
data = []
fields =['name', 'date', 'time', 'info']
with open(filename) as fh:
l = 1
for line in fh:
description = list( line.strip().split("|", 4))
print(description)
sno ='name'+str(l)
i = 0
dict2 = {}
while i<len(fields):
dict2[fields[i]]= description[i]
i = i + 1
data.append(dict2)
l = l + 1
out_file = open("json_file.json", "w")
json.dump(data, out_file, indent = 4)
out_file.close()
I would use pandas, it allows you to solve your problem in one statement and avoid reinventing a wheel:
import pandas as pd
pd.read_table("input_file.txt", sep="|", header=None,
names=["name", "date" , "time", "info"]).fillna("")\
.to_json("json_file.json", orient="records")

Writing a List of Dictionaries to seperate JSONs

I do have dictionary, with each value as a list.
I want to write individual items to separate JSON files.
For example
data_to_write = {"Names":["name1", "name2", "name3"], "email":["mail1", "mail2", "mail3"]}
Now I want 3 jsons i.e data1.jsob, data2.json, data3.json in the following(approx) format.
data1.json
{
Name: name1,
email: mail1
}
data2.json
{
Name: name2,
email: mail2
}
and so on.
My current approach is
for file_no in range(no_of_files):
for count, (key, info_list) in enumerate(data_to_write.items()):
for info in info_list:
with open(
os.path.join(self.path_to_output_dir, str(file_no)) + ".json",
"a",
) as resume:
json.dump({key: info}, resume)
But this is wrong. Any helps appreciated.
You could use pandas to do the work for you. Read the dictionary into a dataframe, then iterate the rows of the dataframe to produce the json for each row:
import pandas as pd
data_to_write = {"Names":["name1", "name2", "name3"], "email":["mail1", "mail2", "mail3"]}
df = pd.DataFrame(data_to_write).rename(columns={'Names':'Name'})
for i in range(len(df)):
jstr = df.iloc[i].to_json()
with open(f"data{i+1}.json", "w") as f:
f.write(jstr)
Output (each line is in a separate file):
{"Name":"name1","email":"mail1"}
{"Name":"name2","email":"mail2"}
{"Name":"name3","email":"mail3"}
Try:
import json
data_to_write = {
"Names": ["name1", "name2", "name3"],
"email": ["mail1", "mail2", "mail3"],
}
for i, val in enumerate(zip(*data_to_write.values()), 1):
d = dict(zip(data_to_write, val))
with open(f"data{i}.json", "w") as f_out:
json.dump(d, f_out, indent=4)
This writes data(1..3).json with content:
# data1.json
{
"Names": "name1",
"email": "mail1"
}
# data2.json
{
"Names": "name2",
"email": "mail2"
}
...
import json
data_to_write = {
"Names": ["name1", "name2", "name3"],
"email": ["mail1", "mail2", "mail3"],
}
for ind, val in enumerate(zip(*data_to_write.values())):
jsn = dict(zip(data_to_write, val))
print(jsn)
with open("data{}.json".format(ind), "w") as f:
f.write(json.dumps(jsn))

How to fix the output for converting to JSON

I wrote a code in python that converts a file with these objects to JSON. It converts into the proper json format but the output is not exactly what I need.
{
name: (sindey, crosby)
game: "Hockey"
type: athlete
},
{
name: (wayne, gretzky)
game: "Ice Hockey"
type: athlete
}
Code:
import json
f = open("log.file", "r")
content = f.read()
splitcontent = content.splitlines()
d = []
for line in splitcontent:
appendage = {}
if ('}' in line) or ('{' in line):
# Append a just-created record and start a new one
continue
d.append(appendage)
key, val = line.split(':')
if val.endswith(','):
# strip a trailing comma
val = val[:-1]
appendage[key] = val
with open("json_log.json", 'w') as file:
file.write((json.dumps(d, indent=4, sort_keys=False)))
Desired output:
[
{
"name": "(sindey, crosby)",
"game": "Hockey",
"type": "athlete"
},
{
"name": "(wayne, gretzky)",
"game": "Ice Hockey",
"type": "athlete"
}
]
But I'm getting:
[
{
" name": " (sindey, crosby)"
},
{
" game": " \"Hockey\""
},
{
" type": " athlete"
},
{
" name": " (wayne, gretzky)"
},
{
" game": " \"Ice Hockey\""
},
{
" type": " athlete"
}
]
Any way to fix it to get the desired output and fix the {} around each individual line?
It's usually a good idea to split parsing into simpler tasks, e.g. first parse records, then parse fields.
I'm skipping the file handling and using a text variable:
intxt = """
{
name: (sindey, crosby)
game: "Hockey"
type: athlete
},
{
name: (wayne, gretzky)
game: "Ice Hockey"
type: athlete
}
"""
Then create a function that can yield all lines that are part of a record:
import json
def parse_records(txt):
reclines = []
for line in txt.split('\n'):
if ':' not in line:
if reclines:
yield reclines
reclines = []
else:
reclines.append(line)
and a function that takes those lines and parses each key/value pair:
def parse_fields(reclines):
res = {}
for line in reclines:
key, val = line.strip().rstrip(',').split(':', 1)
res[key.strip()] = val.strip()
return res
the main function becomes trivial:
res = []
for rec in parse_records(intxt):
res.append(parse_fields(rec))
print(json.dumps(res, indent=4))
the output, as desired:
[
{
"name": "(sindey, crosby)",
"game": "\"Hockey\"",
"type": "athlete"
},
{
"name": "(wayne, gretzky)",
"game": "\"Ice Hockey\"",
"type": "athlete"
}
]
The parsing functions can of course be made better, but you get the idea.
Yes I haven't checked the ouput properly, I remodified the logic now. The output is as expected.
import json
f = open("log.file", "r")
content = f.read()
print(content)
splitcontent = content.splitlines()
d = []
for line in splitcontent:
if "{" in line:
appendage = {}
elif "}" in line:
d.append(appendage)
else:
key, val = line.split(':')
appendage[key.strip()] = val.strip()
with open("json_log.json", 'w') as file:
file.write((json.dumps(d, indent=4, sort_keys=False)))

How to convert nested JSON files to CSV in python

I am completely new to python and trying to covert nested json files to csv. The current code I am trying to use is:
import json
def read_json(filename: str) -> dict:
try:
with open(filename, "r") as f:
data = json.loads(f.read())
except:
raise Exception(f"Reading {filename} file encountered an error")
return data
def normalize_json(data: dict) -> dict:
new_data = dict()
for key, value in data.items():
if not isinstance(value, dict):
new_data[key] = value
else:
for k, v in value.items():
new_data[key + "_" + k] = v
return new_data
def generate_csv_data(data: dict) -> str:
# Defining CSV columns in a list to maintain
# the order
csv_columns = data.keys()
# Generate the first row of CSV
csv_data = ",".join(csv_columns) + "\n"
# Generate the single record present
new_row = list()
for col in csv_columns:
new_row.append(str(data[col]))
# Concatenate the record with the column information
# in CSV format
csv_data += ",".join(new_row) + "\n"
return csv_data
def write_to_file(data: str, filepath: str) -> bool:
try:
with open(filepath, "w+") as f:
f.write(data)
except:
raise Exception(f"Saving data to {filepath} encountered an error")
def main():
# Read the JSON file as python dictionary
data = read_json(filename="test2.json")
# Normalize the nested python dict
new_data = normalize_json(data=data)
# Pretty print the new dict object
print("New dict:", new_data)
# Generate the desired CSV data
csv_data = generate_csv_data(data=new_data)
# Save the generated CSV data to a CSV file
write_to_file(data=csv_data, filepath=data2.csv")
if __name__ == '__main__':
main()
It works partly: I get a CSV file that contains all values. However, for the nested key fields it only gives me the "highest" level (e.g. I get "currentEmployments" but not "currentEmployments_firmId").
Could someone help me with this?
Sample json file:
{
"basicInformation": {
"individualId": 10000,
"firstName": "Name",
"middleName": "middleName.",
"lastName": "lastName",
"bcScope": "Active",
"iaScope": "NotInScope",
"daysInIndustryCalculatedDate": "1/1/2000"
},
"currentEmployments": [
{
"firmId": 001,
"firmName": "firm1",
"iaOnly": "N",
"registrationBeginDate": "1/1/2005",
"firmBCScope": "ACTIVE",
"firmIAScope": "ACTIVE",
"iaSECNumber": "10000",
"iaSECNumberType": "100",
"bdSECNumber": "1000",
"branchOfficeLocations": [
{
"locatedAtFlag": "Y",
"supervisedFromFlag": "N",
"privateResidenceFlag": "N",
"branchOfficeId": "10000",
"street1": "street1",
"city": "city",
"state": "MD",
"country": "United States",
"zipCode": "10000"
}
]
}
],
"currentIAEmployments": [],
"previousEmployments": [
{
"iaOnly": "N",
"bdSECNumber": "20000",
"firmId": 200,
"firmName": "firm2",
"street1": "street",
"city": "city",
"state": "MD",
"country": "UNITED STATES",
"zipCode": "10000",
}
],
"examsCount": {
"stateExamCount": 0,
"principalExamCount": 0,
"productExamCount": 1
},
}

converting text file to json in python

I have multiple documents that together are approximately 400 GB and I want to convert them to json format in order to drop to elasticsearch for analysis.
Each file is approximately 200 MB.
Original file looked like:
IUGJHHGF#BERLIN:lhfrjy
0t7yfudf#WARSAW:qweokm246
0t7yfudf#CRACOW:Er747474
0t7yfudf#cracow:kui666666
000t7yf#Vienna:1йй2ц2й2цй2цц3у
It has the characters that are not only English. key1 is always separated with #, where city was separated either by ; or :
After I have parsed it with code:
#!/usr/bin/env python
# coding: utf8
import json
with open('2') as f:
for line in f:
s1 = line.find("#")
rest = line[s1+1:]
if rest.find(";") != -1:
if rest.find(":") != -1:
print "FOUND BOTH : ; "
s2 = -0
else:
s2 = s1+1+rest.find(";")
elif rest.find(":") != -1:
s2 = s1+1+rest.find(":")
else:
print "FOUND NO : ; "
s2 = -0
key1 = line[:s1]
city = line[s1+1:s2]
description = line[s2+1:len(line)-1]
All file looks like:
RRS12345 Cracow Sunflowers
RRD12345 Berin Data
After that parsing I want to have the output:
{
"location_data":[
{
"key1":"RRS12345",
"city":"Cracow",
"description":"Sunflowers"
},
{
"key1":"RRD123dsd45",
"city":"Berlin",
"description":"Data"
},
{
"key1":"RRD123dsds45",
"city":"Berlin",
"description":"1йй2ц2й2цй2цц3у"
}
]
}
How can I convert it to the required json format quickly, where we do not have only English characters?
import json
def process_text_to_json():
location_data = []
with open("file.txt") as f:
for line in f:
line = line.split()
location_data.append({"key1": line[0], "city": line[1], "description": line[2]})
location_data = {"location_data": location_data}
return json.dumps(location_data)
Output sample:
{"location_data": [{"city": "Cracow", "key1": "RRS12345", "description": "Sunflowers"}, {"city": "Berin", "key1": "RRD12345", "description": "Data"}, {"city": "Cracow2", "key1": "RRS12346", "description": "Sunflowers"}, {"city": "Berin2", "key1": "RRD12346", "description": "Data"}, {"city": "Cracow3", "key1": "RRS12346", "description": "Sunflowers"}, {"city": "Berin3", "key1": "RRD12346", "description": "Data"}]}
Iterate over each line and form your dict.
Ex:
d = {"location_data":[]}
with open(filename, "r") as infile:
for line in infile:
val = line.split()
d["location_data"].append({"key1": val[0], "city": val[1], "description": val[2]})
print(d)

Categories

Resources