How to Convert Multiple Text Files Into Multiple json Files - python

I have multiple text files which I need to convert to json files. For each text file I want an individual json file.
Text file content
File-1.txt
['education~25,850,103,23', 'experience~28,94,107,27', 'skills~29,904,59,27']
File-2.txt
['introduction~211,143,87,13', 'education~169,302,131,17', 'skills~322,421,84,15', 'experience~325,142,112,14', 'reference~320,699,68,14']
and so on ...
The expected output is a json file which contains:
Keyword(Class name)
Values(Coordinates)
This is what I tried with this code I was able to write data into txt--
with open(PATH_TO_RESULTS + '/' + os.path.join(os.path.basename(os.path.dirname(image_path))) + '.txt', 'w') as f:
image_name = os.path.splitext(os.path.basename(image_path))[0]
# f.write((image_path + '|'))
req_fields = []
for key, value in field_item.items():
#print("=====================")
# print(key)
# print((scores[0, index]))
# print(value)
# print("==================")
merge = str(key.decode('utf-8')) + '~' + str(value)
req_fields.append(merge)
f.write(str(req_fields))
print("#######################Required Fields###########################",req_fields)
And one more thing, the json file name should also be the same as txt file name.

I think that it's what you need. Or at least so close.
You can improve and adapt it(Best naming)
import glob, os
import json
os.chdir(".")
def read_file(file):
with open(file, 'r') as file:
return file.read()
def write_json(file, data):
with open(file, 'w') as fout:
json.dump(data, fout, indent=4)
for file in glob.glob("*.txt"):
content = read_file(file)
to_parse_in_rows = content.replace('[', '').replace(']', '').split(', ')
rows = []
for part in to_parse_in_rows:
field12, field3, field4, field5 = part.replace("'", '').split(',')
field1, field2 = field12.split('~')
row = {
'class': field1,
'field2': int(field2),
'field3': int(field3),
'field4': int(field4),
'field5': int(field5)
}
rows.append(row)
write_json(file.replace('.txt', '.json'), rows)

Related

How to convert json file contains Unicode to string and save as a json file in python?

I have a large JSON file. My goal is to minify in using Python.
The JSON file contains Arabic and Bengali Characters.
My problem is when I try to minify is I am getting Unicode characters instead of normal string characters like this \u09c7.
How can I save the minified file with normal string characters?
Below is my code:
import json
filename = 'quran.json' # file name we want to compress
newname = filename.replace('.json', '.min.json') # Output file name
fp = open(filename, encoding="utf8")
print("Compressing file: " + filename)
print('Compressing...')
jload = json.load(fp)
newfile = json.dumps(jload, indent=None, separators=(',', ':'))
newfile = str.encode(newfile)
f = open(newname, 'wb')
f.write(newfile)
f.close()
print('Compression complete!)
Here is the file link in case you want to try: https://raw.githubusercontent.com/nhridoy/quran-api/main/v1/quran.json
it need ensure_ascii=False flag in json.dumps()
import json
filename = 'quran.json' # file name we want to compress
newname = filename.replace('.json', '.min.json') # Output file name
with open(filename, encoding="utf8") as fp:
print("Compressing file: " + filename)
print('Compressing...')
jload = json.load(fp)
newfile = json.dumps(jload, indent=None, separators=(',', ':'), ensure_ascii=False)
#newfile = str.encode(newfile) # remove this
with open(newname, 'w', encoding="utf8") as f: # add encoding="utf8"
f.write(newfile)
print('Compression complete!')

Python Json to csv, Extract the specified keys,KeyError:

I need to convert json to csv, I just want to extract some keys in the file, but some keys do not exist in the json file, I hope it can automatically fill in these non-existent keys
import csv
import json
import sys
import codecs
def trans(path):
jsonData = codecs.open('C:/Users/jeri/Desktop/1.json', 'r', 'utf-8')
# csvfile = open(path + '.csv', 'w')
# csvfile = open(path + '.csv', 'wb')
csvfile = open('C:/Users/jeri/Desktop/1.csv', 'w', newline='', encoding='utf-8')
writer = csv.writer(csvfile, delimiter=',')
keys = ['dob','firstname','lastname']
writer.writerow(keys)
for line in jsonData:
dic = json.loads(line)
writer.writerow([dic['dob'],dic['firstname'],dic['lastname'],])
jsonData.close()
csvfile.close()
if __name__ == '__main__':
path = str(sys.argv[0])
print(path)
trans(path)
Console prompt::
Traceback (most recent call last):
File "C:\Users\jeri\PycharmProjects\pythonProject9\main.py", line 25, in <module>
trans(path)
File "C:\Users\jeri\PycharmProjects\pythonProject9\main.py", line 17, in trans
writer.writerow([dic['dob'],dic['firstname'],dic['lastname'],])
KeyError: 'dob'
If the key 'dob' might be missing, instead of dic['dob'], do dic.get('dob', None). That provides the default you want.
I think this would solve your problem.
(I defined a function to test the existence of each item in json, if exists it return the value and if it doesn't exists it returns 'N/A')
def getValue(dic, item):
try:
return dic[item]
except:
return 'N/A'
for line in jsonData:
dic = json.loads(line)
writer.writerow([getValue(dic, 'dob'), getValue(dic, 'firstname'), getValue(dic, 'lastname'),])
you can transform your for loop into something like this.
for line in jsonData:
dic = json.loads(line)
dob = dic['dob'] if "dob" in dic else None
firstname = dic['firstname'] if "firstname" in dic else None
lastname = dic['lastname'] if "lastname" in dic else None
writer.writerow([dob,firstname,lastname])

Write output from a loop to a csv

I have a script which predicts product names from input files. The code is as follows:
output_dir = "C:\\Users\\Lenovo\\.spyder-py3\\NER_training"
DIR = 'C:\\Users\\Lenovo\\.spyder-py3\\Testing\\'
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
with open('eng_productnames.csv', newline='') as myFile:
reader = csv.reader(myFile)
for rowz in reader:
try:
filenamez = rowz[1]
file = open(DIR+filenamez, "r", encoding ='utf-8')
filecontentszz = file.read()
for s in filecontentszz:
filecontentszz = re.sub(r'\s+', ' ', filecontentszz)
#filecontents = filecontents.encode().decode('unicode-escape')
filecontentszz = ''.join([line.lower() for line in filecontentszz])
doc2 = nlp2(filecontentszz)
for ent in doc2.ents:
print(filenamez, ent.label_, ent.text)
break
except Exception as e:`
which gives me output in the form of a stringas:
07-09-18 N021024s16PASBUNDLEACK - Acknowledgement P.txt PRODUCT ABC1
06-22-18 Letter from Supl.txt PRODUCT ABC2
06-22-18 Letter from Req to Change .txt PRODUCT ABC3
Now I want to export all these details to a csv with 2 columns, one column as FILENAME and one column with PRODUCT having all filenames and product names under the respective column names. All product names start with PRODUCT and then the name in the string. How can I solve this:
Output csv should look like:
Filename PRODUCT
07-09-18 Acknowledgement P.txt ABC1
06-22-18 Letter Req to Change.txt ABC2
You can make a csv.writer to write each row to the output file, using writerow instead of printing to the screen.
output_dir = "C:\\Users\\Lenovo\\.spyder-py3\\NER_training"
DIR = 'C:\\Users\\Lenovo\\.spyder-py3\\Testing\\'
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
with open('eng_productnames.csv', newline='') as input_file, \
open('output.csv', 'w') as output_file:
reader = csv.reader(input_file)
writer = csv.writer(output_file)
writer.writerow(["Filename", "Product"]) # this is the header row
for rowz in reader:
try:
filenamez = rowz[1]
file = open(DIR+filenamez, "r", encoding ='utf-8')
filecontentszz = file.read()
for s in filecontentszz:
filecontentszz = re.sub(r'\s+', ' ', filecontentszz)
#filecontents = filecontents.encode().decode('unicode-escape')
filecontentszz = ''.join([line.lower() for line in filecontentszz])
doc2 = nlp2(filecontentszz)
for ent in doc2.ents:
writer.writerow([filenamez, ent.text])
break
I'm assuming here that filenamez and ent.text contain the information you want in each column. If that's not the case then you can manipulate them to get what you need before writing to the CSV.
There are many ways you can achieve this. One that I prefer is by using Pandas, which is a powerful library to work with CSV files.
You can create a dictionary:
predicted_products = {'FILENAME': [], 'PRODUCT': []}
and iteratively append filenames and products to the corresponding lists.
After that is done, convert predicted_products to a DataFrame, and call to_csv function:
import Pandas as pd
predicted_products_df = pd.DataFrame.from_dict(predicted_products)
predicted_products_df.to_csv('your_path/file_name.csv')
I prefer this way, since you can edit data easier before you save the file.
To your existing code, I suppose that print(filenamez, ent.label_, ent.text) prints the output. If so then:
import Pandas as pd
output_dir = "C:\\Users\\Lenovo\\.spyder-py3\\NER_training"
DIR = 'C:\\Users\\Lenovo\\.spyder-py3\\Testing\\'
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
predicted_products = {'FILENAME': [], 'PRODUCT': []}
with open('eng_productnames.csv', newline='') as myFile:
reader = csv.reader(myFile)
for rowz in reader:
try:
filenamez = rowz[1]
file = open(DIR+filenamez, "r", encoding ='utf-8')
filecontentszz = file.read()
for s in filecontentszz:
filecontentszz = re.sub(r'\s+', ' ', filecontentszz)
#filecontents = filecontents.encode().decode('unicode-escape')
filecontentszz = ''.join([line.lower() for line in filecontentszz])
doc2 = nlp2(filecontentszz)
for ent in doc2.ents:
print(filenamez, ent.label_, ent.text)
predicted_products['FILENAME'].append(filenamez + ' ' + ent.label_)
predicted_products['PRODUCT'].append(ent.text)
break
except Exception as e:
predicted_products_df = pd.DataFrame.from_dict(predicted_products)
predicted_products_df.to_csv('your_path/file_name.csv')

How to loop through a list of strings in Python

I'm a bit new to Python and I am trying to simplify my existing code.
Right now, I have the code repeated 5 times with different strings. I'd like to have the code one time and have it run through a list of strings.
Currently what I have:
def wiScanFormat():
File = open("/home/pi/gpsMaster/WiScan.txt", "r")
data = File.read()
File.close()
MAC = data.replace("Address:", "\nAddress, ")
File = open("/home/pi/gpsMaster/WiScan.txt", "w")
File.write(MAC)
File.close()
File = open("/home/pi/gpsMaster/WiScan.txt", "r")
data = File.read()
File.close()
SSID = data.replace("ESSID:", "\nESSID, ")
File = open("/home/pi/gpsMaster/WiScan.txt", "w")
File.write(SSID)
File.close()
File = open("/home/pi/gpsMaster/WiScan.txt", "r")
data = File.read()
File.close()
FREQ = data.replace("Frequency:", "\nFrequency, ")
File = open("/home/pi/gpsMaster/WiScan.txt", "w")
File.write(FREQ)
File.close()
File = open("/home/pi/gpsMaster/WiScan.txt", "r")
data = File.read()
File.close()
QUAL = data.replace("Quality", "\nQuality, ")
File = open("/home/pi/gpsMaster/WiScan.txt", "w")
File.write(QUAL)
File.close()
File = open("/home/pi/gpsMaster/WiScan.txt", "r")
data = File.read()
File.close()
SIG = data.replace("Signal level", "\nSignal Level, ")
File = open("/home/pi/gpsMaster/WiScan.txt", "w")
File.write(SIG)
File.close()
What I'd like to have:
ORG = ['Address:', 'ESSID:'...etc]
NEW = ['\nAddress, ' , '\nESSID, ' , ... etc]
and run that through:
File = open("/home/pi/gpsMaster/WiScan.txt", "r")
data = File.read()
File.close()
ID = data.replace("ORG", "NEW")
File = open("/home/pi/gpsMaster/WiScan.txt", "w")
File.write(ID)
File.close()
I've tried running exactly what I put up, but it does not seem to format it the way I need to.
The output from above looks like:
Cell 46 - Address: xx:xx:xx:xx:xx:xx ESSID:"MySSID" Frequency:2.412 GHz (Channel 1) Quality=47/100 Signal level=48/100 Quality=47/100 Signal level=48/100
But it is supposed to look like this (And it does when I run that same block over the strings separately):
xx:xx:xx:xx:xx:xx MySSID 5.18 GHz (Channel 36) 0.81 0.99
How should I go about looping this block of code through my list of strings?
There two strings that I would need for the find and replace, old and new, so they would have to work together. These lists will be the same size, obviously, and I need them to be in the correct order. Address with address, ESSID with ESSID, etc.
Thanks in advance!
Try something like this:
ORG = ['Address:', 'ESSID:'...etc]
NEW = ['\nAddress, ' , '\nESSID, ' , ... etc]
File = open("/home/pi/gpsMaster/WiScan.txt", "r")
data = File.read()
File.close()
for org, new in zip(ORG, NEW):
data = data.replace(org, new)
File = open("/home/pi/gpsMaster/WiScan.txt", "w")
File.write(data)
File.close()
(Note the way zip works: https://docs.python.org/2/library/functions.html#zip)
If I am reading your question right, you are opening the same file, making a small alteration, saving it, and then closing it again, five times. You could just open it once, make all the alterations, and then save it. For instance, like this:
filename = "/home/pi/gpsMaster/WiScan.txt"
with open(filename, 'r') as fin:
data = fin.read()
data = data.replace("Address:", "\nAddress, ")
data = data.replace("ESSID:", "\nESSID, ")
data = data.replace("Frequency:", "\nFrequency, ")
data = data.replace("Quality", "\nQuality, ")
data = data.replace("Signal level", "\nSignal Level, ")
with open(filename, 'w') as fout:
fout.write(data)
If you want to use lists (ORG and NEW) for your replacements, you could do this:
with open(filename, 'r') as fin:
data = fin.read()
for o,n in zip(ORG, NEW):
data = data.replace(o,n)
with open(filename, 'w') as fout:
fout.write(data)
Given your ORG and NEW, the simplest way to do this would be something like:
# Open once for both read and write; use with statement for guaranteed close at end of block
with open("/home/pi/gpsMaster/WiScan.txt", "r+") as f:
data = f.read() # Slurp file
f.seek(0) # Seek back to beginning of file
# Perform all replacements
for orig, repl in zip(ORG, NEW):
data = data.replace(orig, repl)
f.write(data) # Write new data over old
f.truncate() # If replacement shrunk file, truncate extra
You could just do this:
def wiScanFormat(path = "/home/pi/gpsMaster/WiScan.txt"):
# List of tuples with strings to find and strings to replace with
replacestr = [
("Address:", "\nAddress, "),
("ESSID:", "\nESSID, "),
("Frequency:", "\nFrequency, "),
("Quality", "\nQuality, "),
("Signal level", "\nSignal Level, ")
]
with open(path, "r") as file: # Open a file
data = file.read()
formated = data
for i in replacestr: # Loop over each element (tuple) in the list
formated = formated.replace(i[0], i[1]) # Replace the data
with open(path, "w") as file:
written = file.write(formated) # Write the data
return written

How to convert CSV file to multiline JSON?

Here's my code, really simple stuff...
import csv
import json
csvfile = open('file.csv', 'r')
jsonfile = open('file.json', 'w')
fieldnames = ("FirstName","LastName","IDNumber","Message")
reader = csv.DictReader( csvfile, fieldnames)
out = json.dumps( [ row for row in reader ] )
jsonfile.write(out)
Declare some field names, the reader uses CSV to read the file, and the filed names to dump the file to a JSON format. Here's the problem...
Each record in the CSV file is on a different row. I want the JSON output to be the same way. The problem is it dumps it all on one giant, long line.
I've tried using something like for line in csvfile: and then running my code below that with reader = csv.DictReader( line, fieldnames) which loops through each line, but it does the entire file on one line, then loops through the entire file on another line... continues until it runs out of lines.
Any suggestions for correcting this?
Edit: To clarify, currently I have: (every record on line 1)
[{"FirstName":"John","LastName":"Doe","IDNumber":"123","Message":"None"},{"FirstName":"George","LastName":"Washington","IDNumber":"001","Message":"Something"}]
What I'm looking for: (2 records on 2 lines)
{"FirstName":"John","LastName":"Doe","IDNumber":"123","Message":"None"}
{"FirstName":"George","LastName":"Washington","IDNumber":"001","Message":"Something"}
Not each individual field indented/on a separate line, but each record on it's own line.
Some sample input.
"John","Doe","001","Message1"
"George","Washington","002","Message2"
The problem with your desired output is that it is not valid json document,; it's a stream of json documents!
That's okay, if its what you need, but that means that for each document you want in your output, you'll have to call json.dumps.
Since the newline you want separating your documents is not contained in those documents, you're on the hook for supplying it yourself. So we just need to pull the loop out of the call to json.dump and interpose newlines for each document written.
import csv
import json
csvfile = open('file.csv', 'r')
jsonfile = open('file.json', 'w')
fieldnames = ("FirstName","LastName","IDNumber","Message")
reader = csv.DictReader( csvfile, fieldnames)
for row in reader:
json.dump(row, jsonfile)
jsonfile.write('\n')
You can use Pandas DataFrame to achieve this, with the following Example:
import pandas as pd
csv_file = pd.DataFrame(pd.read_csv("path/to/file.csv", sep = ",", header = 0, index_col = False))
csv_file.to_json("/path/to/new/file.json", orient = "records", date_format = "epoch", double_precision = 10, force_ascii = True, date_unit = "ms", default_handler = None)
import csv
import json
file = 'csv_file_name.csv'
json_file = 'output_file_name.json'
#Read CSV File
def read_CSV(file, json_file):
csv_rows = []
with open(file) as csvfile:
reader = csv.DictReader(csvfile)
field = reader.fieldnames
for row in reader:
csv_rows.extend([{field[i]:row[field[i]] for i in range(len(field))}])
convert_write_json(csv_rows, json_file)
#Convert csv data into json
def convert_write_json(data, json_file):
with open(json_file, "w") as f:
f.write(json.dumps(data, sort_keys=False, indent=4, separators=(',', ': '))) #for pretty
f.write(json.dumps(data))
read_CSV(file,json_file)
Documentation of json.dumps()
I took #SingleNegationElimination's response and simplified it into a three-liner that can be used in a pipeline:
import csv
import json
import sys
for row in csv.DictReader(sys.stdin):
json.dump(row, sys.stdout)
sys.stdout.write('\n')
You can try this
import csvmapper
# how does the object look
mapper = csvmapper.DictMapper([
[
{ 'name' : 'FirstName'},
{ 'name' : 'LastName' },
{ 'name' : 'IDNumber', 'type':'int' },
{ 'name' : 'Messages' }
]
])
# parser instance
parser = csvmapper.CSVParser('sample.csv', mapper)
# conversion service
converter = csvmapper.JSONConverter(parser)
print converter.doConvert(pretty=True)
Edit:
Simpler approach
import csvmapper
fields = ('FirstName', 'LastName', 'IDNumber', 'Messages')
parser = CSVParser('sample.csv', csvmapper.FieldMapper(fields))
converter = csvmapper.JSONConverter(parser)
print converter.doConvert(pretty=True)
I see this is old but I needed the code from SingleNegationElimination however I had issue with the data containing non utf-8 characters. These appeared in fields I was not overly concerned with so I chose to ignore them. However that took some effort. I am new to python so with some trial and error I got it to work. The code is a copy of SingleNegationElimination with the extra handling of utf-8. I tried to do it with https://docs.python.org/2.7/library/csv.html but in the end gave up. The below code worked.
import csv, json
csvfile = open('file.csv', 'r')
jsonfile = open('file.json', 'w')
fieldnames = ("Scope","Comment","OOS Code","In RMF","Code","Status","Name","Sub Code","CAT","LOB","Description","Owner","Manager","Platform Owner")
reader = csv.DictReader(csvfile , fieldnames)
code = ''
for row in reader:
try:
print('+' + row['Code'])
for key in row:
row[key] = row[key].decode('utf-8', 'ignore').encode('utf-8')
json.dump(row, jsonfile)
jsonfile.write('\n')
except:
print('-' + row['Code'])
raise
Add the indent parameter to json.dumps
data = {'this': ['has', 'some', 'things'],
'in': {'it': 'with', 'some': 'more'}}
print(json.dumps(data, indent=4))
Also note that, you can simply use json.dump with the open jsonfile:
json.dump(data, jsonfile)
Use pandas and the json library:
import pandas as pd
import json
filepath = "inputfile.csv"
output_path = "outputfile.json"
df = pd.read_csv(filepath)
# Create a multiline json
json_list = json.loads(df.to_json(orient = "records"))
with open(output_path, 'w') as f:
for item in json_list:
f.write("%s\n" % item)
How about using Pandas to read the csv file into a DataFrame (pd.read_csv), then manipulating the columns if you want (dropping them or updating values) and finally converting the DataFrame back to JSON (pd.DataFrame.to_json).
Note: I haven't checked how efficient this will be but this is definitely one of the easiest ways to manipulate and convert a large csv to json.
As slight improvement to #MONTYHS answer, iterating through a tup of fieldnames:
import csv
import json
csvfilename = 'filename.csv'
jsonfilename = csvfilename.split('.')[0] + '.json'
csvfile = open(csvfilename, 'r')
jsonfile = open(jsonfilename, 'w')
reader = csv.DictReader(csvfile)
fieldnames = ('FirstName', 'LastName', 'IDNumber', 'Message')
output = []
for each in reader:
row = {}
for field in fieldnames:
row[field] = each[field]
output.append(row)
json.dump(output, jsonfile, indent=2, sort_keys=True)
def read():
noOfElem = 200 # no of data you want to import
csv_file_name = "hashtag_donaldtrump.csv" # csv file name
json_file_name = "hashtag_donaldtrump.json" # json file name
with open(csv_file_name, mode='r') as csv_file:
csv_reader = csv.DictReader(csv_file)
with open(json_file_name, 'w') as json_file:
i = 0
json_file.write("[")
for row in csv_reader:
i = i + 1
if i == noOfElem:
json_file.write("]")
return
json_file.write(json.dumps(row))
if i != noOfElem - 1:
json_file.write(",")
Change the above three parameter, everything will be done.
import csv
import json
csvfile = csv.DictReader('filename.csv', 'r'))
output =[]
for each in csvfile:
row ={}
row['FirstName'] = each['FirstName']
row['LastName'] = each['LastName']
row['IDNumber'] = each ['IDNumber']
row['Message'] = each['Message']
output.append(row)
json.dump(output,open('filename.json','w'),indent=4,sort_keys=False)

Categories

Resources