While this code reads and writes a jsonlines file. How to compress it? I tried directly using gzip.open but I am getting various errors.
import json
def dump_jsonl(data, output_path, append=False):
"""
Write list of objects to a JSON lines file.
"""
mode = 'a+' if append else 'w'
with open(output_path, mode, encoding='utf-8') as f:
for line in data:
json_record = json.dumps(line, ensure_ascii=False)
f.write(json_record + '\n')
print('Wrote {} records to {}'.format(len(data), output_path))
def load_jsonl(input_path) -> list:
"""
Read list of objects from a JSON lines file.
"""
data = []
with open(input_path, 'r', encoding='utf-8') as f:
for line in f:
data.append(json.loads(line.rstrip('\n|\r')))
print('Loaded {} records from {}'.format(len(data), input_path))
return data
This is what I am doing to compress but I am unable to read it.
def dump_jsonl(data, output_path, append=False):
with gzip.open(output_path, "a+") as f:
for line in data:
json_record = json.dumps(line, ensure_ascii = False)
encoded = json_record.encode("utf-8") + ("\n").encode("utf-8")
compressed = gzip.compress(encoded)
f.write(compressed)
Use the gzip module's compress function.
import gzip
with open('file.jsonl') as f_in:
with gzip.open('file.jsonl.gz', 'wb') as f_out:
f_out.writelines(f_in)
gzip.open() is for opening gzipped files, not jsonl.
Read:
gzip a file in Python
Python support for Gzip
Related
I have a large JSON file. My goal is to minify in using Python.
The JSON file contains Arabic and Bengali Characters.
My problem is when I try to minify is I am getting Unicode characters instead of normal string characters like this \u09c7.
How can I save the minified file with normal string characters?
Below is my code:
import json
filename = 'quran.json' # file name we want to compress
newname = filename.replace('.json', '.min.json') # Output file name
fp = open(filename, encoding="utf8")
print("Compressing file: " + filename)
print('Compressing...')
jload = json.load(fp)
newfile = json.dumps(jload, indent=None, separators=(',', ':'))
newfile = str.encode(newfile)
f = open(newname, 'wb')
f.write(newfile)
f.close()
print('Compression complete!)
Here is the file link in case you want to try: https://raw.githubusercontent.com/nhridoy/quran-api/main/v1/quran.json
it need ensure_ascii=False flag in json.dumps()
import json
filename = 'quran.json' # file name we want to compress
newname = filename.replace('.json', '.min.json') # Output file name
with open(filename, encoding="utf8") as fp:
print("Compressing file: " + filename)
print('Compressing...')
jload = json.load(fp)
newfile = json.dumps(jload, indent=None, separators=(',', ':'), ensure_ascii=False)
#newfile = str.encode(newfile) # remove this
with open(newname, 'w', encoding="utf8") as f: # add encoding="utf8"
f.write(newfile)
print('Compression complete!')
I am currently using the following code to convert a large CSV file to a JSON file.
import csv
import json
def csv_to_json(csvFilePath, jsonFilePath):
jsonArray = []
with open(csvFilePath, encoding='utf-8') as csvf:
csvReader = csv.DictReader(csvf)
for row in csvReader:
jsonArray.append(row)
with open(jsonFilePath, 'w', encoding='utf-8') as jsonf:
jsonString = json.dumps(jsonArray, indent=4)
jsonf.write(jsonString)
csvFilePath = r'test_data.csv'
jsonFilePath = r'test_data.json'
csv_to_json(csvFilePath, jsonFilePath)
This code works fine and I am able to convert the CSV to JSON without any issues. However, as the CSV file contains 600,000+ rows and hence as many items in my JSON, it has become very difficult to manage the JSON file.
I would like to modify my above code such that for every 5000 rows of the CSV, the data is written into a new JSON file. Ideally, I would be having 120 (600,000/5000) JSON files in this case.
How can I do the same?
Split up your read\write methods and add a simple threshold:
JSON_ENTRIES_THRESHOLD = 5000 # modify to whatever you see suitable
def write_json(json_array, filename):
with open(filename, 'w', encoding='utf-8') as jsonf:
json.dump(json_array, jsonf) # note the usage of .dump directly to a file descriptor
def csv_to_json(csvFilePath, jsonFilePath):
jsonArray = []
with open(csvFilePath, encoding='utf-8') as csvf:
csvReader = csv.DictReader(csvf)
filename_index = 0
for row in csvReader:
jsonArray.append(row)
if len(jsonArray) >= JSON_ENTRIES_THRESHOLD:
# if we reached the treshold, write out
write_json(jsonArray, f"jsonFilePath-{filename_index}.json")
filename_index += 1
jsonArray = []
# Finally, write out the remainder
write_json(jsonArray, f"jsonFilePath-{filename_index}.json")
f.pk is basically a container of base64 + zip which I need to import in Python and extract. The zip file is p.plist, so f.pk = path + name + data of p.plist.
I can't find any working encoding for open() or codecs.open() to open it as a str and save the output. I always have a generated output.plist which is different than the original.
Encodings I have already used include ASCII; UTF-x; Latin_1; ISO-x;
import codecs, os
with open('f.pk', 'r', encoding='Latin_1') as f:
f_open = f.read()
with codecs.open('f.pk', 'r', encoding='zip') as f:
f_open = f.read()
f2=f_open[3:] #SKIP DUMMY PART
f3=f2.split('-DATA-')
f4=f3[1].split('-COMMENT-')
with open('output.plist', 'w') as f:
print(f_out, file=f)
original.plist = 5e03964972def5b83880397b7377e6d1aea33e2b
output.plist = 6473aea0ae8bc75a04859effe1ee366de4cdd2d2
I have deep analyzed both files:
with no success.
with open('file.pk', 'rb') as f:
f_open = f.read()
# do something with bytes here
with open('p.temp', 'wb') as f:
pickle.dump(f_bytes, f) # temp file
# reload temp file
with open("p.temp", 'rb') as f:
data = f.read()
# skip encodings and skip unwanted bytes
data = data[4:-3]
# save it
with open('p.plist', 'wb') as f:
f.write(data)
os.remove('p.temp')
I have data which is being accessed via http request and is sent back by the server in a comma separated format, I have the following code :
site= 'www.example.com'
hdr = {'User-Agent': 'Mozilla/5.0'}
req = urllib2.Request(site,headers=hdr)
page = urllib2.urlopen(req)
soup = BeautifulSoup(page)
soup = soup.get_text()
text=str(soup)
The content of text is as follows:
april,2,5,7
may,3,5,8
june,4,7,3
july,5,6,9
How can I save this data into a CSV file.
I know I can do something along the lines of the following to iterate line by line:
import StringIO
s = StringIO.StringIO(text)
for line in s:
But i'm unsure how to now properly write each line to CSV
EDIT---> Thanks for the feedback as suggested the solution was rather simple and can be seen below.
Solution:
import StringIO
s = StringIO.StringIO(text)
with open('fileName.csv', 'w') as f:
for line in s:
f.write(line)
General way:
##text=List of strings to be written to file
with open('csvfile.csv','wb') as file:
for line in text:
file.write(line)
file.write('\n')
OR
Using CSV writer :
import csv
with open(<path to output_csv>, "wb") as csv_file:
writer = csv.writer(csv_file, delimiter=',')
for line in data:
writer.writerow(line)
OR
Simplest way:
f = open('csvfile.csv','w')
f.write('hi there\n') #Give your csv text here.
## Python will convert \n to os.linesep
f.close()
You could just write to the file as you would write any normal file.
with open('csvfile.csv','wb') as file:
for l in text:
file.write(l)
file.write('\n')
If just in case, it is a list of lists, you could directly use built-in csv module
import csv
with open("csvfile.csv", "wb") as file:
writer = csv.writer(file)
writer.writerows(text)
I would simply write each line to a file, since it's already in a CSV format:
write_file = "output.csv"
with open(write_file, "wt", encoding="utf-8") as output:
for line in text:
output.write(line + '\n')
I can't recall how to write lines with line-breaks at the moment, though :p
Also, you might like to take a look at this answer about write(), writelines(), and '\n'.
To complement the previous answers, I whipped up a quick class to write to CSV files. It makes it easier to manage and close open files and achieve consistency and cleaner code if you have to deal with multiple files.
class CSVWriter():
filename = None
fp = None
writer = None
def __init__(self, filename):
self.filename = filename
self.fp = open(self.filename, 'w', encoding='utf8')
self.writer = csv.writer(self.fp, delimiter=';', quotechar='"', quoting=csv.QUOTE_ALL, lineterminator='\n')
def close(self):
self.fp.close()
def write(self, elems):
self.writer.writerow(elems)
def size(self):
return os.path.getsize(self.filename)
def fname(self):
return self.filename
Example usage:
mycsv = CSVWriter('/tmp/test.csv')
mycsv.write((12,'green','apples'))
mycsv.write((7,'yellow','bananas'))
mycsv.close()
print("Written %d bytes to %s" % (mycsv.size(), mycsv.fname()))
Have fun
What about this:
with open("your_csv_file.csv", "w") as f:
f.write("\n".join(text))
str.join() Return a string which is the concatenation of the strings in iterable.
The separator between elements is
the string providing this method.
In my situation...
with open('UPRN.csv', 'w', newline='') as out_file:
writer = csv.writer(out_file)
writer.writerow(('Name', 'UPRN','ADMIN_AREA','TOWN','STREET','NAME_NUMBER'))
writer.writerows(lines)
you need to include the newline option in the open attribute and it will work
https://www.programiz.com/python-programming/writing-csv-files
Currently I'm using this:
f = open(filename, 'r+')
text = f.read()
text = re.sub('foobar', 'bar', text)
f.seek(0)
f.write(text)
f.close()
But the problem is that the old file is larger than the new file. So I end up with a new file that has a part of the old file on the end of it.
If you don't want to close and reopen the file, to avoid race conditions, you could truncate it:
f = open(filename, 'r+')
text = f.read()
text = re.sub('foobar', 'bar', text)
f.seek(0)
f.write(text)
f.truncate()
f.close()
The functionality will likely also be cleaner and safer using open as a context manager, which will close the file handler, even if an error occurs!
with open(filename, 'r+') as f:
text = f.read()
text = re.sub('foobar', 'bar', text)
f.seek(0)
f.write(text)
f.truncate()
The fileinput module has an inplace mode for writing changes to the file you are processing without using temporary files etc. The module nicely encapsulates the common operation of looping over the lines in a list of files, via an object which transparently keeps track of the file name, line number etc if you should want to inspect them inside the loop.
from fileinput import FileInput
for line in FileInput("file", inplace=1):
line = line.replace("foobar", "bar")
print(line)
Probably it would be easier and neater to close the file after text = re.sub('foobar', 'bar', text), re-open it for writing (thus clearing old contents), and write your updated text to it.
I find it easier to remember to just read it and then write it.
For example:
with open('file') as f:
data = f.read()
with open('file', 'w') as f:
f.write('hello')
To anyone who wants to read and overwrite by line, refer to this answer.
https://stackoverflow.com/a/71285415/11442980
filename = input("Enter filename: ")
with open(filename, 'r+') as file:
lines = file.readlines()
file.seek(0)
for line in lines:
value = int(line)
file.write(str(value + 1))
file.truncate()
Honestly you can take a look at this class that I built which does basic file operations. The write method overwrites and append keeps old data.
class IO:
def read(self, filename):
toRead = open(filename, "rb")
out = toRead.read()
toRead.close()
return out
def write(self, filename, data):
toWrite = open(filename, "wb")
out = toWrite.write(data)
toWrite.close()
def append(self, filename, data):
append = self.read(filename)
self.write(filename, append+data)
Try writing it in a new file..
f = open(filename, 'r+')
f2= open(filename2,'a+')
text = f.read()
text = re.sub('foobar', 'bar', text)
f.seek(0)
f.close()
f2.write(text)
fw.close()