How to archive binary data

How to archive binary data - python

Use zipfile to archive data dictionary:
import os, sys, cPickle, zipfile
data = {1: 'one'}
pfile = r'c:\temp\myPickle.p'
cPickle.dump( data, open( pfile, "wb" ))
zfilename = r'c:\temp\myArchive.zip'
zfile = zipfile.ZipFile(zfilename, "w", zipfile.ZIP_DEFLATED)
zfile.write(pfile, os.path.basename(pfile))
zfile.close()
The approach results two files on a local drive: one is pickle and another is zip.
To get pickled data zip file needs to be un-archived:
fh = open(zfilename, 'rb')
z = zipfile.ZipFile(fh)
for each in z.namelist():
z.extract(each, r'c:\temp')
fh.close()
How to simplify the process?

Use gzip instead. It compresses the file as you write it so there is no need for intermediate files.
# (python 2) import cPickle as pickle
import pickle
import gzip
data = {1: 'one'}
pfile = r'test.pkl.gz'
pickle.dump(data, gzip.open(pfile, "w"), pickle.HIGHEST_PROTOCOL)
print pickle.load(gzip.open(pfile))

Related

Python iterate folder of csv and convert do json

I am amateur at python but I have a task of converting folder of csv to json files. I have this script working with specified CSV file but I have no idea how to make the script iterate thrue folder of csv and convert all of those csv to json. The original script:
import csv
import json
import pandas as pd
file = '/users/krzysztofpaszta/CSVtoGD/build-a-bridge.csv'
json_file = '/users/krzysztofpaszta/CSVtoGD/build-a-bridge.json'
#Odczyt pliku CSV
def read_CSV(file, json_file):
csv_rows = []
with open(file) as csvfile:
reader = csv.DictReader(csvfile)
field = reader.fieldnames
for row in reader:
csv_rows.extend([{field[i]:row[field[i]] for i in range(len(field))}])
convert_write_json(csv_rows, json_file)
#Zamiana CSV na JSON
def convert_write_json(data, json_file):
with open(json_file, "w") as f:
f.write(json.dumps(data, sort_keys=False, indent=4, separators=(',', ': ')))
f.write(json.dumps(data))
read_CSV(file,json_file)
someone will give me a hint?

You can use os functions, particularly os.listdir() to iterate over files in the directory, and safely generate new names with os.path.splitext():
import os
DIRECTORY = "/path/to/dir"
for f in os.listdir(os.fsencode(DIRECTORY)):
fn = os.fsdecode(f)
pre, ext = os.path.splitext(fn)
if ext == ".csv":
read_CSV(fn, pre + '.json')
The similar approach with pathlib would be:
from pathlib import Path
DIRECTORY = "/path/to/dir"
files = Path(DIRECTORY).glob('*.csv') # to process files only in this dir
files = Path(DIRECTORY).rglob('*.csv') # to process files in sub-directories recursively
for f in files:
read_CSV(f, str(f.with_suffix('.json'))) # use .with_suffix() for safe name generation

You can list the csv files in a folder using pathlib:
from pathlib import Path
csv_files = Path().glob('*.csv')
Then loop over the files:
for csv_file in csv_files:
csv_path = str(csv_file.absolute())
json_path = csv_path.replace('.csv', '.json')
read_CSV(csv_path, json_path)

How to set path with glob when the fileName is use for csv?

I'm looking for a solution about path making for glob and for pandas to_csv anyone have a solution ?
My code :
from glob import glob
import json
import pandas as pd
PathIn = 'c:\\Users\\***\\PycharmProjects\\Project\\In'
PathOut = 'c:\\Users\\***\\PycharmProjects\\Project\\Out'
for fileName in glob(PathIn + '*.json', recursive=True):
with open(fileName, 'rb') as f:
json_dict = json.load(f)
print(json_dict)
.
.
.
.
.
.
df.to_csv(PathOut + fileName + '.csv', sep=";")
He doesn't print me my JSON file so don't take any file in my In. And I don't have any CSV in my Output.

the key here is you want to create the output file in the relevant user dir based on the input file, so you could instead just get a list of the users dirs and iterate over each of them settting the in and output file then search the json files and create the csv in the coresponding dir. something like.
import json
from glob import glob
import os.path as op
basepath = r'C:\Users\***\PycharmProjects'
_in = 'In'
_out = 'Out'
suffix = '\*.json'
output_suffix = '.csv'
for path in glob(basepath):
in_dir = op.join(path, _in)
out_dir = op.join(path, _out)
for json_file in glob(in_dir + suffix, recursive=True):
in_file_name = op.basename(json_file)
out_file_name = in_file_name.split('.')[0] + output_suffix
output_file = op.join(out_dir, out_file_name)
with open(json_file) as jf:
json_data = json.load(jf)
print(json_data)
###do some stuff with the json
with open(output_file, 'w') as of:
of.write("some data or json stuff")

Just slightly modifying your code I think you missed a \ when writing the path for searching in the input directory.
For the output directory you need to build your filename by replacing the extension .json with .csv. There are many ways to do that:
for fileName in glob(PathIn + '\*.json', recursive=True):
with open(fileName, 'rb') as f:
json_dict = json.load(f)
print(json_dict)
out_file_name = os.path.split(fileName)[0] + '.csv'
out_file_dir = os.path.join(PathOut, out_file_name)
# Here do something with your output file

Create multiply amount of files with random names in python and zip them

I'm newbie in Python,
I need to create a large quantity of files with random names in Dest_Dir (my destination directory), And then Zip them to one file.
Does anyone have an idea how to do that?
I managed to create files like that with a for loop into a specific folder, but it doesn't fit in case I want to create large amount of file (Lets say 100)
And the names I created aren't random.
import os
import sys
import platform
SRC_Dir = os.path.dirname(__file__)
Dest_Dir = os.path.join(SRC_Dir, 'dest')
items = ["one", "two", "three"]
for item in items:
#(os.path.join(Dest_Dir, filename), 'wb') as temp_file:
with open(os.path.join(Dest_Dir, item), 'wb') as f:
f.write("This is my first line of code")
f.write("\nThis is my second line of code with {} the first item in my list".format(item))
f.write("\nAnd this is my last line of code")

You could make use of the built-in tempfile
import os
import tempfile
for _ in range(100):
file_descriptor, file_path = tempfile.mkstemp(".txt", "prefix-", Dest_Dir)
file_handle = open(file_path, "wb")
# do stuff
os.close(file_descriptor)
file_handle.close()
Since a comment was made about the zip part I figured I would add that as well
import os
import tempfile
import zipfile
new_files = []
for _ in range(10):
file_descriptor, file_path = tempfile.mkstemp(".txt", "prefix-", "/tmp")
file_handle = open(file_path, "wb")
file_handle.write("HELLO")
os.close(file_descriptor)
file_handle.close()
new_files.append(file_path)
with zipfile.ZipFile("/tmp/zipped.zip", "w") as zipped:
for file_path in new_files:
zipped.write(file_path, os.path.basename(file_path))
The zipped.write arguments here assume only the filename (and not the path) are desired for the archived name.

Python, write in memory zip to file

How do I write an in memory zipfile to a file?
# Create in memory zip and add files
zf = zipfile.ZipFile(StringIO.StringIO(), mode='w',compression=zipfile.ZIP_DEFLATED)
zf.writestr('file1.txt', "hi")
zf.writestr('file2.txt', "hi")
# Need to write it out
f = file("C:/path/my_zip.zip", "w")
f.write(zf) # what to do here? Also tried f.write(zf.read())
f.close()
zf.close()

StringIO.getvalue return content of StringIO:
>>> import StringIO
>>> f = StringIO.StringIO()
>>> f.write('asdf')
>>> f.getvalue()
'asdf'
Alternatively, you can change position of the file using seek:
>>> f.read()
''
>>> f.seek(0)
>>> f.read()
'asdf'
Try following:
mf = StringIO.StringIO()
with zipfile.ZipFile(mf, mode='w', compression=zipfile.ZIP_DEFLATED) as zf:
zf.writestr('file1.txt', "hi")
zf.writestr('file2.txt', "hi")
with open("C:/path/my_zip.zip", "wb") as f: # use `wb` mode
f.write(mf.getvalue())

Modify falsetru's answer for python3
1) use io.StringIO instead of StringIO.StringIO
StringIO in python3
2) use b"abc" instead of "abc" , or
python 3.5: TypeError: a bytes-like object is required, not 'str' when writing to a file
3) encode to binary string str.encode(s, "utf-8")
Best way to convert string to bytes in Python 3?
import zipfile
import io
mf = io.BytesIO()
with zipfile.ZipFile(mf, mode="w",compression=zipfile.ZIP_DEFLATED) as zf:
zf.writestr('file1.txt', b"hi")
zf.writestr('file2.txt', str.encode("hi"))
zf.writestr('file3.txt', str.encode("hi",'utf-8'))
with open("a.txt.zip", "wb") as f: # use `wb` mode
f.write(mf.getvalue())
This should also work for gzip: How do I gzip compress a string in Python?

with ZipFile(read_file, 'r') as zipread:
with ZipFile(file_write_buffer, 'w', ZIP_DEFLATED) as zipwrite:
for item in zipread.infolist():
# Copy all ZipInfo attributes for each file since defaults are not preseved
dest.CRC = item.CRC
dest.date_time = item.date_time
dest.create_system = item.create_system
dest.compress_type = item.compress_type
dest.external_attr = item.external_attr
dest.compress_size = item.compress_size
dest.file_size = item.file_size
dest.header_offset = item.header_offset
In the case where the zip file reads corrupted and you notice missing symlinks or corrupted files with wrong timestamps, it could be the fact that the file properties are not getting copied over.
The above code snippet is how I solved the problem.

Read CSV from within Zip File

I have a directory of zip files (approximately 10,000 small files), within each is a CSV file I am trying to read and split into a number of different CSV files.
I managed to write the code to split the CSV files from a directory of CSVs, shown below, that reads the first atttribute of the CSV, and depending what it is write it to the relevent CSV.
import csv
import os
import sys
import re
import glob
reader = csv.reader(open("C:/Projects/test.csv", "rb"), delimiter=',', quotechar='"')
write10 = csv.writer(open('ouput10.csv', 'w'), delimiter=',', lineterminator='\n', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
write15 = csv.writer(open('ouput15.csv', 'w'), delimiter=',', lineterminator='\n', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
headings10=["RECORD_IDENTIFIER","CUSTODIAN_NAME","LOCAL_CUSTODIAN_NAME","PROCESS_DATE","VOLUME_NUMBER","ENTRY_DATE","TIME_STAMP","VERSION","FILE_TYPE"]
write10.writerow(headings10)
headings15=["RECORD_IDENTIFIER","CHANGE_TYPE","PRO_ORDER","USRN","STREET_DESCRIPTION","LOCALITY_NAME","TOWN_NAME","ADMINSTRATIVE_AREA","LANGUAGE"]
write15.writerow(headings15)
for row in reader:
type = row[0]
if "10" in type:
write10.writerow(row)
elif "15" in type:
write15.writerow(row)
So I am now trying to read the Zip files rather than wasting time extracting them first.
This is what I have so far after following as many tutorials as I have found
import glob
import os
import csv
import zipfile
import StringIO
for name in glob.glob('C:/Projects/abase/*.zip'):
base = os.path.basename(name)
filename = os.path.splitext(base)[0]
datadirectory = 'C:/Projects/abase/'
dataFile = filename
archive = '.'.join([dataFile, 'zip'])
fullpath = ''.join([datadirectory, archive])
csv = '.'.join([dataFile, 'csv'])
filehandle = open(fullpath, 'rb')
zfile = zipfile.ZipFile(filehandle)
data = StringIO.StringIO(zfile.read(csv))
reader = csv.reader(data)
for row in reader:
print row
However and error gets thrown
AttributeError: 'str' object has no attribute 'reader'
Hopefully someone can show me how to change my CSV reading code that works to read the Zip file.
Much appreciated
Tim

Simple fix. You're overriding the csv module with your local csv variable. Just change the name of that variable:
import glob
import os
import csv
import zipfile
import StringIO
for name in glob.glob('C:/Projects/abase/*.zip'):
base = os.path.basename(name)
filename = os.path.splitext(base)[0]
datadirectory = 'C:/Projects/abase/'
dataFile = filename
archive = '.'.join([dataFile, 'zip'])
fullpath = ''.join([datadirectory, archive])
csv_file = '.'.join([dataFile, 'csv']) #all fixed
filehandle = open(fullpath, 'rb')
zfile = zipfile.ZipFile(filehandle)
data = StringIO.StringIO(zfile.read(csv_file)) #don't forget this line!
reader = csv.reader(data)
for row in reader:
print row

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to archive binary data - python

Related

Python iterate folder of csv and convert do json

How to set path with glob when the fileName is use for csv?

Create multiply amount of files with random names in python and zip them

Python, write in memory zip to file

Read CSV from within Zip File

Categories

Resources