python tarred folder stream - python

Is there a way to tarred a folder and get a tarred stream instead of a tarred file?
I have tried to use tar module but it directly return the tarred file.
with tarfile.open("zipped.tar",'w|') as tar:
for base_root, subFolders, files in os.walk('test'):
for j in files:
filepath = os.path.join(base_root,j)
if os.path.isfile(filepath):
with open(filepath, 'rb') as file:
size = os.stat(filepath).st_size
info = tarfile.TarInfo()
info.size = size
info.name = filepath
if(size <= chunck_size):
data = file.read(info.size)
fobj = StringIO.StringIO(data)
tar.addfile(info, fobj)
else:
data = ""
while True:
temp_data = file.read(chunck_size)
if temp_data == '':
break
data = data + temp_data
fobj = StringIO.StringIO(data)
tar.addfile(info, fobj)

According to the documentation, open can take a fileobj argument :
If fileobj is specified, it is used as an alternative to a file object opened in binary mode for name. It is supposed to be at position 0.
So you can write this, then use the buffer object.
import io
buffer = io.BytesIO()
with tarfile.open("zipped.tar",'w|', fileobj=buffer) as tar:

Related

How to change smiles to images(png) using RDKit in python script

How to save image files after generate image file in python script?
def mols_to_pngs(mols, basename = "test"):
filenames = []
for i, mol in enumerate(mols):
filename = "%s%d.png" % (basename, i)
Draw.MolToFile(mol,filename)
filenames.append(filename)
return filenames
and I want to this process automatically using csv file and python script
To automatically save image files generated from a python script using a csv file, you need to read the data from the csv file and process it in your script. Here's a basic example:
import csv
import sys
def mols_to_pngs(mols, basename = "test"):
filenames = []
for i, mol in enumerate(mols):
filename = "%s%d.png" % (basename, i)
Draw.MolToFile(mol,filename)
filenames.append(filename)
return filenames
# Read the csv file
try:
with open('data.csv', 'r') as file:
reader = csv.reader(file)
# Skip the header row
next(reader)
mols = []
for row in reader:
# Process the data from the csv file
mol = process_data(row)
mols.append(mol)
except IOError:
print("Could not read file:", file)
sys.exit()
# Save the image files
filenames = mols_to_pngs(mols)
print("Saved the following files:", filenames)
Note that you need to replace the process_data function with your own implementation to process the data from the csv file and create the mol objects. Additionally, you may need to modify the code to match your specific use case.

How to write data to a file in Python (File is create but no data)

When I pass the file name directly as below, data is being written to the output file.
Rpt_file_wfl = open('output.csv','a')
Rpt_file_wfl.write(output)
But when I pass the filename as a variable, the file is getting created but there is no data.
OUT_PATH = E:\MYDRIVE
outDir = py_script
outFiles = output.csv
Rpt_file_wfl = open(OUT_PATH+outDir+outFiles[0],'a')
Rpt_file_wfl.write(output)
I do close the file in the end.
Why would the data not be written with the above code.
Try to use os.path
import os
output_text = 'some text'
drive_path = 'E:'
drive_dir = 'Mydrive'
out_dir = 'py_script'
out_file = 'output.csv'
full_path = os.path.join(drive_path, drive_dir, out_dir, out_file)
with open(full_path, 'a', encoding='utf-8') as file:
file.write(output_text)
If it doesn't work - try to .replace() delimiters, like:
full_path = full_path.replace('/', '\\')
Or else:
full_path = full_path.replace('\\', '/')
Here`s example of working code:
OUT_PATH='D:\\output\\'
outDir='scripts\\'
outFiles=['1.csv', '2.csv']
path = OUT_PATH+outDir+outFiles[0]
output='Example output'
with open(path, 'a') as file:
file.write(output)

How do I apply my python code to all of the files in a folder at once, and how do I create a new name for each subsequent output file?

The code I am working with takes in a .pdf file, and outputs a .txt file. My question is, how do I create a loop (probably a for loop) which runs the code over and over again on all files in a folder which end in ".pdf"? Furthermore, how do I change the output each time the loop runs so that I can write a new file each time, that has the same name as the input file (ie. 1_pet.pdf > 1_pet.txt, 2_pet.pdf > 2_pet.txt, etc.)
Here is the code so far:
path="2_pet.pdf"
content = getPDFContent(path)
encoded = content.encode("utf-8")
text_file = open("Output.txt", "w")
text_file.write(encoded)
text_file.close()
The following script solve your problem:
import os
sourcedir = 'pdfdir'
dl = os.listdir('pdfdir')
for f in dl:
fs = f.split(".")
if fs[1] == "pdf":
path_in = os.path.join(dl,f)
content = getPDFContent(path_in)
encoded = content.encode("utf-8")
path_out = os.path.join(dl,fs[0] + ".txt")
text_file = open(path_out, 'w')
text_file.write(encoded)
text_file.close()
Create a function that encapsulates what you want to do to each file.
import os.path
def parse_pdf(filename):
"Parse a pdf into text"
content = getPDFContent(filename)
encoded = content.encode("utf-8")
## split of the pdf extension to add .txt instead.
(root, _) = os.path.splitext(filename)
text_file = open(root + ".txt", "w")
text_file.write(encoded)
text_file.close()
Then apply this function to a list of filenames, like so:
for f in files:
parse_pdf(f)
One way to operate on all PDF files in a directory is to invoke glob.glob() and iterate over the results:
import glob
for path in glob.glob('*.pdf')
content = getPDFContent(path)
encoded = content.encode("utf-8")
text_file = open("Output.txt", "w")
text_file.write(encoded)
text_file.close()
Another way is to allow the user to specify the files:
import sys
for path in sys.argv[1:]:
...
Then the user runs your script like python foo.py *.pdf.
You could use a recursive function to search the folders and all subfolders for files that end with pdf. Than take those files and then create a text file for it.
It could be something like:
import os
def convert_PDF(path, func):
d = os.path.basename(path)
if os.path.isdir(path):
[convert_PDF(os.path.join(path,x), func) for x in os.listdir(path)]
elif d[-4:] == '.pdf':
funct(path)
# based entirely on your example code
def convert_to_txt(path):
content = getPDFContent(path)
encoded = content.encode("utf-8")
file_path = os.path.dirname(path)
# replace pdf with txt extension
file_name = os.path.basename(path)[:-4]+'.txt'
text_file = open(file_path +'/'+file_name, "w")
text_file.write(encoded)
text_file.close()
convert_PDF('path/to/files', convert_to_txt)
Because the actual operation is changeable, you can replace the function with whatever operation you need to perform (like using a different library, converting to a different type, etc.)

renaming a list of pdf files with for loop

i am trying to rename a list of pdf files by extracting the name from the file using PyPdf. i tried to use a for loop to rename the files but i always get an error with code 32 saying that the file is being used by another process. I am using python2.7
Here's my code
import os, glob
from pyPdf import PdfFileWriter, PdfFileReader
# this function extracts the name of the file
def getName(filepath):
output = PdfFileWriter()
input = PdfFileReader(file(filepath, "rb"))
output.addPage(input.getPage(0))
outputStream = file(filepath + '.txt', 'w')
output.write(outputStream)
outputStream.close()
outText = open(filepath + '.txt', 'rb')
textString = outText.read()
outText.close()
nameStart = textString.find('default">')
nameEnd = textString.find('_SATB', nameStart)
nameEnd2 = textString.find('</rdf:li>', nameStart)
if nameStart:
testName = textString[nameStart+9:nameEnd]
if len(testName) <= 100:
name = testName + '.pdf'
else:
name = textString[nameStart+9:nameEnd2] + '.pdf'
return name
pdfFiles = glob.glob('*.pdf')
m = len(pdfFiles)
for each in pdfFiles:
newName = getName(each)
os.rename(each, newName)
Consider using the with directive of Python. With it you do not need to handle closing the file yourself:
def getName(filepath):
output = PdfFileWriter()
with file(filepath, "rb") as pdfFile:
input = PdfFileReader(pdfFile)
...
You're not closing the input stream (the file) used by the pdf reader.
Thus, when you try to rename the file, it's still open.
So, instead of this:
input = PdfFileReader(file(filepath, "rb"))
Try this:
inputStream = file(filepath, "rb")
input = PdfFileReader(inputStream)
(... when done with this file...)
inputStream.close()
It does not look like you close the file object associated with the PDF reader object. Though maybe at tne end of the function it is closed automatically, but to be sure you might want to create a separate file object which you pass to the PdfFileReader and then close the file handle when done. Then rename.
The below was from SO: How to close pyPDF "PdfFileReader" Class file handle
import os.path
from pyPdf import PdfFileReader
fname = 'my.pdf'
fh = file(fname, "rb")
input = PdfFileReader(fh)
fh.close()
os.rename(fname, 'my_renamed.pdf')

How to read filenames included into a gz file

I've tried to read a gz file:
with open(os.path.join(storage_path,file), "rb") as gzipfile:
with gzip.GzipFile(fileobj=gzipfile) as datafile:
data = datafile.read()
It works but I need the filenames and the size of every file included into my gz file.
This code print out the content of the included file into the archive.
How can I read the filenames included into this gz file?
The Python gzip module does not provide access to that information.
The source code skips over it without ever storing it:
if flag & FNAME:
# Read and discard a null-terminated string containing the filename
while True:
s = self.fileobj.read(1)
if not s or s=='\000':
break
The filename component is optional, not guaranteed to be present (the commandline gzip -c decompression option would use the original filename sans .gz in that case, I think). The uncompressed filesize is not stored in the header; you can find it in the last four bytes instead.
To read the filename from the header yourself, you'd need to recreate the file header reading code, and retain the filename bytes instead. The following function returns that, plus the decompressed size:
import struct
from gzip import FEXTRA, FNAME
def read_gzip_info(gzipfile):
gf = gzipfile.fileobj
pos = gf.tell()
# Read archive size
gf.seek(-4, 2)
size = struct.unpack('<I', gf.read())[0]
gf.seek(0)
magic = gf.read(2)
if magic != '\037\213':
raise IOError('Not a gzipped file')
method, flag, mtime = struct.unpack("<BBIxx", gf.read(8))
if not flag & FNAME:
# Not stored in the header, use the filename sans .gz
gf.seek(pos)
fname = gzipfile.name
if fname.endswith('.gz'):
fname = fname[:-3]
return fname, size
if flag & FEXTRA:
# Read & discard the extra field, if present
gf.read(struct.unpack("<H", gf.read(2)))
# Read a null-terminated string containing the filename
fname = []
while True:
s = gf.read(1)
if not s or s=='\000':
break
fname.append(s)
gf.seek(pos)
return ''.join(fname), size
Use the above function with an already-created gzip.GzipFile object:
filename, size = read_gzip_info(gzipfileobj)
GzipFile itself doesn't have this information, but:
The file name is (usually) the name of the archive minus the .gz
If the uncompressed file is smaller than 4G, then the last four bytes of the archive contain the uncompressed size:
In [14]: f = open('fuse-ext2-0.0.7.tar.gz')
In [15]: f.seek(-4, 2)
In [16]: import struct
In [17]: r = f.read()
In [18]: struct.unpack('<I', r)[0]
Out[18]: 7106560
In [19]: len(gzip.open('fuse-ext2-0.0.7.tar.gz').read())
Out[19]: 7106560
(technically, the last four bytes are the size of the original (uncompressed) input data modulo 232 (the ISIZE field in the member trailer, http://www.gzip.org/zlib/rfc-gzip.html))
i've solved in this mode:
fl = search_files(storage_path)
for f in fl:
with open(os.path.join(storage_path,f), "rb") as gzipfile:
with gzip.GzipFile(fileobj=gzipfile) as datafile:
data = datafile.read()
print str(storage_path) + "/" + str(f[:-3]) + " : " + str(len(data)) + " bytes" #pcap file size
i don't know if it's correct.
Any suggest?
the new code:
fl = search_files(storage_path)
for f in fl:
with open(os.path.join(storage_path,f), "rb") as gzipfile:
#try with module 2^32
gzipfile.seek(-4,2)
r = gzipfile.read()
print str(storage_path) + "/" + str(f[:-3]) + " : " + str(struct.unpack('<I' ,r)[0]) + " bytes" #dimensione del file pcap
The solution of Martjin is really nice, I've packaged it for Python 3.6+: https://github.com/PierreSelim/gzinfo
Juste need to pip install gzinfo
in your code
import gzinfo
info = gzinfo.read_gz_info('bar.txt.gz')
# info.name is 'foo.txt'
print(info.fname)

Categories

Resources