Running python subprocess.call on tgz file to untar and stream output - python

I'm using a subprocess call to untar a file in the command line, I need to use the output of that call to stream into a temp file so I can read the contents of the "+CONTENTS" folder with in the tgz file.
My failed output is:
./streamContents.py
rsh: ftp: No address associated with hostname
tar (child): ftp://myftpserver.com/pkgsrc/doxygen_pkgs/test.
tgz: Cannot open: Input/output error
tar (child): Error is not recoverable: exiting now
gzip: stdin: unexpected end of file
tar: Child returned status 2
tar: Error exit delayed from previous errors
Traceback (most recent call last):
File "./streamContents.py", line 29, in
stream = proc.stdout.read(8196)
AttributeError: 'int' object has no attribute 'stdout'
#!/usr/bin/python
from io import BytesIO
import urllib2
import tarfile
import ftplib
import socket
import threading
import subprocess
tarfile_url = "ftp://myftpserver.com/pkgsrc/doxygen_pkgs/test.tg
z"
try:
ftpstream = urllib2.urlopen(tarfile_url)
except URLerror, e:
print "URL timeout"
except socket.timeout:
print "Socket timeout"
# BytesIO creates an in-memory temporary file.
tmpfile = BytesIO()
last_size = 0
tfile_extract = ""
while True:
proc = subprocess.call(['tar','-xzvf', tarfile_url], stdout=subprocess.PIPE)
# Download a piece of the file from the ftp connection
stream = proc.stdout.read(8196)
if not stream: break
tmpfile.write(bytes(stream))
# Seeking back to the beginning of the temporary file.
tmpfile.seek(0)
# r|gz forbids seeking backward; r:gz allows seeking backward
try:
tfile = tarfile.open(fileobj=tmpfile, mode="r:gz")
print tfile.extractfile("+CONTENTS")
tfile_extract_text = tfile_extract.read()
print tfile_extract.tell()
tfile.close()
if tfile_extract.tell() > 0 and tfile_extract.tell() == last_size:
print tfile_extract_text
break
else:
last_size = tfile_extract.tell()
except Exception:
tfile.close()
pass
tfile_extract_text = tfile_extract.read()
print tfile_extract_text
# When you're done:
tfile.close()
tmpfile.close()

Expanding on my comment above, you need to do download the tar file using urllib2 and tempfile to a temporary file and then open this temporary file using tarfile.
Here's some code to get started:
import urllib2
import tarfile
from tempfile import TemporaryFile
f_url = 'url_of_your_tar_archive'
ftpstream = urllib2.urlopen(f_url)
tmpfile = TemporaryFile()
# Download contents of tar to a temporary file
while True:
s = ftpstream.read(16384)
if not s:
break
tmpfile.write(s)
ftpstream.close()
# Access the temporary file to extract the file you need
tmpfile.seek(0)
tfile = tarfile.open(fileobj=tmpfile, mode='r:gz')
print tfile.getnames()
contents = tfile.extractfile("+CONTENTS").read()
print contents

Related

Convert python2 script to python3

Would like to get this script working with python3 (Python 3.10.4):
https://stackoverflow.com/a/2573715/2394635
It would be where it says Full code bellow:
I don't put the code directly because I get the stackoverflow notification It looks like your post is mostly code; please add some more details.
I've used the pip script 2to3, with this resulting code:
import sys, os, hashlib, io, bencode
def pieces_generator(info):
"""Yield pieces from download file(s)."""
piece_length = info['piece length']
if 'files' in info: # yield pieces from a multi-file torrent
piece = ""
for file_info in info['files']:
path = os.sep.join([info['name']] + file_info['path'])
print(path)
sfile = open(path.decode('UTF-8'), "rb")
while True:
piece += sfile.read(piece_length-len(piece))
if len(piece) != piece_length:
sfile.close()
break
yield piece
piece = ""
if piece != "":
yield piece
else: # yield pieces from a single file torrent
path = info['name']
print(path)
sfile = open(path.decode('UTF-8'), "rb")
while True:
piece = sfile.read(piece_length)
if not piece:
sfile.close()
return
yield piece
def corruption_failure():
"""Display error message and exit"""
print("download corrupted")
exit(1)
def main():
# Open torrent file
torrent_file = open(sys.argv[1], "rb")
metainfo = bencode.bdecode(torrent_file.read())
info = metainfo['info']
pieces = io.StringIO(info['pieces'])
# Iterate through pieces
for piece in pieces_generator(info):
# Compare piece hash with expected hash
piece_hash = hashlib.sha1(piece).digest()
if (piece_hash != pieces.read(20)):
corruption_failure()
# ensure we've read all pieces
if pieces.read():
corruption_failure()
if __name__ == "__main__":
main()
However, keeps failing:
% python3 extract-torrent.py archive.torrent
Traceback (most recent call last):
File "/home/smt/Documents/extract-torrent-py3.py", line 1, in <module>
import sys, os, hashlib, io, bencode
File "/home/smt/.local/lib/python3.10/site-packages/bencode.py", line 73, in <module>
from types import StringType, IntType, LongType, DictType, ListType, TupleType
ImportError: cannot import name 'StringType' from 'types' (/usr/lib/python3.10/types.py)
Any help?
As #9769953 pointed out, bencode is not compatible with Python 3.10. You could try bencodepy which claims to be compatible with both Python 2 and 3.
From the website:
Install with pip install bencode.py
Import with import bencodepy

Python Append file should refreshed with new data

I am trying to write my output into a file what my code is doing is that its looking for matched file names and storing it into a file similary for unmatched files but the problem is when i use write it overwrites the file and when i use append on every run it keeps on appending the file matched file names. What i need is that it refresh te file whenever the script is run and loads it with current data only.
import re
import sys
import os
import glob
import pandas as pd
import logging
try:
for file in glob.glob('*.csv'):
r = re.search(r'abc_sales_(20[0-9][0-9])-([1-9]|1[0-2]|0[0-9])-([1-9]|1[0-9]|2[0-9]|3[0-1]|0[0-9])-[0-9]{2}_[a-z0-9]{3,5}.csv', file)
if r:
#matched=file
print(f'File matched:{file}')
fp=open('bad_lines.txt', 'r+')
sys.stdout = fp
else:
path=f'File not matched:{file}'
f=open('filenotmatched.txt','a')
f.seek(0)
f.truncate()
f.write(path+'\n')
f.close()
except Exception as e:
pass
Suggested changes to your code.
import re
import sys
import os
import glob
import pandas as pd
import logging
# We create new 'bad_lines.txt' and
# 'filenotmatched.txt' for each run
with open('bad_lines.txt', 'w') as f_badlines, open('filenotmatched.txt','w') as f_notmatched:
try:
for file in glob.glob('*.csv'):
r = re.search(r'abc_sales_(20[0-9][0-9])-([1-9]|1[0-2]|0[0-9])-([1-9]|1[0-9]|2[0-9]|3[0-1]|0[0-9])-[0-9]{2}_[a-z0-9]{3,5}.csv', file)
if r:
#matched=file
#print(f'File matched:{file}')
#fp=open('bad_lines.txt', 'r+')
# ** Not clear why you redirected
# ** standard out to a file
# ** rather than writing to file directly
#sys.stdout = fp
f_badlines.write(f'File matched:{file}\n')
else:
path=f'File not matched:{file}'
#f=open('filenotmatched.txt','a')
#f.seek(0)
#f.truncate()
#f.write(path+'\n')
#f.close()
f_notmatched.write(path + '\n')
except Exception as e:
pass

.xlsx and xls(Latest Versions) to pdf using python

With the help of this .doc to pdf using python
Link I am trying for excel (.xlsx and xls formats)
Following is modified Code for Excel:
import os
from win32com import client
folder = "C:\\Oprance\\Excel\\XlsxWriter-0.5.1"
file_type = 'xlsx'
out_folder = folder + "\\PDF_excel"
os.chdir(folder)
if not os.path.exists(out_folder):
print 'Creating output folder...'
os.makedirs(out_folder)
print out_folder, 'created.'
else:
print out_folder, 'already exists.\n'
for files in os.listdir("."):
if files.endswith(".xlsx"):
print files
print '\n\n'
word = client.DispatchEx("Excel.Application")
for files in os.listdir("."):
if files.endswith(".xlsx") or files.endswith('xls'):
out_name = files.replace(file_type, r"pdf")
in_file = os.path.abspath(folder + "\\" + files)
out_file = os.path.abspath(out_folder + "\\" + out_name)
doc = word.Workbooks.Open(in_file)
print 'Exporting', out_file
doc.SaveAs(out_file, FileFormat=56)
doc.Close()
It is showing following error :
>>> execfile('excel_to_pdf.py')
Creating output folder...
C:\Excel\XlsxWriter-0.5.1\PDF_excel created.
apms_trial.xlsx
~$apms_trial.xlsx
Exporting C:\Excel\XlsxWriter-0.5.1\PDF_excel\apms_trial.pdf
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "excel_to_pdf.py", line 30, in <module>
doc = word.Workbooks.Open(in_file)
File "<COMObject <unknown>>", line 8, in Open
pywintypes.com_error: (-2147352567, 'Exception occurred.', (0, u'Microsoft Excel
', u"Excel cannot open the file '~$apms_trial.xlsx' because the file format or f
ile extension is not valid. Verify that the file has not been corrupted and that
the file extension matches the format of the file.", u'xlmain11.chm', 0, -21468
27284), None)
>>>
There is problem in
doc.SaveAs(out_file, FileFormat=56)
What should be FileFormat file format?
Please Help
Link of xlsxwriter :
https://xlsxwriter.readthedocs.org/en/latest/contents.html
With the help of this you can generate excel file with .xlsx and .xls
for example excel file generated name is trial.xls
Now if you want to generate pdf of that excel file then do the following :
from win32com import client
xlApp = client.Dispatch("Excel.Application")
books = xlApp.Workbooks.Open('C:\\excel\\trial.xls')
ws = books.Worksheets[0]
ws.Visible = 1
ws.ExportAsFixedFormat(0, 'C:\\excel\\trial.pdf')
I got the same thing and the same error... ANSWER: 57.... see below...
from win32com import client
import win32api
def exceltopdf(doc):
excel = client.DispatchEx("Excel.Application")
excel.Visible = 0
wb = excel.Workbooks.Open(doc)
ws = wb.Worksheets[1]
try:
wb.SaveAs('c:\\targetfolder\\result.pdf', FileFormat=57)
except Exception, e:
print "Failed to convert"
print str(e)
finally:
wb.Close()
excel.Quit()
... as an alternative to the fragile ExportAsFixedFormat...
You can print an excel sheet to pdf on linux using python.
Do need to run openoffice as a headless server and use unoconv, takes a bit of configuring but is doable
You run OO as a (service) daemon and use it for the conversions for xls, xlsx and doc, docx.
http://dag.wiee.rs/home-made/unoconv/
Another solution for
Is to start gotenberg docker container locally
https://github.com/gotenberg/gotenberg
And pass (any supported by libreoffice) file from python wia HTTP to the container and get result as pdf
LIBREOFFICE_URL = 'http://localhost:3000/forms/libreoffice/convert'
LIBREOFFICE_LANDSCAPE_URL = 'http://localhost:3000/forms/libreoffice/convert?landscape=1'
def _retry_gotenberg(url, io_bytes, post_file_name='index.html'):
response = None
for _ in range(5):
response = requests.post(url, files={post_file_name: io_bytes})
if response.status_code == 200:
break
logging.info('Will sleep and retry: %s %s', response.status_code, response.content)
sleep(3)
if not response or response.status_code != 200:
raise RuntimeRrror(f'Bad response from doc-to-pdf: {response.status_code} {response.content}')
return response
def process_libreoffice(io_bytes, ext: str):
if ext in ('.doc', '.docx'):
url = LIBREOFFICE_URL
else:
url = LIBREOFFICE_LANDSCAPE_URL
response = self._retry_gotenberg(url, io_bytes, post_file_name=f'file.{ext}')
return response.content
The GroupDocs.Conversion Cloud SDK for Python is another option to convert Excel to PDF. It is paid API. However, it provides 150 free monthly API calls.
P.S: I'm a developer evangelist at GroupDocs.
# Import module
import groupdocs_conversion_cloud
from shutil import copyfile
# Get your client_id and client_key at https://dashboard.groupdocs.cloud (free registration is required).
client_id = "xxxxxx-xxxx-xxxx-xxxx-xxxxxxxxx"
client_key = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
# Create instance of the API
convert_api = groupdocs_conversion_cloud.ConvertApi.from_keys(client_id, client_key)
try:
#Convert PDF to PNG
# Prepare request
request = groupdocs_conversion_cloud.ConvertDocumentDirectRequest("pdf", "C:/Temp/Book1.xlsx")
# Convert
result = convert_api.convert_document_direct(request)
copyfile(result, 'C:/Temp/Book1_output.pdf')
print("Result {}".format(result))
except groupdocs_conversion_cloud.ApiException as e:
print("Exception when calling get_supported_conversion_types: {0}".format(e.message))

Python csv read file, what if I don't close the file?

I use the following code to read a csv file
f = csv.reader(open(filename, 'rb'))
Then there's no way I can close filename, right? Is there any harm of doing so or is there a better way of reading it?
There is, use context managers:
with open(filename, 'rb') as handle:
f = csv.reader(handle)
In general an open unused file descriptor is a resource leak and should be avoided.
Interestingly in the case of files, at least the file descriptor is released, as soon as there is no reference to the file any more (see also this answer):
#!/usr/bin/env python
import gc
import os
import subprocess
# GC thresholds (http://docs.python.org/3/library/gc.html#gc.set_threshold)
print "Garbage collection thresholds: {}".format(gc.get_threshold())
if __name__ == '__main__':
pid = os.getpid()
print('------- No file descriptor ...')
subprocess.call(['lsof -p %s' % pid], shell=True)
x = open('/tmp/test', 'w')
print('------- Reference to a file ...')
subprocess.call(['lsof -p %s' % pid], shell=True)
x = 2
print('------- FD is freed automatically w/o GC')
subprocess.call(['lsof -p %s' % pid], shell=True)

How to test a directory of files for gzip and uncompress gzipped files in Python using zcat?

I'm in my 2nd week of Python and I'm stuck on a directory of zipped/unzipped logfiles, which I need to parse and process.
Currently I'm doing this:
import os
import sys
import operator
import zipfile
import zlib
import gzip
import subprocess
if sys.version.startswith("3."):
import io
io_method = io.BytesIO
else:
import cStringIO
io_method = cStringIO.StringIO
for f in glob.glob('logs/*'):
file = open(f,'rb')
new_file_name = f + "_unzipped"
last_pos = file.tell()
# test for gzip
if (file.read(2) == b'\x1f\x8b'):
file.seek(last_pos)
#unzip to new file
out = open( new_file_name, "wb" )
process = subprocess.Popen(["zcat", f], stdout = subprocess.PIPE, stderr=subprocess.STDOUT)
while True:
if process.poll() != None:
break;
output = io_method(process.communicate()[0])
exitCode = process.returncode
if (exitCode == 0):
print "done"
out.write( output )
out.close()
else:
raise ProcessException(command, exitCode, output)
which I've "stitched" together using these SO answers (here) and blogposts (here)
However, it does not seem to work, because my test file is 2.5GB and the script has been chewing on it for 10+mins plus I'm not really sure if what I'm doing is correct anyway.
Question:
If I don't want to use GZIP module and need to de-compress chunk-by-chunk (actual files are >10GB), how do I uncompress and save to file using zcat and subprocess in Python?
Thanks!
This should read the first line of every file in the logs subdirectory, unzipping as required:
#!/usr/bin/env python
import glob
import gzip
import subprocess
for f in glob.glob('logs/*'):
if f.endswith('.gz'):
# Open a compressed file. Here is the easy way:
# file = gzip.open(f, 'rb')
# Or, here is the hard way:
proc = subprocess.Popen(['zcat', f], stdout=subprocess.PIPE)
file = proc.stdout
else:
# Otherwise, it must be a regular file
file = open(f, 'rb')
# Process file, for example:
print f, file.readline()

Categories

Resources