I have a file with a special file ending (.dsx) that basically is a gzip'ed XML file. I am able to decompress that file using
$ gzip -S .dsx -d -c file.dsx
<prints my XML to the console>
I'm trying to do the same thing with Python:
import sys
import gzip
def main():
file_path = sys.argv[1]
with gzip.open(file_path, 'rb') as f:
file_content = f.read()
# ...
if __name__ == "__main__":
main()
But I get the following error:
Traceback (most recent call last):
File "<truncated>\decode-dsx.py", line 17, in <module>
main()
File "<truncated>\decode-dsx.py", line 13, in main
file_content = f.read()
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.1520.0_x64__qbz5n2kfra8p0\lib\gzip.py", line 301, in read
return self._buffer.read(size)
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.1520.0_x64__qbz5n2kfra8p0\lib\_compression.py", line 118, in readall
while data := self.read(sys.maxsize):
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.1520.0_x64__qbz5n2kfra8p0\lib\gzip.py", line 488, in read
if not self._read_gzip_header():
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.1520.0_x64__qbz5n2kfra8p0\lib\gzip.py", line 436, in _read_gzip_header
raise BadGzipFile('Not a gzipped file (%r)' % magic)
gzip.BadGzipFile: Not a gzipped file (b'PK'
I also tried calling the module directly, by adding the .gz line ending, and calling:
python -m gzip -d .\file.gz
It produces the same error.
Thanks to #jasonharper's hint, it was indeed just a regular zip file.
import sys
import zipfile
import os
def main():
file_path = sys.argv[1]
file_name = os.path.basename(file_path)
with zipfile.ZipFile(file_path, mode="r") as archive:
with archive.open(f'{file_name}.xml') as myfile:
print(myfile.read())
if __name__ == "__main__":
main()
Related
I have read through the Python documentation about zip files and watched a couple of videos, but everything didn't work. I'm using Kali Linux, so that the password has to be encoded in bytes.
Here is my code, with which I have tried:
import zipfile
import string
import traceback
def try_function(zip, pwd):
try:
zip.extractall(pwd=pwd.encode())
print("Yes")
except TypeError:
print("No")
z = zipfile.ZipFile("test.txt.zip")
pwd_local = "abc"
if __name__ == '__main__':
try_function(z, pwd_local)
But I always get the same error:
Traceback (most recent call last):
File "ZipWorker.py", line 22, in <module>
try_function(z, pwd_list)
File "ZipWorker.py", line 11, in crack
zip.extractall(pwd.encode())
File "/usr/lib/python3.9/zipfile.py", line 1633, in extractall
self._extract_member(zipinfo, path, pwd)
File "/usr/lib/python3.9/zipfile.py", line 1686, in _
extract_member
with self.open(member, pwd=pwd) as source, \
File "/usr/lib/python3.9/zipfile.py", line 1559, in open
return ZipExtFile(zef_file, mode, zinfo, pwd, True)
File "/usr/lib/python3.9/zipfile.py", line 797, in __init__
self._decompressor = _get_decompressor(self._compress_type)
File "/usr/lib/python3.9/zipfile.py", line 698, in
_get_decompressor
_check_compression(compress_type)
File "/usr/lib/python3.9/zipfile.py", line 678, in
_check_compression
raise NotImplementedError("That compression method is not
supported")
NotImplementedError: That compression method is not supported
Does anyone know how to do this? I'm using python3.9.
So I finally find out, why the code above doesn't work.
When you are creating a zipfile with for example 7zip, this zip file will be encrypted.
But the encryption isn't in bytes, it's encrypted in the hashes: AES-256 or ZipCrypto.
I'm trying to make a tokanizer, I have a file that I'm trying to read with gzip. but it gives the following error:
Traceback (most recent call last):
File "extract_sends.py", line 14, in <module>
main()
File "extract_sends.py", line 12, in main
file_content = f.read()
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/gzip.py", line 276, in read
return self._buffer.read(size)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/gzip.py", line 463, in read
if not self._read_gzip_header():
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/gzip.py", line 411, in _read_gzip_header
raise OSError('Not a gzipped file (%r)' % magic)
OSError: Not a gzipped file (b'# ')
This is my code, I'm just starting but if python can't read the file I'm not comming far.
import gzip
import sys
import re
def main():
file = sys.argv[0]
with gzip.open(file, 'rt') as f:
file_content = f.read()
main()
The file is a .txt.gz file
You should try the simplest ever debugging technique: print the value you are trying to use.
Anyway if you did that you would see that sys.argv[0] is not the filename parameter you put on the commandline after the command to run your code - that is sys.argv[1]
So change:
file = sys.argv[0]
To:
file = sys.argv[1]
print( “Reading from file”,file )
I am trying to read a password protected word document on Python using zipfile.
The following code works with a non-password protected document, but gives an error when used with a password protected file.
try:
from xml.etree.cElementTree import XML
except ImportError:
from xml.etree.ElementTree import XML
import zipfile
psw = "1234"
WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
PARA = WORD_NAMESPACE + 'p'
TEXT = WORD_NAMESPACE + 't'
def get_docx_text(path):
document = zipfile.ZipFile(path, "r")
document.setpassword(psw)
document.extractall()
xml_content = document.read('word/document.xml')
document.close()
tree = XML(xml_content)
paragraphs = []
for paragraph in tree.getiterator(PARA):
texts = [node.text
for node in paragraph.getiterator(TEXT)
if node.text]
if texts:
paragraphs.append(''.join(texts))
return '\n\n'.join(paragraphs)
When running get_docx_text() with a password protected file, I received the following error:
Traceback (most recent call last):
File "<ipython-input-15-d2783899bfe5>", line 1, in <module>
runfile('/Users/username/Workspace/Python/docx2txt.py', wdir='/Users/username/Workspace/Python')
File "/Applications/Spyder-Py2.app/Contents/Resources/lib/python2.7/spyderlib/widgets/externalshell/sitecustomize.py", line 680, in runfile
execfile(filename, namespace)
File "/Applications/Spyder-Py2.app/Contents/Resources/lib/python2.7/spyderlib/widgets/externalshell/sitecustomize.py", line 78, in execfile
builtins.execfile(filename, *where)
File "/Users/username/Workspace/Python/docx2txt.py", line 41, in <module>
x = get_docx_text("/Users/username/Desktop/file.docx")
File "/Users/username/Workspace/Python/docx2txt.py", line 23, in get_docx_text
document = zipfile.ZipFile(path, "r")
File "zipfile.pyc", line 770, in __init__
File "zipfile.pyc", line 811, in _RealGetContents
BadZipfile: File is not a zip file
Does anyone have any advice to get this code to work?
I don't think this is an encryption problem, for two reasons:
Decryption is not attempted when the ZipFile object is created. Methods like ZipFile.extractall, extract, and open, and read take an optional pwd parameter containing the password, but the object constructor / initializer does not.
Your stack trace indicates that the BadZipFile is being raised when you create the ZipFile object, before you call setpassword:
document = zipfile.ZipFile(path, "r")
I'd look carefully for other differences between the two files you're testing: ownership, permissions, security context (if you have that on your OS), ... even filename differences can cause a framework to "not see" the file you're working on.
Also --- the obvious one --- try opening the encrypted zip file with your zip-compatible command of choice. See if it really is a zip file.
I tested this by opening an encrypted zip file in Python 3.1, while "forgetting" to provide a password. I could create the ZipFile object (the variable zfile below) without any error, but got a RuntimeError --- not a BadZipFile exception --- when I tried to read a file without providing a password:
Traceback (most recent call last):
File "./zf.py", line 35, in <module>
main()
File "./zf.py", line 29, in main
print_checksums(zipfile_name)
File "./zf.py", line 22, in print_checksums
for checksum in checksum_contents(zipfile_name):
File "./zf.py", line 13, in checksum_contents
inner_file = zfile.open(inner_filename, "r")
File "/usr/lib64/python3.1/zipfile.py", line 903, in open
"password required for extraction" % name)
RuntimeError: File apache.log is encrypted, password required for extraction
I was also able to raise a BadZipfile exception, once by trying to open an empty file and once by trying to open some random logfile text that I'd renamed to a ".zip" extension. The two test files produced identical stack traces, down to the line numbers.
Traceback (most recent call last):
File "./zf.py", line 35, in <module>
main()
File "./zf.py", line 29, in main
print_checksums(zipfile_name)
File "./zf.py", line 22, in print_checksums
for checksum in checksum_contents(zipfile_name):
File "./zf.py", line 10, in checksum_contents
zfile = zipfile.ZipFile(zipfile_name, "r")
File "/usr/lib64/python3.1/zipfile.py", line 706, in __init__
self._GetContents()
File "/usr/lib64/python3.1/zipfile.py", line 726, in _GetContents
self._RealGetContents()
File "/usr/lib64/python3.1/zipfile.py", line 738, in _RealGetContents
raise BadZipfile("File is not a zip file")
zipfile.BadZipfile: File is not a zip file
While this stack trace isn't exactly the same as yours --- mine has a call to _GetContents, and the pre-3.2 "small f" spelling of BadZipfile --- but they're close enough that I think this is the kind of problem you're dealing with.
I have a simple task but cannot make my code work. I want to loop over the URLs listed in my textfile and download it using wget command in Python. Each URL are placed in separate line in the textfile.
Basically, this is the structure of the list in my textfile:
http://e4ftl01.cr.usgs.gov//MODIS_Composites/MOLT/MOD11C3.005/2000.03.01/MOD11C3.A2000061.005.2007177231646.hdf
http://e4ftl01.cr.usgs.gov//MODIS_Composites/MOLT/MOD11C3.005/2014.12.01/MOD11C3.A2014335.005.2015005235231.hdf
all the URLs are about 178 lines. Then save it in the current working directory.
Below is the initial code that I am working:
import os, fileinput, urllib2 as url, wget
os.chdir("E:/Test/dwnld")
for line in fileinput.FileInput("E:/Test/dwnld/data.txt"):
print line
openurl = wget.download(line)
The error message is:
Traceback (most recent call last): File "E:\Python_scripts\General_purpose\download_url_from_textfile.py", line 5, in <module>
openurl = wget.download(line) File "C:\Python278\lib\site-packages\wget.py", line 297, in download
(fd, tmpfile) = tempfile.mkstemp(".tmp", prefix=prefix, dir=".") File "C:\Python278\lib\tempfile.py", line 308, in mkstemp
return _mkstemp_inner(dir, prefix, suffix, flags) File "C:\Python278\lib\tempfile.py", line 239, in _mkstemp_inner
fd = _os.open(file, flags, 0600) OSError: [Errno 22] Invalid argument: ".\\MOD11C3.A2000061.005.2007177231646.hdf'\n.frbfrp.tmp"
Try to use urllib.urlretrieve. Check the documentation here: https://docs.python.org/2/library/urllib.html#urllib.urlretrieve
I have a script that connects to server and makes a local copy of the whole directory.
EOFerror occurs after exactly 50 files of any choice have been downloaded.
Can anyone please tell me, what is wrong with the script?
ERROR:
Traceback (most recent call last):
File "ftp.py", line 37, in <module>
ftp_walk(ftp)
File "ftp.py", line 17, in ftp_walk
currdir = ftp.pwd()[1:]
File "/usr/lib/python2.7/ftplib.py", line 574, in pwd
resp = self.sendcmd('PWD')
File "/usr/lib/python2.7/ftplib.py", line 244, in sendcmd
return self.getresp()
File "/usr/lib/python2.7/ftplib.py", line 210, in getresp
resp = self.getmultiline()
File "/usr/lib/python2.7/ftplib.py", line 196, in getmultiline
line = self.getline()
File "/usr/lib/python2.7/ftplib.py", line 186, in getline
if not line: raise EOFError
EOFError
SCRIPT:
#!/usr/bin/python
import ftplib
import sys
import os
import datetime
def ftp_walk(ftp):
dirs = ftp.nlst()
for item in (path for path in dirs if path not in ('.', '..')):
try:
ftp.cwd(item)
print datetime.datetime.now().strftime("%Y-%m-%d %H:%M")+' DIR: ', ftp.pwd()
ftp_walk(ftp)
ftp.cwd('..')
except Exception, e:
currdir = ftp.pwd()[1:]
if not os.path.exists(currdir): os.makedirs(currdir)
try:
with open(currdir+"/"+item, 'wb') as f:
def callback(data):
f.write(data)
ftp.retrbinary('RETR %s' % item, callback)
f.close()
print datetime.datetime.now().strftime("%Y-%m-%d %H:%M")+' RETR: '+ currdir+"/"+item
except Exception, e:
print e
ftp = ftplib.FTP("hhhhhhhhhhhhhh")
ftp.login("aaaaaaaa", "bbbbbbbbbbb")
ftp.sendcmd("TYPE I") #binary mode
ftp.set_pasv(True) # Trying Passive mode
ftp.cwd("public_html/eeeeeeee/rrrrrrrr/images")
ftp_walk(ftp)
ftp.quit()
Edit:
After manual update of ftplib for python 2.7:
Traceback (most recent call last):
File "ftp.py", line 29, in <module>
ftp = ftplib.FTP("something.com")
File "/usr/lib/python2.7/ftplib.py", line 114, in __init__
self.connect(host)
File "/usr/lib/python2.7/ftplib.py", line 150, in connect
self.file = self.sock.makefile('r', encoding=self.encoding)
TypeError: makefile() got an unexpected keyword argument 'encoding'
I have tried your script and it works without any problems. I just pulled 233 images from my server using it. Try setting the current dir to ftp.cwd("./public_html/eeeeeeee/rrrrrrrr/images") and see what happens...