I have to write in Python that performs the following tasks:
1- Download the Movielens datasets from the url ‘http://files.grouplens.org/datasets/movielens/ml-
25m.zip’
2- Download the Movielens checksum from the url ‘http://files.grouplens.org/datasets/movielens/ml-
25m.zip.md5’
3- Check whether the checksum of the archive corresponds to the downloaded one
4- In case of positive check, print the names of the files contained by the downloaded archive
This is what I wrote up to now:
from zipfile import ZipFile
from urllib import request
import hashlib
def md5(fname):
hash_md5 = hashlib.md5()
with open(fname, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
url_datasets = 'http://files.grouplens.org/datasets/movielens/ml-25m.zip'
datasets = 'datasets.zip'
url_checksum = 'http://files.grouplens.org/datasets/movielens/ml-25m.zip.md5'
request.urlretrieve( url_datasets, datasets)
request.urlretrieve (url_checksum, checksum)
checksum = 'datasets.zip.md5'
with ZipFile(datasets, 'r') as zipObj:
listOfiles = zipObj.namelist()
for elem in listOfiles:
print(elem)
So what I'm missing is a way to compare the checksum I computed with the one I downloaded and maybe I can create a function "printFiles" that checks the checksum and in the positive case prints the list of files.
Is there something else I can improve?
Your code isn't actually making any of the requests.
from zipfile import ZipFile
import hashlib
import requests
def md5(fname):
hash_md5 = hashlib.md5()
hash_md5.update( open(fname,'rb').read() )
return hash_md5.hexdigest()
url_datasets = 'http://files.grouplens.org/datasets/movielens/ml-25m.zip'
datasets = 'datasets.zip'
url_checksum = 'http://files.grouplens.org/datasets/movielens/ml-25m.zip.md5'
checksum = 'datasets.zip.md5'
ds = requests.get( url_datasets, allow_redirects=True)
cs = requests.get( url_checksum, allow_redirects=True)
open( datasets, 'wb').write( ds.content )
ds_md5 = md5(datasets)
cs_md5 = cs.content.decode('utf-8').split()[0]
print( ds_md5 )
print( cs_md5 )
if ds_md5 == cs_md5:
print( "MATCH" )
with ZipFile(datasets, 'r') as zipObj:
listOfiles = zipObj.namelist()
for elem in listOfiles:
print(elem)
else:
print( "Checksum fail" )
Related
I am using the code below to get any free journal pdfs from pubmed. It does downloadload something that when I look at it, just consists of the number 1.. Any ideas on where I am going wrong? Thank you
import metapub
from urllib.request import urlretrieve
import textract
from pathlib import Path
another_path='/content/Articles/'
pmid_list=['35566889','33538053', '30848212']
for i in range(len(pmid_list)):
query=pmid_list[i]
#for ind in pmid_df.index:
# query= pmid_df['PMID'][ind]
url = metapub.FindIt(query).url
try:
urlretrieve(url)
file_name = query
out_file = another_path + file_name
with open(out_file, "w") as textfile:
textfile.write(textract.process(out_file,extension='pdf',method='pdftotext',encoding="utf_8",
))
except:
continue
I see two mistakes.
First: urlretrieve(url) saves data in temporary file with random filename - so you can't access it because you don't know its filename. You should use second parameter to save it with own filename.
urlretrieve(url, file_name)
Second: you use the same out_file to process file (process(out_file)) and write result (open(out_file, 'w')) - but first you use open() which deletes all content in file and later it will process empty file. You should first process file and later open it for writing.
data = textract.process(out_file, extension='pdf', method='pdftotext', encoding="utf_8")
with open(out_file, "wb") as textfile: # save bytes
textfile.write(data)
or you should write result with different name (i.e with extension .txt)`
Full working example with other small changes
import os
from urllib.request import urlretrieve
import metapub
import textract
#another_path = '/content/Articles/'
another_path = './'
pmid_list = ['35566889','33538053', '30848212']
for query in pmid_list:
print('query:', query)
url = metapub.FindIt(query).url
print('url:', url)
if url:
try:
out_file = os.path.join(another_path, query)
print('out_file:', out_file)
print('... downloading')
urlretrieve(url, out_file + '.pdf')
print('... processing')
data = textract.process(out_file + '.pdf', extension='pdf', method='pdftotext', encoding="utf_8")
print('... saving')
with open(out_file + '.txt', "wb") as textfile: # save bytes
textfile.write(data)
print('... OK')
except Exception as ex:
print('Exception:', ex)
I'm translating some linux log data to a CSV for data analytics. Some of the instructions take some time so, I thought I would put in a progress bar for each file that is being translated. However, when putting in a progress bar with either the progresspar2 or tqdm, my pandas dataframes and null. There's no data at all. When I remove the progress bar, everything works as it should.
Here is my CSV translating function:
import pandas as pd
from dateutil import parser
from tqdm import trange
import os
import glob
import csv
import socket
def logsToCSV():
print("[+] Translating log to CSV")
log_file = open(CSV_FILE_PATH, "w", newline='')
csv_w = csv.writer(log_file)
for filename in glob.glob(os.path.join(LOGS_FILE_PATH, '*.txt')): # Find all files in path with .txt
data_file = open(filename, "r")
file_length = len(data_file.readlines())
for i in trange(file_length, desc='loop', leave=False): # Progress Bar Via TQDM
for new_line in data_file:
new_line = line.strip().split(" ")
date = str("%s %s %s" % (new_line[0], new_line[1], new_line[2])).strip()
date = parser.parse(date)
ip =str(new_line[5]).partition("/")
ip = str(ip[0]).strip()
try:
url = str(new_line[7]).strip()
except:
url = None
csv_w.writerow([date,ip,url])
TQDM is breaking something or I am implementing it incorrectly.
EDIT 1:
I figured it out. I was exhausting the file read during my readlines() to get the length. This works:
def logsToCSV():
print("[+] Translating log to CSV")
log_file = open(CSV_FILE_PATH, "w", newline='')
csv_w = csv.writer(log_file)
path, dirs, files = next(os.walk(LOGS_FILE_PATH))
log_num = len(files)
print(log_num)
for filename in glob.glob(os.path.join(LOGS_FILE_PATH, '*.txt')): # Find all files in path with .txt
data_file = open(filename, "r")
with open(filename, "r") as f:
file_length = len(f.readlines())
f.close()
pbar = tqdm(total=file_length)
for line in data_file:
new_line = line.strip().split(" ")
date = str("%s %s %s" % (new_line[0], new_line[1], new_line[2])).strip()
date = parser.parse(date)
ip =str(new_line[5]).partition("/")
ip = str(ip[0]).strip()
try:
url = str(new_line[7]).strip()
except:
url = None
csv_w.writerow([date,ip,url])
pbar.update(1)
pbar.close()
you can apply tqdm for your main loop :
from tqdm import tqdm
for i in tqdam(condition):
I'm trying to write a python script to search through my current directory to identify jpg by their header then hash those files. I'm kind of all over the place. Any suggestions would be appreciated.
from os import listdir, getcwd
from os.path import isfile, join, normpath, basename
import hashlib
jpgHead = b'\xff\xd8\xff\xe0'
def get_files():
current_path = normpath(getcwd())
return [join(current_path, f) for f in listdir(current_path) if
isfile(join(current_path, f))]
def checkJPG():
checkJPG=checkJPG.read(4)
if checkJPG==jpgHead
get_hashes()
def get_hashes():
files = checkJPG()
list_of_hashes = []
for each_file in files:
hash_md5 = hashlib.md5()
with open(each_file, "rb") as f:
list_of_hashes.append('Filename: {}\tHash:
{}\n'.format(basename(each_file), hash_md5.hexdigest()))
return list_of_hashes
def write_jpgHashes():
hashes=get_hashes()
with open('list_of_hashes.txt', 'w') as f:
for md5_hash in hashes:
f.write(md5_hash)
if __name__ == '__main__':
write_jpgHashes()
I modified some of your functions a bit , give it a try
from os import listdir, getcwd
from os.path import isfile, join, normpath, basename
import hashlib
jpgHead = b'\xff\xd8\xff\xe0'
def get_files(path = getcwd()):
current_path = normpath(path)
return [ join(current_path, f) for f in listdir(current_path) if isfile(join(current_path, f)) ]
def checkJPG(path):
with open(path, 'rb') as f :
header = f.read(4)
return header == jpgHead
def get_hashes():
list_of_hashes = []
for each_file in get_files() :
if checkJPG(each_file) :
list_of_hashes.append('Filename: {}\tHash: {}\n'.format(each_file, md5hf(each_file)))
return list_of_hashes
def md5hf(path):
#return hashlib.md5(open(path, "rb").read()).hexdigest() ## you can use this line for small files ##
hash_md5 = hashlib.md5()
with open(path, "rb") as f:
for chunk in iter(lambda : f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def write_jpgHashes():
hashes=get_hashes()
with open('list_of_hashes.txt', 'w') as f:
for md5_hash in hashes:
f.write(md5_hash)
if __name__ == '__main__':
write_jpgHashes()
Notes :
Fixed some syntax and indentation errors
Turned checkJPG to boolean
Added the md5 hash of files to list_of_hashes in get_hashes
Added the md5hf function , to get the md5 checksum
I'm having a problem for computing all checksums of all the files under the /bin/* directory.
I'm implementing a HIDS in Python, so i need to compute the checksums of each file and save it, say, in a list .... so my code here only returns the first checksum of the /bin/* directory.
import sys
import haslib
path = sys.argv[1] #PATH OF THE FILES, ex: /etc/shadow, /bin/*, etc.
with open(path,'rb') as fh:
md5 = hashlib.md5()
while True:
data = fh.read(8192)
if not data:
break
md5.update(data)
print md5.hexdigest()
Any suggestions ??
import sys
from os import listdir
from os.path import isfile, join
import hashlib
path = sys.argv[1] #PATH OF THE FILES, ex: /etc/shadow, /bin/*, etc.
files = [ f for f in listdir(path) if isfile(join(path,f)) ]
my_files = {}
for fil in files:
with open(fil,'rb') as fh:
md5 = hashlib.md5()
while True:
data = fh.read(8192)
if not data:
break
md5.update(data)
my_files[fil] = md5.hexdigest()
for k,v in my_files.iteritems():
print 'file_name is {} | hash is {}'.format(k,v)
Here is the situation:
I get gzipped xml documents from Amazon S3
import boto
from boto.s3.connection import S3Connection
from boto.s3.key import Key
conn = S3Connection('access Id', 'secret access key')
b = conn.get_bucket('mydev.myorg')
k = Key(b)
k.key('documents/document.xml.gz')
I read them in file as
import gzip
f = open('/tmp/p', 'w')
k.get_file(f)
f.close()
r = gzip.open('/tmp/p', 'rb')
file_content = r.read()
r.close()
Question
How can I ungzip the streams directly and read the contents?
I do not want to create temp files, they don't look good.
Yes, you can use the zlib module to decompress byte streams:
import zlib
def stream_gzip_decompress(stream):
dec = zlib.decompressobj(32 + zlib.MAX_WBITS) # offset 32 to skip the header
for chunk in stream:
rv = dec.decompress(chunk)
if rv:
yield rv
The offset of 32 signals to the zlib header that the gzip header is expected but skipped.
The S3 key object is an iterator, so you can do:
for data in stream_gzip_decompress(k):
# do something with the decompressed data
I had to do the same thing and this is how I did it:
import gzip
f = StringIO.StringIO()
k.get_file(f)
f.seek(0) #This is crucial
gzf = gzip.GzipFile(fileobj=f)
file_content = gzf.read()
For Python3x and boto3-
So I used BytesIO to read the compressed file into a buffer object, then I used zipfile to open the decompressed stream as uncompressed data and I was able to get the datum line by line.
import io
import zipfile
import boto3
import sys
s3 = boto3.resource('s3', 'us-east-1')
def stream_zip_file():
count = 0
obj = s3.Object(
bucket_name='MonkeyBusiness',
key='/Daily/Business/Banana/{current-date}/banana.zip'
)
buffer = io.BytesIO(obj.get()["Body"].read())
print (buffer)
z = zipfile.ZipFile(buffer)
foo2 = z.open(z.infolist()[0])
print(sys.getsizeof(foo2))
line_counter = 0
for _ in foo2:
line_counter += 1
print (line_counter)
z.close()
if __name__ == '__main__':
stream_zip_file()
You can try PIPE and read contents without downloading file
import subprocess
c = subprocess.Popen(['-c','zcat -c <gzip file name>'], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
for row in c.stdout:
print row
In addition "/dev/fd/" + str(c.stdout.fileno()) will provide you FIFO file name (Named pipe) which can be passed to other program.