unpack bz2 url without temporary file in python - python

I want to unpack data from bz2 url directly to target file. Here is the code:
filename = 'temp.file'
req = urllib2.urlopen('http://example.com/file.bz2')
CHUNK = 16 * 1024
with open(filename, 'wb') as fp:
while True:
chunk = req.read(CHUNK)
if not chunk: break
fp.write(bz2.decompress(chunk))
fp.close()
Error on bz2.decompress(chunk) - ValueError: couldn't find end of stream

Use bz2.BZ2Decompressor to do sequential decompression:
filename = 'temp.file'
req = urllib2.urlopen('http://example.com/file.bz2')
CHUNK = 16 * 1024
decompressor = bz2.BZ2Decompressor()
with open(filename, 'wb') as fp:
while True:
chunk = req.read(CHUNK)
if not chunk:
break
fp.write(decompressor.decompress(chunk))
req.close()
BTW, you don't need to call fp.close() as long as you use with statement.

Here's a more direct and efficient way using requests in streaming mode:
req = requests.get('http://example.com/file.bz2', stream=True)
with open(filename, 'wb') as fp:
shutil.copyfileobj(req.raw, fp)

You should use BZ2Decompressor which supports incremental decompression. see https://docs.python.org/2/library/bz2.html#bz2.BZ2Decompressor
I haven't debugged this but it should work like this:
filename = 'temp.file'
req = urllib2.urlopen('http://example.com/file.bz2')
CHUNK = 16 * 1024
decompressor = bz.BZ2Decompressor()
with open(filename, 'wb') as fp:
while True:
chunk = req.read(CHUNK)
if not chunk: break
decomp = decompressor.decompress(chunk)
if decomp:
fp.write(decomp)

Related

How can I work around a server where large downloads sometimes hang with Python requests?

I Need to download monthly Open Library Data Dumps files, they are some big files:
https://openlibrary.org/data/ol_dump_authors_latest.txt.gz
https://openlibrary.org/data/ol_dump_works_latest.txt.gz
https://openlibrary.org/data/ol_dump_editions_latest.txt.gz
It hangs downloading at worker and edition file because they are big files,the problem i dont get no exception that connection failed.It Just stops Downloading,i know that because the file size wont change for hours
First Try
dump_url = "https://openlibrary.org/data/ol_dump_editions_latest.txt.gz"
dump_path = "temp_file/ol_dump_editions_latest.txt.gz"
session = requests.Session()
with session.get(dump_url, stream=True) as r:
r.raise_for_status()
with open(dump_path, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024*1024):
f.write(chunk)
Second Try
dump_url = "https://openlibrary.org/data/ol_dump_editions_latest.txt.gz"
dump_path = "temp_file/ol_dump_editions_latest.txt.gz"
session = requests.Session()
with session.get(dump_url, stream=True) as r:
r.raise_for_status()
with open(dump_path, 'wb') as f:
shutil.copyfileobj(r.raw, f)
This answer's core content is written by #Edin.A, and is taken from a GitHub ticket they wrote with their permission. Formatting and prose has been slightly modified, but other than reducing log verbosity, code is unchanged.
This can be solved by passing requests a timeout= argument, and then making a new request after a ConnectionError caused by that timeout. Note the max_d_r_c limit used to prevent an endless loop:
import requests
from requests.exceptions import ConnectionError
import os
def resume_download_ol_dump_editions_latest(dump_url,dump_path,max_d_r_c):
max_download_resumes = 30
if max_d_r_c < max_download_resumes:
max_d_r_c += 1
with open(dump_path, 'ab') as f:
position = f.tell()
pos_header = {"Range": f"bytes={position}-"}
with requests.Session() as s:
try:
with s.get(dump_url,headers=pos_header,stream=True,allow_redirects=True,timeout=300) as r:
r.raise_for_status()
with open(dump_path, 'ab') as f:
for chunk in r.iter_content(chunk_size=1024*1024):
f.write(chunk)
f.flush()
os.fsync(f.fileno())
except ConnectionError as to:
resume_download_ol_dump_editions_latest(dump_url=dump_url,dump_path=dump_path,max_d_r_c=max_d_r_c)
def download_ol_dump_editions_latest(dump_url,dump_path):
max_download_resumes_count = 0
with requests.Session() as s:
try:
with s.get(dump_url,stream=True,allow_redirects=True,timeout=300) as r:
r.raise_for_status()
with open(dump_path, 'wb') as f:
last_file_size = None
for chunk in r.iter_content(chunk_size=1024*1024):
f.write(chunk)
f.flush()
os.fsync(f.fileno())
except ConnectionError as to:
resume_download_ol_dump_editions_latest(dump_url=dump_url,dump_path=dump_path,max_d_r_c=max_download_resumes_count)
dump_url = "https://openlibrary.org/data/ol_dump_editions_latest.txt.gz"
dump_path = "temp_file/ol_dump_editions_latest.txt.gz"
download_ol_dump_editions_latest(dump_url=dump_url, dump_path=dump_path)

python script failling to read csvfile with error - StopIteration

I am working on script which downloads large audit logs csv file from azure DevOps and filters data according given condition. This works for small csv file but for file with large data it fails with
fields = next(reader)
stopIteration
Can someone help with changes required in script? I am using python 3.7.9 on MacOs
def getproject(url,pat):
response = requests.get(url, auth=HTTPBasicAuth(username='',password=pat))
if response.status_code == 200:
url_data = response.content
tempfile = open("temp.csv","wb")
tempfile.write(url_data)
tempfile.close()
return url_data
else:
print("\nERROR : Unable to conect The server...")
def FilterData():
lists =[]
pro_name=[]
RepoId =[]
RepoName=[]
new_file = open("temp_new.csv", 'w',newline='')
writer = csv.writer(new_file)
with open("temp.csv", 'r') as readFile:
reader = csv.reader(readFile)
fields = next(reader)
lists.append(fields)
for row in reader:
for field in row:
if field == "Git.RepositoryCreated":
lists.append(row)
writer.writerows(lists)
readFile.close()
new_file.close()
os.remove("temp.csv")
timestamp = (datetime.datetime.now())
timestamp = timestamp.strftime("%d%B%Y_%H%M%S")
file_name = "Data2_"+str(timestamp)+".csv"
file1 = open("temp_new.csv",'r')
df = pd.read_csv(file1)
for i in df["Data"]:
res = json.loads(i)
pro_name.append(res['ProjectName'])
RepoId.append(res['RepoId'])
RepoName.append(res['RepoName'])
Disp_Name = df["ActorDisplayName"]
ActionId = df["ActionId"]
TimeStamp = df["Timestamp"]
file1.close()
os.remove("temp_new.csv")
Header = ["Actor Display Name","Project
Name","RepoName","RepoId","ActionId","Timestamp"]
d=[Disp_Name,pro_name,RepoName,RepoId,ActionId,TimeStamp]
export_data = zip_longest(*d, fillvalue = '')
with open(file_name, 'w',newline='') as myfile:
wr = csv.writer(myfile)
wr.writerow(Header)
wr.writerows(export_data)
myfile.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser("This is used for getting list of the projects")
parser.add_argument("-o" , dest="org", help="org name")
parser.add_argument("-p" , dest="pat", help="pat value")
parser.add_argument("-sd" , dest="sdate", help="Start Date")
parser.add_argument("-ed" , dest="edate", help="End Date")
args = parser.parse_args()
org = args.org
token = args.pat
startdate = args.sdate
enddate = args.edate
url = "https://auditservice.dev.azure.com/{org_name}/_apis/audit/downloadlog?
format=csv&startTime={startdt}&endTime={enddt}&api-version=6.1-
preview.1".format(org_name=org,startdt=startdate,enddt=enddate)
#call "getproject" function to check url and token to further create required csv
getproject(url,token)
FilterData()
[+] in your getproject function,
you should use a try except block to handle http errors etc.
[+] if the csv file you're trying to download is quite large, it may be best to write the data in chunks.
As for the fields = next(reader) stopIteration errpr.
I'm not sure. ¯_(ツ)_/¯
Try throwing your code in the debugger and stepping through it.
See: download large file in python with requests
def getproject(url,pat):
try:
# NOTE the stream=True parameter below
with requests.get(url, auth=HTTPBasicAuth(username='',password=pat), stream=True) as r:
r.raise_for_status()
with open('tmp.csv', 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
# If you have chunk encoded response uncomment if
# and set chunk_size parameter to None.
#if chunk:
f.write(chunk)
except requests.exceptions.ConnectionError as c_error:
print(f"[-] Connection Error: {c_error}")
except requests.exceptions.Timeout as t_error:
print(f"[-] Connection Timeout Error: {t_error}")
except requests.exceptions.RequestException as req_error:
print(f"[-] Some Ambiguous Exception: {req_error}")
# This way seems faster based upon the comments of the link i shared
import requests
import shutil
def download_file(url):
local_filename = url.split('/')[-1]
with requests.get(url, stream=True) as r:
with open(local_filename, 'wb') as f:
shutil.copyfileobj(r.raw, f)
return local_filename

Python http download using requests.get always missing a chunk

I am trying to define a function that resumes download if the connection is broken. However, the following does not work as expected.
In line 8, I have to manually deduce one chunk-size in order for it to work, otherwise, the final file will be missing exactly one chunk-size for each time I resume it.
if os.path.exists(fileName):
header = requests.head(url)
fileLength = int(header.headers['Content-Length'])
if fileLength == os.path.getsize(fileName):
return True
else:
with open(fileName, 'ab') as f:
position = f.tell()-1024
pos_header = {}
print(position)
pos_header['Range'] = f'bytes={position}-'
with requests.get(url, headers = pos_header, stream = True) as r:
with open(fileName, 'ab') as f:
#some validation should be here
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(r.content)
f.flush()
print(os.path.getsize(fileName))
else:
with requests.get(url, allow_redirects=True, stream = True) as r:
with open(fileName, 'wb') as f:
iter = 0
for chunk in r.iter_content(chunk_size = 1024):
if chunk:
f.write(chunk)
f.flush()
iter += 1
if iter > 2000:
break
Interestingly, the part missing is the in-between two parts of the downloads. Is there a more elegant way of resolving this than what I did?
You have a bug in the code that downloads the 'rest' of the file if it's the second attempt. The bug is in the following line:
f.write(r.content)
It should be
f.write(chunk)
Basically, you're iterating over chunks but writing the entire content, and that messes things up.

How to read and write binary data as plist or zip?

f.pk is basically a container of base64 + zip which I need to import in Python and extract. The zip file is p.plist, so f.pk = path + name + data of p.plist.
I can't find any working encoding for open() or codecs.open() to open it as a str and save the output. I always have a generated output.plist which is different than the original.
Encodings I have already used include ASCII; UTF-x; Latin_1; ISO-x;
import codecs, os
with open('f.pk', 'r', encoding='Latin_1') as f:
f_open = f.read()
with codecs.open('f.pk', 'r', encoding='zip') as f:
f_open = f.read()
f2=f_open[3:] #SKIP DUMMY PART
f3=f2.split('-DATA-')
f4=f3[1].split('-COMMENT-')
with open('output.plist', 'w') as f:
print(f_out, file=f)
original.plist = 5e03964972def5b83880397b7377e6d1aea33e2b
output.plist = 6473aea0ae8bc75a04859effe1ee366de4cdd2d2
I have deep analyzed both files:
with no success.
with open('file.pk', 'rb') as f:
f_open = f.read()
# do something with bytes here
with open('p.temp', 'wb') as f:
pickle.dump(f_bytes, f) # temp file
# reload temp file
with open("p.temp", 'rb') as f:
data = f.read()
# skip encodings and skip unwanted bytes
data = data[4:-3]
# save it
with open('p.plist', 'wb') as f:
f.write(data)
os.remove('p.temp')

How to divide file into several files using python

I have a video file and i need to divide it into several smaller files of size 256KB and save all files names in a text file then i need to read all the small files and merges them into the original file.
is this possible to do it in python and how ?
First stab at splitting:
input_file = open(input_filename, 'rb')
blocksize = 4096
chunksize = 1024 * 256
buf = None
chunk_num = 0
current_read = 0
output_filename = 'output-chunk-{:04d}'.format(chunk_num)
output_file = open(output_filename, 'wb')
while buf is None or len(buf) > 0:
buf = input_file.read(blocksize)
current_read += len(buf)
output_file.write(buf)
if chunksize <= current_read:
output_file.close()
current_read = 0
chunk_num += 1
output_filename = 'output-chunk-{:04d}'.format(chunk_num)
output_file = open(output_filename, 'wb')
output_file.close()
input_file.close()
This might get you partway there; adapt as needed.
Merging:
blocksize = 4096
chunk_num = 0
input_filename = 'output-chunk-{:04d}'.format(chunk_num)
output_filename = 'reconstructed.bin'
output_file = open(output_filename, 'wb')
while True:
try:
input_file = open(input_filename, 'rb')
except IOError:
break
buf = None
while buf is None or len(buf) > 0:
buf = input_file.read(blocksize)
output_file.write(buf)
input_file.close()
chunk_num += 1
input_filename = 'output-chunk-{:04d}'.format(chunk_num)
output_file.close()

Categories

Resources