Python http download using requests.get always missing a chunk - python

I am trying to define a function that resumes download if the connection is broken. However, the following does not work as expected.
In line 8, I have to manually deduce one chunk-size in order for it to work, otherwise, the final file will be missing exactly one chunk-size for each time I resume it.
if os.path.exists(fileName):
header = requests.head(url)
fileLength = int(header.headers['Content-Length'])
if fileLength == os.path.getsize(fileName):
return True
else:
with open(fileName, 'ab') as f:
position = f.tell()-1024
pos_header = {}
print(position)
pos_header['Range'] = f'bytes={position}-'
with requests.get(url, headers = pos_header, stream = True) as r:
with open(fileName, 'ab') as f:
#some validation should be here
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(r.content)
f.flush()
print(os.path.getsize(fileName))
else:
with requests.get(url, allow_redirects=True, stream = True) as r:
with open(fileName, 'wb') as f:
iter = 0
for chunk in r.iter_content(chunk_size = 1024):
if chunk:
f.write(chunk)
f.flush()
iter += 1
if iter > 2000:
break
Interestingly, the part missing is the in-between two parts of the downloads. Is there a more elegant way of resolving this than what I did?

You have a bug in the code that downloads the 'rest' of the file if it's the second attempt. The bug is in the following line:
f.write(r.content)
It should be
f.write(chunk)
Basically, you're iterating over chunks but writing the entire content, and that messes things up.

Related

How can I work around a server where large downloads sometimes hang with Python requests?

I Need to download monthly Open Library Data Dumps files, they are some big files:
https://openlibrary.org/data/ol_dump_authors_latest.txt.gz
https://openlibrary.org/data/ol_dump_works_latest.txt.gz
https://openlibrary.org/data/ol_dump_editions_latest.txt.gz
It hangs downloading at worker and edition file because they are big files,the problem i dont get no exception that connection failed.It Just stops Downloading,i know that because the file size wont change for hours
First Try
dump_url = "https://openlibrary.org/data/ol_dump_editions_latest.txt.gz"
dump_path = "temp_file/ol_dump_editions_latest.txt.gz"
session = requests.Session()
with session.get(dump_url, stream=True) as r:
r.raise_for_status()
with open(dump_path, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024*1024):
f.write(chunk)
Second Try
dump_url = "https://openlibrary.org/data/ol_dump_editions_latest.txt.gz"
dump_path = "temp_file/ol_dump_editions_latest.txt.gz"
session = requests.Session()
with session.get(dump_url, stream=True) as r:
r.raise_for_status()
with open(dump_path, 'wb') as f:
shutil.copyfileobj(r.raw, f)
This answer's core content is written by #Edin.A, and is taken from a GitHub ticket they wrote with their permission. Formatting and prose has been slightly modified, but other than reducing log verbosity, code is unchanged.
This can be solved by passing requests a timeout= argument, and then making a new request after a ConnectionError caused by that timeout. Note the max_d_r_c limit used to prevent an endless loop:
import requests
from requests.exceptions import ConnectionError
import os
def resume_download_ol_dump_editions_latest(dump_url,dump_path,max_d_r_c):
max_download_resumes = 30
if max_d_r_c < max_download_resumes:
max_d_r_c += 1
with open(dump_path, 'ab') as f:
position = f.tell()
pos_header = {"Range": f"bytes={position}-"}
with requests.Session() as s:
try:
with s.get(dump_url,headers=pos_header,stream=True,allow_redirects=True,timeout=300) as r:
r.raise_for_status()
with open(dump_path, 'ab') as f:
for chunk in r.iter_content(chunk_size=1024*1024):
f.write(chunk)
f.flush()
os.fsync(f.fileno())
except ConnectionError as to:
resume_download_ol_dump_editions_latest(dump_url=dump_url,dump_path=dump_path,max_d_r_c=max_d_r_c)
def download_ol_dump_editions_latest(dump_url,dump_path):
max_download_resumes_count = 0
with requests.Session() as s:
try:
with s.get(dump_url,stream=True,allow_redirects=True,timeout=300) as r:
r.raise_for_status()
with open(dump_path, 'wb') as f:
last_file_size = None
for chunk in r.iter_content(chunk_size=1024*1024):
f.write(chunk)
f.flush()
os.fsync(f.fileno())
except ConnectionError as to:
resume_download_ol_dump_editions_latest(dump_url=dump_url,dump_path=dump_path,max_d_r_c=max_download_resumes_count)
dump_url = "https://openlibrary.org/data/ol_dump_editions_latest.txt.gz"
dump_path = "temp_file/ol_dump_editions_latest.txt.gz"
download_ol_dump_editions_latest(dump_url=dump_url, dump_path=dump_path)

python script failling to read csvfile with error - StopIteration

I am working on script which downloads large audit logs csv file from azure DevOps and filters data according given condition. This works for small csv file but for file with large data it fails with
fields = next(reader)
stopIteration
Can someone help with changes required in script? I am using python 3.7.9 on MacOs
def getproject(url,pat):
response = requests.get(url, auth=HTTPBasicAuth(username='',password=pat))
if response.status_code == 200:
url_data = response.content
tempfile = open("temp.csv","wb")
tempfile.write(url_data)
tempfile.close()
return url_data
else:
print("\nERROR : Unable to conect The server...")
def FilterData():
lists =[]
pro_name=[]
RepoId =[]
RepoName=[]
new_file = open("temp_new.csv", 'w',newline='')
writer = csv.writer(new_file)
with open("temp.csv", 'r') as readFile:
reader = csv.reader(readFile)
fields = next(reader)
lists.append(fields)
for row in reader:
for field in row:
if field == "Git.RepositoryCreated":
lists.append(row)
writer.writerows(lists)
readFile.close()
new_file.close()
os.remove("temp.csv")
timestamp = (datetime.datetime.now())
timestamp = timestamp.strftime("%d%B%Y_%H%M%S")
file_name = "Data2_"+str(timestamp)+".csv"
file1 = open("temp_new.csv",'r')
df = pd.read_csv(file1)
for i in df["Data"]:
res = json.loads(i)
pro_name.append(res['ProjectName'])
RepoId.append(res['RepoId'])
RepoName.append(res['RepoName'])
Disp_Name = df["ActorDisplayName"]
ActionId = df["ActionId"]
TimeStamp = df["Timestamp"]
file1.close()
os.remove("temp_new.csv")
Header = ["Actor Display Name","Project
Name","RepoName","RepoId","ActionId","Timestamp"]
d=[Disp_Name,pro_name,RepoName,RepoId,ActionId,TimeStamp]
export_data = zip_longest(*d, fillvalue = '')
with open(file_name, 'w',newline='') as myfile:
wr = csv.writer(myfile)
wr.writerow(Header)
wr.writerows(export_data)
myfile.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser("This is used for getting list of the projects")
parser.add_argument("-o" , dest="org", help="org name")
parser.add_argument("-p" , dest="pat", help="pat value")
parser.add_argument("-sd" , dest="sdate", help="Start Date")
parser.add_argument("-ed" , dest="edate", help="End Date")
args = parser.parse_args()
org = args.org
token = args.pat
startdate = args.sdate
enddate = args.edate
url = "https://auditservice.dev.azure.com/{org_name}/_apis/audit/downloadlog?
format=csv&startTime={startdt}&endTime={enddt}&api-version=6.1-
preview.1".format(org_name=org,startdt=startdate,enddt=enddate)
#call "getproject" function to check url and token to further create required csv
getproject(url,token)
FilterData()
[+] in your getproject function,
you should use a try except block to handle http errors etc.
[+] if the csv file you're trying to download is quite large, it may be best to write the data in chunks.
As for the fields = next(reader) stopIteration errpr.
I'm not sure. ¯_(ツ)_/¯
Try throwing your code in the debugger and stepping through it.
See: download large file in python with requests
def getproject(url,pat):
try:
# NOTE the stream=True parameter below
with requests.get(url, auth=HTTPBasicAuth(username='',password=pat), stream=True) as r:
r.raise_for_status()
with open('tmp.csv', 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
# If you have chunk encoded response uncomment if
# and set chunk_size parameter to None.
#if chunk:
f.write(chunk)
except requests.exceptions.ConnectionError as c_error:
print(f"[-] Connection Error: {c_error}")
except requests.exceptions.Timeout as t_error:
print(f"[-] Connection Timeout Error: {t_error}")
except requests.exceptions.RequestException as req_error:
print(f"[-] Some Ambiguous Exception: {req_error}")
# This way seems faster based upon the comments of the link i shared
import requests
import shutil
def download_file(url):
local_filename = url.split('/')[-1]
with requests.get(url, stream=True) as r:
with open(local_filename, 'wb') as f:
shutil.copyfileobj(r.raw, f)
return local_filename

Editing a downloaded CSV in memory before writing

Forewarning: I am very new to Python and programming in general. I am trying to use Python 3 to get some CSV data and making some changes to it before writing it to a file. My problem lies in accessing the CSV data from a variable, like so:
import csv
import requests
csvfile = session.get(url)
reader = csv.reader(csvfile.content)
for row in reader:
do(something)
This returns:
_csv.Error: iterator should return strings, not int (did you open the file in text mode?)
Googling revealed that I should be feeding the reader text instead of bytes, so I also attempted:
reader = csv.reader(csvfile.text)
This also does not work as the loop works through it letter by letter instead of line by line. I also experimented with TextIOWrapper and similar options with no success. The only way I have managed to get this to work is by writing the data to a file, reading it, and then making changes, like so:
csvfile = session.get(url)
with open("temp.txt", 'wb') as f:
f.write(csvfile.content)
with open("temp.txt", 'rU', encoding="utf8") as data:
reader = csv.reader(data)
for row in reader:
do(something)
I feel like this is far from the most optimal way of doing this, even if it works. What is the proper way to read and edit the CSV data directly from memory, without having to save it to a temporary file?
you don't have to write to a temp file, here is what I would do, using the "csv" and "requests" modules:
import csv
import requests
__csvfilepathname__ = r'c:\test\test.csv'
__url__ = 'https://server.domain.com/test.csv'
def csv_reader(filename, enc = 'utf_8'):
with open(filename, 'r', encoding = enc) as openfileobject:
reader = csv.reader(openfileobject)
for row in reader:
#do something
print(row)
return
def csv_from_url(url):
line = ''
datalist = []
s = requests.Session()
r = s.get(url)
for x in r.text.replace('\r',''):
if not x[0] == '\n':
line = line + str(x[0])
else:
datalist.append(line)
line = ''
datalist.append(line)
# at this point you already have a data list 'datalist'
# no need really to use the csv.reader object, but here goes:
reader = csv.reader(datalist)
for row in reader:
#do something
print(row)
return
def main():
csv_reader(__csvfilepathname__)
csv_from_url(__url__)
return
if __name__ == '__main__':
main ()
not very pretty, and probably not very good in regards to memory/performance, depending on how "big" your csv/data is
HTH, Edwin.

Append JSON to file

I am trying to append values to a json file. How can i append the data? I have been trying so many ways but none are working ?
Code:
def all(title,author,body,type):
title = "hello"
author = "njas"
body = "vgbhn"
data = {
"id" : id,
"author": author,
"body" : body,
"title" : title,
"type" : type
}
data_json = json.dumps(data)
#data = ast.literal_eval(data)
#print data_json
if(os.path.isfile("offline_post.json")):
with open('offline_post.json','a') as f:
new = json.loads(f)
new.update(a_dict)
json.dump(new,f)
else:
open('offline_post.json', 'a')
with open('offline_post.json','a') as f:
new = json.loads(f)
new.update(a_dict)
json.dump(new,f)
How can I append data to json file when this function is called?
I suspect you left out that you're getting a TypeError in the blocks where you're trying to write the file. Here's where you're trying to write:
with open('offline_post.json','a') as f:
new = json.loads(f)
new.update(a_dict)
json.dump(new,f)
There's a couple of problems here. First, you're passing a file object to the json.loads command, which expects a string. You probably meant to use json.load.
Second, you're opening the file in append mode, which places the pointer at the end of the file. When you run the json.load, you're not going to get anything because it's reading at the end of the file. You would need to seek to 0 before loading (edit: this would fail anyway, as append mode is not readable).
Third, when you json.dump the new data to the file, it's going to append it to the file in addition to the old data. From the structure, it appears you want to replace the contents of the file (as the new data contains the old data already).
You probably want to use r+ mode, seeking back to the start of the file between the read and write, and truncateing at the end just in case the size of the data structure ever shrinks.
with open('offline_post.json', 'r+') as f:
new = json.load(f)
new.update(a_dict)
f.seek(0)
json.dump(new, f)
f.truncate()
Alternatively, you can open the file twice:
with open('offline_post.json', 'r') as f:
new = json.load(f)
new.update(a_dict)
with open('offline_post.json', 'w') as f:
json.dump(new, f)
This is a different approach, I just wanted to append without reloading all the data. Running on a raspberry pi so want to look after memory. The test code -
import os
json_file_exists = 0
filename = "/home/pi/scratch_pad/test.json"
# remove the last run json data
try:
os.remove(filename)
except OSError:
pass
count = 0
boiler = 90
tower = 78
while count<10:
if json_file_exists==0:
# create the json file
with open(filename, mode = 'w') as fw:
json_string = "[\n\t{'boiler':"+str(boiler)+",'tower':"+str(tower)+"}\n]"
fw.write(json_string)
json_file_exists=1
else:
# append to the json file
char = ""
boiler = boiler + .01
tower = tower + .02
while(char<>"}"):
with open(filename, mode = 'rb+') as f:
f.seek(-1,2)
size=f.tell()
char = f.read()
if char == "}":
break
f.truncate(size-1)
with open(filename, mode = 'a') as fw:
json_string = "\n\t,{'boiler':"+str(boiler)+",'tower':"+str(tower)+"}\n]"
fw.seek(-1, os.SEEK_END)
fw.write(json_string)
count = count + 1

unpack bz2 url without temporary file in python

I want to unpack data from bz2 url directly to target file. Here is the code:
filename = 'temp.file'
req = urllib2.urlopen('http://example.com/file.bz2')
CHUNK = 16 * 1024
with open(filename, 'wb') as fp:
while True:
chunk = req.read(CHUNK)
if not chunk: break
fp.write(bz2.decompress(chunk))
fp.close()
Error on bz2.decompress(chunk) - ValueError: couldn't find end of stream
Use bz2.BZ2Decompressor to do sequential decompression:
filename = 'temp.file'
req = urllib2.urlopen('http://example.com/file.bz2')
CHUNK = 16 * 1024
decompressor = bz2.BZ2Decompressor()
with open(filename, 'wb') as fp:
while True:
chunk = req.read(CHUNK)
if not chunk:
break
fp.write(decompressor.decompress(chunk))
req.close()
BTW, you don't need to call fp.close() as long as you use with statement.
Here's a more direct and efficient way using requests in streaming mode:
req = requests.get('http://example.com/file.bz2', stream=True)
with open(filename, 'wb') as fp:
shutil.copyfileobj(req.raw, fp)
You should use BZ2Decompressor which supports incremental decompression. see https://docs.python.org/2/library/bz2.html#bz2.BZ2Decompressor
I haven't debugged this but it should work like this:
filename = 'temp.file'
req = urllib2.urlopen('http://example.com/file.bz2')
CHUNK = 16 * 1024
decompressor = bz.BZ2Decompressor()
with open(filename, 'wb') as fp:
while True:
chunk = req.read(CHUNK)
if not chunk: break
decomp = decompressor.decompress(chunk)
if decomp:
fp.write(decomp)

Categories

Resources