I am working on script which downloads large audit logs csv file from azure DevOps and filters data according given condition. This works for small csv file but for file with large data it fails with
fields = next(reader)
stopIteration
Can someone help with changes required in script? I am using python 3.7.9 on MacOs
def getproject(url,pat):
response = requests.get(url, auth=HTTPBasicAuth(username='',password=pat))
if response.status_code == 200:
url_data = response.content
tempfile = open("temp.csv","wb")
tempfile.write(url_data)
tempfile.close()
return url_data
else:
print("\nERROR : Unable to conect The server...")
def FilterData():
lists =[]
pro_name=[]
RepoId =[]
RepoName=[]
new_file = open("temp_new.csv", 'w',newline='')
writer = csv.writer(new_file)
with open("temp.csv", 'r') as readFile:
reader = csv.reader(readFile)
fields = next(reader)
lists.append(fields)
for row in reader:
for field in row:
if field == "Git.RepositoryCreated":
lists.append(row)
writer.writerows(lists)
readFile.close()
new_file.close()
os.remove("temp.csv")
timestamp = (datetime.datetime.now())
timestamp = timestamp.strftime("%d%B%Y_%H%M%S")
file_name = "Data2_"+str(timestamp)+".csv"
file1 = open("temp_new.csv",'r')
df = pd.read_csv(file1)
for i in df["Data"]:
res = json.loads(i)
pro_name.append(res['ProjectName'])
RepoId.append(res['RepoId'])
RepoName.append(res['RepoName'])
Disp_Name = df["ActorDisplayName"]
ActionId = df["ActionId"]
TimeStamp = df["Timestamp"]
file1.close()
os.remove("temp_new.csv")
Header = ["Actor Display Name","Project
Name","RepoName","RepoId","ActionId","Timestamp"]
d=[Disp_Name,pro_name,RepoName,RepoId,ActionId,TimeStamp]
export_data = zip_longest(*d, fillvalue = '')
with open(file_name, 'w',newline='') as myfile:
wr = csv.writer(myfile)
wr.writerow(Header)
wr.writerows(export_data)
myfile.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser("This is used for getting list of the projects")
parser.add_argument("-o" , dest="org", help="org name")
parser.add_argument("-p" , dest="pat", help="pat value")
parser.add_argument("-sd" , dest="sdate", help="Start Date")
parser.add_argument("-ed" , dest="edate", help="End Date")
args = parser.parse_args()
org = args.org
token = args.pat
startdate = args.sdate
enddate = args.edate
url = "https://auditservice.dev.azure.com/{org_name}/_apis/audit/downloadlog?
format=csv&startTime={startdt}&endTime={enddt}&api-version=6.1-
preview.1".format(org_name=org,startdt=startdate,enddt=enddate)
#call "getproject" function to check url and token to further create required csv
getproject(url,token)
FilterData()
[+] in your getproject function,
you should use a try except block to handle http errors etc.
[+] if the csv file you're trying to download is quite large, it may be best to write the data in chunks.
As for the fields = next(reader) stopIteration errpr.
I'm not sure. ¯_(ツ)_/¯
Try throwing your code in the debugger and stepping through it.
See: download large file in python with requests
def getproject(url,pat):
try:
# NOTE the stream=True parameter below
with requests.get(url, auth=HTTPBasicAuth(username='',password=pat), stream=True) as r:
r.raise_for_status()
with open('tmp.csv', 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
# If you have chunk encoded response uncomment if
# and set chunk_size parameter to None.
#if chunk:
f.write(chunk)
except requests.exceptions.ConnectionError as c_error:
print(f"[-] Connection Error: {c_error}")
except requests.exceptions.Timeout as t_error:
print(f"[-] Connection Timeout Error: {t_error}")
except requests.exceptions.RequestException as req_error:
print(f"[-] Some Ambiguous Exception: {req_error}")
# This way seems faster based upon the comments of the link i shared
import requests
import shutil
def download_file(url):
local_filename = url.split('/')[-1]
with requests.get(url, stream=True) as r:
with open(local_filename, 'wb') as f:
shutil.copyfileobj(r.raw, f)
return local_filename
Related
I Need to download monthly Open Library Data Dumps files, they are some big files:
https://openlibrary.org/data/ol_dump_authors_latest.txt.gz
https://openlibrary.org/data/ol_dump_works_latest.txt.gz
https://openlibrary.org/data/ol_dump_editions_latest.txt.gz
It hangs downloading at worker and edition file because they are big files,the problem i dont get no exception that connection failed.It Just stops Downloading,i know that because the file size wont change for hours
First Try
dump_url = "https://openlibrary.org/data/ol_dump_editions_latest.txt.gz"
dump_path = "temp_file/ol_dump_editions_latest.txt.gz"
session = requests.Session()
with session.get(dump_url, stream=True) as r:
r.raise_for_status()
with open(dump_path, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024*1024):
f.write(chunk)
Second Try
dump_url = "https://openlibrary.org/data/ol_dump_editions_latest.txt.gz"
dump_path = "temp_file/ol_dump_editions_latest.txt.gz"
session = requests.Session()
with session.get(dump_url, stream=True) as r:
r.raise_for_status()
with open(dump_path, 'wb') as f:
shutil.copyfileobj(r.raw, f)
This answer's core content is written by #Edin.A, and is taken from a GitHub ticket they wrote with their permission. Formatting and prose has been slightly modified, but other than reducing log verbosity, code is unchanged.
This can be solved by passing requests a timeout= argument, and then making a new request after a ConnectionError caused by that timeout. Note the max_d_r_c limit used to prevent an endless loop:
import requests
from requests.exceptions import ConnectionError
import os
def resume_download_ol_dump_editions_latest(dump_url,dump_path,max_d_r_c):
max_download_resumes = 30
if max_d_r_c < max_download_resumes:
max_d_r_c += 1
with open(dump_path, 'ab') as f:
position = f.tell()
pos_header = {"Range": f"bytes={position}-"}
with requests.Session() as s:
try:
with s.get(dump_url,headers=pos_header,stream=True,allow_redirects=True,timeout=300) as r:
r.raise_for_status()
with open(dump_path, 'ab') as f:
for chunk in r.iter_content(chunk_size=1024*1024):
f.write(chunk)
f.flush()
os.fsync(f.fileno())
except ConnectionError as to:
resume_download_ol_dump_editions_latest(dump_url=dump_url,dump_path=dump_path,max_d_r_c=max_d_r_c)
def download_ol_dump_editions_latest(dump_url,dump_path):
max_download_resumes_count = 0
with requests.Session() as s:
try:
with s.get(dump_url,stream=True,allow_redirects=True,timeout=300) as r:
r.raise_for_status()
with open(dump_path, 'wb') as f:
last_file_size = None
for chunk in r.iter_content(chunk_size=1024*1024):
f.write(chunk)
f.flush()
os.fsync(f.fileno())
except ConnectionError as to:
resume_download_ol_dump_editions_latest(dump_url=dump_url,dump_path=dump_path,max_d_r_c=max_download_resumes_count)
dump_url = "https://openlibrary.org/data/ol_dump_editions_latest.txt.gz"
dump_path = "temp_file/ol_dump_editions_latest.txt.gz"
download_ol_dump_editions_latest(dump_url=dump_url, dump_path=dump_path)
How do I request and export http file to read if file is not present in my directory?
My code :
def data():
try:
with open('sample.json', 'r') as openfile:
json_object = json.load(openfile)
except FileNotFoundError as e:
print(e)
else:
print('Downloading NOW...')
url = 'https://margincalculator.angelbroking.com/OpenAPI_File/files/OpenAPIScripMaster.json'
d = requests.get(url).json()
with open("sample.json", "w") as outfile:
json.dump(d, outfile)
print('sym downloaded')
finally:
with open('sample.json', 'r') as openfile:
json_object = json.load(openfile)
print(json_object)
What am i trying?
step 1 : Try : Read file from directory
step 2 : if file not found in directory than get it from url and export
step 3 : than read again
step 4 : if still erorr than print('Error in code Please Check')
else print the read_file
Thank you for taking your time to response my question.
Code:
Use isfile(<file>) instead, it is a better option in this case.
isfile('sample.json') checks if file exists or not.
from os.path import isfile
def data():
file='sample.json'
if isfile(file):
with open(file, 'r') as openfile:
json_object = json.load(openfile)
else:
print('Downloading NOW...')
url = 'https://margincalculator.angelbroking.com/OpenAPI_File/files/OpenAPIScripMaster.json'
d = requests.get(url).json()
with open("sample.json", "w") as outfile:
json.dump(d, outfile)
print('sym downloaded')
with open('sample.json', 'r') as openfile:
json_object = json.load(openfile)
print(json_object)
I am trying to define a function that resumes download if the connection is broken. However, the following does not work as expected.
In line 8, I have to manually deduce one chunk-size in order for it to work, otherwise, the final file will be missing exactly one chunk-size for each time I resume it.
if os.path.exists(fileName):
header = requests.head(url)
fileLength = int(header.headers['Content-Length'])
if fileLength == os.path.getsize(fileName):
return True
else:
with open(fileName, 'ab') as f:
position = f.tell()-1024
pos_header = {}
print(position)
pos_header['Range'] = f'bytes={position}-'
with requests.get(url, headers = pos_header, stream = True) as r:
with open(fileName, 'ab') as f:
#some validation should be here
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(r.content)
f.flush()
print(os.path.getsize(fileName))
else:
with requests.get(url, allow_redirects=True, stream = True) as r:
with open(fileName, 'wb') as f:
iter = 0
for chunk in r.iter_content(chunk_size = 1024):
if chunk:
f.write(chunk)
f.flush()
iter += 1
if iter > 2000:
break
Interestingly, the part missing is the in-between two parts of the downloads. Is there a more elegant way of resolving this than what I did?
You have a bug in the code that downloads the 'rest' of the file if it's the second attempt. The bug is in the following line:
f.write(r.content)
It should be
f.write(chunk)
Basically, you're iterating over chunks but writing the entire content, and that messes things up.
When I open sdata.csv file it will not iterate, no error is shown simply not printing. Why could this be? I even did print(g) and it shows its reading properly. I also am trying to write data to the same file and the same blank file occurs with only the heading in it.
import urllib.request as request
import json
from urllib.request import urlopen, Request
import requests
import demjson
import csv
import time
req = Request('https://api.gameslabs.net/1.0.0/exchange', headers={'User-Agent': 'Mozilla/5.0'})
with request.urlopen(req) as response:
if response.getcode() == 200:
source = response.read()
data = json.loads(source)
else:
print('An error occurred while attempting to retrieve data from the API.')
y = json.dumps(data)
x = json.loads(y)
f = csv.writer(open("item-com.csv", "w+"))
# Write CSV Header, If you dont need that, remove this line
f.writerow(["symbol", "buy_game", "buy_item", "buy_name", "sell_game", "sell_item", "sell_name"])
for x in x:
f.writerow([x["symbol"],
x["buy"]["game"],
x["buy"]["item"],
x["buy"]["name"],
x["sell"]["game"],
x["sell"]["item"],
x["sell"]["name"]])
o = csv.DictReader(open("item-com.csv"))
for row in o:
print(row['buy_name'])
req2 = Request('https://api.gameslabs.net/1.0.0/exchange/symbol/MS_IRON_PICKAXE/candles?timeFrame=day',
headers={'User-Agent': 'Mozilla/5.0'})
with request.urlopen(req2) as response:
if response.getcode() == 200:
source2 = response.read()
data2 = json.loads(source2)
else:
print('An error occurred while attempting to retrieve data from the API.')
xdum = json.dumps(data2)
bdum = json.loads(xdum)
ged = csv.writer(open("sdata.csv", "w+"))
ged.writerow(["timestamp", "low", "open", "close", "high", "volume"])
for bdum in bdum:
ged.writerow([bdum["timestamp"],
bdum["low"],
bdum["open"],
bdum["close"],
bdum["high"]])
g = csv.DictReader(open("sdata.csv"))
for row in g:
print(row['timestamp'])
You are writing and reading from the same files. However, you don't ensure the file is closed in between. If you use a context manager it will take care of that for you. I notice you are using context managers for url respones.
I've modified your slightly code to use context managers for file management:
...
with open("item-com.csv", "w+") as csv_file:
f = csv.writer(csv_file)
# Write CSV Header, If you dont need that, remove this line
f.writerow(["symbol", "buy_game", "buy_item", "buy_name", "sell_game", "sell_item", "sell_name"])
for x in x:
f.writerow([x["symbol"],
x["buy"]["game"],
x["buy"]["item"],
x["buy"]["name"],
x["sell"]["game"],
x["sell"]["item"],
x["sell"]["name"]])
with open("item-com.csv") as csv_file:
o = csv.DictReader(csv_file)
for row in o:
print(row['buy_name'])
req2 = Request('https://api.gameslabs.net/1.0.0/exchange/symbol/MS_IRON_PICKAXE/candles?timeFrame=day',
headers={'User-Agent': 'Mozilla/5.0'})
with request.urlopen(req2) as response:
if response.getcode() == 200:
source2 = response.read()
data2 = json.loads(source2)
else:
print('An error occurred while attempting to retrieve data from the API.')
xdum = json.dumps(data2)
bdum = json.loads(xdum)
with open("sdata.csv", "w+") as csv_file:
ged = csv.writer(csv_file)
ged.writerow(["timestamp", "low", "open", "close", "high", "volume"])
for bdum in bdum:
ged.writerow([bdum["timestamp"],
bdum["low"],
bdum["open"],
bdum["close"],
bdum["high"]])
with open("sdata.csv") as csv_file:
g = csv.DictReader(csv_file)
for row in g:
print(row['timestamp'])
Instead of writing line by line to text file try this way. This method reduces repetitive i/o and doesn't have to keep the file open for long time.
lst = []
for x in x:
tmpTuple = ([x["symbol"],
x["buy"]["game"],
x["buy"]["item"],
x["buy"]["name"],
x["sell"]["game"],
x["sell"]["item"],
x["sell"]["name"]])
lst.append(tmpTuple)
#outside loop create a pandas dataframe
df = pd.DataFrame(lst)
#this is several options to save
df.to_csv('filename.csv')
I have a Python 3 script running what download images from the web, but it stops after a few hours and I can't figure out why.
I have URL's in a .csv file and it saves the images with the name what is provided in column 1 of the csv file.
I already tried to switch off the "print (url)", because I thought that this maybe took too much memory at a certain moment, but that didn't do the trick.
This is my script:
import csv
import requests
print ('download.py map 3_new')
with open('3.csv') as csvfile:
csvrows = csv.reader(csvfile, delimiter=';', quotechar='"')
for row in csvrows:
filename = row[0]
url = row[1]
#print (url)
result = requests.get(url, stream = True)
if result.status_code == 200:
image = result.raw.read()
open(filename,"wb").write(image)
There is a good chance that it's caused by not closing the files after saving images to your hard drive. Try to do it this way:
import csv
import requests
print ('download.py map 3_new')
with open('3.csv') as csvfile:
csvrows = csv.reader(csvfile, delimiter=';', quotechar='"')
for row in csvrows:
filename = row[0]
url = row[1]
#print (url)
result = requests.get(url, stream = True)
if result.status_code == 200:
image = result.raw.read()
with open(filename,"wb") as f:
f.write(image)