Python script stops after a few hours - python

I have a Python 3 script running what download images from the web, but it stops after a few hours and I can't figure out why.
I have URL's in a .csv file and it saves the images with the name what is provided in column 1 of the csv file.
I already tried to switch off the "print (url)", because I thought that this maybe took too much memory at a certain moment, but that didn't do the trick.
This is my script:
import csv
import requests
print ('download.py map 3_new')
with open('3.csv') as csvfile:
csvrows = csv.reader(csvfile, delimiter=';', quotechar='"')
for row in csvrows:
filename = row[0]
url = row[1]
#print (url)
result = requests.get(url, stream = True)
if result.status_code == 200:
image = result.raw.read()
open(filename,"wb").write(image)

There is a good chance that it's caused by not closing the files after saving images to your hard drive. Try to do it this way:
import csv
import requests
print ('download.py map 3_new')
with open('3.csv') as csvfile:
csvrows = csv.reader(csvfile, delimiter=';', quotechar='"')
for row in csvrows:
filename = row[0]
url = row[1]
#print (url)
result = requests.get(url, stream = True)
if result.status_code == 200:
image = result.raw.read()
with open(filename,"wb") as f:
f.write(image)

Related

python script failling to read csvfile with error - StopIteration

I am working on script which downloads large audit logs csv file from azure DevOps and filters data according given condition. This works for small csv file but for file with large data it fails with
fields = next(reader)
stopIteration
Can someone help with changes required in script? I am using python 3.7.9 on MacOs
def getproject(url,pat):
response = requests.get(url, auth=HTTPBasicAuth(username='',password=pat))
if response.status_code == 200:
url_data = response.content
tempfile = open("temp.csv","wb")
tempfile.write(url_data)
tempfile.close()
return url_data
else:
print("\nERROR : Unable to conect The server...")
def FilterData():
lists =[]
pro_name=[]
RepoId =[]
RepoName=[]
new_file = open("temp_new.csv", 'w',newline='')
writer = csv.writer(new_file)
with open("temp.csv", 'r') as readFile:
reader = csv.reader(readFile)
fields = next(reader)
lists.append(fields)
for row in reader:
for field in row:
if field == "Git.RepositoryCreated":
lists.append(row)
writer.writerows(lists)
readFile.close()
new_file.close()
os.remove("temp.csv")
timestamp = (datetime.datetime.now())
timestamp = timestamp.strftime("%d%B%Y_%H%M%S")
file_name = "Data2_"+str(timestamp)+".csv"
file1 = open("temp_new.csv",'r')
df = pd.read_csv(file1)
for i in df["Data"]:
res = json.loads(i)
pro_name.append(res['ProjectName'])
RepoId.append(res['RepoId'])
RepoName.append(res['RepoName'])
Disp_Name = df["ActorDisplayName"]
ActionId = df["ActionId"]
TimeStamp = df["Timestamp"]
file1.close()
os.remove("temp_new.csv")
Header = ["Actor Display Name","Project
Name","RepoName","RepoId","ActionId","Timestamp"]
d=[Disp_Name,pro_name,RepoName,RepoId,ActionId,TimeStamp]
export_data = zip_longest(*d, fillvalue = '')
with open(file_name, 'w',newline='') as myfile:
wr = csv.writer(myfile)
wr.writerow(Header)
wr.writerows(export_data)
myfile.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser("This is used for getting list of the projects")
parser.add_argument("-o" , dest="org", help="org name")
parser.add_argument("-p" , dest="pat", help="pat value")
parser.add_argument("-sd" , dest="sdate", help="Start Date")
parser.add_argument("-ed" , dest="edate", help="End Date")
args = parser.parse_args()
org = args.org
token = args.pat
startdate = args.sdate
enddate = args.edate
url = "https://auditservice.dev.azure.com/{org_name}/_apis/audit/downloadlog?
format=csv&startTime={startdt}&endTime={enddt}&api-version=6.1-
preview.1".format(org_name=org,startdt=startdate,enddt=enddate)
#call "getproject" function to check url and token to further create required csv
getproject(url,token)
FilterData()
[+] in your getproject function,
you should use a try except block to handle http errors etc.
[+] if the csv file you're trying to download is quite large, it may be best to write the data in chunks.
As for the fields = next(reader) stopIteration errpr.
I'm not sure. ¯_(ツ)_/¯
Try throwing your code in the debugger and stepping through it.
See: download large file in python with requests
def getproject(url,pat):
try:
# NOTE the stream=True parameter below
with requests.get(url, auth=HTTPBasicAuth(username='',password=pat), stream=True) as r:
r.raise_for_status()
with open('tmp.csv', 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
# If you have chunk encoded response uncomment if
# and set chunk_size parameter to None.
#if chunk:
f.write(chunk)
except requests.exceptions.ConnectionError as c_error:
print(f"[-] Connection Error: {c_error}")
except requests.exceptions.Timeout as t_error:
print(f"[-] Connection Timeout Error: {t_error}")
except requests.exceptions.RequestException as req_error:
print(f"[-] Some Ambiguous Exception: {req_error}")
# This way seems faster based upon the comments of the link i shared
import requests
import shutil
def download_file(url):
local_filename = url.split('/')[-1]
with requests.get(url, stream=True) as r:
with open(local_filename, 'wb') as f:
shutil.copyfileobj(r.raw, f)
return local_filename

Iteration Url Load from CSV in Python

Please Help me
I have a data url in the CSV file, in that file there are 100 rows and 1 column,
I want to load data line 1 to line 100 from CSV using Python, how do I write the code line?
However, after running the repetition can only work once in one of the lines does not reach the end of the url in the CSV and does not continue to the next URL.
disc_information = html.find('div', class_='alert alert-info global-promo').text.strip().strip('\n')
AttributeError: 'NoneType' object has no attribute 'text'
how do I get through if an error occurs when html is not found?
the following line of code I use python, please help so that the looping scrape runs to the end of the url list
from bs4 import BeautifulSoup
import requests
import pandas as pd
import csv
import pandas
with open('Url Torch.csv','rt') as f:
data = csv.reader(f, delimiter=',')
for row in data:
URL_GO = row[2]
def variable_Scrape(url):
try:
cookies = dict(cookie="............")
request = requests.get(url, cookies=cookies)
html = BeautifulSoup(request.content, 'html.parser')
title = html.find('div', class_='title').text.strip().strip('\n')
desc = html.find('div', class_='content').text
link = html.find_all('img', class_='lazyload slide-item owl-lazy')
normal_price = html.find('div', class_='amount public').text.strip().strip('\n')
disc_information = html.find('div', class_='alert alert-info global-promo').text.strip().strip('\n')
except AttributeError as e:
print(e)
#ConnectionAbortedError
return False
else:
print(title)
#print(desc)
#print(link)
finally:
print(title)
print(desc)
print(link)
print('Finally.....')
variable_Scrape(URL_GO)
Is hard to give you the exact answer without seeing you csv file but try this:
import csv
f = open('you_file.csv')
csv_f = csv.reader(f)
for row in csv_f:
print row[0]
This is the code
import csv
data = [] #create an empty list to store rows on it
with open('emails.csv') as csv_file:
reader = csv.reader(csv_file)
for row in reader:
data.append(row) #add each row to the list
Based on your comments about passing a loop when the url is not ok:
for url in data: # data is the list where url stored
try:
# do your code here (requests, beautifulsoup) :
# r = requests.get(url) ...
except:
pass
# will go to the next loop (next url) if an error happens

Iterate over files in a directory with Python - code not working

The following code works one one specified file:
sec_urls = []
filename = './index/2000Q1.csv'
with open(filename, newline='') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for line in reader:
url = line[4].strip()
sec_urls.append(url)
print(url, 'downloaded and added to list')
I would like to perform this on several files contained within the directory INDEX_DIR, however I am not getting any results with the following code:
sec_urls = []
for filename in os.listdir(INDEX_DIR):
if filename.endswith('.csv'):
with open(filename, newline='') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for line in reader:
url = line[4].strip()
sec_urls.append(url)
print(url, 'downloaded and added to list')
I am not receiving any error messages, I am just not receiving an output.
Any help would be greatly appreciated (I am an absolute beginner).

Combine two python scripts for web search

I'm trying to download files from a site and due to search result limitations (max 300), I need to search each item individually. I have a csv file that has a complete list which I've written some basic code to return the ID# column.
With some help, I've got another script that iterates through each search result and downloads a file. What I need to do now is to combine the two so that it will search each individual ID# and download the file.
I know my loop is messed up here, I just can't figure out where and if I'm even looping in the right order
import requests, json, csv
faciltiyList = []
with open('Facility List.csv', 'r') as f:
csv_reader = csv.reader(f, delimiter=',')
for searchterm in csv_reader:
faciltiyList.append(searchterm[0])
url = "https://siera.oshpd.ca.gov/FindFacility.aspx"
r = requests.get(url+"?term="+str(searchterm))
searchresults = json.loads(r.content.decode('utf-8'))
for report in searchresults:
rpt_id = report['RPT_ID']
reporturl = f"https://siera.oshpd.ca.gov/DownloadPublicFile.aspx?archrptsegid={rpt_id}&reporttype=58&exportformatid=8&versionid=1&pageid=1"
r = requests.get(reporturl)
a = r.headers['Content-Disposition']
filename = a[a.find("filename=")+9:len(a)]
file = open(filename, "wb")
file.write(r.content)
r.close()
The original code I have is here:
import requests, json
searchterm="ALAMEDA (COUNTY)"
url="https://siera.oshpd.ca.gov/FindFacility.aspx"
r=requests.get(url+"?term="+searchterm)
searchresults=json.loads(r.content.decode('utf-8'))
for report in searchresults:
rpt_id=report['RPT_ID']
reporturl=f"https://siera.oshpd.ca.gov/DownloadPublicFile.aspx?archrptsegid={rpt_id}&reporttype=58&exportformatid=8&versionid=1&pageid=1"
r=requests.get(reporturl)
a=r.headers['Content-Disposition']
filename=a[a.find("filename=")+9:len(a)]
file = open(filename, "wb")
file.write(r.content)
r.close()
The searchterm ="ALAMEDA (COUNTY)" results in more than 300 results, so I'm trying to replace "ALAMEDA (COUNTY)" with a list that'll run through each name (ID# in this case) so that I'll get just one result, then run again for the next on the list
CSV - just 1 line
Tested with a CSV file with just 1 line:
406014324,"HOLISTIC PALLIATIVE CARE, INC.",550004188,Parent Facility,5707 REDWOOD RD,OAKLAND,94619,1,ALAMEDA,Not Applicable,,Open,1/1/2018,Home Health Agency/Hospice,Hospice,37.79996,-122.17075
Python code
This script reads the IDs from the CSV file. Then, it fetches the results from URL and finally writes the desired contents to the disk.
import requests, json, csv
# read Ids from csv
facilityIds = []
with open('Facility List.csv', 'r') as f:
csv_reader = csv.reader(f, delimiter=',')
for searchterm in csv_reader:
facilityIds.append(searchterm[0])
# fetch and write file contents
url = "https://siera.oshpd.ca.gov/FindFacility.aspx"
for facilityId in facilityIds:
r = requests.get(url+"?term="+str(facilityId))
reports = json.loads(r.content.decode('utf-8'))
# print(f"reports = {reports}")
for report in reports:
rpt_id = report['RPT_ID']
reporturl = f"https://siera.oshpd.ca.gov/DownloadPublicFile.aspx?archrptsegid={rpt_id}&reporttype=58&exportformatid=8&versionid=1&pageid=1"
r = requests.get(reporturl)
a = r.headers['Content-Disposition']
filename = a[a.find("filename=")+9:len(a)]
# print(f"filename = {filename}")
with open(filename, "wb") as o:
o.write(r.content)
Repl.it link

Editing a downloaded CSV in memory before writing

Forewarning: I am very new to Python and programming in general. I am trying to use Python 3 to get some CSV data and making some changes to it before writing it to a file. My problem lies in accessing the CSV data from a variable, like so:
import csv
import requests
csvfile = session.get(url)
reader = csv.reader(csvfile.content)
for row in reader:
do(something)
This returns:
_csv.Error: iterator should return strings, not int (did you open the file in text mode?)
Googling revealed that I should be feeding the reader text instead of bytes, so I also attempted:
reader = csv.reader(csvfile.text)
This also does not work as the loop works through it letter by letter instead of line by line. I also experimented with TextIOWrapper and similar options with no success. The only way I have managed to get this to work is by writing the data to a file, reading it, and then making changes, like so:
csvfile = session.get(url)
with open("temp.txt", 'wb') as f:
f.write(csvfile.content)
with open("temp.txt", 'rU', encoding="utf8") as data:
reader = csv.reader(data)
for row in reader:
do(something)
I feel like this is far from the most optimal way of doing this, even if it works. What is the proper way to read and edit the CSV data directly from memory, without having to save it to a temporary file?
you don't have to write to a temp file, here is what I would do, using the "csv" and "requests" modules:
import csv
import requests
__csvfilepathname__ = r'c:\test\test.csv'
__url__ = 'https://server.domain.com/test.csv'
def csv_reader(filename, enc = 'utf_8'):
with open(filename, 'r', encoding = enc) as openfileobject:
reader = csv.reader(openfileobject)
for row in reader:
#do something
print(row)
return
def csv_from_url(url):
line = ''
datalist = []
s = requests.Session()
r = s.get(url)
for x in r.text.replace('\r',''):
if not x[0] == '\n':
line = line + str(x[0])
else:
datalist.append(line)
line = ''
datalist.append(line)
# at this point you already have a data list 'datalist'
# no need really to use the csv.reader object, but here goes:
reader = csv.reader(datalist)
for row in reader:
#do something
print(row)
return
def main():
csv_reader(__csvfilepathname__)
csv_from_url(__url__)
return
if __name__ == '__main__':
main ()
not very pretty, and probably not very good in regards to memory/performance, depending on how "big" your csv/data is
HTH, Edwin.

Categories

Resources