Wondering if there is a better way to update files? - python

I currently have a python program that is both a web-scraper, and file-writer which updates databases that are on my desktop using windows 10 task scheduler. The problem is, for some reason the task scheduler doesn't run the python files at the specified time 100% of the time. I was wondering if there was a better approach to assure that the files get updated at their specified times, as long as the computer is on.
I've Tried changing the task scheduler settings, but I still have this problem.
import requests
from bs4 import BeautifulSoup
from datetime import datetime
#Updates Everyday.
#Fantasy5-WebScraper
response = requests.get('https://www.lotteryusa.com/michigan/fantasy-5/')
soup = BeautifulSoup(response.text, 'html.parser')
date = soup.find(class_='date')
results = soup.find(class_='draw-result list-unstyled list-inline')
d = datetime.strptime(date.time['datetime'], '%Y-%m-%d')
Fantasy5 = (d.strftime("%Y-%m-%d")+(',')+results.get_text().strip().replace('\n',','))
print(Fantasy5)
#Writing to DataBase
with open("Filename.txt", "r") as f:
data = f.read()
with open("Filename.txt", "w") as f:
f.write('{}{}{}'.format(Fantasy5, '\n' if data else '', data))
f.close()
#Writing to DataFrame
with open("Filename.txt", "r") as f:
data = f.read()
with open("Filename.txt", "w") as f:
f.write('{}{}{}'.format(Fantasy5, '\n' if data else '', data))
f.close()

You can use schedule to do this task. then add the python file to startup so it gets executed every time you start the computer.
this program will do the job every day at 6 am.
import schedule
import time
import requests
from bs4 import BeautifulSoup
from datetime import datetime
def job(t):
response = requests.get('https://www.lotteryusa.com/michigan/fantasy-5/')
soup = BeautifulSoup(response.text, 'html.parser')
date = soup.find(class_='date')
results = soup.find(class_='draw-result list-unstyled list-inline')
d = datetime.strptime(date.time['datetime'], '%Y-%m-%d')
Fantasy5 = (d.strftime("%Y-%m-%d")+(',')+results.get_text().strip().replace('\n',','))
print(Fantasy5)
#Writing to DataBase
with open("Filename.txt", "r") as f:
data = f.read()
with open("Filename.txt", "w") as f:
f.write('{}{}{}'.format(Fantasy5, '\n' if data else '', data))
f.close()
#Writing to DataFrame
with open("Filename.txt", "r") as f:
data = f.read()
with open("Filename.txt", "w") as f:
f.write('{}{}{}'.format(Fantasy5, '\n' if data else '', data))
f.close()
return
schedule.every().day.at("06:00").do(job,'It is 06:00')
while True:
schedule.run_pending()
time.sleep(60)

Related

Saving Multiple files in Python

I am trying to create a new file each time the following runs. At the moment it creates 1 file and just overwrites it. Is there a to make it not overwrite and create a new file for each loop?
import xml.etree.ElementTree as ET
import time
import csv
with open('OrderCSV.csv', newline='') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
orders_data = ET.Element('orders_data')
orders = ET.SubElement(orders_data, 'orders')
##Order Details
order_reference = ET.SubElement(orders, 'order reference')
order_reference.set('',"12345")
order_date = ET.SubElement(order_reference, 'order_date')
order_priority = ET.SubElement(order_reference, 'order_priority')
order_category = ET.SubElement(order_reference, 'order_category')
delivery_service = ET.SubElement(order_reference, 'delivery_service')
delivery_service.text = row['delivery_service']
timestr = time.strftime("%Y%m%d%H%M%S")
mydata = ET.tostring(orders_data)
myfile = open(timestr, "wb")
myfile.write(mydata)
You could see if the file already exists and wait a bit
while True:
timestr = time.strftime("%Y%m%d%H%M%S")
if not os.path.exists(timestr):
break
time.sleep(.1)
with open(timestr, "wb") as myfile:
mydata = ET.tostring(orders_data)
myfile.write(mydata)
Instead of waiting you could just add seconds. This will cause the file names to drift forward in time if you process a lot of them per second.
mytime = time.time()
while True:
timestr = time.strftime("%Y%m%d%H%M%S", time.localtime(mytime))
if not os.path.exists(timestr):
break
time.sleep(.1)
with open(timestr, "wb") as myfile:
mydata = ET.tostring(orders_data)
myfile.write(mydata)
Another option is to get a single timestamp before the loop and update it as you go.
mytime = time.strftime("%Y%m%d%H%M%S")
for index, row in enumerate(reader):
....
mytime = f"mytime-{index}"
....
change the variable name each time you run the loop and I would suggest using with statement for opening file as you also have to close it after you open it
with open(timestr, 'wb') as myfile:
myfile.write(mydata)
edit: only flaw I can imagine in your code is not closing the file after opening it

Combine two python scripts for web search

I'm trying to download files from a site and due to search result limitations (max 300), I need to search each item individually. I have a csv file that has a complete list which I've written some basic code to return the ID# column.
With some help, I've got another script that iterates through each search result and downloads a file. What I need to do now is to combine the two so that it will search each individual ID# and download the file.
I know my loop is messed up here, I just can't figure out where and if I'm even looping in the right order
import requests, json, csv
faciltiyList = []
with open('Facility List.csv', 'r') as f:
csv_reader = csv.reader(f, delimiter=',')
for searchterm in csv_reader:
faciltiyList.append(searchterm[0])
url = "https://siera.oshpd.ca.gov/FindFacility.aspx"
r = requests.get(url+"?term="+str(searchterm))
searchresults = json.loads(r.content.decode('utf-8'))
for report in searchresults:
rpt_id = report['RPT_ID']
reporturl = f"https://siera.oshpd.ca.gov/DownloadPublicFile.aspx?archrptsegid={rpt_id}&reporttype=58&exportformatid=8&versionid=1&pageid=1"
r = requests.get(reporturl)
a = r.headers['Content-Disposition']
filename = a[a.find("filename=")+9:len(a)]
file = open(filename, "wb")
file.write(r.content)
r.close()
The original code I have is here:
import requests, json
searchterm="ALAMEDA (COUNTY)"
url="https://siera.oshpd.ca.gov/FindFacility.aspx"
r=requests.get(url+"?term="+searchterm)
searchresults=json.loads(r.content.decode('utf-8'))
for report in searchresults:
rpt_id=report['RPT_ID']
reporturl=f"https://siera.oshpd.ca.gov/DownloadPublicFile.aspx?archrptsegid={rpt_id}&reporttype=58&exportformatid=8&versionid=1&pageid=1"
r=requests.get(reporturl)
a=r.headers['Content-Disposition']
filename=a[a.find("filename=")+9:len(a)]
file = open(filename, "wb")
file.write(r.content)
r.close()
The searchterm ="ALAMEDA (COUNTY)" results in more than 300 results, so I'm trying to replace "ALAMEDA (COUNTY)" with a list that'll run through each name (ID# in this case) so that I'll get just one result, then run again for the next on the list
CSV - just 1 line
Tested with a CSV file with just 1 line:
406014324,"HOLISTIC PALLIATIVE CARE, INC.",550004188,Parent Facility,5707 REDWOOD RD,OAKLAND,94619,1,ALAMEDA,Not Applicable,,Open,1/1/2018,Home Health Agency/Hospice,Hospice,37.79996,-122.17075
Python code
This script reads the IDs from the CSV file. Then, it fetches the results from URL and finally writes the desired contents to the disk.
import requests, json, csv
# read Ids from csv
facilityIds = []
with open('Facility List.csv', 'r') as f:
csv_reader = csv.reader(f, delimiter=',')
for searchterm in csv_reader:
facilityIds.append(searchterm[0])
# fetch and write file contents
url = "https://siera.oshpd.ca.gov/FindFacility.aspx"
for facilityId in facilityIds:
r = requests.get(url+"?term="+str(facilityId))
reports = json.loads(r.content.decode('utf-8'))
# print(f"reports = {reports}")
for report in reports:
rpt_id = report['RPT_ID']
reporturl = f"https://siera.oshpd.ca.gov/DownloadPublicFile.aspx?archrptsegid={rpt_id}&reporttype=58&exportformatid=8&versionid=1&pageid=1"
r = requests.get(reporturl)
a = r.headers['Content-Disposition']
filename = a[a.find("filename=")+9:len(a)]
# print(f"filename = {filename}")
with open(filename, "wb") as o:
o.write(r.content)
Repl.it link

Python script stops after a few hours

I have a Python 3 script running what download images from the web, but it stops after a few hours and I can't figure out why.
I have URL's in a .csv file and it saves the images with the name what is provided in column 1 of the csv file.
I already tried to switch off the "print (url)", because I thought that this maybe took too much memory at a certain moment, but that didn't do the trick.
This is my script:
import csv
import requests
print ('download.py map 3_new')
with open('3.csv') as csvfile:
csvrows = csv.reader(csvfile, delimiter=';', quotechar='"')
for row in csvrows:
filename = row[0]
url = row[1]
#print (url)
result = requests.get(url, stream = True)
if result.status_code == 200:
image = result.raw.read()
open(filename,"wb").write(image)
There is a good chance that it's caused by not closing the files after saving images to your hard drive. Try to do it this way:
import csv
import requests
print ('download.py map 3_new')
with open('3.csv') as csvfile:
csvrows = csv.reader(csvfile, delimiter=';', quotechar='"')
for row in csvrows:
filename = row[0]
url = row[1]
#print (url)
result = requests.get(url, stream = True)
if result.status_code == 200:
image = result.raw.read()
with open(filename,"wb") as f:
f.write(image)

Having problems writing to text files. Text files being overwritten/cut

I would like to write new data to the beginning of my text file, with the previous data shifting down 1 line each time new data is imported, I would like everything to be organized, but every time I import something gets deleted.
Code:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
response = requests.get('https://www.lotteryusa.com/michigan/lucky-4-life/')
soup = BeautifulSoup(response.text, 'html.parser')
date = soup.find(class_='date')
results = soup.find(class_='draw-result list-unstyled list-inline')
d = datetime.strptime(date.time['datetime'], '%Y-%m-%d')
Lucky = (d.strftime("%m%d%Y")+(',')+results.get_text()[:-20].strip().replace('\n',','))
print(Lucky)
with open("webscraper2noteppad++", "r+") as f:
file = f.readlines()
f.seek(0,0)
f.write(Lucky)
Also tried doing this
with open("webscraper2noteppad++", "r+") as f:
file = f.read()
f.seek(0,0)
f.write(Lucky + '\n')
but I have to put 10 lines between the already existing data, and the new data. So it can be can be imported on top without deleting.
You can first read the content of your file, the append it to the new data and then write everything to the file:
with open("webscraper2noteppad++", "r") as f:
data = f.read()
with open("webscraper2noteppad++", "w") as f:
f.write('{}{}{}'.format(lucky, '\n' if data else '', data))

Parse data with beatifulsoup using Threads

I have thousands of URLs in a text file, now I want to extract title and price from a product link. I tried to implement threads to do it faster but seems that it's not working correctly, producing duplicate data and executing script too long. Without using threads, the script works as expected.
Here is my code:
import requests
from bs4 import BeautifulSoup
import csv
import lxml
import threading
def runner(fname):
global lck
lck.acquire()
with open(fname, 'r') as f:
for line in f:
r = requests.get(line)
soup = BeautifulSoup(r.content, 'lxml')
try:
title = soup.find('h1', id='itemTitle').text.trim().encode('utf-8')
price = soup.find('span', itemprop='price').text.trim().encode('utf-8')
except:
price = "No price"
with open("Data.csv", 'a', newline='',) as file:
writer = csv.writer(file)
writer.writerow([title, price])
lck.release()
lck = threading.Lock()
fname = "ProductLinks.txt"
threads = []
for i in range(0, 3):
t = threading.Thread(target = runner, args = (fname, ))
threads.append(t)
t.start()
for t in threads:
t.join()
Can someone please guide me, on how to do it correctly, so it can extract and save data parallelly
It is producing duplicate results because when you create the threads you call the same function three times.
t = threading.Thread(target = runner, args = (fname, ))
When you execute the above line, the argument always stays fname which as far as I understand it is always "ProductLinks.txt". Therefore your program will go into runner and there I see that you loop over all the lines of the text.
I suspect that what you want to "parallelise" is exactly that looping over the text lines? Then you would need to write a function parse_line and pass this one into the threading environment.
I would also suggest that you store the values in a dict and export to csv in the end because am not sure if the open environment is thread-safe.
def parse_line(line, result_dict):
r = requests.get(line)
soup = BeautifulSoup(r.content, 'lxml')
try:
title = soup.find('h1', id='itemTitle').text.trim().encode('utf-8')
price = soup.find('span', itemprop='price').text.trim().encode('utf-8')
result_dict[title] = price
except:
result_dict['No title'] = "No price"
Now, say that you have a list with all the lines in your file as strings. You can achieve that by doing the following
file_lines = []
with open(fname, 'r') as f:
for line in f:
file_lines.append(line)
Then you can call this function using Threading over the list of all lines in your file
my_dict = {}
for input_line in file_lines:
t = threading.Thread(target = parse_line, args = (input_line, my_dict))
threads.append(t)
t.start()
Finally you can export your dict to csv using pandas
import pandas as pd
pd.DataFrame(my_dict).to_csv("Data.csv")

Categories

Resources