How do i solve for error str in datetime? - python

Goal is the use datetime to reiterate over
http://www.harness.org.au/racing/results/?firstDate=01-01-2019
http://www.harness.org.au/racing/results/?firstDate=02-01-2019.... to yesterdays date
(should be done in new_url = base_url + str(enddate1))
then once in that href, i want to circulate over meetingfulllisttable to get name and href to then get results data from each track that day.
My current error is'<=' not supported between instances of 'datetime.timedelta' and 'str' - which comes from my while loop. why is this? never used datetime before
from datetime import datetime, date, timedelta
import requests
import re
from bs4 import BeautifulSoup
base_url = "http://www.harness.org.au/racing/results/?firstDate="
base1_url = "http://www.harness.org.au"
webpage_response = requests.get('http://www.harness.org.au/racing/results/?firstDate=')
soup = BeautifulSoup(webpage_response.content, "html.parser")
format = "%d-%m-%y"
delta = timedelta(days=1)
yesterday = datetime.today() - timedelta(days=1)
yesterday1 = yesterday.strftime(format)
enddate = datetime(2019, 1, 1)
enddate1 = enddate.strftime(format)
while enddate1 <= yesterday1:
enddate1 =+ timedelta(days=1)
new_url = base_url + str(enddate1)
soup12 = requests.get(new_url)
soup1 = BeautifulSoup(soup12.content, "html.parser")
table1 = soup1.find('table', class_='meetingListFull')
for tr in table1.find_all('tr'):
all_cells = tr.find_all('td')
track = all_cells.a.href.get_text()
href = all_cells.get('href')
trackresults = base1_url + href

This
yesterday1 = yesterday.strftime(format)
Is a string. That's why you are getting that error

Related

Python: Webdriver get list of the URLs, duplicated data

I'm trying to pull the hist data from a URL. The date(as epcho time) is part of the URL.
import pandas as pd
import numpy as np
from selenium import webdriver
import chromedriver_binary
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
import time
from datetime import datetime
options = Options()
options.headless = True
lastDate = '2021-07-01'
firstDate = '2010-01-01'
time_object = time.strptime(lastDate, '%Y-%m-%d')
period2 = int(time.mktime(time_object))
period1 = int(period2 - 86400*200)
time_object = time.strptime(firstDate, '%Y-%m-%d')
period0 = time.mktime(time_object)
count = 1
url=f"https://finance.yahoo.com/quote/%5EGSPC/history?period1={period1}&period2={period2}&interval=1d&filter=history&frequency=1d&includeAdjustedClose=true"
#url=r'https://finance.yahoo.com/quote/%5EGSPC/history?period1=1262304000&period2=1625097600&interval=1d&filter=history&frequency=1d&includeAdjustedClose=true'
while period2 >= period0:
ed = datetime.fromtimestamp(period2)
sd = datetime.fromtimestamp(period1)
print(f"Working on {sd} {ed}, current count {count}")
print(f"URL is {url}")
driver = webdriver.Chrome(options=options)
driver.implicitly_wait(20)
driver.get(url)
js = "var q=document.documentElement.scrollTop=100000"
driver.execute_script(js)
for row in t.tbody.findAll('tr'):
date = row.findAll('td')[0].text
date = datetime.strptime(date, "%b %d, %Y")
date = date.strftime("%Y-%m-%d")
open = row.findAll('td')[1].text.replace(',','')
high = row.findAll('td')[2].text.replace(',','')
low = row.findAll('td')[3].text.replace(',','')
close = row.findAll('td')[4].text.replace(',','')
adjclose = row.findAll('td')[5].text.replace(',','')
volume = row.findAll('td')[6].text.replace(',','')
hist = pd.DataFrame([[date,open,high,low,close,adjclose,volume]], columns=['Date', 'Open','High','Low','Close', 'Adj Close', 'Volumn'])
if count == 1:
hist.to_csv('hist.csv', index=False, header=True)
else:
hist.to_csv('hist.csv', index=False, mode='a', header=False)
count = count + 1
period2 = int(period1)
period1 = int(period2 - 86400*200)
url=f"https://finance.yahoo.com/quote/%5EGSPC/history?period1={period1}&period2={period2}&interval=1d&filter=history&frequency=1d&includeAdjustedClose=true"
driver.close()
I printed the URL, it updated properly with the newly refreshed period. However, what's being written to my hist.csv is duplicated. It seems the driver only respected my first URL and completely ignored the rest. As a result I got the first period of the dates/price etc repeatedl in my hist.csv
Appreciate if you have any comments.
Thanks
Please disregard - I just realized I didn't refresh the variable while I'm using jupyter. i found the problem just 2 minutes after I posted the questions. Thanks for the great stackoverflow!

Creating a while loop for a non-simple piece of code

I'm new to Python (and programming in general) and I'm having trouble creating a while loop. All the examples I've been able to find are pretty simple, along the lines of var=10, subtract 1 until you get to 0. I have a larger piece of code that I need to re-run as long as a condition is met, but what I've tried hasn't worked. I'm using Python 3 in Anaconda.
I've tried def main() and the below and now I'm trying while true (in my code sample, I have while true and below it is indented but I can't seem to get it to indent here). What I'm trying to get it to do is check whether datey is less than endDate and if it is, start the whole thing again.
Here's what I have:
from openpyxl import load_workbook
while True:
WB = load_workbook('File.xlsx', data_only=True)
SH = WB['02474']
import datetime
from datetime import timedelta
endDate = datetime.datetime.strptime('2018-02-09', '%Y-%m-%d')
MXrow = SH.max_row
NXrow = MXrow+1
datex = SH.cell(row=MXrow, column=2)
datex2 = datex.value
datey = datetime.datetime.strptime(datex2, '%Y-%m-%d') + datetime.timedelta(days=1)
datey2 = datey.strftime('%Y-%m-%d')
URL = 'https://www.example.com' + str(datey2)
import requests
page = requests.get(URL)
from bs4 import BeautifulSoup
soup = BeautifulSoup(page.text, 'lxml')
mintemp = soup.find("tr",{"class":"weatherhistory_results_datavalue temp_mn"})
for i in mintemp:
mintemp2 = i.find("span",{"class": "value"})
maxtemp = soup.find("tr",{"class":"weatherhistory_results_datavalue temp_mx"})
for i in maxtemp:
maxtemp2 = i.find("span",{"class": "value"})
mintemp3 = mintemp2.text
maxtemp3 = maxtemp2.text
while(datey <= endDate):
SH.cell(row=NXrow, column=1).value = "ZIP"
SH.cell(row=NXrow, column=2).value = datey2
SH.cell(row=NXrow, column=3).value = URL
SH.cell(row=NXrow, column=4).value = mintemp3
SH.cell(row=NXrow, column=5).value = maxtemp3
WB.save('File.xlsx')
Would love any pointers you can give me to fix this!
If you want to check if
datey is less than endDate
then simply use your while loop argument at the top instead of a boolean:
import requests
import datetime
from datetime import timedelta
while(datey <= endDate):
WB = load_workbook('File.xlsx', data_only=True)
SH = WB['02474']
endDate = datetime.datetime.strptime('2018-02-09', '%Y-%m-%d')
MXrow = SH.max_row
NXrow = MXrow+1
datex = SH.cell(row=MXrow, column=2)
datex2 = datex.value
datey = datetime.datetime.strptime(datex2, '%Y-%m-%d') +
datetime.timedelta(days=1)
datey2 = datey.strftime('%Y-%m-%d')
URL = 'https://www.example.com' + str(datey2)
page = requests.get(URL)
from bs4 import BeautifulSoup
soup = BeautifulSoup(page.text, 'lxml')
mintemp = soup.find("tr",{"class":"weatherhistory_results_datavalue temp_mn"})
for i in mintemp:
mintemp2 = i.find("span",{"class": "value"})
maxtemp = soup.find("tr",{"class":"weatherhistory_results_datavalue temp_mx"})
for i in maxtemp:
maxtemp2 = i.find("span",{"class": "value"})
mintemp3 = mintemp2.text
maxtemp3 = maxtemp2.text
SH.cell(row=NXrow, column=1).value = "ZIP"
SH.cell(row=NXrow, column=2).value = datey2
SH.cell(row=NXrow, column=3).value = URL
SH.cell(row=NXrow, column=4).value = mintemp3
SH.cell(row=NXrow, column=5).value = maxtemp3
WB.save('File.xlsx')
This way, when 'datey' is less than or equal to 'endDate', the contents of the while loop will continue to loop until 'datey' is greater than 'endDate'
Also import statements go at the top of your file and do not belong inside a loop.

Http requests freezes after severel requests

Okay, here is my code:
from lxml import html
from lxml import etree
from selenium import webdriver
import calendar
import math
import urllib
import progressbar
import requests
Using selenium
path_to_driver = '/home/vladislav/Shit/geckodriver'
browser = webdriver.Firefox(executable_path = path_to_driver)
Create a dict, where i store data and create progressbars
DataDict = {}
barY = progressbar.ProgressBar(max_value=progressbar.UnknownLength)
barM = progressbar.ProgressBar(max_value=progressbar.UnknownLength)
barW = progressbar.ProgressBar(max_value=progressbar.UnknownLength)
Forming parameters in a loop, constructing a url from them and send a browser.get request
for year in (range(2014,2016)):
barY.update(year)
for month in range(1,13):
barM.update(month)
weeks = math.ceil(calendar.monthrange(year,month)[1]/4)
for week in range(weeks):
barW.update(week)
if (week > 2):
start_day = 22
end_day = calendar.monthrange(year,month)[1]
else:
start_day =7*week + 1
end_day = 7*(week + 1)
start_date = str(year) + '-' + str(month).zfill(2) +'-' + str(start_day).zfill(2)
end_date = str(year) + '-' +str(month).zfill(2) + '-' + str(end_day).zfill(2)
params = {'end-date': end_date, 'start-date': start_date}
url = 'http://www.finam.ru/profile/moex-akcii/aeroflot/news'
url = url + ('&' if urllib.parse.urlparse(url).query else '?') + urllib.parse.urlencode(params)
The request itself
browser.get(url)
try:
news_list = browser.find_element_by_class_name('news-list')
news_list_text = news_list.text
news_list_text = news_list_text.split('\n')
for i in range(int(len(news_list_text)/2)):
DataDict.update({news_list_text[2*i]:news_list_text[2*i+1]})
print("Found! Adding news to the dictionary!")
except:
pass
But after 2-4 requests it just freezes:(
Whats the problem?
Okay, the problem was in an advertising banner, which appeared after several requests. Solution is just to wait (time.sleep), untill the banner disapeares, and the send request again!:
try:
browser.get(url)
try:
news_list = browser.find_element_by_class_name('news-list')
news_list_text = news_list.text
news_list_text = news_list_text.split('\n')
for i in range(int(len(news_list_text)/2)):
DataDict.update({news_list_text[2*i]:news_list_text[2*i+1]})
#print("Found! Adding news to the dictionary!")
except:
pass
time.sleep(10)
except:
print("perchaps this shitty AD?")
try:
news_list = browser.find_element_by_class_name('news-list')
news_list_text = news_list.text
news_list_text = news_list_text.split('\n')
for i in range(int(len(news_list_text)/2)):
DataDict.update({news_list_text[2*i]:news_list_text[2*i+1]})
#print("Found! Adding news to the dictionary!")
except:
pass

handling pagination BeautifulSoup -Selenium

I have been struggling with this for over a week now. I am trying to learn Python and build something that will be useful to me at the sametime - something to help me find a new home to rent.
I have all the code working the way I want it - except I can not get all 550 properties, I can only get the first 25 on page 1. I have tried several methods but nothing seems to work.
If I use urlopen and take the main url + the regex of the page number "2_p/" for example I get a urlerror unknown url "h'.
If I use webdriver- firefox tries to look up www.h.com. I really could use a little help. Attached is my code- sorry it is kinda messy and probably a little long - I am just learning, so don't be to cruel.
from urllib.request import urlopen
from selenium import webdriver
from bs4 import BeautifulSoup
import csv
import datetime
from datetime import timedelta
import time
import re
pages1 = set()
next_page = ()
csv_output = [ ]
def getLinks(url):
global pages
driver = webdriver.Firefox()
driver.get(url)
time.sleep(3)
pageSource = driver.page_source
bsObj = BeautifulSoup(pageSource)
for addr_link in bsObj.findAll("a", href=re.compile("^/homedetails/*")):
if 'href' in addr_link.attrs:
if addr_link['href'] not in pages1:
newPage = addr_link.attrs['href']
pages1.add(newPage)
#print(newPage)
print(len(pages1))
for link in pages1:
property_url = link
getData(property_url)
def getData(url):
base_url = 'http://www.zillow.com'
final_url = base_url+url
html = urlopen(final_url)
bsObj = BeautifulSoup(html)
try:
# Gets Property Address
address = bsObj.find("header", {"class":"zsg-content-header addr"}).find("h1")
s_address = address.get_text()
print(address)
# Gets number of bedsrooms
beds = bsObj.find("span", {"class":"addr_bbs"})
s_beds = beds.get_text()
# Gets number of bathsrooms
baths = beds.find_next("span", {"class":"addr_bbs"})
s_baths = baths.get_text()
# Gets sqft
sqft = baths.find_next("span", {"class":"addr_bbs"})
s_sqft = sqft.get_text()
# Gets rent_amount
rent_amount = bsObj.find("div", {"class":"main-row home-summary-row"}).span
s_rent_amount = rent_amount.get_text()
# Gets rent_zestiment
zestiment_holder = bsObj.find("span", {"class":"zsg-tooltip-launch zsg-tooltip-launch_keyword"})
rent_zestiment = zestiment_holder.find_next("span")
s_rent_zestiment = rent_zestiment.get_text()
# Gets Date posted on Zillow
for time_posted in bsObj(text=re.compile("Posted")):
posted = time_posted.parent.get_text()
if 'hours' in posted:
date_on_zillow = datetime.date.today()
posted_date = date_on_zillow
else:
days_subtracted = int(re.search(r'\d+', posted).group())
posted_date = datetime.date.today()-datetime.timedelta(days=+days_subtracted)
# Gets Property subdivision
subdivision = bsObj.find(id="hdp-neighborhood").h2
s_subdivision = subdivision.get_text()
# Gets property_manager_name
property_manager_name = bsObj.find("span", {"class":"snl company-name"})
s_property_manager_name = property_manager_name.get_text()
# Gets property_manager_phone
property_manager_phone = bsObj.find("span", {"class":"snl phone"})
s_property_manager_phone = property_manager_phone.get_text()
# Gets disc_of_property
disc_of_property =(bsObj.find('div', {'class': "notranslate"}).text)
s_disc_of_property = disc_of_property.encode("utf-8")
# Gets url_of_listing so I can see Photos if interested
main_url = 'http://www.zillow.com'
url2 = url
listing_url = main_url+url2
s_listing_url = listing_url
except AttributeError as e:
return None
csv_data = [s_address, s_beds, s_baths, s_sqft, s_rent_amount, s_rent_zestiment, posted_date, s_subdivision, s_property_manager_name, s_property_manager_phone, s_disc_of_property, s_listing_url]
csv_output.append(csv_data)
resultFile = open("output.csv",'w+')
wr = csv.writer(resultFile)
wr.writerows(csv_output)
resultFile.close()
header = ['Address', 'Beds', 'Baths', 'Sqft', 'Rent Amount', 'rent Zestiment', 'Posted Date', 'Subdivision', 'Property Manager Name', 'Property Manager Phone', 'Disc of Property', 'URL']
csv_output.append(header)
getLinks("http://www.zillow.com/homes/for_rent/Jackson-County-MO/house,mobile_type/1804_rid/6m_days/39.371994,-93.635788,38.697836,-95.077744_rect/9_zm/")
Edit:
The 'while' that has been commented out was my last attempt at handling the pagination.

how to automate this beautifulsoup import

I am importing links to boxscores from this webpage
http://www.covers.com/pageLoader/pageLoader.aspx?page=/data/wnba/teams/pastresults/2012/team665231.html
This is how I am doing it now. I get the links from the first page.
url = 'http://www.covers.com/pageLoader/pageLoader.aspx?page=/data/wnba/teams/pastresults/2012/team665231.html'
boxurl = urllib2.urlopen(url).read()
soup = BeautifulSoup(boxurl)
boxscores = soup.findAll('a', href=re.compile('boxscore'))
basepath = "http://www.covers.com"
pages=[] # This grabs the links from the page
for a in boxscores:
pages.append(urllib2.urlopen(basepath + a['href']).read())
Then in a new window I do this.
newsoup = pages[1] # I am manually changing this every time
soup = BeautifulSoup(newsoup)
def _unpack(row, kind='td'):
return [val.text for val in row.findAll(kind)]
tables = soup('table')
linescore = tables[1]
linescore_rows = linescore.findAll('tr')
roadteamQ1 = float(_unpack(linescore_rows[1])[1])
roadteamQ2 = float(_unpack(linescore_rows[1])[2])
roadteamQ3 = float(_unpack(linescore_rows[1])[3])
roadteamQ4 = float(_unpack(linescore_rows[1])[4]) # add OT rows if ???
roadteamFinal = float(_unpack(linescore_rows[1])[-3])
hometeamQ1 = float(_unpack(linescore_rows[2])[1])
hometeamQ2 = float(_unpack(linescore_rows[2])[2])
hometeamQ3 = float(_unpack(linescore_rows[2])[3])
hometeamQ4 = float(_unpack(linescore_rows[2])[4]) # add OT rows if ???
hometeamFinal = float(_unpack(linescore_rows[2])[-3])
misc_stats = tables[5]
misc_stats_rows = misc_stats.findAll('tr')
roadteam = str(_unpack(misc_stats_rows[0])[0]).strip()
hometeam = str(_unpack(misc_stats_rows[0])[1]).strip()
datefinder = tables[6]
datefinder_rows = datefinder.findAll('tr')
date = str(_unpack(datefinder_rows[0])[0]).strip()
year = 2012
from dateutil.parser import parse
parsedDate = parse(date)
date = parsedDate.replace(year)
month = parsedDate.month
day = parsedDate.day
modDate = str(day)+str(month)+str(year)
gameid = modDate + roadteam + hometeam
data = {'roadteam': [roadteam],
'hometeam': [hometeam],
'roadQ1': [roadteamQ1],
'roadQ2': [roadteamQ2],
'roadQ3': [roadteamQ3],
'roadQ4': [roadteamQ4],
'homeQ1': [hometeamQ1],
'homeQ2': [hometeamQ2],
'homeQ3': [hometeamQ3],
'homeQ4': [hometeamQ4]}
globals()["%s" % gameid] = pd.DataFrame(data)
df = pd.DataFrame.load('df')
df = pd.concat([df, globals()["%s" % gameid]])
df.save('df')
How can I automate this so I don't have to manually change newsoup = pages[1] manually and scrape all of the boxscores that are linked from the first url in one go. I am pretty new to python and lacking in some understanding of the basics.
So in the first code box you collect the pages
So in the second code box you have to loop this, if I understood it
for page in pages:
soup = BeautifulSoup(page)
# rest of the code here

Categories

Resources