Creating a while loop for a non-simple piece of code - python

I'm new to Python (and programming in general) and I'm having trouble creating a while loop. All the examples I've been able to find are pretty simple, along the lines of var=10, subtract 1 until you get to 0. I have a larger piece of code that I need to re-run as long as a condition is met, but what I've tried hasn't worked. I'm using Python 3 in Anaconda.
I've tried def main() and the below and now I'm trying while true (in my code sample, I have while true and below it is indented but I can't seem to get it to indent here). What I'm trying to get it to do is check whether datey is less than endDate and if it is, start the whole thing again.
Here's what I have:
from openpyxl import load_workbook
while True:
WB = load_workbook('File.xlsx', data_only=True)
SH = WB['02474']
import datetime
from datetime import timedelta
endDate = datetime.datetime.strptime('2018-02-09', '%Y-%m-%d')
MXrow = SH.max_row
NXrow = MXrow+1
datex = SH.cell(row=MXrow, column=2)
datex2 = datex.value
datey = datetime.datetime.strptime(datex2, '%Y-%m-%d') + datetime.timedelta(days=1)
datey2 = datey.strftime('%Y-%m-%d')
URL = 'https://www.example.com' + str(datey2)
import requests
page = requests.get(URL)
from bs4 import BeautifulSoup
soup = BeautifulSoup(page.text, 'lxml')
mintemp = soup.find("tr",{"class":"weatherhistory_results_datavalue temp_mn"})
for i in mintemp:
mintemp2 = i.find("span",{"class": "value"})
maxtemp = soup.find("tr",{"class":"weatherhistory_results_datavalue temp_mx"})
for i in maxtemp:
maxtemp2 = i.find("span",{"class": "value"})
mintemp3 = mintemp2.text
maxtemp3 = maxtemp2.text
while(datey <= endDate):
SH.cell(row=NXrow, column=1).value = "ZIP"
SH.cell(row=NXrow, column=2).value = datey2
SH.cell(row=NXrow, column=3).value = URL
SH.cell(row=NXrow, column=4).value = mintemp3
SH.cell(row=NXrow, column=5).value = maxtemp3
WB.save('File.xlsx')
Would love any pointers you can give me to fix this!

If you want to check if
datey is less than endDate
then simply use your while loop argument at the top instead of a boolean:
import requests
import datetime
from datetime import timedelta
while(datey <= endDate):
WB = load_workbook('File.xlsx', data_only=True)
SH = WB['02474']
endDate = datetime.datetime.strptime('2018-02-09', '%Y-%m-%d')
MXrow = SH.max_row
NXrow = MXrow+1
datex = SH.cell(row=MXrow, column=2)
datex2 = datex.value
datey = datetime.datetime.strptime(datex2, '%Y-%m-%d') +
datetime.timedelta(days=1)
datey2 = datey.strftime('%Y-%m-%d')
URL = 'https://www.example.com' + str(datey2)
page = requests.get(URL)
from bs4 import BeautifulSoup
soup = BeautifulSoup(page.text, 'lxml')
mintemp = soup.find("tr",{"class":"weatherhistory_results_datavalue temp_mn"})
for i in mintemp:
mintemp2 = i.find("span",{"class": "value"})
maxtemp = soup.find("tr",{"class":"weatherhistory_results_datavalue temp_mx"})
for i in maxtemp:
maxtemp2 = i.find("span",{"class": "value"})
mintemp3 = mintemp2.text
maxtemp3 = maxtemp2.text
SH.cell(row=NXrow, column=1).value = "ZIP"
SH.cell(row=NXrow, column=2).value = datey2
SH.cell(row=NXrow, column=3).value = URL
SH.cell(row=NXrow, column=4).value = mintemp3
SH.cell(row=NXrow, column=5).value = maxtemp3
WB.save('File.xlsx')
This way, when 'datey' is less than or equal to 'endDate', the contents of the while loop will continue to loop until 'datey' is greater than 'endDate'
Also import statements go at the top of your file and do not belong inside a loop.

Related

Issues Scraping multiple webpages with BeautifulSoup

I am scraping a URL (example: https://bitinfocharts.com/top-100-richest-dogecoin-addresses-4.html) and the number on the end of the URL is the page number. I am trying to scrape multiple pages, so I used the following code to loop through the multiple pages:
for page in range(4, 7): #Range designates the page numbers for the URL
r = s.get(f'https://bitinfocharts.com/top-100-richest-dogecoin-addresses-{page}.html') #Format the page number into url
print(page)
When I run the code in my script and print the page, it returns 4, 5 and 6, meaning that it should be working. However whenever I run the full code, it only gives me the results for the 6th page.
What I think may be happening is the code is finalizing on the last number and formatting that into the URL, whenever it should formatting each number into the URL instead.
I have tried looking at other people with similar issues but haven't been able to find a solution. I believe this may be a code formatting error but I am not exactly sure. Any advice is greatly appreciated. Thank you.
Here is the remainder of my code:
import csv
import requests
from bs4 import BeautifulSoup as bs
from datetime import datetime
import os
import pandas as pd
import openpyxl
# define 1-1-2020 as a datetime object
after_date = datetime(2021, 1, 1)
with requests.Session() as s:
s.headers = {"User-Agent": "Safari/537.36"}
for page in range(4, 7): #Range designates the page numbers for the URL
r = s.get(f'https://bitinfocharts.com/top-100-richest-dogecoin-addresses-{page}.html') #Format the page number into url
print(page)
soup = bs(r.content, 'lxml')
# select all tr elements (minus the first one, which is the header)
table_elements = soup.select('tr')[1:]
address_links = []
for element in table_elements:
children = element.contents # get children of table element
url = children[1].a['href']
last_out_str = children[8].text
if last_out_str != "": # check to make sure the date field isn't empty
last_out = datetime.strptime(last_out_str, "%Y-%m-%d %H:%M:%S %Z") # load date into datetime object for comparison
if last_out > after_date: # if check to see if the date is after last_out
address_links.append(url + '-full') #add adddress_links to the list, -full makes the link show all data
print(address_links)
for url in address_links: #loop through the urls in address_links list
r = s.get(url)
soup = bs(r.content, 'lxml')
ad2 = (soup.title.string) #grab the web title which is used for the filename
ad2 = ad2.replace('Dogecoin', '')
ad2 = ad2.replace('Address', '')
ad2 = ad2.replace('-', '')
filename = ad2.replace(' ', '')
sections = soup.find_all(class_='table-striped')
for section in sections: #This contains the data which is imported into the 'gf' dataframe or the 'info' xlsx sheet
oldprofit = section.find_all('td')[11].text #Get the profit
removetext = oldprofit.replace('USD', '')
removetext = removetext.replace(' ', '')
removetext = removetext.replace(',', '')
profit = float(removetext)
balance = section.find_all('td')[0].text #Get the wallet balance
amount_recieved = section.find_all('td')[3].text #Get amount recieved
ins = amount_recieved[amount_recieved.find('(') + 1:amount_recieved.find(')')] #Filter out text from
# amount recieved
ins = ins.replace('ins', '')
ins = ins.replace(' ', '')
ins = float(ins)
first_recieved = section.find_all('td')[4].text #Get the data of the first incoming transaction
fr = first_recieved.replace('first', '')
fr = fr.replace(':', '')
fr = fr.replace(' ', '')
last_recieved = section.find_all('td')[5].text #Get the date of the last incoming transaction
lr = last_recieved.replace('last', '')
lr = lr.replace(':', '')
lr = lr.replace(' ', '')
amount_sent = section.find_all('td')[7].text #Get the amount sent
outs = amount_sent[amount_sent.find('(') + 1:amount_sent.find(')')] #Filter out the text
outs = outs.replace('outs', '')
outs = outs.replace(' ', '')
outs = float(outs)
first_sent = section.find_all('td')[8].text #Get the first outgoing transaction date
fs = first_sent.replace('first', '') #clean up first outgoing transaction date
fs = fs.replace(':', '')
fs = fs.replace(' ', '')
last_sent = section.find_all('td')[9].text #Get the last outgoing transaction date
ls = last_sent.replace('last', '') #Clean up last outgoing transaction date
ls = ls.replace(':', '')
ls = ls.replace(' ', '')
dbalance = section.find_all('td')[0].select('b') #get the balance of doge
dusd = section.find_all('td')[0].select('span')[1] #get balance of USD
for data in dbalance: #used to clean the text up
balance = data.text
for data1 in dusd: #used to clean the text up
usd = data1.text
# Compare profit to goal, if profit doesn't meet the goal, the URL is not scraped
goal = float(30000)
if profit < goal:
continue
#Select wallets with under 2000 transactions
trans = float(ins + outs) #adds the amount of incoming and outgoing transactions
trans_limit = float(2000)
if trans > trans_limit:
continue
# Create Info Dataframe using the data from above
info = {
'Balance': [balance],
'USD Value': [usd],
'Wallet Profit': [profit],
'Amount Recieved': [amount_recieved],
'First Recieved': [fr],
'Last Recieved': [lr],
'Amount Sent': [amount_sent],
'First Sent': [fs],
'Last Sent': [ls],
}
gf = pd.DataFrame(info)
a = 'a'
if a:
df = \
pd.read_html(requests.get(url, headers={'User-agent': 'Mozilla/5.0'}).text, attrs={"id": "table_maina"},
index_col=None, header=[0])[0] #uses pandas to read the dataframe and save it
directory = '/Users/chris/Desktop/Files' #directory for the file to go to
file = f'{filename}.xlsx'
writer = pd.ExcelWriter(os.path.join(directory, file), engine='xlsxwriter')
with pd.ExcelWriter(writer) as writer:
df.to_excel(writer, sheet_name='transactions')
gf.to_excel(writer, sheet_name='info')
Check your indentation - In your question the loops are on the same level, so loop that make the requests is iterating over all the pages but results are never processed until iterating is done. That is why it only works for the last page.
Move your loops, that should handle the response and extract elements into your first loop:
...
for page in range(4, 7): #Range designates the page numbers for the URL
r = s.get(f'https://bitinfocharts.com/top-100-richest-dogecoin-addresses-{page}.html') #Format the page number into url
print(page)
soup = bs(r.content, 'lxml')
table_elements = soup.select('tr')[1:]
address_links = []
for element in table_elements:
...
for url in address_links:
...

Python: Webdriver get list of the URLs, duplicated data

I'm trying to pull the hist data from a URL. The date(as epcho time) is part of the URL.
import pandas as pd
import numpy as np
from selenium import webdriver
import chromedriver_binary
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
import time
from datetime import datetime
options = Options()
options.headless = True
lastDate = '2021-07-01'
firstDate = '2010-01-01'
time_object = time.strptime(lastDate, '%Y-%m-%d')
period2 = int(time.mktime(time_object))
period1 = int(period2 - 86400*200)
time_object = time.strptime(firstDate, '%Y-%m-%d')
period0 = time.mktime(time_object)
count = 1
url=f"https://finance.yahoo.com/quote/%5EGSPC/history?period1={period1}&period2={period2}&interval=1d&filter=history&frequency=1d&includeAdjustedClose=true"
#url=r'https://finance.yahoo.com/quote/%5EGSPC/history?period1=1262304000&period2=1625097600&interval=1d&filter=history&frequency=1d&includeAdjustedClose=true'
while period2 >= period0:
ed = datetime.fromtimestamp(period2)
sd = datetime.fromtimestamp(period1)
print(f"Working on {sd} {ed}, current count {count}")
print(f"URL is {url}")
driver = webdriver.Chrome(options=options)
driver.implicitly_wait(20)
driver.get(url)
js = "var q=document.documentElement.scrollTop=100000"
driver.execute_script(js)
for row in t.tbody.findAll('tr'):
date = row.findAll('td')[0].text
date = datetime.strptime(date, "%b %d, %Y")
date = date.strftime("%Y-%m-%d")
open = row.findAll('td')[1].text.replace(',','')
high = row.findAll('td')[2].text.replace(',','')
low = row.findAll('td')[3].text.replace(',','')
close = row.findAll('td')[4].text.replace(',','')
adjclose = row.findAll('td')[5].text.replace(',','')
volume = row.findAll('td')[6].text.replace(',','')
hist = pd.DataFrame([[date,open,high,low,close,adjclose,volume]], columns=['Date', 'Open','High','Low','Close', 'Adj Close', 'Volumn'])
if count == 1:
hist.to_csv('hist.csv', index=False, header=True)
else:
hist.to_csv('hist.csv', index=False, mode='a', header=False)
count = count + 1
period2 = int(period1)
period1 = int(period2 - 86400*200)
url=f"https://finance.yahoo.com/quote/%5EGSPC/history?period1={period1}&period2={period2}&interval=1d&filter=history&frequency=1d&includeAdjustedClose=true"
driver.close()
I printed the URL, it updated properly with the newly refreshed period. However, what's being written to my hist.csv is duplicated. It seems the driver only respected my first URL and completely ignored the rest. As a result I got the first period of the dates/price etc repeatedl in my hist.csv
Appreciate if you have any comments.
Thanks
Please disregard - I just realized I didn't refresh the variable while I'm using jupyter. i found the problem just 2 minutes after I posted the questions. Thanks for the great stackoverflow!

How do i solve for error str in datetime?

Goal is the use datetime to reiterate over
http://www.harness.org.au/racing/results/?firstDate=01-01-2019
http://www.harness.org.au/racing/results/?firstDate=02-01-2019.... to yesterdays date
(should be done in new_url = base_url + str(enddate1))
then once in that href, i want to circulate over meetingfulllisttable to get name and href to then get results data from each track that day.
My current error is'<=' not supported between instances of 'datetime.timedelta' and 'str' - which comes from my while loop. why is this? never used datetime before
from datetime import datetime, date, timedelta
import requests
import re
from bs4 import BeautifulSoup
base_url = "http://www.harness.org.au/racing/results/?firstDate="
base1_url = "http://www.harness.org.au"
webpage_response = requests.get('http://www.harness.org.au/racing/results/?firstDate=')
soup = BeautifulSoup(webpage_response.content, "html.parser")
format = "%d-%m-%y"
delta = timedelta(days=1)
yesterday = datetime.today() - timedelta(days=1)
yesterday1 = yesterday.strftime(format)
enddate = datetime(2019, 1, 1)
enddate1 = enddate.strftime(format)
while enddate1 <= yesterday1:
enddate1 =+ timedelta(days=1)
new_url = base_url + str(enddate1)
soup12 = requests.get(new_url)
soup1 = BeautifulSoup(soup12.content, "html.parser")
table1 = soup1.find('table', class_='meetingListFull')
for tr in table1.find_all('tr'):
all_cells = tr.find_all('td')
track = all_cells.a.href.get_text()
href = all_cells.get('href')
trackresults = base1_url + href
This
yesterday1 = yesterday.strftime(format)
Is a string. That's why you are getting that error

handling pagination BeautifulSoup -Selenium

I have been struggling with this for over a week now. I am trying to learn Python and build something that will be useful to me at the sametime - something to help me find a new home to rent.
I have all the code working the way I want it - except I can not get all 550 properties, I can only get the first 25 on page 1. I have tried several methods but nothing seems to work.
If I use urlopen and take the main url + the regex of the page number "2_p/" for example I get a urlerror unknown url "h'.
If I use webdriver- firefox tries to look up www.h.com. I really could use a little help. Attached is my code- sorry it is kinda messy and probably a little long - I am just learning, so don't be to cruel.
from urllib.request import urlopen
from selenium import webdriver
from bs4 import BeautifulSoup
import csv
import datetime
from datetime import timedelta
import time
import re
pages1 = set()
next_page = ()
csv_output = [ ]
def getLinks(url):
global pages
driver = webdriver.Firefox()
driver.get(url)
time.sleep(3)
pageSource = driver.page_source
bsObj = BeautifulSoup(pageSource)
for addr_link in bsObj.findAll("a", href=re.compile("^/homedetails/*")):
if 'href' in addr_link.attrs:
if addr_link['href'] not in pages1:
newPage = addr_link.attrs['href']
pages1.add(newPage)
#print(newPage)
print(len(pages1))
for link in pages1:
property_url = link
getData(property_url)
def getData(url):
base_url = 'http://www.zillow.com'
final_url = base_url+url
html = urlopen(final_url)
bsObj = BeautifulSoup(html)
try:
# Gets Property Address
address = bsObj.find("header", {"class":"zsg-content-header addr"}).find("h1")
s_address = address.get_text()
print(address)
# Gets number of bedsrooms
beds = bsObj.find("span", {"class":"addr_bbs"})
s_beds = beds.get_text()
# Gets number of bathsrooms
baths = beds.find_next("span", {"class":"addr_bbs"})
s_baths = baths.get_text()
# Gets sqft
sqft = baths.find_next("span", {"class":"addr_bbs"})
s_sqft = sqft.get_text()
# Gets rent_amount
rent_amount = bsObj.find("div", {"class":"main-row home-summary-row"}).span
s_rent_amount = rent_amount.get_text()
# Gets rent_zestiment
zestiment_holder = bsObj.find("span", {"class":"zsg-tooltip-launch zsg-tooltip-launch_keyword"})
rent_zestiment = zestiment_holder.find_next("span")
s_rent_zestiment = rent_zestiment.get_text()
# Gets Date posted on Zillow
for time_posted in bsObj(text=re.compile("Posted")):
posted = time_posted.parent.get_text()
if 'hours' in posted:
date_on_zillow = datetime.date.today()
posted_date = date_on_zillow
else:
days_subtracted = int(re.search(r'\d+', posted).group())
posted_date = datetime.date.today()-datetime.timedelta(days=+days_subtracted)
# Gets Property subdivision
subdivision = bsObj.find(id="hdp-neighborhood").h2
s_subdivision = subdivision.get_text()
# Gets property_manager_name
property_manager_name = bsObj.find("span", {"class":"snl company-name"})
s_property_manager_name = property_manager_name.get_text()
# Gets property_manager_phone
property_manager_phone = bsObj.find("span", {"class":"snl phone"})
s_property_manager_phone = property_manager_phone.get_text()
# Gets disc_of_property
disc_of_property =(bsObj.find('div', {'class': "notranslate"}).text)
s_disc_of_property = disc_of_property.encode("utf-8")
# Gets url_of_listing so I can see Photos if interested
main_url = 'http://www.zillow.com'
url2 = url
listing_url = main_url+url2
s_listing_url = listing_url
except AttributeError as e:
return None
csv_data = [s_address, s_beds, s_baths, s_sqft, s_rent_amount, s_rent_zestiment, posted_date, s_subdivision, s_property_manager_name, s_property_manager_phone, s_disc_of_property, s_listing_url]
csv_output.append(csv_data)
resultFile = open("output.csv",'w+')
wr = csv.writer(resultFile)
wr.writerows(csv_output)
resultFile.close()
header = ['Address', 'Beds', 'Baths', 'Sqft', 'Rent Amount', 'rent Zestiment', 'Posted Date', 'Subdivision', 'Property Manager Name', 'Property Manager Phone', 'Disc of Property', 'URL']
csv_output.append(header)
getLinks("http://www.zillow.com/homes/for_rent/Jackson-County-MO/house,mobile_type/1804_rid/6m_days/39.371994,-93.635788,38.697836,-95.077744_rect/9_zm/")
Edit:
The 'while' that has been commented out was my last attempt at handling the pagination.

how to automate this beautifulsoup import

I am importing links to boxscores from this webpage
http://www.covers.com/pageLoader/pageLoader.aspx?page=/data/wnba/teams/pastresults/2012/team665231.html
This is how I am doing it now. I get the links from the first page.
url = 'http://www.covers.com/pageLoader/pageLoader.aspx?page=/data/wnba/teams/pastresults/2012/team665231.html'
boxurl = urllib2.urlopen(url).read()
soup = BeautifulSoup(boxurl)
boxscores = soup.findAll('a', href=re.compile('boxscore'))
basepath = "http://www.covers.com"
pages=[] # This grabs the links from the page
for a in boxscores:
pages.append(urllib2.urlopen(basepath + a['href']).read())
Then in a new window I do this.
newsoup = pages[1] # I am manually changing this every time
soup = BeautifulSoup(newsoup)
def _unpack(row, kind='td'):
return [val.text for val in row.findAll(kind)]
tables = soup('table')
linescore = tables[1]
linescore_rows = linescore.findAll('tr')
roadteamQ1 = float(_unpack(linescore_rows[1])[1])
roadteamQ2 = float(_unpack(linescore_rows[1])[2])
roadteamQ3 = float(_unpack(linescore_rows[1])[3])
roadteamQ4 = float(_unpack(linescore_rows[1])[4]) # add OT rows if ???
roadteamFinal = float(_unpack(linescore_rows[1])[-3])
hometeamQ1 = float(_unpack(linescore_rows[2])[1])
hometeamQ2 = float(_unpack(linescore_rows[2])[2])
hometeamQ3 = float(_unpack(linescore_rows[2])[3])
hometeamQ4 = float(_unpack(linescore_rows[2])[4]) # add OT rows if ???
hometeamFinal = float(_unpack(linescore_rows[2])[-3])
misc_stats = tables[5]
misc_stats_rows = misc_stats.findAll('tr')
roadteam = str(_unpack(misc_stats_rows[0])[0]).strip()
hometeam = str(_unpack(misc_stats_rows[0])[1]).strip()
datefinder = tables[6]
datefinder_rows = datefinder.findAll('tr')
date = str(_unpack(datefinder_rows[0])[0]).strip()
year = 2012
from dateutil.parser import parse
parsedDate = parse(date)
date = parsedDate.replace(year)
month = parsedDate.month
day = parsedDate.day
modDate = str(day)+str(month)+str(year)
gameid = modDate + roadteam + hometeam
data = {'roadteam': [roadteam],
'hometeam': [hometeam],
'roadQ1': [roadteamQ1],
'roadQ2': [roadteamQ2],
'roadQ3': [roadteamQ3],
'roadQ4': [roadteamQ4],
'homeQ1': [hometeamQ1],
'homeQ2': [hometeamQ2],
'homeQ3': [hometeamQ3],
'homeQ4': [hometeamQ4]}
globals()["%s" % gameid] = pd.DataFrame(data)
df = pd.DataFrame.load('df')
df = pd.concat([df, globals()["%s" % gameid]])
df.save('df')
How can I automate this so I don't have to manually change newsoup = pages[1] manually and scrape all of the boxscores that are linked from the first url in one go. I am pretty new to python and lacking in some understanding of the basics.
So in the first code box you collect the pages
So in the second code box you have to loop this, if I understood it
for page in pages:
soup = BeautifulSoup(page)
# rest of the code here

Categories

Resources