I want to rasie an exception error if any mismatch found but also the loop should continue.
If there's any mismatch/exception error, the entire case should fail.
Can y'all check the below code and help me out here?
def test01_check_urls(self, test_setup):
#reading the file
Total_entries=len(old_urls) //Total_entries=5
print("Total entries in the sheet: "+ str(Total_entries))
col_count=0
#opening urls
while col_count<Total_entries:
Webpage=old_urls[col_count] //fetching data from 1st cell in the excel
Newpage=new_urls[col_count] //fetching data from 1st cell in the excel
driver.get(Webpage)
print("The old page url is: "+Webpage)
page_title=driver.title
print(page_title)
Redr_page=driver.current_url
print("The new url is: "+Redr_page)
print("New_url from sheet:"+Newpage)
try:
if Redr_page==Newpage:
print("Correct url")
except:
raise Exception("Url mismatch")
col_count+=1
Have a variable url_mismatch, initially False. Instead of immediately raising an exception when the is a URL mismatch, just set this variable to True. Then when the loop ends, check the value of this variable and raise an exception if the variable is True.
However, It's not clear how your try block results in an exception. Did you possibly mean (no try block necessary):
if Redr_page == Newpage:
print("Correct url")
else:
raise Exception("Url mismatch")
For now I am leaving that part of the code unmodified:
url_mismatch = False
while col_count<Total_entries:
Webpage=old_urls[col_count] //fetching data from 1st cell in the excel
Newpage=new_urls[col_count] //fetching data from 1st cell in the excel
driver.get(Webpage)
print("The old page url is: "+Webpage)
page_title=driver.title
print(page_title)
Redr_page=driver.current_url
print("The new url is: "+Redr_page)
print("New_url from sheet:"+Newpage)
try:
if Redr_page==Newpage:
print("Correct url")
except:
print('Mismatch url')
url_mismatch = True # show we have had a mismtach
col_count+=1
# now check for a mismatch and raise an exception if there has been one:
if url_mismatch:
raise Exception("Url mismatch")
Related
I have the following code to get some data using selenium. That goes through a list with ids with a for loop and to store them in my lists (titulos = [] and ids = []. It was working fine until I added the try/except. The code would look like this:
for item in registros:
found = False
ids = []
titulos = []
try:
while true:
#code to request data
try:
error = False
error = #error message
if error is True:
break
except:
continue
except:
continue
try:
found = #if id has data
if found.is_displayed:
titulo = #locator
ids.append(item)
titulos.append(titulo)
except NoSuchElementException:
input.clear()
The first inner try block needs to be indented. Also, the error parameter will always be set to the text message so it will always be true. Try formatting your code correctly and then identifying the problem.
I am performing web scraping via Python \ Selenium \ Chrome headless driver. I am reading the results from JSON - here is my code:
CustId=500
while (CustId<=510):
print(CustId)
# Part 1: Customer REST call:
urlg = f'https://mywebsite/customerRest/show/?id={CustId}'
driver.get(urlg)
soup = BeautifulSoup(driver.page_source,"lxml")
dict_from_json = json.loads(soup.find("body").text)
# print(dict_from_json)
#try:
CustID = (dict_from_json['customerAddressCreateCommand']['customerId'])
# Addr = (dict_from_json['customerShowCommand']['customerAddressShowCommandSet'][0]['addressDisplayName'])
writefunction()
CustId = CustId+1
The issue is sometimes 'addressDisplayName' will be present in the result set and sometimes not. If its not, it errors with the error:
IndexError: list index out of range
Which makes sense, as it doesn't exist. How do I ignore this though - so if 'addressDisplayName' doesn't exist just continue with the loop? I've tried using a TRY but the code still stops executing.
try..except block should resolved your issue.
CustId=500
while (CustId<=510):
print(CustId)
# Part 1: Customer REST call:
urlg = f'https://mywebsite/customerRest/show/?id={CustId}'
driver.get(urlg)
soup = BeautifulSoup(driver.page_source,"lxml")
dict_from_json = json.loads(soup.find("body").text)
# print(dict_from_json)
CustID = (dict_from_json['customerAddressCreateCommand']['customerId'])
try:
Addr = (dict_from_json['customerShowCommand']['customerAddressShowCommandSet'][0]'addressDisplayName'])
except:
Addr ="NaN"
CustId = CustId+1
If you get an IndexError (with an index of '0') it means that your list is empty. So it is one step in the path earlier (otherwise you'd get a KeyError if 'addressDisplayName' was missing from the dict).
You can check if the list has elements:
if dict_from_json['customerShowCommand']['customerAddressShowCommandSet']:
# get the data
Otherwise you can indeed use try..except:
try:
# get the data
except IndexError, KeyError:
# handle missing data
I want to find title, address, price of some items in an online mall.
But, sometimes the address is empty and my code is break in my code(below_it's an only selenium part)
num = 1
while 1:
try:
title = browser.find_element_by_xpath('//*[#id="root"]/div[1]/section/article/div/div['+str(num)+']/div/div/a/span').text
datas_title.append(title)
address = browser.find_element_by_xpath('//*[#id="root"]/div[1]/section/article/div/div['+str(num)+']/div/div/a/div/p[2]').text
datas_address.append(address)
price = browser.find_element_by_xpath('//*[#id="root"]/div[1]/section/article/div/div['+str(num)+']/div/div/a/p').text
datas_price.append(price)
print('crowling....num = '+str(num))
num=num+1
except Exception as e:
print("finish get data...")
break
print(datas_title)
print(datas_address)
print(datas_price)
what should I do if the address is empty -> just ignore it and find the next items?
Use this so you can skip the entries with missing information:
num = 1
while 1:
try:
title = browser.find_element_by_xpath('//*[#id="root"]/div[1]/section/article/div/div['+str(num)+']/div/div/a/span').text
datas_title.append(title)
address = browser.find_element_by_xpath('//*[#id="root"]/div[1]/section/article/div/div['+str(num)+']/div/div/a/div/p[2]').text
datas_address.append(address)
price = browser.find_element_by_xpath('//*[#id="root"]/div[1]/section/article/div/div['+str(num)+']/div/div/a/p').text
datas_price.append(price)
print('crowling....num = '+str(num))
num=num+1
except:
print("an error was encountered")
continue
print(datas_title)
print(datas_address)
print(datas_price)
address = browser.find_element_by_xpath('//*[#id="root"]/div[1]/section/article/div/div['+str(num)+']/div/div/a/div/p[2]').text
if not address:
address = "None"
else:
address = address[0].text
datas_title.append(address)
You could use find_elements to check if it's empty and then proceed to do it with either value. You can than encapsulate this into a function pass it the xpath and the data_title array and your code should be repeatable.
I think you need to first check if the web element returned isn't none. And then proceed with fetching text.
You could write a function for it, and catch that exception in it.
from typing import Text, final
from bs4 import BeautifulSoup
import requests
source = requests.get("https://www.nytimes.com/interactive/2021/us/covid-cases.html").text
soup = BeautifulSoup(source, "lxml")
states = soup.find("tbody", class_="children").find_all("tr")
# print(state.prettify())
for state in states:
# determining the name of the state
name = state.a.text
final_name = ""
for character in name:
if character in "qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM ":
final_name += character
print(final_name)
# finding the daily number of cases on average
try:
daily_cases_avg = state.find("td",class_="bignum cases show-mobile").text
except Exception as e:
daily_cases_avg = None
print(daily_cases_avg)
# finding the number of cases per 100,000
try:
num_cases_per_hunThous = state.find("td",class_="num cases show-mobile").text
except Exception as e:
num_cases_per_hunThous = None
print(num_cases_per_hunThous)
# finding percent change over the past 14 days
try:
pct_change_cases_14 = state.find("td",class_="chart cases wider td-end show-mobile").span.text
except Exception as e:
pct_change_cases_14 = None
print(pct_change_cases_14)
# daily average of the number of people hospitalized
try:
daliy_hos_avg = state.find_all("td",class_="bignum")[1].text
except Exception as e:
daily_hos_avg = None
print(daliy_hos_avg)
# number of people people hospitalized per 100,000
try:
num_hos_hunThous = state.find_all("td",class_="num")[1].text
except Exception as e:
num_hos_hunThous = None
print(num_hos_hunThous)
# percent change of number of hospitalized people over the past 14 days
try:
pct_change_hos_14 = state.find("td",class_="num td-end").text
except Exception as e:
pct_change_hos_14 = None
print(pct_change_hos_14)
# daily average of deaths
try:
daily_death_avg = state.find_all("td",class_="bignum")[2].text
except Exception as e:
daily_death_avg = None
print(daily_death_avg)
# number of deaths per 100,000
try:
deaths_hunThous = state.find_all("td",class_="num td-end")[1].text
except Exception as e:
deaths_hunThous = None
print(deaths_hunThous)
# percent of people fully vaccinated
try:
pct_vac = state.find("td",class_="num vax td-end").text
except Exception as e:
pct_vac = None
print(pct_vac)
All I am trying to do is scrape COVID-19 data off of the New York Times. I am a beginner so I am just using this as a way to learn how to scrape websites efficiently. However, the website only the states that show up prior to a dropdown.
On the website, after the state of Illinois, there is button "Show all." The states that appear after clicking that button are not getting scraped for data, so I was wondering how I can get past that to get data for all of the states.
If you open developer tools and go to network you can see all of the requests the page is sending. I found one request it sends https://static01.nyt.com/newsgraphics/2021/coronavirus-tracking/data/counties.json
The website recieves a link for every county. Each element in the json obj contains a link to another nyt article that contains the avg cases for that individual county.
It would be more complicated but you could write a script that goes through each of these countys and scrapes the data. Then add up the avg cases for each state based on each individual county.
That is what I would do
I have a loop inside loop i'm using try n catch once get error try n catch works fine but loop continues to next value. What I need is that where the loop breaks start from the same value don't continue to next so how i can do that with my code [like in other languages: in c++, it is i--]
for
r = urllib2.urlopen(url)
encoding = r.info().getparam('charset')
html = r.read()
c = td.find('a')['href']
urls = []
urls.append(c)
#collecting urls from first page then from those url collecting further info in below loop
for abc in urls:
try:
r = urllib2.urlopen(abc)
encoding = r.info().getparam('charset')
html = r.read()
except Exception as e:
last_error = e
time.sleep(retry_timeout) #here is the problem once get error then switch from next value
I need a more pythonic way to do this.
Waiting for a reply. Thank you.
Unfortunatly, there is no simple way to go back with iterator in Python :
http://docs.python.org/2/library/stdtypes.html
You should be interested in this stackoverflow's thread :
Making a python iterator go backwards?
For your particular case, i will use a simple while loop :
url = []
i = 0
while i < len(url): #url is list contain all urls which contain infinite as url updates every day
data = url[i]
try:
#getting data from there
i+=1
except:
#shows the error received and continue to next loop i need to make the loop start from same position
The problem with the way, you want to handle your problem is that you will risk to go on a infinite loop. For example if a link is broken r = urllib2.urlopen(abc) will always run an exception and you will always stay at the same position. You should consider doing something like that :
r = urllib2.urlopen(url)
encoding = r.info().getparam('charset')
html = r.read()
c = td.find('a')['href']
urls = []
urls.append(c)
#collecting urls from first page then from those url collecting further info in below loop
NUM_TRY = 3
for abc in urls:
for _ in range(NUM_TRY):
try:
r = urllib2.urlopen(abc)
encoding = r.info().getparam('charset')
html = r.read()
break #if we arrive to this line, it means no error occur so we don't need to retry again
#this is why we break the inner loop
except Exception as e:
last_error = e
time.sleep(retry_timeout) #here is the problem once get error then switch from next value