I am using webdriver selenium driver to open the url in the for loop. Once the URL opens, it stores file_total_l.append(get_file_total) in a list. How can I check to make sure variable 'missing_amount' is in the webpage URL before appending get_file_total to list file_total_l?
Whats happening:
Its inserting file_total into my database a second time into my table, if I run the script twice. The missing_amount is 165,757,06 from my table so why isnt that being inserted.
print(icl_dollar_amount):
['627,418.07', '6,986,500.57', '165,757.06']
print(missing_amount[i])
'165,757.06'
code:
missing_amount = []
for rps_amount2 in rps_amount_l:
if rps_amount2 not in bpo_file_total_l:
rps_table_q_2 = f"""select * from rps..sendfile where processingdate = '{cd}' and datasetname like '%ICL%' and paymenttotamt = '{rps_amount2}' """
rps_table_results = sql_server_cursor.execute(rps_table_q_2).fetchall()
file_missing = True
for rps in rps_table_results:
rps_amount_f = str(rps[18]).rstrip('0')
rps_amount_f = ("{:,}".format(float(rps_amount_f)))
missing_amount.append(rps_amount_f)
file_total_l
for link in url_list:
print(link)
options = Options()
browser = webdriver.Chrome(options=options,
executable_path=r'\\test\user$\test\Documents\driver\chromedriver.exe')
browser.get(link)
body = browser.find_element_by_xpath("//*[contains(text(), 'Total:')]").text
body_l.append(body)
icl_dollar_amount = re.findall('(?:[\£\$\€]{1}[,\d]+.?\d*)', body)[0].split('$', 1)[1]
icl_dollar_amount_l.append(icl_dollar_amount)
if not missing_amount:
logging.info("List is empty")
print("List is empty")
count = 0
for i in range(len(missing_amount)):
if missing_amount[i] in icl_dollar_amount_l:
body = body_l[i]
get_file_total = re.findall('(?:[\£\$\€]{1}[,\d]+.?\d*)', body)[0].split('$', 1)[1]
file_total_l.append(get_file_total)
Related
Very new to Python and Selenium, looking to scrape a few data points. I'm struggling in three areas:
I don't understand how to loop through multiple URLs properly
I can't figure out why the script is iterating twice over each URL
I can't figure out why it's only outputting the data for the second URL
Much thanks for taking a look!
Here's my current script:
urls = [
'https://developers.google.com/speed/pagespeed/insights/?url=https://www.crutchfield.com/%2F&tab=mobile',
'https://developers.google.com/speed/pagespeed/insights/?url=https://www.lastpass.com%2F&tab=mobile'
]
driver = webdriver.Chrome(executable_path='/Library/Frameworks/Python.framework/Versions/3.9/bin/chromedriver')
for url in urls:
for page in range(0, 1):
driver.get(url)
wait = WebDriverWait(driver, 120).until(EC.presence_of_element_located((By.CLASS_NAME, 'origin-field-data')))
df = pd.DataFrame(columns = ['Title', 'Core Web Vitals', 'FCP', 'FID', 'CLS', 'TTI', 'TBT', 'Total Score'])
company = driver.find_elements_by_class_name("audited-url__link")
data = []
for i in company:
data.append(i.get_attribute('href'))
for x in data:
#Get URL name
title = driver.find_element_by_xpath('//*[#id="page-speed-insights"]/div[2]/div[3]/div[2]/div[1]/div[1]/div/div[2]/h1/a')
co_name = title.text
#Get Core Web Vitals text pass/fail
cwv = driver.find_element_by_xpath('//*[#id="page-speed-insights"]/div[2]/div[3]/div[2]/div[1]/div[2]/div/div[1]/div[1]/div[1]/span[2]')
core_web = cwv.text
#Get FCP
fcp = driver.find_element_by_xpath('//*[#id="page-speed-insights"]/div[2]/div[3]/div[2]/div[1]/div[2]/div/div[1]/div[1]/div[2]/div[1]/div[1]/div')
first_content = fcp.text
#Get FID
fid = driver.find_element_by_xpath('//*[#id="page-speed-insights"]/div[2]/div[3]/div[2]/div[1]/div[2]/div/div[1]/div[1]/div[2]/div[3]/div[1]/div')
first_input = fid.text
#Get CLS
cls = driver.find_element_by_xpath('//*[#id="page-speed-insights"]/div[2]/div[3]/div[2]/div[1]/div[2]/div/div[1]/div[1]/div[2]/div[4]/div[1]/div')
layout_shift = cls.text
#Get TTI
tti = driver.find_element_by_xpath('//*[#id="interactive"]/div/div[1]')
time_interactive = tti.text
#Get TBT
tbt = driver.find_element_by_xpath('//*[#id="total-blocking-time"]/div/div[1]')
total_block = tbt.text
#Get Total Score
total_score = driver.find_element_by_xpath('//*[#id="page-speed-insights"]/div[2]/div[3]/div[2]/div[1]/div[1]/div/div[1]/a/div[2]')
score = total_score.text
#Adding all columns to dataframe
df.loc[len(df)] = [co_name,core_web,first_content,first_input,layout_shift,time_interactive,total_block,score]
driver.close()
#df.to_csv('Double Page Speed Test 9-10.csv')
print(df)
Q1 : I don't understand how to loop through multiple URLs properly ?
Ans : for url in urls:
Q2. I can't figure out why the script is iterating twice over each URL
Ans : Cause you have for page in range(0, 1):
Update 1:
I did not run your entire code with DF. Also sometimes either one of the pages, does not show the number and href, but when I typically run the below code,
driver = webdriver.Chrome(driver_path)
driver.maximize_window()
driver.implicitly_wait(50)
wait = WebDriverWait(driver, 20)
urls = [
'https://developers.google.com/speed/pagespeed/insights/?url=https://www.crutchfield.com/%2F&tab=mobile',
'https://developers.google.com/speed/pagespeed/insights/?url=https://www.lastpass.com%2F&tab=mobile'
]
data = []
for url in urls:
driver.get(url)
wait = WebDriverWait(driver, 120).until(EC.presence_of_element_located((By.CLASS_NAME, 'origin-field-data')))
company = driver.find_elements_by_css_selector("h1.audited-url a")
for i in company:
data.append(i.get_attribute('href'))
print(data)
this output :
['https://www.crutchfield.com//', 'https://www.lastpass.com/', 'https://www.lastpass.com/']
which is true case the element locator that we have used is representing 1 element on page 1 or 2 element on page 2
I wanted to loop through the product links from all the pages after looping, driver should open each every product link for scraping the data. For me only 1 product is opening it is not moving to next product link. Help me on this. Thanks in advance
import xlwt
from selenium import webdriver
import re
import time
from datetime import date
class expert:
def __init__(self):
self.url='https://expert938.expertonline.it/dm-IT-it/Vendita_Smartphone_W8D.aspx'
self.country='IT'
self.currency='euro'
self.VAT='Included'
def experts(self):
wb = xlwt.Workbook()
ws = wb.add_sheet('Sheet1',cell_overwrite_ok=True)
ws.write(0,0,"Product_Url")
ws.write(0,1,"Product_Manufacturer")
ws.write(0,2,"Product_Model")
ws.write(0,3,"Product_color")
ws.write(0,4,"Product_memory")
ws.write(0,5,"Product_Price")
ws.write(0,6,"Currency")
ws.write(0,7,"VAT")
ws.write(0,8,"Shipping Cost")
ws.write(0,9,"Country")
ws.write(0,10,"Date")
wb.save(r"C:\Users\Karthick R\Desktop\VS code\expert938.xls")
driver=webdriver.Chrome()
driver.get(self.url)
today = date.today()
time.sleep(5)
driver.maximize_window()
while True:
containers = []
flag = False
containers =driver.find_elements_by_css_selector('div[class="col-xs-12 skywalker_riga skywalker_riga_articolo"]')
for container in containers:
url = container.find_element_by_css_selector('div[class="text-center relative-container"]')
urls = url.find_element_by_tag_name('a').get_attribute('href')
product_links = []
#print(urls)
product_links.append(urls)
print(product_links)
for links in product_links:
driver.get(links)
time.sleep(10)
break
expertit=expert()
expertit.experts()
Try changing from
url = container.find_element_by_css_selector('div[class="text-center relative-container"]')
to
url = container.find_element_by_xpath('.//div[#class="text-center relative-container"]')
I tried the below code with little tweak :
driver = webdriver.Chrome(driver_path)
driver.maximize_window()
driver.implicitly_wait(30)
driver.get("https://expert938.expertonline.it/dm-IT-it/Vendita_Smartphone_W8D.aspx")
wait = WebDriverWait(driver, 20)
containers = driver.find_elements_by_xpath("//div[#class='col-xs-12 skywalker_riga skywalker_riga_articolo']")
i = 1
product_links = []
for container in containers:
url = container.find_element_by_xpath(f"(//div[#class='col-xs-12 skywalker_riga skywalker_riga_articolo']/descendant::div[contains(#class,'relative-container')]/a)[{i}]").get_attribute('href')
i = i + 1
product_links.append(url)
print(product_links)
and got this output :
['https://expert938.expertonline.it/dm-IT-it/iPhone-11-64GB-2020-_EXP729191.aspx',
'https://expert938.expertonline.it/dm-IT-it/iPhone-11-64GB-2020-_EXP729190.aspx',
'https://expert938.expertonline.it/dm-IT-it/iPhone-11-64GB-2020-_EXP729189.aspx',
'https://expert938.expertonline.it/dm-IT-it/OPPO-A16CRYSTALBLACK_EXP754142.aspx',
'https://expert938.expertonline.it/dm-IT-it/OPPO-A16PEARLBLUE_EXP754143.aspx',
'https://expert938.expertonline.it/dm-IT-it/C20232DARKBLUE_EXP751967.aspx',
'https://expert938.expertonline.it/dm-IT-it/POWER-U20_EXP749758.aspx',
'https://expert938.expertonline.it/dm-IT-it/POWER-U10_EXP749759.aspx',
'https://expert938.expertonline.it/dm-IT-it/POWER-U10_EXP749761.aspx',
'https://expert938.expertonline.it/dm-IT-it/POWER-U10_EXP749760.aspx',
'https://expert938.expertonline.it/dm-IT-it/POWER-U20_EXP749757.aspx',
'https://expert938.expertonline.it/dm-IT-it/REDMI-NOTE-10-5G-128GB-CHROME-SILVER_EXP748139.aspx',
'https://expert938.expertonline.it/dm-IT-it/X60-PRO_EXP747150.aspx',
'https://expert938.expertonline.it/dm-IT-it/Y72_EXP747155.aspx',
'https://expert938.expertonline.it/dm-IT-it/V21_EXP747153.aspx',
'https://expert938.expertonline.it/dm-IT-it/X60-PRO_EXP747151.aspx',
'https://expert938.expertonline.it/dm-IT-it/V21_EXP747152.aspx',
'https://expert938.expertonline.it/dm-IT-it/Y72_EXP747154.aspx',
'https://expert938.expertonline.it/dm-IT-it/MI-11I-8-256GB_EXP746932.aspx',
'https://expert938.expertonline.it/dm-IT-it/XIAOMI-MI-11-LITE-5G-128GB_EXP746078.aspx', 'https://expert938.expertonline.it/dm-IT-it/iPhone-12-mini-64GB_EXP745681.aspx',
'https://expert938.expertonline.it/dm-IT-it/iPhone-12-64GB_EXP745686.aspx',
'https://expert938.expertonline.it/dm-IT-it/REDMI-NOTE-10-5G-128GB-NIGHTTIME-BLUE_EXP745322.aspx',
'https://expert938.expertonline.it/dm-IT-it/REDMI-NOTE-10S-6-128GB_EXP744826.aspx',
'https://expert938.expertonline.it/dm-IT-it/REDMI-NOTE-10S-6-128GB_EXP744824.aspx',
'https://expert938.expertonline.it/dm-IT-it/M11-LITE-5G-128GB-VERDE_EXP744822.aspx',
'https://expert938.expertonline.it/dm-IT-it/REDMI-NOTE-10S-6-128GB_EXP744825.aspx',
'https://expert938.expertonline.it/dm-IT-it/8-6-128GB-5G-BLACK_EXP744698.aspx',
'https://expert938.expertonline.it/dm-IT-it/C21_EXP744696.aspx',
'https://expert938.expertonline.it/dm-IT-it/REALME-8_EXP744695.aspx',
'https://expert938.expertonline.it/dm-IT-it/8-6-128GB-5G-BLUE_EXP744699.aspx',
'https://expert938.expertonline.it/dm-IT-it/A74-Prism-Black_EXP742935.aspx',
'https://expert938.expertonline.it/dm-IT-it/A74-Midnight-Blue_EXP742936.aspx',
'https://expert938.expertonline.it/dm-IT-it/Y62-W-K610-LIGHT-BLUE_EXP742397.aspx',
'https://expert938.expertonline.it/dm-IT-it/Y62-W-K610-DARK-BLUE_EXP742396.aspx',
'https://expert938.expertonline.it/dm-IT-it/Y62-W-K610-_EXP742398.aspx']
Process finished with exit code 0
I write a python script. first, it visits this website. then click on the arrow on the right side and go to the new web page to collect some data. finally back to the previous page and do the same thing with next item.
Web page : https://register.fca.org.uk/s/search?q=capital&type=Companies
This is the code.
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.wait import WebDriverWait
import time
url = 'https://register.fca.org.uk/s/search?q=capital&type=Companies'
service = Service('link to come driver')
service.start()
driver = webdriver.Remote(service.service_url)
driver.get(url)
time.sleep(12)
divs = driver.find_elements_by_xpath('//div[#class="result-card_main"]')
for d in divs:
RN = ''
companyName = ''
companyName = d.find_element_by_tag_name('h2').text
RNData = d.find_element_by_xpath('.//div[#class="result-card_figure-offset"]').text
RN = RNData.split(':')[1].strip()
d.click()
time.sleep(12)
phoneNumber = ''
phoneNumberData = driver.find_elements_by_xpath('//*[#id="who-is-this-details-content"]/div[1]/div[2]/div[2]/div/div/div[2]')
phoneNumber = phoneNumberData[0].text.split('\n')[1]
print(RN)
print(companyName)
print(phoneNumber)
driver.execute_script("history.back();")
it givesme this Error:
selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
How can I solve this problem?
Here's a quick and dirty way to avoid that error, change your code like this:
url = 'https://register.fca.org.uk/s/search?q=capital&type=Companies'
driver.get(url)
time.sleep(12)
divs = driver.find_elements_by_xpath('//div[#class="result-card_main"]')
for i in range(len(divs)):
time.sleep(4)
d = driver.find_elements_by_xpath('//div[#class="result-card_main"]')
RN = ''
companyName = ''
companyName = d[i].find_element_by_tag_name('h2').text
RNData = d[i].find_element_by_xpath('.//div[#class="result-card_figure-offset"]').text
RN = RNData.split(':')[1].strip()
d[i].click()
time.sleep(12)
phoneNumber = ''
phoneNumberData = driver.find_elements_by_xpath('//*[#id="who-is-this-details-content"]/div[1]/div[2]/div[2]/div/div/div[2]')
phoneNumber = phoneNumberData[0].text.split('\n')[1]
print(RN)
print(companyName)
print(phoneNumber)
driver.execute_script("window.history.go(-1)")
what i am trying to do is getting the asin (an attribute) from a div (element) in html to then concatenate with amazon.com/dp/ + asin to form a url which is to then be visited. the divs have no id but are identified by the data-index="1" attribute within the div element so i am wondering how to call this div element to then be specifically searched for the asin attribute. thanks for reading.
using python 3.7 and selenium webdriver
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
driver = webdriver.Chrome()
email = ('.')
password = ('.')
query = ('macbook')
urls = []
prices = []
names = []
descs = []
def search_amazon(query):
driver.get('https://amazon.com/')
searchBox = driver.find_element_by_id('twotabsearchtextbox')
time.sleep(2)
searchBox.send_keys(query)
searchBox.send_keys(Keys.ENTER)
time.sleep(3)
firstResult = driver.find_element_by_name('data-index="1"')
asin = firstResult.getAttribute('data-asin')
print(asin)
url = 'https://amazon.com/dp/' + asin
driver.get(url)
print(url)
return url
search_amazon(query)
You need to change these two lines of code with the code I have provided.
firstResult = driver.find_element_by_name('data-index="1"')
asin = firstResult.getAttribute('data-asin')
Since data-index is not the name its an attribute.You can use following css selector.
firstResult = driver.find_element_by_css_selector('div[data-index="1"]>div')
asin = firstResult.get_attribute('data-asin')
Here is the working code.
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
driver = webdriver.Chrome()
email = ('.')
password = ('.')
query = ('macbook')
urls = []
prices = []
names = []
descs = []
def search_amazon(query):
driver.get('https://amazon.com/')
searchBox = driver.find_element_by_id('twotabsearchtextbox')
time.sleep(2)
searchBox.send_keys(query)
searchBox.send_keys(Keys.ENTER)
time.sleep(3)
firstResult = driver.find_element_by_css_selector('div[data-index="1"]>div')
asin = firstResult.get_attribute('data-asin')
print(asin)
url = 'https://amazon.com/dp/' + asin
driver.get(url)
print(url)
return url
search_amazon(query)
Take a look at this site : https://www.arabam.com/ilan/sahibinden-satilik-mercedes-benz-cla-180-d-style/sahibinden-boyasiz-hasarsiz-cam-tavan-temiz-arac/14229201
I press end key so that it goes to the end of page. And then one by one it presses up key until it finds this :
enter image description here
It was working just fine but it doesnt seem to be working anymore.
options.add_argument('window-size=1200x600')
prefs = {}
prefs = {"profile.default_content_setting_values.geolocation": 2, "profile.default_content_setting_values.notifications": 2}
options.add_experimental_option("prefs", prefs)
d = webdriver.Chrome(chrome_options=options,
executable_path='./chromedriver')
d.get(features["ad_url"])
# Use send_keys(Keys.HOME) to scroll up to the top of page
d.find_element_by_tag_name('body').send_keys(
Keys.END)
while True:
d.find_element_by_tag_name('body').send_keys(
Keys.UP)
time.sleep(1)
e = d.find_element_by_xpath("/html/body/div[3]/div[6]/div[3]/div/div[1]/div[3]/div/div[3]/div")
if e.text:
break
Here is a fully functional code to try:
import json
import scrapy
from scrapy.spiders import SitemapSpider
from scrapy.crawler import CrawlerProcess
from selenium import webdriver
from datetime import datetime
from selenium.webdriver.common.keys import Keys
import pickle
import time
class Myspider(SitemapSpider):
name = 'spidername'
sitemap_urls = ['https://www.arabam.com/sitemap/otomobil_1.xml','https://www.arabam.com/sitemap/otomobil_2.xml',
'https://www.arabam.com/sitemap/otomobil_3.xml','https://www.arabam.com/sitemap/otomobil_4.xml',
'https://www.arabam.com/sitemap/otomobil_5.xml','https://www.arabam.com/sitemap/otomobil_6.xml',
'https://www.arabam.com/sitemap/otomobil_7.xml','https://www.arabam.com/sitemap/otomobil_8.xml',
'https://www.arabam.com/sitemap/otomobil_9.xml','https://www.arabam.com/sitemap/otomobil_10.xml',
'https://www.arabam.com/sitemap/otomobil_11.xml','https://www.arabam.com/sitemap/otomobil_12.xml',
'https://www.arabam.com/sitemap/otomobil_13.xml']
sitemap_rules = [
('/otomobil/', 'parse'),
]
custom_settings = {'FEED_FORMAT':'csv','FEED_URI': "arabam_"+str(datetime.today().strftime('%d%m%y'))+'.csv'
}
def parse(self,response):
for td in response.xpath("/html/body/div[3]/div[6]/div[4]/div/div[2]/table/tbody/tr/td[4]/div/a"):
link = td.xpath("#href").extract()
year = td.xpath("text()").extract()
self.crawled.append(link[0])
self.new_links += 1
if int(year[0]) > 2010:
url = "https://www.arabam.com/" + link[0]
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self,response):
features = {}
options = webdriver.ChromeOptions()
# options.add_argument('headless')
options.add_argument('window-size=1200x600')
prefs = {}
prefs = {"profile.default_content_setting_values.geolocation": 2, "profile.default_content_setting_values.notifications": 2}
options.add_experimental_option("prefs", prefs)
d = webdriver.Chrome(chrome_options=options,
executable_path='./chromedriver')
d.get(features["ad_url"])
# Use send_keys(Keys.HOME) to scroll up to the top of page
d.find_element_by_tag_name('body').send_keys(
Keys.END)
while True:
d.find_element_by_tag_name('body').send_keys(
Keys.UP)
time.sleep(1)
e = d.find_element_by_xpath("/html/body/div[3]/div[6]/div[3]/div/div[1]/div[3]/div/div[3]/div")
if e.text:
break
overview1 = e.text.split("\n")
yield features
process = CrawlerProcess({
})
process.crawl(Myspider)
process.start() # the script wi
Edit:
I commented out and ran the code and it turns out that keys are being sent. The problem is with trying to find specific div. I tried putting try catch on it but that doesn't seem to be working.
while True:
d.find_element_by_tag_name('body').send_keys(
Keys.UP)
time.sleep(1)
try:
e = d.find_element_by_xpath("/html/body/div[3]/div[6]/div[3]/div/div[1]/div[3]/div/div[3]/div")
if e.text:
break
except:
pass
Edit:
This is what I did to scroll up. But unfortunately this doesn't work for most cases
for i in range(0,37):
d.find_element_by_tag_name('body').send_keys(
Keys.UP)
time.sleep(1)
e = d.find_element_by_xpath("/html/body/div[3]/div[6]/div[3]/div/div[1]/div[3]/div/div[3]/div[2]/div")
overview1 = e.text.split("\n")
Edit:
Tried this. It scrolls into view but doesnt get element
e = d.find_element_by_xpath("//div[#id = 'js-hook-appendable-technicalPropertiesWrapper' and #class = 'cf' ] ")
actions = ActionChains(d)
actions.move_to_element(e).perform()
wait = WebDriverWait(d, 20)
wait.until(EC.visibility_of_element_located((By.XPATH, "//div[#id = 'js-hook-appendable-technicalPropertiesWrapper' and #class = 'cf' ]")))
overview1 = e.text.split("\n")
Edit:
Screenshot of HTML
enter image description here
Adding as an answer as this is a bit lengthy comment.
First, you need to wait for the element to appear. And then find the element and extract the values. From your code, the element find is done before the visibility check.
And another thing you can try is to scroll to the specific element before extracting values. This specific table seems to be loading values only if it is in the viewport.
actions = ActionChains(d)
actions.move_to_element(e).perform()
wait = WebDriverWait(d, 20)
wait.until(EC.visibility_of_element_located((By.XPATH, "//div[#id = 'js-hook-appendable-technicalPropertiesWrapper' and #class = 'cf' ]")))
e = d.find_element_by_xpath("//div[#id = 'js-hook-appendable-technicalPropertiesWrapper' and #class = 'cf' ] ")
# Scroll to the element
d.executeScript("arguments[0].scrollIntoView(true);", element);
# Check what is the actual text value you get
print(e.text)
print (e.text.split("\n"))