They will scrape only the data of 1page only not to move on second page is any solution then provide me have trying different approches but I am not successfull to solve these problem if any solution then provide me these is page URL https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx
import scrapy
from scrapy.http import Request
from selenium import webdriver
class TestSpider(scrapy.Spider):
name = 'test'
start_urls = ['https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//div[#class='list-group']//#href").extract()
for book in books:
url = response.urljoin(book)
if url.endswith('.ro') or url.endswith('.ro/'):
continue
yield Request(url, callback=self.parse_book)
def __init__(self):
self.driver = webdriver.Chrome(
'C:\Program Files (x86)\chromedriver.exe')
def parse_book(self, response):
self.driver.get(
"https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx")
title = response.xpath(
"//span[#id='HeadingContent_lblTitle']//text()").get()
d1 = response.xpath("//div[#class='col-md-10']//p[1]//text()").get()
d1 = d1.strip()
d2 = response.xpath("//div[#class='col-md-10']//p[2]//text()").get()
d2 = d2.strip()
d3 = response.xpath(
"//div[#class='col-md-10']//p[3]//span//text()").get()
d3 = d3.strip()
d4 = response.xpath("//div[#class='col-md-10']//p[4]//text()").get()
d4 = d4.strip()
yield{
"title1": title,
"title2": d1,
"title3": d2,
"title4": d3,
"title5": d4,
}
max_page_el = WebDriverWait(driver, 10).until(
EC.visibility_of_element_located((By.ID, "MainContent_PagerTop_lblPages"))
)
max_page = int(max_page_el.text.split("din").pop().split(")")[0])
# test with smaller number
for i in range(1, 4):
WebDriverWait(driver, 10).until(
EC.visibility_of_element_located((By.ID, f"MainContent_PagerTop_NavToPage{i}"))
).click()
# scrap here
elements = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.CLASS_NAME, f"list-group"))
)
# just an example
print(elements)
driver.quit()
They give me output of 1 page:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# Setup your driver, open the page, etc.
max_page_el = WebDriverWait(driver, 10).until(
EC.visibility_of_element_located((By.ID, "MainContent_PagerTop_lblPages"))
)
# use can use regex here
max_page = int(max_page_el.text.split("din").pop().split(")")[0])
# test with smaller number
for i in range(1, max_page):
WebDriverWait(driver, 10).until(
EC.visibility_of_element_located((By.ID, f"MainContent_PagerTop_NavToPage{i}"))
).click()
# scrap here
elements = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.CLASS_NAME, f"list-group"))
)
# just an example
print(elements)
driver.quit()
Related
I am trying to use Beautiful Soup to read a value from a web page. The following steps are necessary:
go to the webpage:
url = 'https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings/funds/'
insert the ISIN in the searchbar
3. select the autocomplete-results from the container msci-ac-search-data-dropdown (click)
4. read the value from the "div class: ratingdata-outercircle esgratings-profile-header-green" to get the text: "ratingdata-fund-rating esg-fund-ratings-circle-aaa".
so far i have tried the following:
import requests
from bs4 import BeautifulSoup
isin = 'IE00B4L5Y983'
url = 'https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings/funds/'
soup = BeautifulSoup( requests.get(url).content, 'html.parser' )
payload = {}
for i in soup.select('form[action="https://www.msci.com/search"] input[value]'):
payload[i['name']] = i['value']
payload['UQ_txt'] = isin
Try:
import requests
from bs4 import BeautifulSoup
isin = "IE00B4L5Y983"
url = "https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings"
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:102.0) Gecko/20100101 Firefox/102.0",
"X-Requested-With": "XMLHttpRequest",
}
params = {
"p_p_id": "esg_fund_ratings_profile",
"p_p_lifecycle": "2",
"p_p_state": "normal",
"p_p_mode": "view",
"p_p_resource_id": "searchFundRatingsProfiles",
"p_p_cacheability": "cacheLevelPage",
"_esg_fund_ratings_profile_keywords": isin,
}
data = requests.get(url, params=params, headers=headers).json()
params = {
"p_p_id": "esg_fund_ratings_profile",
"p_p_lifecycle": "2",
"p_p_state": "normal",
"p_p_mode": "view",
"p_p_resource_id": "showEsgFundRatingsProfile",
"p_p_cacheability": "cacheLevelPage",
"_esg_fund_ratings_profile_fundShareClassId": data[0]["url"],
}
headers["Referer"] = "https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings/funds/{}/{}".format(
data[0]["encodedTitle"], data[0]["url"]
)
soup = BeautifulSoup(
requests.get(url, params=params, headers=headers).content, "html.parser"
)
data = soup.select_one(".ratingdata-fund-rating")["class"]
print(data)
Prints:
['ratingdata-fund-rating', 'esg-fund-ratings-circle-aaa']
When you press enter, you send another request, which already shows the search result. Here is an example of how to get what you want
import requests
isin = 'IE00B4L5Y983'
url = f"https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings?p_p_id=esg_fund_ratings_profile&p_p_lifecycle=2&p_p_state=normal&p_p_mode=view&p_p_resource_id=searchFundRatingsProfiles&p_p_cacheability=cacheLevelPage&_esg_fund_ratings_profile_keywords={isin}"
for title in requests.get(url).json():
print(title['title'])
OUTPUT:
iShares Core MSCI World UCITS ETF USD (Acc)
If I may: from the OP's description I can only infer this is either an education related test, either a job interview related test. As such, following the exact instructions is paramount. In order to follow said instructions, you can only use selenium. The following code will work 'a la point', and get the desired result:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--headless")
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
url = 'https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings/funds/'
browser.get(url)
WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.ID, '_esg_fund_ratings_profile_keywords'))).send_keys('IE00B4L5Y983')
WebDriverWait(browser, 20).until(EC.visibility_of_element_located((By.ID, 'ui-id-1')))
result = browser.find_element(By.ID, "ui-id-1")
result.click()
WebDriverWait(browser, 20).until(EC.visibility_of_element_located((By.CLASS_NAME, 'esgratings-profile-header-green')))
result = browser.find_element(By.CLASS_NAME, "esgratings-profile-header-green").find_element(By.TAG_NAME, "div").get_attribute('class')
print(result)
browser.quit()
This will return:
ratingdata-fund-rating esg-fund-ratings-circle-aaa
I'm trying to write to CSV file and am only getting 1 column with the company names. Does anyone know how I can write to a CSV file and get all the data with column headings?
Printed Output
'Agilent Technologies\n6,319\n2,912\n441\n1,619\n321\n189\n189\n1,347\n81
\n236\n1,210\n19.2%\n307', 'Alcoa\n12,152\n9,153\n31\n227\n664\n390\n390\n2,039\n195\n19\n429\n3.5%\n190',
Current Output after writing to CSV
Agilent Technologies
Alcoa
Desired Output after writing to CSV
Full Code
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
import pandas as pd
import requests
import csv
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
ser = Service("./chromedriver.exe")
browser = driver = webdriver.Chrome(service=ser)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
driver.execute_cdp_cmd("Network.enable", {})
driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.53 Safari/537.36'})
wait = WebDriverWait(driver, 30)
driver.get("https://stockrover.com")
wait.until(EC.visibility_of_element_located((By.XPATH, "/html/body/div[1]/div/section[2]/div/ul/li[2]"))).click()
user = driver.find_element(By.NAME, "username")
password = driver.find_element(By.NAME, "password")
user.clear()
user.send_keys("******")
password.clear()
password.send_keys("*******")
driver.find_element(By.NAME, "Sign In").click()
wait = WebDriverWait(driver, 30)
stocks_list = []
try:
while True:
# Print the stock symbols
stocks_list.extend([my_elem.text for my_elem in WebDriverWait(driver, 30).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "table[id^='gridview-1072-record']")))])
# Click on next page button
wait.until(EC.visibility_of_element_located((By.XPATH, '//*[#id="button-1157"]'))).click()
except:
print("Next button disabled")
print(stocks_list) # Prints entire list of stocks
df=pd.DataFrame(stocks_list)
df.to_csv('table.csv')
You may just perform this CSV Module in the end add:
stocks_lists = [x.split('\n') for x in stocks_list]
for row in stocks_lists:
with open('output.csv', 'a', encoding='utf-8', newline='') as csv_file:
csv_write = csv.writer(csv_file)
csv_write.writerow(row)
Full Code:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
import pandas as pd
import requests
import csv
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome('G://chromedriver.exe')
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
driver.execute_cdp_cmd("Network.enable", {})
driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.53 Safari/537.36'})
wait = WebDriverWait(driver, 30)
driver.get("https://stockrover.com")
wait.until(EC.visibility_of_element_located((By.XPATH, "/html/body/div[1]/div/section[2]/div/ul/li[2]"))).click()
user = driver.find_element(By.NAME, "username")
password = driver.find_element(By.NAME, "password")
user.clear()
user.send_keys("********")
password.clear()
password.send_keys("********")
driver.find_element(By.NAME, "Sign In").click()
wait = WebDriverWait(driver, 30)
stocks_list = []
try:
while True:
# Print the stock symbols
stocks_list.extend([my_elem.text for my_elem in WebDriverWait(driver, 30).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "table[id^='gridview-1072-record']")))])
# Click on next page button
wait.until(EC.visibility_of_element_located((By.XPATH, '//*[#id="button-1157"]'))).click()
except:
print("Next button disabled")
# print(stocks_list) # Prints entire list of stocks
stocks_lists = [x.split('\n') for x in stocks_list]
for row in stocks_lists:
with open('output.csv', 'a', encoding='utf-8', newline='') as csv_file:
csv_write = csv.writer(csv_file)
csv_write.writerow(row)
Assuming that this is your list of strings
string_sock = ["""Agilent Technologies\n6,319\n2,912\n441\n1,619\n321\n189\n189\n1,347\n81
\n236\n1,210\n19.2%\n307""", """Alcoa\n12,152\n9,153\n31\n227\n664\n390\n390\n2,039\n195\n19\n429\n3.5%\n190"""]
Yours columns
columns = ['Company', 'Sales', 'Cost of Sales', 'R&D', 'SG&A', 'Depr', 'Capex', 'Maint Capex', 'Op income', 'Int Exp.', 'Dividends', 'Net Inc.', 'Net Margin', 'Diluted Sh', 'rs']
Split the input with the separator
splitted_rows = [v.split("\n") for v in string_sock]
pd.DataFrame(splitted_rows, columns=columns).to_csv(test.csv)
I want to learn how to get information from dynamically generated fields.
When I tried simple sites everything worked. Then I decided to try more difficult and now I can't figure it out. It took me about a two weeks to cross out the solution options that I found on the Internet over and over again.
Now I'm not sure that I can get information that appears on sites in this way. Of course, most likely I'm doing something wrong, but I can't take some new idea how it do. Now, I decided to ask here. Perhaps there are those who understand this and can prompt. If yes - please give me some example.
The site I use to learn - kbp.aero/en
The information I'm trying to get (arrival schedule) - .tbody .tr .td
For example I tried:
1.
URL = 'https://kbp.aero/en/'
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
}
time.sleep(1)
response = requests.get(URL, headers = HEADERS)
soup = BeautifulSoup(response.content, 'html.parser')
items = soup.find('div', class_ = 'table_wrp out yesterday')
items = items.findAll('tr', class_ = 'tr')
comps = []
if(len(items) > 0):
for item in items:
comps.append({
'title':item.find('td', class_ = 'td').get_text(strip = True),
})
for comp in comps:
print(comp['title'])
# for item in items:
# comps.append({
# 'text': item.get_text(strip=True)
# })
#
# for comp in comps:
# print(comp['text'])
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def main():
driver = webdriver.Chrome()
driver.get("https://kbp.aero/en/")
wait = WebDriverWait(driver, 10)
element = wait.until(EC.text_to_be_present_in_element((By.CLASS_NAME, 'tbody'), ''))
tds = element.find_elements(By.CLASS_NAME, "td")
for td in tds:
print(td.text)
# try:
# element = WebDriverWait(driver, 10).until(
# EC.presence_of_element_located((By.CLASS_NAME, "tbody"))
# )
# tds = element.find_elements(By.CLASS_NAME, "td")
# for td in tds:
# print(td.text)
#
# finally:
# driver.quit()
Thanks for any advice.
This will fetch the entire table data:
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
PATH = r"chromedriverexe path"
driver = webdriver.Chrome(PATH)
driver.get("https://kbp.aero/en/")
driver.maximize_window()
sleep(3)
print(driver.find_element(By.CSS_SELECTOR, "div.table_wrp.out.today > table").text)
Output:
Рейс Час Призначення Перевізник Термінал Гейт Статус
TK 1256 15:05 Istanbul Turkish Airlines D D5 Boarding Completed
PS 9556 15:05 Istanbul Ukraine International Airlines D D5 Boarding Completed
7W 163 15:10 Lviv Wind Rose D D19 Boarding
FR 3167 15:10 Warsaw Ryanair D D9 Boarding
PS 9013 15:15 Ivano-Frankivsk Ukraine International Airlines D D18 Boarding
7W 113 15:15 Ivano-Frankivsk Wind Rose D D18 Boarding
I am struggling to parse/scrape each page after clicking the Next button using Selenium. I am able to go to the second page, however, it fails after that. Not sure how to solve this, any suggestions?
Here is the code:
class PropertyFoxSpider(scrapy.Spider):
name = 'property_fox'
start_urls = [
'https://propertyfox.co.za/listing-search?currentpage=1&term_id=62515&keywords=Western+Cape&orderby=createddate:desc&status%5B%5D=Active'
]
def __init__(self):
#path to driver
self.driver = webdriver.Chrome('path')
def parse(self,response):
self.driver.get(response.url)
while True:
try:
elem = WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.ID, "pagerNext")))
elem.click()
url = self.driver.current_url
yield scrapy.Request(url=url, callback=self.parse_page, dont_filter=False)
except TimeoutException:
break
def parse_page(self, response):
#self.driver.get(response.url)
for prop in response.css('div.property-item'):
link = prop.css('a::attr(href)').get()
banner = prop.css('div.property-figure-icon div::text').get()
sold_tag = None
if banner:
banner = banner.strip()
sold_tag = 'sold' if 'sold' in banner.lower() else None
yield scrapy.Request(
link,
callback=self.parse_property,
meta={'item': {
'agency': self.name,
'url': link,
'offering': 'buy',
'banners': banner,
'sold_tag': sold_tag,
}},
)
def parse_property(self, response):
item = response.meta.get('item')
...
You can wait until URL changed and then scrape it
from selenium.webdriver.support.ui import WebDriverWait
url = self.driver.current_url
elem = WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.ID, "pagerNext")))
elem.click()
WebDriverWait(self.driver, 10).until(lambda driver: self.driver.current_url != url)
url = self.driver.current_url
I'm struggling to get this code to extract the desired information from one single page.
I've tried all the usual selenium tactics and added a time delay. Hopefully, it's something simple. I'm not getting any error messages.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup as bs
from time import sleep
options = Options()
options.add_argument("--headless")
options.add_argument("window-size=1400,600")
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_0_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36"
options.add_argument(f'user-agent={user_agent}')
capabilities = { 'chromeOptions': { 'useAutomationExtension': False},'args': ['--disable-extensions']}
browser = webdriver.Chrome(executable_path=r'/usr/local/bin/chromedriver',desired_capabilities = capabilities,options=options)
url='https://groceries.asda.com/product/celery-spring-onions/asda-growers-selection-trimmed-spring-onions/41676'
browser.get(url)
sleep(3)
source_data = browser.page_source
bs_data = bs(source_data,"html.parser")
#product id
try:
product_id = bs_data.findfindAll('span', {'class': 'pdp-main-details__product-code'})
product_id = product_id.replace('Product code:','').strip()
except:
product_id = "n/a"
#image address
try:
for image in bs_data.find("div", {"class":"s7staticimage"}):
image_url = image.find('img')['src']
except:
image_url = "n/a"
#product description
try:
product_desc = bs_data.find('class',{'pdp-main-pdp-main-details__title'})
product_desc = product_desc.get_text().strip()
except:
product_desc = "n/a"
#product price
try:
product_price = bs_data.find('class',{'co-product__price pdp-main-details__price'})
product_price = product_price.get_text().strip()
except:
product_price = "n/a"
print (url,'|',image_url,'|',product_id,'|',product_desc,'|',product_price)
browser.quit()
Any assistance is greatly appreciated.
Thanks
Since the content is dynamically generated, your soup has nothing in it to find. Selenium is good enough. I don't know why you have treated the elements as list because there is only one of each on this page.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument('--headless')
capabilities = { 'chromeOptions': { 'useAutomationExtension': False},'args': ['--disable-extensions']}
browser = webdriver.Chrome(executable_path='C:/bin/chromedriver.exe',desired_capabilities = capabilities,options=options)
url='https://groceries.asda.com/product/celery-spring-onions/asda-growers-selection-trimmed-spring-onions/41676'
browser.get(url)
browser.implicitly_wait(15)
product_id = browser.find_element_by_class_name('pdp-main-details__product-code')
print(product_id.text)
image = browser.find_element_by_xpath("//*[#id=\"s7viewer_flyout\"]/div[1]/img[1]")
image_url = image.get_attribute('src')
print(image_url)
Output:-
Product code: 410212
https://ui.assets-asda.com/dm/asdagroceries/5050854288142_T1?defaultImage=asdagroceries/noImage&resMode=sharp2&id=PqaST3&fmt=jpg&fit=constrain,1&wid=188&hei=188