scrape only 1 page not to scrape multiple pages

scrape only 1 page not to scrape multiple pages - python

They will scrape only the data of 1page only not to move on second page is any solution then provide me have trying different approches but I am not successfull to solve these problem if any solution then provide me these is page URL https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx
import scrapy
from scrapy.http import Request
from selenium import webdriver
class TestSpider(scrapy.Spider):
name = 'test'
start_urls = ['https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//div[#class='list-group']//#href").extract()
for book in books:
url = response.urljoin(book)
if url.endswith('.ro') or url.endswith('.ro/'):
continue
yield Request(url, callback=self.parse_book)
def __init__(self):
self.driver = webdriver.Chrome(
'C:\Program Files (x86)\chromedriver.exe')
def parse_book(self, response):
self.driver.get(
"https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx")
title = response.xpath(
"//span[#id='HeadingContent_lblTitle']//text()").get()
d1 = response.xpath("//div[#class='col-md-10']//p[1]//text()").get()
d1 = d1.strip()
d2 = response.xpath("//div[#class='col-md-10']//p[2]//text()").get()
d2 = d2.strip()
d3 = response.xpath(
"//div[#class='col-md-10']//p[3]//span//text()").get()
d3 = d3.strip()
d4 = response.xpath("//div[#class='col-md-10']//p[4]//text()").get()
d4 = d4.strip()
yield{
"title1": title,
"title2": d1,
"title3": d2,
"title4": d3,
"title5": d4,
}
max_page_el = WebDriverWait(driver, 10).until(
EC.visibility_of_element_located((By.ID, "MainContent_PagerTop_lblPages"))
)
max_page = int(max_page_el.text.split("din").pop().split(")")[0])
# test with smaller number
for i in range(1, 4):
WebDriverWait(driver, 10).until(
EC.visibility_of_element_located((By.ID, f"MainContent_PagerTop_NavToPage{i}"))
).click()
# scrap here
elements = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.CLASS_NAME, f"list-group"))
)
# just an example
print(elements)
driver.quit()
They give me output of 1 page:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# Setup your driver, open the page, etc.
max_page_el = WebDriverWait(driver, 10).until(
EC.visibility_of_element_located((By.ID, "MainContent_PagerTop_lblPages"))
)
# use can use regex here
max_page = int(max_page_el.text.split("din").pop().split(")")[0])
# test with smaller number
for i in range(1, max_page):
WebDriverWait(driver, 10).until(
EC.visibility_of_element_located((By.ID, f"MainContent_PagerTop_NavToPage{i}"))
).click()
# scrap here
elements = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.CLASS_NAME, f"list-group"))
)
# just an example
print(elements)
driver.quit()

Related

Insert value in searchbar, select autocomplete result and get value by bs4

I am trying to use Beautiful Soup to read a value from a web page. The following steps are necessary:
go to the webpage:
url = 'https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings/funds/'
insert the ISIN in the searchbar
3. select the autocomplete-results from the container msci-ac-search-data-dropdown (click)
4. read the value from the "div class: ratingdata-outercircle esgratings-profile-header-green" to get the text: "ratingdata-fund-rating esg-fund-ratings-circle-aaa".
so far i have tried the following:
import requests
from bs4 import BeautifulSoup
isin = 'IE00B4L5Y983'
url = 'https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings/funds/'
soup = BeautifulSoup( requests.get(url).content, 'html.parser' )
payload = {}
for i in soup.select('form[action="https://www.msci.com/search"] input[value]'):
payload[i['name']] = i['value']
payload['UQ_txt'] = isin

Try:
import requests
from bs4 import BeautifulSoup
isin = "IE00B4L5Y983"
url = "https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings"
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:102.0) Gecko/20100101 Firefox/102.0",
"X-Requested-With": "XMLHttpRequest",
}
params = {
"p_p_id": "esg_fund_ratings_profile",
"p_p_lifecycle": "2",
"p_p_state": "normal",
"p_p_mode": "view",
"p_p_resource_id": "searchFundRatingsProfiles",
"p_p_cacheability": "cacheLevelPage",
"_esg_fund_ratings_profile_keywords": isin,
}
data = requests.get(url, params=params, headers=headers).json()
params = {
"p_p_id": "esg_fund_ratings_profile",
"p_p_lifecycle": "2",
"p_p_state": "normal",
"p_p_mode": "view",
"p_p_resource_id": "showEsgFundRatingsProfile",
"p_p_cacheability": "cacheLevelPage",
"_esg_fund_ratings_profile_fundShareClassId": data[0]["url"],
}
headers["Referer"] = "https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings/funds/{}/{}".format(
data[0]["encodedTitle"], data[0]["url"]
)
soup = BeautifulSoup(
requests.get(url, params=params, headers=headers).content, "html.parser"
)
data = soup.select_one(".ratingdata-fund-rating")["class"]
print(data)
Prints:
['ratingdata-fund-rating', 'esg-fund-ratings-circle-aaa']

When you press enter, you send another request, which already shows the search result. Here is an example of how to get what you want
import requests
isin = 'IE00B4L5Y983'
url = f"https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings?p_p_id=esg_fund_ratings_profile&p_p_lifecycle=2&p_p_state=normal&p_p_mode=view&p_p_resource_id=searchFundRatingsProfiles&p_p_cacheability=cacheLevelPage&_esg_fund_ratings_profile_keywords={isin}"
for title in requests.get(url).json():
print(title['title'])
OUTPUT:
iShares Core MSCI World UCITS ETF USD (Acc)

If I may: from the OP's description I can only infer this is either an education related test, either a job interview related test. As such, following the exact instructions is paramount. In order to follow said instructions, you can only use selenium. The following code will work 'a la point', and get the desired result:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--headless")
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
url = 'https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings/funds/'
browser.get(url)
WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.ID, '_esg_fund_ratings_profile_keywords'))).send_keys('IE00B4L5Y983')
WebDriverWait(browser, 20).until(EC.visibility_of_element_located((By.ID, 'ui-id-1')))
result = browser.find_element(By.ID, "ui-id-1")
result.click()
WebDriverWait(browser, 20).until(EC.visibility_of_element_located((By.CLASS_NAME, 'esgratings-profile-header-green')))
result = browser.find_element(By.CLASS_NAME, "esgratings-profile-header-green").find_element(By.TAG_NAME, "div").get_attribute('class')
print(result)
browser.quit()
This will return:
ratingdata-fund-rating esg-fund-ratings-circle-aaa

Writing to CSV and only getting one column

I'm trying to write to CSV file and am only getting 1 column with the company names. Does anyone know how I can write to a CSV file and get all the data with column headings?
Printed Output
'Agilent Technologies\n6,319\n2,912\n441\n1,619\n321\n189\n189\n1,347\n81
\n236\n1,210\n19.2%\n307', 'Alcoa\n12,152\n9,153\n31\n227\n664\n390\n390\n2,039\n195\n19\n429\n3.5%\n190',
Current Output after writing to CSV
Agilent Technologies
Alcoa
Desired Output after writing to CSV
Full Code
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
import pandas as pd
import requests
import csv
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
ser = Service("./chromedriver.exe")
browser = driver = webdriver.Chrome(service=ser)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
driver.execute_cdp_cmd("Network.enable", {})
driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.53 Safari/537.36'})
wait = WebDriverWait(driver, 30)
driver.get("https://stockrover.com")
wait.until(EC.visibility_of_element_located((By.XPATH, "/html/body/div[1]/div/section[2]/div/ul/li[2]"))).click()
user = driver.find_element(By.NAME, "username")
password = driver.find_element(By.NAME, "password")
user.clear()
user.send_keys("******")
password.clear()
password.send_keys("*******")
driver.find_element(By.NAME, "Sign In").click()
wait = WebDriverWait(driver, 30)
stocks_list = []
try:
while True:
# Print the stock symbols
stocks_list.extend([my_elem.text for my_elem in WebDriverWait(driver, 30).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "table[id^='gridview-1072-record']")))])
# Click on next page button
wait.until(EC.visibility_of_element_located((By.XPATH, '//*[#id="button-1157"]'))).click()
except:
print("Next button disabled")
print(stocks_list) # Prints entire list of stocks
df=pd.DataFrame(stocks_list)
df.to_csv('table.csv')

You may just perform this CSV Module in the end add:
stocks_lists = [x.split('\n') for x in stocks_list]
for row in stocks_lists:
with open('output.csv', 'a', encoding='utf-8', newline='') as csv_file:
csv_write = csv.writer(csv_file)
csv_write.writerow(row)
Full Code:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
import pandas as pd
import requests
import csv
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome('G://chromedriver.exe')
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
driver.execute_cdp_cmd("Network.enable", {})
driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.53 Safari/537.36'})
wait = WebDriverWait(driver, 30)
driver.get("https://stockrover.com")
wait.until(EC.visibility_of_element_located((By.XPATH, "/html/body/div[1]/div/section[2]/div/ul/li[2]"))).click()
user = driver.find_element(By.NAME, "username")
password = driver.find_element(By.NAME, "password")
user.clear()
user.send_keys("********")
password.clear()
password.send_keys("********")
driver.find_element(By.NAME, "Sign In").click()
wait = WebDriverWait(driver, 30)
stocks_list = []
try:
while True:
# Print the stock symbols
stocks_list.extend([my_elem.text for my_elem in WebDriverWait(driver, 30).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "table[id^='gridview-1072-record']")))])
# Click on next page button
wait.until(EC.visibility_of_element_located((By.XPATH, '//*[#id="button-1157"]'))).click()
except:
print("Next button disabled")
# print(stocks_list) # Prints entire list of stocks
stocks_lists = [x.split('\n') for x in stocks_list]
for row in stocks_lists:
with open('output.csv', 'a', encoding='utf-8', newline='') as csv_file:
csv_write = csv.writer(csv_file)
csv_write.writerow(row)

Assuming that this is your list of strings
string_sock = ["""Agilent Technologies\n6,319\n2,912\n441\n1,619\n321\n189\n189\n1,347\n81
\n236\n1,210\n19.2%\n307""", """Alcoa\n12,152\n9,153\n31\n227\n664\n390\n390\n2,039\n195\n19\n429\n3.5%\n190"""]
Yours columns
columns = ['Company', 'Sales', 'Cost of Sales', 'R&D', 'SG&A', 'Depr', 'Capex', 'Maint Capex', 'Op income', 'Int Exp.', 'Dividends', 'Net Inc.', 'Net Margin', 'Diluted Sh', 'rs']
Split the input with the separator
splitted_rows = [v.split("\n") for v in string_sock]
pd.DataFrame(splitted_rows, columns=columns).to_csv(test.csv)

Python, Selenium - parse. Can't get info from dynamic filling

I want to learn how to get information from dynamically generated fields.
When I tried simple sites everything worked. Then I decided to try more difficult and now I can't figure it out. It took me about a two weeks to cross out the solution options that I found on the Internet over and over again.
Now I'm not sure that I can get information that appears on sites in this way. Of course, most likely I'm doing something wrong, but I can't take some new idea how it do. Now, I decided to ask here. Perhaps there are those who understand this and can prompt. If yes - please give me some example.
The site I use to learn - kbp.aero/en
The information I'm trying to get (arrival schedule) - .tbody .tr .td
For example I tried:
1.
URL = 'https://kbp.aero/en/'
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
}
time.sleep(1)
response = requests.get(URL, headers = HEADERS)
soup = BeautifulSoup(response.content, 'html.parser')
items = soup.find('div', class_ = 'table_wrp out yesterday')
items = items.findAll('tr', class_ = 'tr')
comps = []
if(len(items) > 0):
for item in items:
comps.append({
'title':item.find('td', class_ = 'td').get_text(strip = True),
})
for comp in comps:
print(comp['title'])
# for item in items:
# comps.append({
# 'text': item.get_text(strip=True)
# })
#
# for comp in comps:
# print(comp['text'])
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def main():
driver = webdriver.Chrome()
driver.get("https://kbp.aero/en/")
wait = WebDriverWait(driver, 10)
element = wait.until(EC.text_to_be_present_in_element((By.CLASS_NAME, 'tbody'), ''))
tds = element.find_elements(By.CLASS_NAME, "td")
for td in tds:
print(td.text)
# try:
# element = WebDriverWait(driver, 10).until(
# EC.presence_of_element_located((By.CLASS_NAME, "tbody"))
# )
# tds = element.find_elements(By.CLASS_NAME, "td")
# for td in tds:
# print(td.text)
#
# finally:
# driver.quit()
Thanks for any advice.

This will fetch the entire table data:
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
PATH = r"chromedriverexe path"
driver = webdriver.Chrome(PATH)
driver.get("https://kbp.aero/en/")
driver.maximize_window()
sleep(3)
print(driver.find_element(By.CSS_SELECTOR, "div.table_wrp.out.today > table").text)
Output:
Рейс Час Призначення Перевізник Термінал Гейт Статус
TK 1256 15:05 Istanbul Turkish Airlines D D5 Boarding Completed
PS 9556 15:05 Istanbul Ukraine International Airlines D D5 Boarding Completed
7W 163 15:10 Lviv Wind Rose D D19 Boarding
FR 3167 15:10 Warsaw Ryanair D D9 Boarding
PS 9013 15:15 Ivano-Frankivsk Ukraine International Airlines D D18 Boarding
7W 113 15:15 Ivano-Frankivsk Wind Rose D D18 Boarding

Unable to scrape the Next page URLs using Selenium and scrapy

I am struggling to parse/scrape each page after clicking the Next button using Selenium. I am able to go to the second page, however, it fails after that. Not sure how to solve this, any suggestions?
Here is the code:
class PropertyFoxSpider(scrapy.Spider):
name = 'property_fox'
start_urls = [
'https://propertyfox.co.za/listing-search?currentpage=1&term_id=62515&keywords=Western+Cape&orderby=createddate:desc&status%5B%5D=Active'
]
def __init__(self):
#path to driver
self.driver = webdriver.Chrome('path')
def parse(self,response):
self.driver.get(response.url)
while True:
try:
elem = WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.ID, "pagerNext")))
elem.click()
url = self.driver.current_url
yield scrapy.Request(url=url, callback=self.parse_page, dont_filter=False)
except TimeoutException:
break
def parse_page(self, response):
#self.driver.get(response.url)
for prop in response.css('div.property-item'):
link = prop.css('a::attr(href)').get()
banner = prop.css('div.property-figure-icon div::text').get()
sold_tag = None
if banner:
banner = banner.strip()
sold_tag = 'sold' if 'sold' in banner.lower() else None
yield scrapy.Request(
link,
callback=self.parse_property,
meta={'item': {
'agency': self.name,
'url': link,
'offering': 'buy',
'banners': banner,
'sold_tag': sold_tag,
}},
)
def parse_property(self, response):
item = response.meta.get('item')
...

You can wait until URL changed and then scrape it
from selenium.webdriver.support.ui import WebDriverWait
url = self.driver.current_url
elem = WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.ID, "pagerNext")))
elem.click()
WebDriverWait(self.driver, 10).until(lambda driver: self.driver.current_url != url)
url = self.driver.current_url

Python / Selenium / Beautiful Soup not scraping desired elements

I'm struggling to get this code to extract the desired information from one single page.
I've tried all the usual selenium tactics and added a time delay. Hopefully, it's something simple. I'm not getting any error messages.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup as bs
from time import sleep
options = Options()
options.add_argument("--headless")
options.add_argument("window-size=1400,600")
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_0_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36"
options.add_argument(f'user-agent={user_agent}')
capabilities = { 'chromeOptions': { 'useAutomationExtension': False},'args': ['--disable-extensions']}
browser = webdriver.Chrome(executable_path=r'/usr/local/bin/chromedriver',desired_capabilities = capabilities,options=options)
url='https://groceries.asda.com/product/celery-spring-onions/asda-growers-selection-trimmed-spring-onions/41676'
browser.get(url)
sleep(3)
source_data = browser.page_source
bs_data = bs(source_data,"html.parser")
#product id
try:
product_id = bs_data.findfindAll('span', {'class': 'pdp-main-details__product-code'})
product_id = product_id.replace('Product code:','').strip()
except:
product_id = "n/a"
#image address
try:
for image in bs_data.find("div", {"class":"s7staticimage"}):
image_url = image.find('img')['src']
except:
image_url = "n/a"
#product description
try:
product_desc = bs_data.find('class',{'pdp-main-pdp-main-details__title'})
product_desc = product_desc.get_text().strip()
except:
product_desc = "n/a"
#product price
try:
product_price = bs_data.find('class',{'co-product__price pdp-main-details__price'})
product_price = product_price.get_text().strip()
except:
product_price = "n/a"
print (url,'|',image_url,'|',product_id,'|',product_desc,'|',product_price)
browser.quit()
Any assistance is greatly appreciated.
Thanks

Since the content is dynamically generated, your soup has nothing in it to find. Selenium is good enough. I don't know why you have treated the elements as list because there is only one of each on this page.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument('--headless')
capabilities = { 'chromeOptions': { 'useAutomationExtension': False},'args': ['--disable-extensions']}
browser = webdriver.Chrome(executable_path='C:/bin/chromedriver.exe',desired_capabilities = capabilities,options=options)
url='https://groceries.asda.com/product/celery-spring-onions/asda-growers-selection-trimmed-spring-onions/41676'
browser.get(url)
browser.implicitly_wait(15)
product_id = browser.find_element_by_class_name('pdp-main-details__product-code')
print(product_id.text)
image = browser.find_element_by_xpath("//*[#id=\"s7viewer_flyout\"]/div[1]/img[1]")
image_url = image.get_attribute('src')
print(image_url)
Output:-
Product code: 410212
https://ui.assets-asda.com/dm/asdagroceries/5050854288142_T1?defaultImage=asdagroceries/noImage&resMode=sharp2&id=PqaST3&fmt=jpg&fit=constrain,1&wid=188&hei=188

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

scrape only 1 page not to scrape multiple pages - python

Related

Insert value in searchbar, select autocomplete result and get value by bs4

Writing to CSV and only getting one column

Python, Selenium - parse. Can't get info from dynamic filling

Unable to scrape the Next page URLs using Selenium and scrapy

Python / Selenium / Beautiful Soup not scraping desired elements

Categories

Resources