Scrape href/URL - python

My code goes into a webpage which contains multiple entries, gets their URL and then puts them into a list.
Then it navigates through each list of URL 1 by 1, and then does a scape per presentation.
Right now I scrape each title of each presentation (you can see if you run the code), but within the title, there is another URL/href that I would want.
Is there a way to scrape this?
Thanks
from selenium import webdriver
import pandas as pd
from bs4 import BeautifulSoup
import requests
import time
val=[]
driver = webdriver.Chrome()
for x in range (1,3):
driver.get(f'https://www.abstractsonline.com/pp8/#!/9325/sessions/#sessiontype=Advances%20in%20Diagnostics%20and%20Therapeutics/{x}')
time.sleep(9)
page_source = driver.page_source
eachrow = ["https://www.abstractsonline.com/pp8/#!/9325/session/" + x.get_attribute('data-id') for x in driver.find_elements_by_xpath('//*[#id="results"]/li//h1[#class="name"]')]
for row in eachrow:
val.append(row)
print(row)
for b in val:
driver.get(b)
time.sleep(3)
page_source1=driver.page_source
soup=BeautifulSoup(page_source1,'html.parser')
productlist=soup.find_all('a',class_='title color-primary')
for item in productlist:
presentationTitle=item.text.strip()
print(presentationTitle)

I think you want some wait conditions in there and then to extract the href attribute for each presentation within a page
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
base = 'https://www.abstractsonline.com/pp8/#!/9325/session/'
for x in range (1, 3):
driver.get(f'https://www.abstractsonline.com/pp8/#!/9325/sessions/#sessiontype=Advances%20in%20Diagnostics%20and%20Therapeutics/{x}')
links = [base + i.get_attribute('data-id') for i in WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "li .name")))]
for link in links:
driver.get(link)
print(WebDriverWait(driver,10).until(EC.presence_of_element_located((By.ID, "spnSessionTitle"))).text)
for presentation in driver.find_elements_by_css_selector('.title'):
print(presentation.text.strip())
print('https://www.abstractsonline.com/pp8' + presentation.get_attribute('href'))

links = driver.find_elements_by_partial_link_text('https://yourlinks.com/?action=')
for link in links:
print(link.get_attribute("href"))

Related

Scraping with Selenium. for page in range() issue

I am trying to scrape on ethplorer.io. I want to scrape many pages. My code is like this. But it scrapes page(11) three times. range(11,14) I couldn't understand why?
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
driver = webdriver.Chrome()
base_url = "https://ethplorer.io/tr/address/0xf87e31492faf9a91b02ee0deaad50d51d56d5d4d#pageSize=100&tab=tab-holders&holders="
results=[]
for page_number in range(11,14):
url = base_url+str(page_number)
driver.get(url)
data = driver.find_elements(By.CLASS_NAME, "local-link")
for x in data:
results.append(x.text)
driver.quit
with open("all_data.txt" , "w") as file:
for x in results:
file.write(x + "\n")
I have applied some modifications to your code to have it working through the several pages you are calling and to capture the text within the hyperlinks I assume you are targeting, please check below:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
from selenium.common.exceptions import StaleElementReferenceException
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(ChromeDriverManager().install())
url = "https://ethplorer.io/tr/address/0xf87e31492faf9a91b02ee0deaad50d51d56d5d4d#pageSize=100&tab=tab-holders&holders=11" # Here we are calling the first page we need, which is page #11 in this case.
xpath = "//div[#id='token-holders-tab']//div[#id='address-token-holders']//div[#class='block']//table//tr//td//a[contains(#class,'local-link')]"
driver.get(url)
data =driver.find_elements_by_xpath("//div[#id='token-holders-tab']//div[#id='address-token-holders']//div[#class='block']//table//tr//td//a[contains(#class,'local-link')]")
results=[]
for page_number in range(12,15): # Range should start from the next page (page # 12 in this case). Range end with last page you need + 2, in this case you need to scrape from 11 untill 13 ,so rage end should be 15.
for x in data: # On first round it will get page # 11 data.
results.append(x.text)
nextPage = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH,"//div[#id='token-holders-tab']//tr[contains(#class,'paginationFooter')]//A[#class='page-link'][text()='"+str(page_number)+"']")))
driver.execute_script("arguments[0].click();", nextPage)
time.sleep(3)
data =driver.find_elements_by_xpath("//div[#id='token-holders-tab']//div[#id='address-token-holders']//div[#class='block']//table//tr//td//a[contains(#class,'local-link')]")
driver.quit
with open("all_data.txt" , "w") as file:
for x in results:
file.write(x + "\n")

Web page is loaded in selenium and reaches to end but does not contain all the elements inside the div

This is the site.
https://www.talabat.com/uae/top-selling.
There are somewhat 100 products and only 30 gets loaded. I was trying to fetch all the links and page reaches to end but only display 30 products and when clicked somewhere in the webdriver then loads the rest of the products. How can I print the links of all the products?
Thanks in advance!!
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from bs4 import BeautifulSoup
HOME_PAGE_URL = "https://www.talabat.com/uae/top-selling"
PATIENCE_TIME = 60
LOAD_MORE_XPATH = '//*[#id="comment-ajx"]/div'
driver = webdriver.Chrome(executable_path='C:\\Users\\Mansi Dhingra\\Downloads\\chromedriver.exe')
driver.get(HOME_PAGE_URL)
soup=BeautifulSoup(driver.page_source)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# sleep for 30s
res=[]
results = driver.find_elements_by_xpath("/html/body/div[3]/ui-view/section/div/div[2]/div/div[2]/div/div[2]")
html_code = driver.find_element_by_tag_name("section").text
print(html_code)
for res in results:
link=res.find_elements_by_tag_name('a')
for x in link:
product_link = x.get_attribute("href")
print(product_link)
print(results)
The main point is that selenium reads the page before the page has loaded all the items, you need a wait.
Just read the docs:
https://selenium-python.readthedocs.io/waits.html
Choose the best condition for you case and go for it.

How to scrape and extract links to n level and scrape the data again and map it to output in python?

I am learning web crawling and scraping in python. I want to scrape data where in a site there are links, and inside those links there are more links. So I want to scrape data till predefined level n.
This is my basic code
import requests
from selenium import webdriver
from requests_ntlm import HttpNtlmAuth
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from bs4 import BeautifulSoup
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.action_chains import ActionChains
from webdrivermanager import GeckoDriverManager
import pickle
from selenium.webdriver.common.keys import Keys
from urllib.parse import urljoin
from seleniumrequests import Chrome
options = Options()
options.add_argument('--incognito')
options.add_argument('--headless')
options.add_argument("--no-sandbox")
options.add_argument('--disable-dev-shm-usage')
options.add_argument("--profile-directory=Default")
driver = webdriver.Chrome("./chromedriver",options=options)
web_url = 'https://spaceflightnow.com/'
driver.get("https://spaceflightnow.com/")
time.sleep(5)
soup = BeautifulSoup(driver.page_source,"lxml")
#section = soup.section
links=[]
for url in soup.find_all('a',href=True):
links.append(urljoin(web_url,url.get('href')))
#print(urljoin(web_url,url.get('href')))
links = list(filter(lambda x: x != web_url,links))
print(links)
This prints multiple links of first page. Now I want to click and go to all the links in subsequent level and scrape it again,getting more links inside. There is the possibility of same links getting displayed again internally from news feed. So what I want to know is what should be my approach to do it. I can understand I need a tree, but cannot figure out exactly how ?
Like I create a list inside list, but how to do it dynamically till n level ? and how to map it with the data saved in file ?? Can anyone help me with this ? maybe with a sample solution ?
Thank you :)
I made example which work without recursion - I would say it is similar to Breadth-First Search algorithm.
It keeps urls on list [(url, level),...] to control level and in set() to filter visited page. It also filters links to external pages.
Tested with Firefox.
import time
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# ---
def get_links(driver, url):
driver.get(url)
time.sleep(5)
soup = BeautifulSoup(driver.page_source,"lxml")
links = []
for new_url in soup.find_all('a', href=True):
new_url = new_url.get('href')
new_url = urljoin(url, new_url)
links.append(new_url)
return links
# ---
options = Options()
options.add_argument('--incognito')
options.add_argument('--headless')
options.add_argument("--no-sandbox")
options.add_argument('--disable-dev-shm-usage')
options.add_argument("--profile-directory=Default")
driver = webdriver.Chrome("./chromedriver",options=options)
#driver = webdriver.Firefox()
# ---
domain = 'https://spaceflightnow.com/' # to filter external links
start_url = 'https://spaceflightnow.com/'
max_level = 2
links_visited = set([start_url]) # to test visited links
links_with_levels = [(start_url, 0)] # to control levels
# ---
for link, level in links_with_levels:
if level >= max_level:
print('skip:', level, link)
continue
print('visit:', level, link)
links = get_links(driver, link)
print('found:', len(links))
links = list(set(links) - links_visited)
print('after filtering:', len(links))
level += 1
for new_link in links:
if new_link.startswith(domain): # filter external links
links_visited.add(new_link)
links_with_levels.append( (new_link, level) )
# ---
for link, level in links_with_levels:
print('skip:', level, link)

Locating an element in bs4

Trying to scrape all the information of every item dozer on this page.
I have just started and have only fair idea about scraping but not sure of how to doing that.
driver=webdriver.Firefox()
driver.get('https://www.rbauction.com/dozers?keywords=&category=21261693092')
soup=BeautifulSoup(driver.page_source,'html.parser')
#trying all d/f ways buh getting oly nonetype or no element
get= soup.findAll('div' , attrs={'class' : 'sc-gisBJw eHFfwj'})
get2= soup.findAll('div' , attrs={'id' : 'searchResultsList'})
get3= soup.find('div.searchResultsList').find_all('a')
I have to get into each class/id and loop a['href'] and get information of each dozer.
Please help.
You need to wait for the data you are looking for to load before reading it into
the BeautifulSoup object. Use WebDriverWait in selenium to wait for the page to load as it takes a while to render fully:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
driver = webdriver.Firefox()
driver.get('https://www.rbauction.com/dozers?keywords=&category=21261693092')
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'searchResultsList')))
soup = BeautifulSoup(driver.page_source,'html.parser')
This line should return the hrefs from the page then:
hrefs = [el.attrs.get('href') for el in soup.find('div', attrs={'id': 'searchResultsList'}).find_all('a')]
You can just use requests
import requests
headers = {'Referrer':'https://www.rbauction.com/dozers?keywords=&category=21261693092'}
data = requests.get('https://www.rbauction.com/rba-msapi/search?keywords=&searchParams=%7B%22category%22%3A%2221261693092%22%7D&page=0&maxCount=48&trackingType=2&withResults=true&withFacets=true&withBreadcrumbs=true&catalog=ci&locale=en_US', headers = headers).json()
for item in data['response']['results']:
print(item['name'],item['url'])

BeautifulSoup does not extract commet tags in dynamic page

What I need: Count the number of reviews under an extension in Chrome Store in all languages.
What I did: Tried BeautifulSoup to extract a certain tag. I reserched the html-code of the page and found a review tag:
Tried this code:
from bs4 import BeautifulSoup
import requests
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html5lib')
comments = soup.find_all('div', class_ = 'ba-bc-Xb ba-ua-zl-Xb')
But print(comments) shows that the array is empty.
I am stuck at the moment and I see that further I need to handle two problems:
How to cope with select language buttom? How to count reviews in all languages if by default only one language is selected.
The reviews are stored in different tabs. I read about dynamically extract it but didn't get a point.
You could use selenium to perform the tasks and waits for page changes and extract the review count from the PaginationMessage. Tested with a few links. You may need to add error handling for items with no reviews. There also seems to be some POST XHR activity yielding review JSON strings that you may wish to explore.
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
url = 'https://chrome.google.com/webstore/detail/evernote-web-clipper/pioclpoplcdbaefihamjohnefbikjilc?hl=en/'
#url = 'https://chrome.google.com/webstore/detail/https-everywhere/gcbommkclmclpchllfjekcdonpmejbdp?hl=en/'
d = webdriver.Chrome()
d.get(url)
WebDriverWait(d, 5).until(EC.visibility_of_element_located((By.ID, ':21'))).click()
ActionChains(d).click_and_hold(WebDriverWait(d, 5).until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.h-z-Ba-ca.ga-dd-Va.g-aa-ca')))).perform()
languageSelection = WebDriverWait(d, 5).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, '.g-aa-ca-ma-x-L')))
languageSelection[1].click()
s= WebDriverWait(d, 5).until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.Aa.dc-tf + span'))).text
print(s.split()[-1])
d.quit()
try this
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
driver = webdriver.Chrome()
driver.get('https://chrome.google.com/webstore/detail/evernote-web-clipper/pioclpoplcdbaefihamjohnefbikjilc?hl=en')
wait = WebDriverWait(driver, 5)
wait.until(EC.visibility_of_element_located((By.ID, ':21'))).click()
wait.until(
EC.visibility_of_element_located((By.CSS_SELECTOR, '.h-z-Ba-ca.ga-dd-Va.g-aa-ca'))
).click()
english = driver.find_element_by_xpath('//div[#class="ah-mg-j"]/span').text
print('English: ' + english.split()[-1])
wait.until(
EC.visibility_of_element_located((By.XPATH, '//div[#class="g-aa-ca-ma-x-L" and text() = "All languages"]'))
).click()
wait.until_not(EC.text_to_be_present_in_element((By.XPATH, '//div[#class="ah-mg-j"]/span'), english))
time.sleep(2)
AllCount = driver.find_element_by_xpath('//div[#class="ah-mg-j"]/span').text
print('All languages: ' + AllCount.split()[-1])
driver.close()

Categories

Resources