Log in with selenium then run code using URLLIB? [duplicate] - python

The website I want to scrap is :
http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061
I want to get the last page number of the above the link for proceeding, which is 499 while taking the screenshot.
My code :
from bs4 import BeautifulSoup
from urllib.request import urlopen as uReq
from selenium import webdriver;import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
firefox_capabilities = DesiredCapabilities.FIREFOX
firefox_capabilities['marionette'] = True
firefox_capabilities['binary'] = '/etc/firefox'
driver = webdriver.Firefox(capabilities=firefox_capabilities)
url = "http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061"
driver.get(url)
wait = WebDriverWait(driver, 10)
soup=BeautifulSoup(driver.page_source,"lxml")
containers = soup.findAll("ul",{"class":"pages table"})
containers[0] = soup.findAll("li")
li_len = len(containers[0])
for item in soup.find("ul",{"class":"pages table"}) :
li_text = item.select("li")[li_len].text
print("li_text : {}\n".format(li_text))
driver.quit()
I need help to figure out the error in my code for getting the last page number. Also, I would be grateful if someone give the alternate solution for the same and suggest ways to achieve my intention.

If you want to get the last page number of the above the link for proceeding, which is 499 you can use either Selenium or Beautifulsoup as follows :
Selenium :
from selenium import webdriver
driver = webdriver.Firefox(executable_path=r'C:\Utility\BrowserDrivers\geckodriver.exe')
url = "http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061"
driver.get(url)
element = driver.find_element_by_xpath("//div[#class='row pagination']//p/span[contains(.,'Reviews on Reliance Jio')]")
driver.execute_script("return arguments[0].scrollIntoView(true);", element)
print(driver.find_element_by_xpath("//ul[#class='pagination table']/li/ul[#class='pages table']//li[last()]/a").get_attribute("innerHTML"))
driver.quit()
Console Output :
499
Beautifulsoup :
import bs4
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
url = "http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061"
uClient = uReq(url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
container = page_soup.find("ul",{"class":"pages table"})
all_li = container.findAll("li")
last_div = None
for last_div in all_li:pass
if last_div:
content = last_div.getText()
print(content)
Console Output :
499

Related

Parsing web-page with search bar

I need to parse store names (<div class="LocationName">) from https://www.comicshoplocator.com/StoreLocator.
The thing is -- when you entering zip code (for instance 73533) in search it does not appear in the URL.
Because of that python can't see elements on page.
Here is my code snippet. I am receiving no output, because of that.
How to make python see input with zip code?
Thanks
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
browser = webdriver.Firefox(executable_path=r'C:\Geckodriver\Geckodriver.exe')
browser.get('https://www.comicshoplocator.com/StoreLocator')
browser.find_element(By.NAME, 'query').send_keys('73533' + Keys.RETURN)
html = browser.page_source
soup = BeautifulSoup(html, features="html.parser")
for tag in soup.find_all('div', class_="LocationName"):
print(tag.text)
The problem is in here: browser.find_element(By.NAME, 'query').send_keys('73533' + Keys.RETURN)
The correct one would be:
search = browser.find_element(By.NAME, 'query')
search.send_keys('73533')
search.send_keys(Keys.RETURN)
Full working code:
I use chrome driver you can change that portion with no times
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get('https://www.comicshoplocator.com/StoreLocator')
driver.maximize_window()
time.sleep(2)
d=driver.find_element(By.NAME, 'query')
d.send_keys('73533')
d.send_keys(Keys.ENTER)
soup = BeautifulSoup(driver.page_source, 'lxml')
for tag in soup.find_all('div', class_="LocationName"):
print(tag.text)
Output:
MARK DOWN COMICS
WWW.DCBSERVICE.COM
Well, actually, this can be done with requests, there's no need to use Selenium. You can send a post request to:
https://www.comicshoplocator.com/StoreLocator
import re
import requests
from bs4 import BeautifulSoup
data = {
"showAll": "false",
"showCsls": "true",
"query": "73533",
}
response = requests.post(
"https://www.comicshoplocator.com/StoreLocator",
data=data,
)
soup = BeautifulSoup(response.text, "html.parser")
string = soup.select_one("script:-soup-contains('address')").string
unformatted_data = re.search(r"\(({.*?})\)", string, re.DOTALL).group(1)
# remove all the whitespace
formatted_data = re.sub(r"\s+", "", unformatted_data)
print(formatted_data)
Prints:
{storeno:"8816",lat:"41.0671081542969",lng:"-85.1372680664063",name:"WWW.DCBSERVICE.COM",address:"6005ESHELBYDR",address2:"WWW.DCBSERVICE.COM",city:"MEMPHIS",state:"TN",zip:"38141",phone:"",hasProfile:"True",storeLogo:'/Image/CslsLogo/'+"8816"}
This code worked for me:
listings = browser.find_elements(By.CLASS_NAME, 'CslsLocationItem')
for listing in listings:
print(listing.find_element(By.CLASS_NAME,'LocationName').get_attribute('innerText'))

Beautiful Soup 4 Python Webscraping price from website [duplicate]

The website I want to scrap is :
http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061
I want to get the last page number of the above the link for proceeding, which is 499 while taking the screenshot.
My code :
from bs4 import BeautifulSoup
from urllib.request import urlopen as uReq
from selenium import webdriver;import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
firefox_capabilities = DesiredCapabilities.FIREFOX
firefox_capabilities['marionette'] = True
firefox_capabilities['binary'] = '/etc/firefox'
driver = webdriver.Firefox(capabilities=firefox_capabilities)
url = "http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061"
driver.get(url)
wait = WebDriverWait(driver, 10)
soup=BeautifulSoup(driver.page_source,"lxml")
containers = soup.findAll("ul",{"class":"pages table"})
containers[0] = soup.findAll("li")
li_len = len(containers[0])
for item in soup.find("ul",{"class":"pages table"}) :
li_text = item.select("li")[li_len].text
print("li_text : {}\n".format(li_text))
driver.quit()
I need help to figure out the error in my code for getting the last page number. Also, I would be grateful if someone give the alternate solution for the same and suggest ways to achieve my intention.
If you want to get the last page number of the above the link for proceeding, which is 499 you can use either Selenium or Beautifulsoup as follows :
Selenium :
from selenium import webdriver
driver = webdriver.Firefox(executable_path=r'C:\Utility\BrowserDrivers\geckodriver.exe')
url = "http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061"
driver.get(url)
element = driver.find_element_by_xpath("//div[#class='row pagination']//p/span[contains(.,'Reviews on Reliance Jio')]")
driver.execute_script("return arguments[0].scrollIntoView(true);", element)
print(driver.find_element_by_xpath("//ul[#class='pagination table']/li/ul[#class='pages table']//li[last()]/a").get_attribute("innerHTML"))
driver.quit()
Console Output :
499
Beautifulsoup :
import bs4
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
url = "http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061"
uClient = uReq(url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
container = page_soup.find("ul",{"class":"pages table"})
all_li = container.findAll("li")
last_div = None
for last_div in all_li:pass
if last_div:
content = last_div.getText()
print(content)
Console Output :
499

How to wait to page to fully load using requests_html

While accessing this link https://www.dickssportinggoods.com/f/tents-accessories?pageNumber=2 with requests_html, i need to wait to wait some time before the page actually loads. Is it possible with this?
My code:
from requests_html import HTMLSession
from bs4 import BeautifulSoup
from lxml import etree
s = HTMLSession()
response = s.get(
'https://www.dickssportinggoods.com/f/tents-accessories?pageNumber=2')
response.html.render()
soup = BeautifulSoup(response.content, "html.parser")
dom = etree.HTML(str(soup))
item = dom.xpath('//a[#class="rs_product_description d-block"]/text()')[0]
print(item)
It looks like the data you are looking for can be fetched using HTTP GET to
https://prod-catalog-product-api.dickssportinggoods.com/v2/search?searchVO=%7B%22selectedCategory%22%3A%2212301_1809051%22%2C%22selectedStore%22%3A%220%22%2C%22selectedSort%22%3A1%2C%22selectedFilters%22%3A%7B%7D%2C%22storeId%22%3A15108%2C%22pageNumber%22%3A2%2C%22pageSize%22%3A48%2C%22totalCount%22%3A112%2C%22searchTypes%22%3A%5B%22PINNING%22%5D%2C%22isFamilyPage%22%3Atrue%2C%22appliedSeoFilters%22%3Afalse%2C%22snbAudience%22%3A%22%22%2C%22zipcode%22%3A%22%22%7D
The call will return a JSON and you can use that direcly with zero scraping code.
Copy/Paste the URL into the browser --> see the data.
You can specify the page number in the url:
searchVO={"selectedCategory":"12301_1809051","selectedStore":"0","selectedSort":1,"selectedFilters":{},"storeId":15108,"pageNumber":2,"pageSize":48,"totalCount":112,"searchTypes":["PINNING"],"isFamilyPage":true,"appliedSeoFilters":false,"snbAudience":"","zipcode":""}
working code below
import requests
import pprint
page_num = 2
url = f'https://prod-catalog-product-api.dickssportinggoods.com/v2/search?searchVO=%7B%22selectedCategory%22%3A%2212301_1809051%22%2C%22selectedStore%22%3A%220%22%2C%22selectedSort%22%3A1%2C%22selectedFilters%22%3A%7B%7D%2C%22storeId%22%3A15108%2C%22pageNumber%22%3A2%2C%2{page_num}pageSize%22%3A48%2C%22totalCount%22%3A112%2C%22searchTypes%22%3A%5B%22PINNING%22%5D%2C%22isFamilyPage%22%3Atrue%2C%22appliedSeoFilters%22%3Afalse%2C%22snbAudience%22%3A%22%22%2C%22zipcode%22%3A%22%22%7D'
r = requests.get(url)
if r.status_code == 200:
pprint.pprint(r.json())
You can induce Selenium as well in headless mode.
Selenium has the capability to wait unit elements are found with Explicit waits.
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = webdriver.ChromeOptions()
options.add_argument('--window-size=1920,1080')
options.add_argument("--headless")
driver = webdriver.Chrome(executable_path = driver_path, options = options)
driver.get("URL here")
wait = WebDriverWait(driver, 20)
wait.until(EC.visibility_of_element_located((By.XPATH, "//a[#class='rs_product_description d-block']")))
PS: You'd have to download chromedriver from here

Scraping: cannot extract content from webpage

I am trying to scrape the news content from the following page, but with no success.
https://www.business-humanrights.org/en/latest-news/?&search=nike
I have tried with Beautifulsoup :
r = requests.get("https://www.business-humanrights.org/en/latest-news/?&search=nike")
soup = BeautifulSoup(r.content, 'lxml')
soup
but the content that I am looking for - the bits of news that are tagged as div class = 'card__content', do not appear in the soup output.
I also checked, but I could not find any frames to switch to.
Finally, I tried with phantomjs and the following code but with no success:
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
url = "https://www.business-humanrights.org/en/latest-news/?&search=nike"
driver = webdriver.PhantomJS(executable_path= '~\Chromedriver\phantomjs-2.1.1-windows\bin\phantomjs.exe')
driver.get(url)
time.sleep(7)
page = driver.page_source
driver.quit()
soup = BeautifulSoup(page, 'html.parser')
container = soup.find_all('div', attrs={
'class':'card__content'})
print(container)
I am running out of options, anyone can help?
Use API
import requests
r = requests.get("https://www.business-humanrights.org/en/api/internal/explore/?format=json&search=nike")
print(r.json())
I didn't understand why you're facing this. I tried the same above but not with requests and bs4. I used requests_html. xpaths can be used directly in this library without any other libraries.
import requests_html
session = requests_html.HTMLSession()
URL = 'https://www.business-humanrights.org/en/latest-news/?&search=nike'
res = session.get(URL)
divs_with_required_class = res.html.xpath(r'//div[#class="card__content"]')
for item in divs_with_required_class:
print(f'Div {divs_with_required_class.index(item) + 1}:\n', item.text, end='\n\n')
driver.page_source returns initial HTML-doc content no matter how long you wait (time.sleep(7) has no effect).
Try below:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver.get(url)
cards = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, "//div[#class='card__content' and normalize-space(.)]")))
texts = [card.text for card in cards]
print(texts)
driver.quit()

Indeed how to extract the job title href link?

I have the code to extract the job information from Indeed, but now I want to extract the link form the job title so I can open a new page and pull out the job description information.
I can see the link on the html page with the reference to the job posting, within the href tag but not sue how to extract it?
import requests
import time
from random import randint
from bs4 import BeautifulSoup
import urllib, requests, re, pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
webdriver.DesiredCapabilities.CHROME["unexpectedAlertBehaviour"] = "accept"
webdriver.Chrome(chrome_options=options,executable_path=CHROMEDRIVER_PATH)
options = Options()
options.add_argument('start-maximized')
options.add_argument('disable-infobars')
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(chrome_options=options,executable_path='chromedriver')
driver.get("https://www.indeed.co.uk/automotive-engineer-jobs-in-uk")
soup=BeautifulSoup(driver.page_source, "lxml")
title = [tag.text.strip() for tag in soup.select('.jobtitle')]
company = [tag.text.strip() for tag in soup.select('.company')]
location = [tag.text.strip() for tag in soup.select('.location')]
for y in range (len(title)):
tmpstring = (title[y] + ',' + company[y] + ',' + location[y] + ",0")
tmpstring = tmpstring.encode("utf-8")
f = open('FileDump','a')
f.write(tmpstring)
f.close
You can get the child element by usig this code.
title_href = [tag.find("a")["href"] for tag in soup.findAll("h2",{"class":"jobtitle"})]
I try your code and modified a few places.because i found it can get full name from <a>
import requests
import time
from random import randint
from bs4 import BeautifulSoup
import urllib, requests, re, pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
webdriver.DesiredCapabilities.CHROME["unexpectedAlertBehaviour"] = "accept"
options = Options()
options.add_argument('start-maximized')
options.add_argument('disable-infobars')
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(chrome_options=options,executable_path='chromedriver')
driver.get("https://www.indeed.co.uk/automotive-engineer-jobs-in-uk")
domain = "https://www.indeed.co.uk"
soup=BeautifulSoup(driver.page_source, "lxml")
title = [tag.find("a")["title"] for tag in soup.findAll("h2",{"class":"jobtitle"})]
title_href = [domain + tag.find("a")["href"] for tag in soup.findAll("h2",{"class":"jobtitle"})]
company = [tag.text.strip() for tag in soup.findAll("span",{"class":"company"})]
location = [tag.text.strip() for tag in soup.findAll("span",{"class":"location"})]
print(title_href)
driver.close()
You can use the below code to extract the links
from BeautifulSoup import BeautifulSoup
import urllib2
import re
html_page = urllib2.urlopen("http://arstechnica.com")
soup = BeautifulSoup(html_page)
for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
print link.get('href')
Reference
https://pythonspot.com/extract-links-from-webpage-beautifulsoup/

Categories

Resources