Scraping using beautiful soup not working fully? - python

I was trying to scrape some data using BeautifulSoup on python from the site 'https://www.geappliances.com/ge-appliances/kitchen/ranges/' which has some products.
import unittest, time, random
from selenium import webdriver
from webdriver_manager.firefox import GeckoDriverManager
from bs4 import BeautifulSoup
import pandas as pd
links = []
browser = webdriver.Firefox(executable_path="C:\\Users\\drivers\\geckodriver\\win64\\v0.29.1\\geckodriver.exe")
browser.get("https://www.geappliances.com/ge-appliances/kitchen/ranges/")
content = browser.page_source
soup = BeautifulSoup(content, "html.parser")
for main in soup.findAll('li', attrs = {'class' : 'product'}):
name=main.find('a', href=True)
if (name != ''):
links.append((name.get('href')).strip())
print("Got links : ", len(links))
exit()
Here in output I get:-
Got links: 0
I printed the soup and saw that this part was not there in the soup. I have been trying to get around this problem to no avail.
Am I doing something wrong? Any suggestion are appreciated. Thanks.

Study the source of the webpage.Check your findAll function.

You should wait till the page loads. Use time.sleep() to pause execution for a while.
You can try like this.
from bs4 import BeautifulSoup
from selenium import webdriver
import time
url = 'https://www.geappliances.com/ge-appliances/kitchen/ranges/'
driver = webdriver.Chrome("chromedriver.exe")
driver.get(url)
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'lxml')
u = soup.find('ul', class_='productGrid')
for item in u.find_all('li', class_='product'):
print(item.find('a')['href'])
driver.close()
/appliance/GE-Profile-30-Smart-Slide-In-Front-Control-Induction-Fingerprint-Resistant-Range-with-In-Oven-Camera-PHS93XYPFS
/appliance/GE-Profile-30-Electric-Pizza-Oven-PS96PZRSS
/appliance/GE-Profile-30-Smart-Slide-In-Front-Control-Gas-Double-Oven-Convection-Fingerprint-Resistant-Range-PGS960YPFS
/appliance/GE-Profile-30-Smart-Dual-Fuel-Slide-In-Front-Control-Fingerprint-Resistant-Range-P2S930YPFS
/appliance/GE-Profile-30-Smart-Slide-In-Electric-Double-Oven-Convection-Fingerprint-Resistant-Range-PS960YPFS
/appliance/GE-Profile-30-Smart-Slide-In-Front-Control-Induction-and-Convection-Range-with-No-Preheat-Air-Fry-PHS930BPTS
/appliance/GE-Profile-30-Smart-Slide-In-Fingerprint-Resistant-Front-Control-Induction-and-Convection-Range-with-No-Preheat-Air-Fry-PHS930YPFS
/appliance/GE-Profile-30-Smart-Slide-In-Front-Control-Gas-Range-with-No-Preheat-Air-Fry-PGS930BPTS
/appliance/GE-Profile-30-Smart-Slide-In-Front-Control-Gas-Fingerprint-Resistant-Range-with-No-Preheat-Air-Fry-PGS930YPFS
/appliance/GE-Profile-30-Smart-Slide-In-Electric-Convection-Range-with-No-Preheat-Air-Fry-PSS93BPTS
/appliance/GE-Profile-30-Smart-Slide-In-Electric-Convection-Fingerprint-Resistant-Range-with-No-Preheat-Air-Fry-PSS93YPFS
/appliance/GE-30-Slide-In-Front-Control-Gas-Double-Oven-Range-JGSS86SPSS

Related

python bs4, how to scrape this text in html?

the site url:https://n.news.naver.com/mnews/article/421/0006111920
I want to scrape "5" on the below html.
I used this code: soup.select_one('span.u_likeit_text._count').get_text()
the result is '추천'
html code
<span class="u_likeit_text _count num">5</span>
Main issue here that the count is dynamically generated by JavaScript and not present in response and so your soup.
You could use selenium to render the page like a browser will do and convert the driver.page_source to your BeautifulSoup object:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get("https://n.news.naver.com/mnews/article/421/0006111920")
time.sleep(3)
soup = BeautifulSoup(driver.page_source, 'html.parser')
soup.select_one('span.u_likeit_text._count').get_text()
Output:
8
You have to separate the classes using space, instead of connecting over dot.
from bs4 import BeautifulSoup
soup = BeautifulSoup("<span class='u_likeit_text _count num'>5</span>", 'html.parser')
print(soup)
seven_day = soup.find_all("span" , class_="u_likeit_text _count num")
print(seven_day[0].text)

fixed url scraping(Selenium)

hellow! I have a question about
I wanna scrape company names and ticker names in "https://www.nasdaq.com/market-activity/stocks/screener"
So I think Selenium can help my problem. but my code only works on the first page.
I'm sorry for my poor English.
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.keys import Keys
import time
nasduq_all=[] #ticker+company
nasduq_ticker=[] #ticker lists(홀수)
nasduq_company=[] #company lists(짝수)
dict_nasduq={} #ticker+company
page_url = 'https://www.nasdaq.com/market-activity/stocks/screener'
driver = webdriver.Chrome('/Users/kim/Desktop/dgaja/chromedriver')
driver.implicitly_wait(2)
driver.get(page_url)
time.sleep(1)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
#First of all, I'm only trying to go to page 2. :[
driver.find_element_by_xpath("/html/body/div[2]/div/main/div[2]/article/div[3]/div[1]/div/div/div[3]/div[5]/button[2]").send_keys(Keys.ENTER)
time.sleep(10)
ticker = soup.find("tbody", {"nasdaq-screener__table-body"}).find_all('a')
for i in ticker: #text
name=i.text
nasduq_all.append(name)
print(nasduq_all)

Python BeautifulSoup soup.find

I want to scrape some specific data from a website using urllib and BeautifulSoup.
Im trying to fetch the text "190.0 kg". I have tried as you can see in my code to use attrs={'class': 'col-md-7'}
but this returns the wrong result. Is there any way to specify that I want it to return the text between <h3>?
from urllib.request import urlopen
from bs4 import BeautifulSoup
# specify the url
quote_page = 'https://styrkeloft.no/live.styrkeloft.no/v2/?test-stevne'
# query the website and return the html to the variable 'page'
page = urlopen(quote_page)
# parse the html using beautiful soup
soup = BeautifulSoup(page, 'html.parser')
# take out the <div> of name and get its value
Weight_box = soup.find('div', attrs={'class': 'col-md-7'})
name = name_box.text.strip()
print (name)
Since this content is dynamically generated there is no way to access that data using the requests module.
You can use selenium webdriver to accomplish this:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_driver = "path_to_chromedriver"
driver = webdriver.Chrome(chrome_options=chrome_options,executable_path=chrome_driver)
driver.get('https://styrkeloft.no/live.styrkeloft.no/v2/?test-stevne')
html = driver.page_source
soup = BeautifulSoup(html, "lxml")
current_lifter = soup.find("div", {"id":"current_lifter"})
value = current_lifter.find_all("div", {'class':'row'})[2].find_all("h3")[0].text
driver.quit()
print(value)
Just be sure to have the chromedriver executable in your machine.

Webscraping - Python - Can't find links in html

I'm trying to scrape all links from https://www.udemy.com/courses/search/?q=sql&src=ukw&lang=en however without even selecting an element, my code retrieves no links. Please see my code below.
import bs4,requests as rq
Link = 'https://www.udemy.com/courses/search/?q=sql&src=ukw&lang=en'
RQOBJ = rq.get(Link)
BS4OBJ = bs4.BeautifulSoup(RQOBJ.text)
print(BS4OBJ)
hope you want link of courses on the page, this code will help
from selenium import webdriver
from bs4 import BeautifulSoup
import time
baseurl='https://www.udemy.com'
url="https://www.udemy.com/courses/search/?q=sql&src=ukw&lang=en"
driver = webdriver.Chrome()
driver.maximize_window()
driver.get(url)
time.sleep(5)
content = driver.page_source.encode('utf-8').strip()
soup = BeautifulSoup(content,"html.parser")
courseLink = soup.findAll("a", {"class": "card__title",'href': True})
for link in courseLink:
print baseurl+link['href']
driver.quit()
It will print:
https://www.udemy.com/the-complete-sql-bootcamp/
https://www.udemy.com/the-complete-oracle-sql-certification-course/
https://www.udemy.com/introduction-to-sql23/
https://www.udemy.com/oracle-sql-12c-become-an-sql-developer-with-subtitle/
https://www.udemy.com/sql-advanced/
https://www.udemy.com/sql-for-newbs/
https://www.udemy.com/sql-for-marketers-data-analytics-data-science-big-data/
https://www.udemy.com/sql-for-punk-analytics/
https://www.udemy.com/sql-basics-for-beginners/
https://www.udemy.com/oracle-sql-step-by-step-approach/
https://www.udemy.com/microsoft-sql-for-beginners/
https://www.udemy.com/sql-tutorial-learn-sql-with-mysql-database-beginner2expert/
the website use javascript to fetch data, you should use selenium

BeautifulSoup parsing returns None

There are plenty of websites thats simply return None when I request a value from the website.
Example:
import requests
from bs4 import BeautifulSoup
def spider():
url = 'https://poloniex.com/exchange'
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
High = soup.findAll("div", {"class": "high info"})[0].string
print High
# returns None
spider()
How do i solve this problem? Please, all I need is a value.
Download chromedriver from this link http://chromedriver.storage.googleapis.com/index.html?path=2.24/ and uzip it & Put chromedriver.exe in C:\Python27\Scripts
try this code:
from selenium import webdriver
import time
from bs4 import BeautifulSoup
driver = webdriver.Chrome()
url= "https://poloniex.com/exchange"
driver.maximize_window()
driver.get(url)
time.sleep(5)
content = driver.page_source.encode('utf-8').strip()
soup = BeautifulSoup(content,"html.parser")
High = soup.findAll("div", {"class": "high info"})[0].string
print High
driver.quit()
It will print:
0.02410000
Hope this helps
The web page has JavaScript code, and because of this the request doesn't return a complete result (the JS code in this case is necessary to complete the page).
I'm using selenium to solve this kind of problem.

Categories

Resources