fixed url scraping(Selenium) - python

hellow! I have a question about
I wanna scrape company names and ticker names in "https://www.nasdaq.com/market-activity/stocks/screener"
So I think Selenium can help my problem. but my code only works on the first page.
I'm sorry for my poor English.
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.keys import Keys
import time
nasduq_all=[] #ticker+company
nasduq_ticker=[] #ticker lists(홀수)
nasduq_company=[] #company lists(짝수)
dict_nasduq={} #ticker+company
page_url = 'https://www.nasdaq.com/market-activity/stocks/screener'
driver = webdriver.Chrome('/Users/kim/Desktop/dgaja/chromedriver')
driver.implicitly_wait(2)
driver.get(page_url)
time.sleep(1)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
#First of all, I'm only trying to go to page 2. :[
driver.find_element_by_xpath("/html/body/div[2]/div/main/div[2]/article/div[3]/div[1]/div/div/div[3]/div[5]/button[2]").send_keys(Keys.ENTER)
time.sleep(10)
ticker = soup.find("tbody", {"nasdaq-screener__table-body"}).find_all('a')
for i in ticker: #text
name=i.text
nasduq_all.append(name)
print(nasduq_all)

Related

can't get selenium to give me gpu prices

I'm new to web Scraping and can't get the prices i have found them in the terminal but the list appears empty despite this
from selenium import webdriver
from bs4 import BeautifulSoup
driver = webdriver.Chrome()
url = "https://www.kuantokusta.pt/p/6894201/msi-geforce-rtx-3080-ventus-3x-plus-oc-lhr-10gb-gddr6"
driver.get(url)
html = driver.page_source
doc = BeautifulSoup(html , "html.parser")
print(doc.prettify())
prices = doc.find_all(text="EUR")
print(prices)
There is a CSS class .prices and especially .old-price and .new-price.
So, you could use them as follow:
prices = doc.select('.prices')
or
prices = doc.select('.old-price')
or
prices = doc.select('.new-price')

BeautifulSoup not extracting all pages

I'm trying to practice some web scraping for a school project, but can't figure out why my script isn't pulling all the listings for a particular region? Would appreciate any help! I've been trying to figure out for hours!
(For simplicity, I'm just sharing one small sub-section of a page i'm trying to scrape. i'm hoping once I can figure out what's wrong here, I can apply it to other regions)
(You might need to create an account to login to see prices, before scraping)
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get('https://condos.ca')
def get_page(page):
url= f'https://condos.ca/toronto/condos-for-sale size_range=300%2C999999999&property_type=Condo%20Apt%2CComm%20Element%20Condo%2CLeasehold%20Condo&mode=Sold&end_date_unix=exact%2C2011&sublocality_id=22&page={page}'
driver.get(url)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'lxml')
return soup
prices=[]
location=[]
for page in range(5):
soup = get_page(page)
for tag in soup.find_all('div',class_ = 'styles___AskingPrice-sc-54qk44-4 styles___ClosePrice-sc-54qk44-5 dHPUdq hwkkXU'):
prices.append(tag.get_text())
for tag in soup.find_all('address',class_ = 'styles___Address-sc-54qk44-13 gTwVlm'):
location.append(tag.get_text())
For some reason, i'm only getting an output with 48 records, when it should be around 146.
Thanks!

Scraping using beautiful soup not working fully?

I was trying to scrape some data using BeautifulSoup on python from the site 'https://www.geappliances.com/ge-appliances/kitchen/ranges/' which has some products.
import unittest, time, random
from selenium import webdriver
from webdriver_manager.firefox import GeckoDriverManager
from bs4 import BeautifulSoup
import pandas as pd
links = []
browser = webdriver.Firefox(executable_path="C:\\Users\\drivers\\geckodriver\\win64\\v0.29.1\\geckodriver.exe")
browser.get("https://www.geappliances.com/ge-appliances/kitchen/ranges/")
content = browser.page_source
soup = BeautifulSoup(content, "html.parser")
for main in soup.findAll('li', attrs = {'class' : 'product'}):
name=main.find('a', href=True)
if (name != ''):
links.append((name.get('href')).strip())
print("Got links : ", len(links))
exit()
Here in output I get:-
Got links: 0
I printed the soup and saw that this part was not there in the soup. I have been trying to get around this problem to no avail.
Am I doing something wrong? Any suggestion are appreciated. Thanks.
Study the source of the webpage.Check your findAll function.
You should wait till the page loads. Use time.sleep() to pause execution for a while.
You can try like this.
from bs4 import BeautifulSoup
from selenium import webdriver
import time
url = 'https://www.geappliances.com/ge-appliances/kitchen/ranges/'
driver = webdriver.Chrome("chromedriver.exe")
driver.get(url)
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'lxml')
u = soup.find('ul', class_='productGrid')
for item in u.find_all('li', class_='product'):
print(item.find('a')['href'])
driver.close()
/appliance/GE-Profile-30-Smart-Slide-In-Front-Control-Induction-Fingerprint-Resistant-Range-with-In-Oven-Camera-PHS93XYPFS
/appliance/GE-Profile-30-Electric-Pizza-Oven-PS96PZRSS
/appliance/GE-Profile-30-Smart-Slide-In-Front-Control-Gas-Double-Oven-Convection-Fingerprint-Resistant-Range-PGS960YPFS
/appliance/GE-Profile-30-Smart-Dual-Fuel-Slide-In-Front-Control-Fingerprint-Resistant-Range-P2S930YPFS
/appliance/GE-Profile-30-Smart-Slide-In-Electric-Double-Oven-Convection-Fingerprint-Resistant-Range-PS960YPFS
/appliance/GE-Profile-30-Smart-Slide-In-Front-Control-Induction-and-Convection-Range-with-No-Preheat-Air-Fry-PHS930BPTS
/appliance/GE-Profile-30-Smart-Slide-In-Fingerprint-Resistant-Front-Control-Induction-and-Convection-Range-with-No-Preheat-Air-Fry-PHS930YPFS
/appliance/GE-Profile-30-Smart-Slide-In-Front-Control-Gas-Range-with-No-Preheat-Air-Fry-PGS930BPTS
/appliance/GE-Profile-30-Smart-Slide-In-Front-Control-Gas-Fingerprint-Resistant-Range-with-No-Preheat-Air-Fry-PGS930YPFS
/appliance/GE-Profile-30-Smart-Slide-In-Electric-Convection-Range-with-No-Preheat-Air-Fry-PSS93BPTS
/appliance/GE-Profile-30-Smart-Slide-In-Electric-Convection-Fingerprint-Resistant-Range-with-No-Preheat-Air-Fry-PSS93YPFS
/appliance/GE-30-Slide-In-Front-Control-Gas-Double-Oven-Range-JGSS86SPSS

How to parse a dynamically loading wildberries page?

I need to extract data from a comment block that is loaded dynamically. I have tried many methods from the Internet, all of them return an empty array.
The program cannot access the comments because they are loaded only when the user scrolls through the page to them. How can I get all the content of the page?
Page - https://www.wildberries.ru/catalog/22063490/detail.aspx?targetUrl=XS
Here is the code that I have now.
import requests as req
from bs4 import BeautifulSoup as bs
import pandas as pd
from sqlalchemy import create_engine
import psycopg2
from selenium import webdriver
import time
import lxml
driver = webdriver.Chrome(executable_path=r"D:\Downloads\chromedriver.exe")
safe_delay = 15
def read_comments(url):
response = req.get(url)
response.encoding = 'utf-8'
driver.get(url)
time.sleep(safe_delay)
html = driver.page_source
soup = bs(html, "html.parser")
#soup = bs(response.text, 'lxml')
coms = soup.find_all('div', class_='comment j-b-comment')
return(coms)
print(read_comments('https://www.wildberries.ru/catalog/22063490/detail.aspx?targetUrl=XS'))
You will need to scroll down the page for the comments to load. You can do this by sending the space key repeatedly, with a small sleep value to give the page time to load.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup as bs
import time
driver = webdriver.Chrome()
url = 'https://www.wildberries.ru/catalog/22063490/detail.aspx?targetUrl=XS'
def read_comments(url):
driver.get(url)
for x in range(10):
actions = ActionChains(driver)
actions.send_keys(Keys.SPACE)
actions.perform()
time.sleep(.5)
html = driver.page_source
soup = bs(html, "html.parser")
coms = soup.find_all('div', class_='comment j-b-comment')
return coms

Webscraping - Python - Can't find links in html

I'm trying to scrape all links from https://www.udemy.com/courses/search/?q=sql&src=ukw&lang=en however without even selecting an element, my code retrieves no links. Please see my code below.
import bs4,requests as rq
Link = 'https://www.udemy.com/courses/search/?q=sql&src=ukw&lang=en'
RQOBJ = rq.get(Link)
BS4OBJ = bs4.BeautifulSoup(RQOBJ.text)
print(BS4OBJ)
hope you want link of courses on the page, this code will help
from selenium import webdriver
from bs4 import BeautifulSoup
import time
baseurl='https://www.udemy.com'
url="https://www.udemy.com/courses/search/?q=sql&src=ukw&lang=en"
driver = webdriver.Chrome()
driver.maximize_window()
driver.get(url)
time.sleep(5)
content = driver.page_source.encode('utf-8').strip()
soup = BeautifulSoup(content,"html.parser")
courseLink = soup.findAll("a", {"class": "card__title",'href': True})
for link in courseLink:
print baseurl+link['href']
driver.quit()
It will print:
https://www.udemy.com/the-complete-sql-bootcamp/
https://www.udemy.com/the-complete-oracle-sql-certification-course/
https://www.udemy.com/introduction-to-sql23/
https://www.udemy.com/oracle-sql-12c-become-an-sql-developer-with-subtitle/
https://www.udemy.com/sql-advanced/
https://www.udemy.com/sql-for-newbs/
https://www.udemy.com/sql-for-marketers-data-analytics-data-science-big-data/
https://www.udemy.com/sql-for-punk-analytics/
https://www.udemy.com/sql-basics-for-beginners/
https://www.udemy.com/oracle-sql-step-by-step-approach/
https://www.udemy.com/microsoft-sql-for-beginners/
https://www.udemy.com/sql-tutorial-learn-sql-with-mysql-database-beginner2expert/
the website use javascript to fetch data, you should use selenium

Categories

Resources