Using Beautiful Soup On Stock Tracker - python

I'm trying to scrape data from Robintrack but, I cannot get the data from the increase/decrease section. I can only scrape the home page data. Here is my Soup
import bs4
import requests
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
robin_track_url= 'https://robintrack.net/popularity_changes?changeType=increases'
#r = requests.get('https://robintrack.net/popularity_changes?changeType=increases')
#soup = bs4.BeautifulSoup(r.text, "xml")
#Grabs and downloads html page
uClient = uReq(robin_track_url)
page_html = uClient.read()
uClient.close()
#Does HTML parsing
page_soup = soup(page_html, "html.parser")
print("On Page")
print(page_soup.body)
stocks = page_soup.body.findAll("div", {"class":"ReactVirtualized__Table__row"})
print(len(stocks))
print(stocks)
Am I doing something wrong?

Your version not will be work because the data that you want to load is loaded via JS.
requests load the only static page.
if you want to get data that you want to do next:
requests.get('https://robintrack.net/api/largest_popularity_increases?hours_ago=24&limit=50&percentage=false&min_popularity=50&start_index=0').json()

Related

Python BeautifulSoup missing information inside tag how to include information inside the tag

I am trying to scrape some data from a website.
But when I want to print it I just get the tags back, but with out the information in it.
This is the code:
#Imports
import bs4
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
#URL
my_url = 'https://website.com'
#Opening connection grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
#closing the page
uClient.close()
#Parse html
page_soup = soup(page_html,"html.parser")
price = page_soup.findAll("div",{"id":"lastTrade"})
print(price)
This ist what I get back:
[<div id="lastTrade"> </div>]
So does anyone can tell me what i have to change or add so I receive the actual infortmation from inside this tag?
Maybe loop through your list like this :
for res in price:
print(res.text)

beautifulsoup webscraper problem: can't find tables on webpage

I want to get tables from this website with this code:
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'https://www.flashscore.pl/pilka-nozna/'
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.find_all('table', {'class': 'soccer'})
print(len(containers))
But when I try to check how much tables I get by print(len(containers)), I get 0.
Any solutions?
edit:
it's possible the page is dynamic. you can use requests-html which allows you to let the page render before pulling the html, or you can use Selenium, as I did here.
This resulted in 42 elements of table class="soccer"
import bs4
from selenium import webdriver
url = 'https://www.flashscore.pl/pilka-nozna/'
browser = webdriver.Chrome('C:\chromedriver_win32\chromedriver.exe')
browser.get(url)
html = browser.page_source
soup = bs4.BeautifulSoup(html,'html.parser')
containers = soup.find_all('table', {'class': 'soccer'})
browser.close()
In [11]: print(len(containers))
42

Can't parse a 2nd page using beautiful soup

I am trying to navigate a website using beautifulsoup. I open the first page and find the links I want to follow, but when I ask beautiful soup to open the next page, none of the HTML is parsed and it just returns this
<function scraper at 0x000001E3684D0E18>
I have tried opening the second page in its own script and it works just fine so the problem has to do with parsing a page from another page.
I have ~2000 links I need to go through so I created a function that goes through them. Here's my script so far
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import lxml
# The first webpage I'm parsing
my_url = 'https://mars.nasa.gov/msl/multimedia/raw/'
#calls the urlopen function from the request module of the urllib module
# AKA opens up the connection and grabs the page
uClient = uReq(my_url)
#imports the entire webpage from html format into python.
# If webpage has lots of data this can take a long time and take up a lot of
space or crash
page_html = uClient.read()
#closes the client
uClient.close()
#parses the HTML using bs4
page_soup = soup(page_html, "lxml")
#finds the categories for the types of images on the site, category 1 is
RHAZ
containers = page_soup.findAll("div", {"class": "image_list"})
RHAZ = containers[1]
# prints the links in RHAZ
links = []
for link in RHAZ.find_all('a'):
#removes unwanted characters from the link making it usable.
formatted_link = my_url+str(link).replace('\n','').split('>')
[0].replace('%5F\"','_').replace('amp;','').replace('<a href=\"./','')
links.append(formatted_link)
print (links[1])
# I know i should be defining a function here.. so ill give it a go.
def scraper():
pic_page = uReq('links[1]') #calls the first link in the list
page_open = uClient.read() #reads the page in a python accessible format
uClient.close() #closes the page after it's been stored to memory
soup_open = soup(page_open, "lxml")
print (soup_open)
print (scraper)
Do I need to clear the previously loaded HTML in beautifulsoup so I can open the next page? If so, how would I do this? Thanks for any help
You need to make requests from the urls scraped from first page...check this code.
from bs4 import BeautifulSoup
import requests
url = 'https://mars.nasa.gov/msl/multimedia/raw'
req = requests.get(url)
soup = BeautifulSoup(req.content, 'lxml')
img_list = soup.find_all('div', attrs={'class': 'image_list'})
for i in img_list:
image = i.find_all('a')
for x in image:
href = x['href'].replace('.', '')
link = (str(url)+str(href))
req2 = requests.get(link)
soup2 = BeautifulSoup(req2.content, 'lxml')
img_list2 = soup2.find_all('div', attrs={
'class': 'RawImageUTC'})
for l in img_list2:
image2 = l.find_all('a')
for y in image2:
href2 = y['href']
print(href2)
Output:
http://mars.jpl.nasa.gov/msl-raw-images/proj/msl/redops/ods/surface/sol/02172/opgs/edr/fcam/FLB_590315340EDR_F0722464FHAZ00337M_.JPG
http://mars.jpl.nasa.gov/msl-raw-images/proj/msl/redops/ods/surface/sol/02172/opgs/edr/fcam/FRB_590315340EDR_F0722464FHAZ00337M_.JPG
http://mars.jpl.nasa.gov/msl-raw-images/proj/msl/redops/ods/surface/sol/02172/opgs/edr/fcam/FLB_590315340EDR_T0722464FHAZ00337M_.JPG
http://mars.jpl.nasa.gov/msl-raw-images/proj/msl/redops/ods/surface/sol/02172/opgs/edr/fcam/FRB_590315340EDR_T0722464FHAZ00337M_.JPG
http://mars.jpl.nasa.gov/msl-raw-images/proj/msl/redops/ods/surface/sol/02171/opgs/edr/fcam/FLB_590214757EDR_F0722464FHAZ00341M_.JPG
http://mars.jpl.nasa.gov/msl-raw-images/proj/msl/redops/ods/surface/sol/02171/opgs/edr/fcam/FRB_590214757EDR_F0722464FHAZ00341M_.JPG
http://mars.jpl.nasa.gov/msl-raw-images/proj/msl/redops/ods/surface/sol/02171/opgs/edr/fcam/FLB_590214757EDR_T0722464FHAZ00341M_.JPG
http://mars.jpl.nasa.gov/msl-raw-images/proj/msl/redops/ods/surface/sol/02171/opgs/edr/fcam/FRB_590214757EDR_T0722464FHAZ00341M_.JPG
http://mars.jpl.nasa.gov/msl-raw-images/proj/msl/redops/ods/surface/sol/02170/opgs/edr/fcam/FLB_590149941EDR_F0722464FHAZ00337M_.JPG
http://mars.jpl.nasa.gov/msl-raw-images/proj/msl/redops/ods/surface/sol/02170/opgs/edr/fcam/FRB_590149941EDR_F0722464FHAZ00337M_.JPG
http://mars.jpl.nasa.gov/msl-raw-images/proj/msl/redops/ods/surface/sol/02170/opgs/edr/fcam/FLB_590134317EDR_S0722464FHAZ00214M_.JPG
http://mars.jpl.nasa.gov/msl-raw-images/proj/msl/redops/ods/surface/sol/02170/opgs/edr/fcam/FLB_590134106EDR_S0722464FHAZ00214M_.JPG
http://mars.jpl.nasa.gov/msl-raw-images/proj/msl/redops/ods/surface/sol/02170/opgs/edr/fcam/FLB_590134065EDR_S0722464FHAZ00214M_.JPG
http://mars.jpl.nasa.gov/msl-raw-images/proj/msl/redops/ods/surface/sol/02170/opgs/edr/fcam/FLB_590134052EDR_S0722464FHAZ00222M_.JPG
http://mars.jpl.nasa.gov/msl-raw-images/proj/msl/redops/ods/surface/sol/02170/opgs/edr/fcam/FLB_590133948EDR_S0722464FHAZ00222M_.JPG

Beautiful Soup returning empty html

So this is my second question regarding Beautiful Soup (sorry, im a beginner)
I was trying to fetch data from this website:
https://www.ccna8.com/ccna4-v6-0-final-exam-full-100-2017/
My Code:
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
url = 'https://www.ccna8.com/ccna4-v6-0-final-exam-full-100-2017/'
uClient = uReq(url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "lxml")
print(page_soup)
But for some reason it returns an empty string.
I've been searching for similar threads and apparently it has something to do with the website using external api's , but this website doesn't.
It seems that the content-type of the response if gzip so you need to handle that before you can process the html response.
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import gzip
url = 'https://www.ccna8.com/ccna4-v6-0-final-exam-full-100-2017/'
uClient = uReq(url)
page_html = gzip.decompress(uClient.read())
uClient.close()
page_soup = soup(page_html, "lxml")
print(page_soup)
try using requests module
Ex:
import requests
from bs4 import BeautifulSoup as soup
url = 'https://www.ccna8.com/ccna4-v6-0-final-exam-full-100-2017/'
uClient = requests.get(url)
page_soup = soup(uClient.text, "lxml")
print(page_soup)

How to find a particular class in BeautifulSoup using findAll

FindAll doesn't find the class I need. However I was able to find the class above that one, but the data structure is not that well organized.
Do you know what can we do to get the data or organize the output from the class above which has all the data together ?
Please see the HTML below and the images.
import bs4
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'https://www.vivino.com/explore?e=eJzLLbI11jNVy83MszU0UMtNrLA1MVBLrrQtLVYrsDVUK7ZNTlQrS7YtKSpNVSsviY4FioEpIwhlDKFMIJQ5VM4EAJCfGxQ='
#Opening a connection
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
#html parse
page_soup = soup(page_html, "html.parser")
container = page_soup.findAll("div", {"class":"wine-explorer__results__item"})
len(container)
Thanks everyone, as you all suggested a module to read Javascript was needed to select that class. I've used selenium in this case, however PyQt5 might be a better option.
import bs4
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
from selenium import webdriver
my_url = 'https://www.vivino.com/explore?e=eJzLLbI11jNVy83MszU0UMtNrLA1MVBLrrQtLVYrsDVUK7ZNTlQrS7YtKSpNVSsviY4FioEpIwhlDKFMIJQ5VM4EAJCfGxQ='
#Opening a connection
#html parse
web_r = uReq(my_url)
driver=webdriver.Firefox()
driver.get(my_url)
page_soup = soup(web_r, "html.parser")
html = driver.execute_script("return document.documentElement.outerHTML")
#print(html)
html_page_soup = soup(html, "html.parser")
container = html_page_soup.findAll("div", {"class": "wine-explorer__results__item"})
len(container)
You can use Dryscrape module with bs4 because wine-explorer selector is created by javascript. Dryscrape module helps you for javascript support.
Try using the following instead:
container = page_soup.findAll("div", {"class": "wine-explorer__results"})

Categories

Resources