Can't get anything back on some web pages using Beautiful Soup - python

I'm trying to scrape this page of job posts, the posts are buried in a bunch of divs but ultimately contained in an unordered list, when I try to retrieve the list using find_all I get None returned either by using the tag id or class. Is there anything I'm doing wrong?
url = "https://resumes.indeed.com/search?l=&q=python&searchFields=jt"
source = requests.get(url).text
soup = BeautifulSoup(source, "lxml")
match = soup.find_all("ul", class_="rezemp-ResumeSearchPage-resultsList")
print(match)

Try this instead
from bs4 import BeautifulSoup
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager
executable_path = {'executable_path':ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless = False)
url = "https://resumes.indeed.com/search?l=&q=python&searchFields=jt"
browser.visit(url)
html = browser.html
soup = BeautifulSoup(html, 'html.parser')
match = soup.find_all("ul", class_="rezemp-ResumeSearchPage-resultsList")
# This is the next you get but it's not structured, not sure if that's what you are looking for.
match[0].text

First, I noticed that the given URL doesn't have any element using class
rezemp-ResumeSearchPage-resultsList
So I change this to "https://resumes.indeed.com/search?l=&lmd=all&q=python&searchFields=jt", i.e. choose 'show all resumes' on that website.
then use selenium to load js:
import lxml
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
chrome_options = Options()
chrome_options.add_argument('--headless')
url = "https://resumes.indeed.com/search?l=&lmd=all&q=python&searchFields=jt"
browser=webdriver.Chrome(options=chrome_options,executable_path="***\\chromedriver.exe")
browser.get(url)
soup = BeautifulSoup(browser.page_source, "lxml")
match = soup.find_all("ul", class_="rezemp-ResumeSearchPage-resultsList")
print(match)
Got it

You should try this
list = []
for m in match:
list.append(m)
print(list)

Related

BeautifulSoup4 find multiple href's links with specific text in links

I'm trying filter all href links with the string "3080" in it, I saw some examples, but I just can't apply them to my code. Can someone tell me how to print only the links.
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import time
import driver_functions
gpu = '3080'
url = f'https://www.alternate.de/listing.xhtml?q={gpu}'
options = webdriver.ChromeOptions()
options.add_argument('--headless')
if __name__ == '__main__':
browser = webdriver.Chrome(options=options, service=Service('chromedriver.exe'))
try:
browser.get(url)
time.sleep(2)
html = browser.page_source
soup = BeautifulSoup(html, 'html.parser')
gpu_list = soup.select("a", class_="grid-container listing")
for link in gpu_list:
print(link['href'])
browser.quit()
except:
driver_functions.browserstatus(browser)
Output
You could use a css attribute = value css selector with * contains operator to target hrefs, within the listings, that contain that gpu variable. You can obviously develop this css selector list if you find edge cases to account for. I only looked at the url given.
gpu_links= [i['href'] for i in soup.select(f".listing [href*='{gpu}']")]
Try this as your selector gpu_list = soup.select('#lazyListingContainer > div > div > div.grid-container.listing > a')

Scraping using beautiful soup not working fully?

I was trying to scrape some data using BeautifulSoup on python from the site 'https://www.geappliances.com/ge-appliances/kitchen/ranges/' which has some products.
import unittest, time, random
from selenium import webdriver
from webdriver_manager.firefox import GeckoDriverManager
from bs4 import BeautifulSoup
import pandas as pd
links = []
browser = webdriver.Firefox(executable_path="C:\\Users\\drivers\\geckodriver\\win64\\v0.29.1\\geckodriver.exe")
browser.get("https://www.geappliances.com/ge-appliances/kitchen/ranges/")
content = browser.page_source
soup = BeautifulSoup(content, "html.parser")
for main in soup.findAll('li', attrs = {'class' : 'product'}):
name=main.find('a', href=True)
if (name != ''):
links.append((name.get('href')).strip())
print("Got links : ", len(links))
exit()
Here in output I get:-
Got links: 0
I printed the soup and saw that this part was not there in the soup. I have been trying to get around this problem to no avail.
Am I doing something wrong? Any suggestion are appreciated. Thanks.
Study the source of the webpage.Check your findAll function.
You should wait till the page loads. Use time.sleep() to pause execution for a while.
You can try like this.
from bs4 import BeautifulSoup
from selenium import webdriver
import time
url = 'https://www.geappliances.com/ge-appliances/kitchen/ranges/'
driver = webdriver.Chrome("chromedriver.exe")
driver.get(url)
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'lxml')
u = soup.find('ul', class_='productGrid')
for item in u.find_all('li', class_='product'):
print(item.find('a')['href'])
driver.close()
/appliance/GE-Profile-30-Smart-Slide-In-Front-Control-Induction-Fingerprint-Resistant-Range-with-In-Oven-Camera-PHS93XYPFS
/appliance/GE-Profile-30-Electric-Pizza-Oven-PS96PZRSS
/appliance/GE-Profile-30-Smart-Slide-In-Front-Control-Gas-Double-Oven-Convection-Fingerprint-Resistant-Range-PGS960YPFS
/appliance/GE-Profile-30-Smart-Dual-Fuel-Slide-In-Front-Control-Fingerprint-Resistant-Range-P2S930YPFS
/appliance/GE-Profile-30-Smart-Slide-In-Electric-Double-Oven-Convection-Fingerprint-Resistant-Range-PS960YPFS
/appliance/GE-Profile-30-Smart-Slide-In-Front-Control-Induction-and-Convection-Range-with-No-Preheat-Air-Fry-PHS930BPTS
/appliance/GE-Profile-30-Smart-Slide-In-Fingerprint-Resistant-Front-Control-Induction-and-Convection-Range-with-No-Preheat-Air-Fry-PHS930YPFS
/appliance/GE-Profile-30-Smart-Slide-In-Front-Control-Gas-Range-with-No-Preheat-Air-Fry-PGS930BPTS
/appliance/GE-Profile-30-Smart-Slide-In-Front-Control-Gas-Fingerprint-Resistant-Range-with-No-Preheat-Air-Fry-PGS930YPFS
/appliance/GE-Profile-30-Smart-Slide-In-Electric-Convection-Range-with-No-Preheat-Air-Fry-PSS93BPTS
/appliance/GE-Profile-30-Smart-Slide-In-Electric-Convection-Fingerprint-Resistant-Range-with-No-Preheat-Air-Fry-PSS93YPFS
/appliance/GE-30-Slide-In-Front-Control-Gas-Double-Oven-Range-JGSS86SPSS

Why can't I use ".text" while scraping table headers with BeautifulSoup to remove unwanted HTML

When I run this code, I can see that the headers list was populated with the results I want, however they are surrounded in some html I don't want to keep.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
# barchart.com uses javascript, so for now I need selenium to get full html
url = 'https://www.barchart.com/stocks/quotes/qqq/constituents'
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
browser = webdriver.Chrome(options=chrome_options)
browser.get(url)
page = browser.page_source
# BeautifulSoup find table
soup = BeautifulSoup(page, 'lxml')
table = soup.find("table")
browser.quit()
# create list headers, then populate with th tagged cells
headers = []
for i in table.find_all('th'):
title = i()
headers.append(title)
So I tried:
for i in table.find_all('th'):
title = i.text()
headers.append(title)
Which returned "TypeError: 'str' object is not callable"
This seemed to work in some example documentation, but the wikipedia tables used there seemed simpler than the ones on Barchart. Any ideas?
As #MendelG pointed out, the error lies in i.text() because text is a property and not a function.
Alternatively you can also use get_text() which is a function.
I would also suggest adding a strip() to get rid of extra whitespace around the text. Or if you want to use get_text() it has this built in:
title = i.get_text(strip=True)
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
# barchart.com uses javascript, so for now I need selenium to get full html
url = 'https://www.barchart.com/stocks/quotes/qqq/constituents'
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
browser = webdriver.Chrome(options=chrome_options)
browser.get(url)
page = browser.page_source
# BeautifulSoup find table
soup = BeautifulSoup(page, 'lxml')
table = soup.find("table")
browser.quit()
# create list headers, then populate with th tagged cells
headers = []
for i in table.find_all('th'):
title = i.text.strip()
# Or alternatively:
#title = i.get_text(strip=True)
headers.append(title)
print(headers)
This prints:
['Symbol', 'Name', '% Holding', 'Shares', 'Links']

Python BeautifulSoup soup.find

I want to scrape some specific data from a website using urllib and BeautifulSoup.
Im trying to fetch the text "190.0 kg". I have tried as you can see in my code to use attrs={'class': 'col-md-7'}
but this returns the wrong result. Is there any way to specify that I want it to return the text between <h3>?
from urllib.request import urlopen
from bs4 import BeautifulSoup
# specify the url
quote_page = 'https://styrkeloft.no/live.styrkeloft.no/v2/?test-stevne'
# query the website and return the html to the variable 'page'
page = urlopen(quote_page)
# parse the html using beautiful soup
soup = BeautifulSoup(page, 'html.parser')
# take out the <div> of name and get its value
Weight_box = soup.find('div', attrs={'class': 'col-md-7'})
name = name_box.text.strip()
print (name)
Since this content is dynamically generated there is no way to access that data using the requests module.
You can use selenium webdriver to accomplish this:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_driver = "path_to_chromedriver"
driver = webdriver.Chrome(chrome_options=chrome_options,executable_path=chrome_driver)
driver.get('https://styrkeloft.no/live.styrkeloft.no/v2/?test-stevne')
html = driver.page_source
soup = BeautifulSoup(html, "lxml")
current_lifter = soup.find("div", {"id":"current_lifter"})
value = current_lifter.find_all("div", {'class':'row'})[2].find_all("h3")[0].text
driver.quit()
print(value)
Just be sure to have the chromedriver executable in your machine.

BeautifulSoup4 not finding div

I've been experimenting with BeautifulSoup4 lately and have found pretty good success across a range of different sites. Though I have stumbled into an issue when trying to scrape amazon.com.
Using the code below, when printing 'soup' I can see the div, but when I search for the div itself, BS4 brings back null. I think the issue is in how the html is being processed. I've tried with LXML and html5lib. Any ideas?
import bs4 as bs
import urllib3
urllib3.disable_warnings()
http = urllib3.PoolManager()
url = 'https://www.amazon.com/gp/goldbox/ref=gbps_fcr_s-4_a870_dls_UPCM?gb_f_deals1=dealStates:AVAILABLE%252CWAITLIST%252CWAITLISTFULL,includedAccessTypes:GIVEAWAY_DEAL,sortOrder:BY_SCORE,enforcedCategories:2619533011,dealTypes:LIGHTNING_DEAL&pf_rd_p=56200e05-4eb2-42ca-9723-af0811ada870&pf_rd_s=slot-4&pf_rd_t=701&pf_rd_i=gb_main&pf_rd_m=ATVPDKIKX0DER&pf_rd_r=PQNZWZRKVD93HXXVG5A7&ie=UTF8'
original = http.request('Get',url)
soup = bs.BeautifulSoup(original.data, 'lxml')
div = soup.find_all('div', {'class':'a-row padCenterContainer'})
You could use selenium to allow the javascript to load before grabbing the html.
from bs4 import BeautifulSoup
from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(chrome_options=options)
url = 'https://www.amazon.com/gp/goldbox/ref=gbps_fcr_s-4_a870_dls_UPCM?gb_f_deals1=dealStates:AVAILABLE%252CWAITLIST%252CWAITLISTFULL,includedAccessTypes:GIVEAWAY_DEAL,sortOrder:BY_SCORE,enforcedCategories:2619533011,dealTypes:LIGHTNING_DEAL&pf_rd_p=56200e05-4eb2-42ca-9723-af0811ada870&pf_rd_s=slot-4&pf_rd_t=701&pf_rd_i=gb_main&pf_rd_m=ATVPDKIKX0DER&pf_rd_r=PQNZWZRKVD93HXXVG5A7&ie=UTF8'
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, 'lxml')
div = soup.find('div', {'class': 'a-row padCenterContainer'})
print(div.prettify())
The output of this script was too long to put in this question but here is a link to it

Categories

Resources