UPDATE:
After Pygirl suggestion I am attempting to use Selenium, but i'm still only getting the sector data:
import requests
import csv
import pandas as pd
from requests import get
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait as wait
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.action_chains import ActionChains
from webdriver_manager.chrome import ChromeDriverManager
from time import sleep
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.set_window_size(1024, 600)
driver.maximize_window()
driver.get('https://eresearch.fidelity.com/eresearch/markets_sectors/sectors/si_performance.jhtml?tab=siperformance')
action = ActionChains(driver)
sleep(4)
industry_link = driver.find_element_by_css_selector('#tab_industry')
action.move_to_element(industry_link)
action.click(industry_link)
action.perform()
url = driver.current_url
r = requests.get(url)
sleep(10)
df_industry_list = pd.read_html(r.text)
df_industry = df_industry_list[0]
df_industry.head()
df_industry.to_excel("SectorPerf.xlsx", sheet_name = "Industry")
I'm trying to get the data from the Industry link of this url: https://eresearch.fidelity.com/eresearch/markets_sectors/sectors/si_performance.jhtml?tab=siperformance
I have written some code that will get the SECTOR link information, however my approach doesn't seem to work for the Industry as the URL appears to be the same for both the sector and the Industry tab...
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
from requests import get
url = 'https://eresearch.fidelity.com/eresearch/markets_sectors/sectors/si_performance.jhtml?tab=siperformance'
r = requests.get(url)
#soup = BeautifulSoup(response.content, 'html.parser')
#sectors = soup.find("table", id="perfTableSort")
df_list = pd.read_html(r.text)
df = df_list[0]
df.head()
#print(df)
Given that the Url seems to be the same (at least is showing the same in my address bar on chrome), how can I also get the Industry data?
Thanks
Try this..
url = 'https://eresearch.fidelity.com/eresearch/markets_sectors/si_performance.jhtml'
industry = {'tab': 'industry'}
sector = {'tab': 'sector'}
r = requests.post(url, data=industry)
#soup = BeautifulSoup(response.content, 'html.parser')
#sectors = soup.find("table", id="perfTableSort")
df_list = pd.read_html(r.text)
df = df_list[0]
df.head()
Now you can put data=industry or data=sector to get desired result..
Using driver.page_source. Extract table part and store it in form of csv or excel
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from time import sleep
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.set_window_size(1024, 600)
driver.maximize_window()
driver.get('https://eresearch.fidelity.com/eresearch/markets_sectors/sectors/si_performance.jhtml?tab=siperformance')
# action = webdriver.ActionChains(driver)
print(driver.page_source) # <--- this will give you source code for Sector
sleep(2)
industry_link = driver.find_element_by_xpath('//*[#id="tab_industry"]')
# action.move_to_element(industry_link)
industry_link.click()
# action.perform()
print(driver.page_source) # <--- this will give you source code for Industry
sleep(2)
Related
I'm trying to scrape the following table from this URL: https://baseballsavant.mlb.com/leaderboard/outs_above_average?type=Fielder&startYear=2022&endYear=2022&split=no&team=&range=year&min=10&pos=of&roles=&viz=show
This is my code:
import requests
from bs4 import BeautifulSoup
url = "https://baseballsavant.mlb.com/leaderboard/outs_above_average?type=Fielder&startYear=2022&endYear=2022&split=no&team=&range=year&min=10&pos=of&roles=&viz=show"
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
table = soup.find("table")
for row in table.findAll("tr"):
print([i.text for i in row.findAll("td")])
However, my variable table returns None, even though there is clearly a table tag in the HTML code of the website. How do I get it?
The webpage is loaded dynamically and relies on JavaScript, therefore requests won't support it. You could use another parser library such as selenium.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Firefox()
url = "https://baseballsavant.mlb.com/leaderboard/outs_above_average?type=Fielder&startYear=2022&endYear=2022&split=no&team=&range=year&min=10&pos=of&roles=&viz=show"
driver.get(url)
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.TAG_NAME, 'table')))
table = driver.find_element(By.TAG_NAME, 'table')
table_html = table.get_attribute('innerHTML')
# print('table html:', table_html)
for tr_web_element in table.find_elements(By.TAG_NAME, 'tr'):
for td_web_element in tr_web_element.find_elements(By.TAG_NAME, 'td'):
print(td_web_element.text)
driver.close()
Or see this answer to incorporate Selenium with BeautifulSoup.
from bs4 import BeautifulSoup
from selenium import webdriver
import requests
import lxml
import openpyxl as op
# from lxml
html_text = 'https://twitter.com/videogamedeals/status/1352325118261948418/retweets'
#
driver = webdriver.
----------
## Heading ##
Chrome(
executable_path='C:/Users/atif/Downloads/chromedriver.exe')
# driver.implicitly_wait(30)
driver.get(html_text)
html = driver.page_source
soup = BeautifulSoup(html, 'lxml')
# body = soup.body
# titles = headers.find_all('a', class_='title fw500 ellipsis')
# for h in headers:
# # title = h.find('a', class_='title fw500 ellipsis').text
# print(h.a['href'])
# a_links = body.find_all("a")
names = soup.find_all(
"a.css-4rbku5 css-18t94o4 css-1dbjc4n r-1loqt21 r-1wbh5a2 r-dnmrzs r-1ny4l3l")
print(len(names))
**>It is showing this error =>
[17548:22900:0415/160654.715:ERROR:device_event_log_impl.cc(214)] [16:06:54.715] Bluetooth: >bluetooth_adapter_winrt.cc:1162 RequestRadioAccessAsync failed: RadioAccessStatus::DeniedByUserWill not >be able to change radio power.
0 <- with zero results**
Actually you can get names using only selenium without BeautifulSoup, here's the code for that:
from seleniumwire import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from bs4 import BeautifulSoup
import requests
import lxml
import openpyxl as op
driver = webdriver.Chrome(ChromeDriverManager().install())
# from lxml
html_text = 'https://twitter.com/videogamedeals/status/1352325118261948418/retweets'
# driver.implicitly_wait(30)
driver.get(html_text)
time.sleep(20)
names = driver.find_elements_by_xpath('//span[#class="css-901oao css-16my406 r-poiln3 r-bcqeeo r-qvutc0"]//span[#class="css-901oao css-16my406 r-poiln3 r-bcqeeo r-qvutc0"]')
for name in names:
print(name.text)
My code goes into a webpage and I want to scrape the href/HTML of each listing within this webpage.
(This code goes to a website which has 2)
I tried xpath, and beautifulSoup but it returns an empty list for me.
Here is the code-
import time
from selenium import webdriver
import pandas as pd
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pandas as pd
from bs4 import BeautifulSoup
driver = webdriver.Chrome()
bracket=[]
driver.get('https://casehippo.com/spa/symposium/national-kidney-foundation-2021-spring-clinical-meetings/event/gallery/?search=Patiromer')
time.sleep(3)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
eachRow=driver.find_element_by_partial_link_text('symposium')
print(eachRow.text)
I just ran the code what you provided, BeautifulSoup set soup variable with all page source successfully:
soup = BeautifulSoup(page_source, 'html.parser')
and in the next line:
eachRow=driver.find_element_by_partial_link_text('symposium')
exception has been raised with message:
selenium.common.exceptions.NoSuchElementException: Message: no such element: Unable to locate element: {"method":"partial link text","selector":"symposium"}
seems like you're using incorrect selector, try to use, somethink like:
element = driver.find_element_by_xpath("//a[#class='title ng-binding']")
code what i'm using:
import time
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Chrome()
try:
bracket = []
driver.get(
'https://casehippo.com/spa/symposium/national-kidney-foundation-2021-spring-clinical-meetings/event/gallery/?search=Patiromer')
time.sleep(3)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
print(soup)
element = driver.find_element_by_xpath("//a[#class='title ng-binding']")
print(element.get_attribute('href'))
elements = driver.find_elements_by_xpath("//a[#class='title ng-binding']")
for el in elements:
print(el.get_attribute('href'))
finally:
driver.quit()
Updated code:
import time
from selenium import webdriver
import pandas as pd
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pandas as pd
from bs4 import BeautifulSoup
driver = webdriver.Chrome()
bracket=[]
driver.get('https://casehippo.com/spa/symposium/national-kidney-foundation-2021-spring-clinical-meetings/event/gallery/?search=Patiromer')
time.sleep(3)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
eachRow=driver.find_elements_by_xpath("//a[contains(#ui-sref,'symposium')]")
for row in eachRow:
print(row.text)
You need to use find_elements (not find_element) if there are more than one, and then iterate over them to see their values. Also partial text wont work because the symposium text is embedded in another element, its not regular text, so xpath is needed
I have the code to extract the job information from Indeed, but now I want to extract the link form the job title so I can open a new page and pull out the job description information.
I can see the link on the html page with the reference to the job posting, within the href tag but not sue how to extract it?
import requests
import time
from random import randint
from bs4 import BeautifulSoup
import urllib, requests, re, pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
webdriver.DesiredCapabilities.CHROME["unexpectedAlertBehaviour"] = "accept"
webdriver.Chrome(chrome_options=options,executable_path=CHROMEDRIVER_PATH)
options = Options()
options.add_argument('start-maximized')
options.add_argument('disable-infobars')
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(chrome_options=options,executable_path='chromedriver')
driver.get("https://www.indeed.co.uk/automotive-engineer-jobs-in-uk")
soup=BeautifulSoup(driver.page_source, "lxml")
title = [tag.text.strip() for tag in soup.select('.jobtitle')]
company = [tag.text.strip() for tag in soup.select('.company')]
location = [tag.text.strip() for tag in soup.select('.location')]
for y in range (len(title)):
tmpstring = (title[y] + ',' + company[y] + ',' + location[y] + ",0")
tmpstring = tmpstring.encode("utf-8")
f = open('FileDump','a')
f.write(tmpstring)
f.close
You can get the child element by usig this code.
title_href = [tag.find("a")["href"] for tag in soup.findAll("h2",{"class":"jobtitle"})]
I try your code and modified a few places.because i found it can get full name from <a>
import requests
import time
from random import randint
from bs4 import BeautifulSoup
import urllib, requests, re, pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
webdriver.DesiredCapabilities.CHROME["unexpectedAlertBehaviour"] = "accept"
options = Options()
options.add_argument('start-maximized')
options.add_argument('disable-infobars')
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(chrome_options=options,executable_path='chromedriver')
driver.get("https://www.indeed.co.uk/automotive-engineer-jobs-in-uk")
domain = "https://www.indeed.co.uk"
soup=BeautifulSoup(driver.page_source, "lxml")
title = [tag.find("a")["title"] for tag in soup.findAll("h2",{"class":"jobtitle"})]
title_href = [domain + tag.find("a")["href"] for tag in soup.findAll("h2",{"class":"jobtitle"})]
company = [tag.text.strip() for tag in soup.findAll("span",{"class":"company"})]
location = [tag.text.strip() for tag in soup.findAll("span",{"class":"location"})]
print(title_href)
driver.close()
You can use the below code to extract the links
from BeautifulSoup import BeautifulSoup
import urllib2
import re
html_page = urllib2.urlopen("http://arstechnica.com")
soup = BeautifulSoup(html_page)
for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
print link.get('href')
Reference
https://pythonspot.com/extract-links-from-webpage-beautifulsoup/
I am stuggling to scrape as per code below. Would apprciate it if someone can have a look at what I am missing?
Regards
PyProg70
from selenium import webdriver
from selenium.webdriver import FirefoxOptions
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from bs4 import BeautifulSoup
import pandas as pd
import re, time
binary = FirefoxBinary('/usr/bin/firefox')
opts = FirefoxOptions()
opts.add_argument("--headless")
browser = webdriver.Firefox(options=opts, firefox_binary=binary)
browser.implicitly_wait(10)
url = 'http://tenderbulletin.eskom.co.za/'
browser.get(url)
html = browser.page_source
soup = BeautifulSoup(html, 'lxml')
print(soup.prettify())
not Java but Javascript. it dynamic page you need to wait and check if Ajax finished the request and content rendered using WebDriverWait.
....
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
.....
browser.get(url)
# wait max 30 second until table loaded
WebDriverWait(browser, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR , 'table.CSSTableGenerator .ng-binding')))
html = browser.find_element_by_css_selector('table.CSSTableGenerator')
soup = BeautifulSoup(html.get_attribute("outerHTML"), 'lxml')
print(soup.prettify().encode('utf-8'))