Not able to scrape image URLs using beautiful soup and python

Not able to scrape image URLs using beautiful soup and python - python

So basically I am using the below code to scrape the image urls of the credit cards from the respective links in the explore_more_url variable.
from urllib.request import urlopen
from bs4 import BeautifulSoup
import json, requests, re
from selenium import webdriver
driver = webdriver.Chrome(executable_path="C:\\Users\\Hari\\Downloads\\chromedriver.exe")
img_url = []
explore_more_url = ['https://www.axisbank.com/retail/cards/credit-card/axis-bank-ace-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/axis-bank-aura-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/privilege-easy-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/reserve-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/axis-bank-freecharge-plus-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/indianoil-axis-bank-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/axis-bank-magnus-card/feature-benefits', 'https://www.axisbank.com/retail/cards/credit-card/flipkart-axisbank-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/axis-bank-freecharge-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/my-zone-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/neo-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/axis-bank-vistara-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/axis-bank-vistara-signature-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/axis-bank-vistara-infinite-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/privilege-credit-card-with-unlimited-travel-benefits-account', 'https://www.axisbank.com/retail/cards/credit-card/miles-more-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/axis-bank-select-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/pride-platinum-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/pride-signature-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/my-zone-easy-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/insta-easy-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/signature-credit-card-with-lifestyle-benefits', 'https://www.axisbank.com/retail/cards/credit-card/platinum-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/titanium-smart-traveler-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/my-wings-credit-card/features-benefits']
for x in explore_more_url:
driver.get(x)
soup_1 = BeautifulSoup(driver.page_source, 'lxml')
img_url.append("https://www.axisbank.com" + soup_1.find('img', alt="Fast Forward Banner").get('src'))
print(img_url)
Output :
Traceback (most recent call last):
File "C:\Users\Hari\PycharmProjects\Card_Prj\axis.py", line 82, in <module>
img_url.append("https://www.axisbank.com" + soup_1.find('img', alt="Fast Forward Banner").get('src'))
AttributeError: 'NoneType' object has no attribute 'get'
The images are something like this in each link:
What is the appropriate code that I could use so that I can get exactly what I am expecting ?

One way of getting the image might be this:
import requests
from bs4 import BeautifulSoup
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}
page = requests.get("https://www.axisbank.com/retail/cards/credit-card/axis-bank-ace-credit-card", headers=headers).text
img_src_ = BeautifulSoup(page, "html.parser").select_one('.bannerWrapper img')["src"]
with open(img_src_.rsplit("/")[-1], "wb") as image:
image.write(requests.get(f"https://www.axisbank.com{img_src_}").content)
Output: an .jpg file in the script's local directory
ace-product-landing-web-version-1920x360.jpg
EDIT: To get just the source urls, try this:
import requests
from bs4 import BeautifulSoup
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}
explore_more_url = [
'https://www.axisbank.com/retail/cards/credit-card/axis-bank-ace-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/axis-bank-aura-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/privilege-easy-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/reserve-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/axis-bank-freecharge-plus-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/indianoil-axis-bank-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/axis-bank-magnus-card/feature-benefits',
'https://www.axisbank.com/retail/cards/credit-card/flipkart-axisbank-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/axis-bank-freecharge-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/my-zone-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/neo-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/axis-bank-vistara-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/axis-bank-vistara-signature-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/axis-bank-vistara-infinite-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/privilege-credit-card-with-unlimited-travel-benefits-account',
'https://www.axisbank.com/retail/cards/credit-card/miles-more-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/axis-bank-select-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/pride-platinum-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/pride-signature-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/my-zone-easy-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/insta-easy-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/signature-credit-card-with-lifestyle-benefits',
'https://www.axisbank.com/retail/cards/credit-card/platinum-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/titanium-smart-traveler-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/my-wings-credit-card/features-benefits',
]
img_urls = []
for url in explore_more_url:
page = requests.get(url, headers=headers).text
try:
img_src_ = BeautifulSoup(page, "html.parser").select_one('.bannerWrapper img')["src"]
print(f"Finding image source url for {url}")
img_urls.append(f"https://www.axisbank.com{img_src_}")
except (KeyError, TypeError):
continue
print(img_urls)
Output:
['https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/ace-product-landing-web-version-1920x360.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/health-and-wellness-product-page-1920x360_v1.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/freecharge-product-landing-page-desktop-banner-revised.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/indian-oil-banner-desktop.jpg', 'https://www.axisbank.com/img/magnuscard/apply-now.png', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/flipkart-abcc-desk.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/product-landing-page-desktop-banner.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/myzone-easy-1920-360-desktop-banner.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/neo-credit-card-1920-360-desktop-banner.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/vistara-1920-360-desktop-banner.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/vistara-1920-360-desktop-banner.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/vistara-1920-360-desktop-banner.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/privilege-credit-card.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/miles---more-credit-card.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/desktop-select-credit-card.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/pride-platinum-1920-360-desktop-banner.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/pride-platinum-1920-360-desktop-banner.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/myzone-easy-1920-360-desktop-banner.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/insta-easy-credit-card.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/signature-credit-card-with.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/platinum-credit-card.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/signature-credit-card-with.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/my-wings-credit-card.jpg']

Related

I am trying to navigate through the pages of a website and scrape its links but the same page data is scraped even after changing page number

from bs4 import BeautifulSoup
import requests
import pymongo
def traverse_source():
article_links = []
for pgindx in range(9):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
"path": f"issue/S0196-0644(21)X0012-1?pageStart={pgindx}",
"Sec-fetch-site": "same-origin",
}
source_url = ""
source_data = requests.get(source_url,headers = headers)
print(source_data.headers)
source_url = None
source_soup = BeautifulSoup(source_data.content,"html.parser")
destination = source_soup.find_all("h3",attrs = {'class': 'toc__item__title' })
for dest in destination:
try:
article_links.append("https://www.annemergmed.com"+dest.a['href'])
except:
pass
source_soup = None
print(article_links)
if __name__ == "__main__":
traverse_source()
Here even after incrementing the page number in the URL, the content of the first webpage is always scraped. I tried navigating through the pages using GET method (changing the URL) but still even after changing the source url, it is still scraping the data of page number 1

This is one way of scraping that data:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
s = requests.Session()
s.headers.update(headers)
big_list = []
for x in tqdm(range(9)):
r = s.get(f'https://www.annemergmed.com/issue/S0196-0644(21)X0012-1?pageStart={x}')
soup = BeautifulSoup(r.text, 'html.parser')
titles = soup.select('div.articleCitation')
for t in titles:
url = t.select_one('h3 a').get('href')
header = t.select_one('h3 a').text
try:
authors = t.select_one('ul.toc__item__authors').get_text(strip=True)
except Exception as e:
authors = 'Unknown'
big_list.append((header, f'https://www.annemergmed.com{url}', authors))
df = pd.DataFrame(list(set(big_list)), columns = ['Title', 'Url', 'Authors'])
print(df.shape)
print(df.head(50))
This will return:
(409, 3)
Title Url Authors
0 194 Challenging the Dogma of Radiographs a Joint Above and Below a Suspected Fracture: Quantification of Waste in Wrist Fracture Evaluation https://www.annemergmed.com/article/S0196-0644(21)01046-5/fulltext M. Rozum,D. Mark Courtney,D. Diercks,S. McDonald
1 112 A Geographical Analysis of Access to Trauma Care From US National Parks in 2018 https://www.annemergmed.com/article/S0196-0644(21)00963-X/fulltext S. Robichaud,K. Boggs,B. Bedell,...A. Sullivan,N. Harris,C. Camargo
2 87 Emergency Radiology Overreads Change Management of Transferred Patients With Traumatic Injuries https://www.annemergmed.com/article/S0196-0644(21)00937-9/fulltext M. Vrablik,R. Kessler,M. Vrablik,...J. Robinson,D. Hippe,M. Hall
[...]

Beautiful soup web scraping returning None-Python

I have a list of movies that I want to scrap the genres from Google.
I've built this code:
import requests
from bs4 import BeautifulSoup
list=['Se7en','Cinema Paradiso','The Shining','Toy Story 3','Capernaum']
gen2 = {}
for i in list:
user_query = i +'movie genre'
URL = 'https://www.google.co.in/search?q=' + user_query
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.63 Safari/537.36'}
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
c = soup.find(class_='EDblX DAVP1')
print(c)
if c != None:
genres = c.findAll('a')
gen2[i]= genres
But it returns an empty dict, so I checked one by one and it worked, for example:
import requests
from bs4 import BeautifulSoup
user_query = 'Se7en movie genre'
URL = "https://www.google.co.in/search?q=" + user_query
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.63 Safari/537.36'}
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
v = soup.find(class_='KKHQ8c')
h = {}
genres = v.findAll('a')
for genre in genres:
h['Se7en']=genre
So I find out that in the for loop the variable c is returning None.
I can't figure out why! It only return None inside the loop.

Currently, your URLs are of the form
URLs
so the returned results(google) aren't accurate for all the movies.
You can change it to
`for i in list:
i="+".join(i.split(" "));
user_query = i + "+movie+genre"
URL = 'https://www.google.com/search?q=+'+user_query`
also, movies that belong to a single genre like Cinema Paradiso are in a div with class name "Z0LcW".

How to get all URLs within a page fom oddsportal?

I have a code that scrapes all URLs from oddsportal.com main page.
I want the subsequent links to all pages within the parent URL
e.g.
https://www.oddsportal.com/soccer/africa/africa-cup-of-nations/results/
has further pages i.e. https://www.oddsportal.com/soccer/africa/africa-cup-of-nations/results/, https://www.oddsportal.com/soccer/africa/africa-cup-of-nations-2019/results/, etc.
How can I get that?
My existing code:
import requests
import bs4 as bs
import pandas as pd
url = 'https://www.oddsportal.com/results/#soccer'
headers = {
'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'}
resp = requests.get(url, headers=headers)
soup = bs.BeautifulSoup(resp.text, 'html.parser')
base_url = 'https://www.oddsportal.com'
a = soup.findAll('a', attrs={'foo': 'f'})
# This set will have all the URLs of the main page
s = set()
for i in a:
s.add(base_url + i['href'])
s = list(s)
# This will filter for all soccer URLs
s = [x for x in s if '/soccer/' in x]
s = pd.DataFrame(s)
print(s)
I am very new to webscraping and hence this question.

You can find main_div tag based on class attribute and use find_all method to get a tag by looping over it you can extract href of it
from bs4 import BeautifulSoup
import requests
headers = {
'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'}
source = requests.get("https://www.oddsportal.com/soccer/africa/africa-cup-of-nations/results/",headers=headers)
soup = BeautifulSoup(source.text, 'html.parser')
main_div=soup.find("div",class_="main-menu2 main-menu-gray")
a_tag=main_div.find_all("a")
for i in a_tag:
print(i['href'])
Output:
/soccer/africa/africa-cup-of-nations/results/
/soccer/africa/africa-cup-of-nations-2019/results/
/soccer/africa/africa-cup-of-nations-2017/results/
/soccer/africa/africa-cup-of-nations-2015/results/
/soccer/africa/africa-cup-of-nations-2013/results/
/soccer/africa/africa-cup-of-nations-2012/results/
/soccer/africa/africa-cup-of-nations-2010/results/
/soccer/africa/africa-cup-of-nations-2008/results/

How to get a link in a dataframe

with attached screenshot my question can be explained quite well.
I am scraping the following page: https://www.transfermarkt.de/tsg-1899-hoffenheim/kader/verein/533/saison_id/2019/plus/1
Table 1 lists the team. In the second column is the player. I need the link as you can see in the screenshot on the bottom left.
When I look into the data frame normally, I only get the following in this cell: "Oliver BaumannO. BaumannTorwart" But I am looking for "https://www.transfermarkt.de/oliver-baumann/profil/spieler/55089".
You guys got any ideas?
Code:
import pandas as pd
import requests
# Global variables
HEADS = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'}
dateiname = 'test.xlsx'
# Global variables
def get_response(url):
# URL-Anfrage durchfuehren
try:
response = requests.get(url, headers=HEADS)
except AttributeError:
print('AttributeError')
return response
def scraping_kader(response):
try:
dfs = pd.read_html(response.text)
#dfs = dfs.to_html(escape=False)
print(dfs[1])
print(dfs[1].iloc[0, :])
except ImportError:
print(' ImportError')
except ValueError:
print(' ValueError')
except AttributeError:
print(' AttributeError')
response = get_response('https://www.transfermarkt.de/tsg-1899-hoffenheim/kader/verein/533/saison_id/2019/plus/1')
scraping_kader(response)

as I know read_html gets only text from table and it doesn't care of links, hidden elements, attributes, etc.
You need module like BeautifulSoup or lxml to work with full HTML and manually get needed information.
soup = BeautifulSoup(response.text, 'html.parser')
all_tooltips = soup.find_all('td', class_='hauptlink')
for item in all_tooltips:
item = item.find('a', class_='spielprofil_tooltip')
if item:
print(item['href']) #, item.text)
This example gets only links but in the same way you can get other elements.
import requests
from bs4 import BeautifulSoup
#import pandas as pd
HEADS = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
}
def get_response(url):
try:
response = requests.get(url, headers=HEADS)
except AttributeError:
print('AttributeError')
return response
def scraping_kader(response):
try:
soup = BeautifulSoup(response.text, 'html.parser')
all_tooltips = soup.find_all('td', class_='hauptlink')
for item in all_tooltips:
item = item.find('a', class_='spielprofil_tooltip')
if item:
print(item['href']) #, item.text)
#print(dfs[1])
#print(dfs[1].iloc[0, :])
except ImportError:
print(' ImportError')
except ValueError:
print(' ValueError')
except AttributeError:
print(' AttributeError')
# --- main --
response = get_response('https://www.transfermarkt.de/tsg-1899-hoffenheim/kader/verein/533/saison_id/2019/plus/1')
scraping_kader(response)
Result
/oliver-baumann/profil/spieler/55089
/philipp-pentke/profil/spieler/8246
/luca-philipp/profil/spieler/432671
/stefan-posch/profil/spieler/223974
/kevin-vogt/profil/spieler/84435
/benjamin-hubner/profil/spieler/52348
/kevin-akpoguma/profil/spieler/160241
/kasim-adams/profil/spieler/263801
/ermin-bicakcic/profil/spieler/51676
/havard-nordtveit/profil/spieler/42234
/melayro-bogarde/profil/spieler/476915
/konstantinos-stafylidis/profil/spieler/148967
/pavel-kaderabek/profil/spieler/143798
/joshua-brenet/profil/spieler/207006
/florian-grillitsch/profil/spieler/195736
/diadie-samassekou/profil/spieler/315604
/dennis-geiger/profil/spieler/251309
/ilay-elmkies/profil/spieler/443752
/christoph-baumgartner/profil/spieler/324278
/mijat-gacinovic/profil/spieler/215864
/jacob-bruun-larsen/profil/spieler/293281
/sargis-adamyan/profil/spieler/125614
/felipe-pires/profil/spieler/327911
/robert-skov/profil/spieler/270393
/ihlas-bebou/profil/spieler/237164
/andrej-kramaric/profil/spieler/46580
/ishak-belfodil/profil/spieler/111039
/munas-dabbur/profil/spieler/145866
/klauss/profil/spieler/498862
/maximilian-beier/profil/spieler/578392

That helps me.
I have now copied the table with pandas and replaced the column with the name with the link from your BS4 code. Works!

How to grab spot price from yahoo finance using BeautifulSoup

I'm trying to grab the spot price of the SPY ETF: https://finance.yahoo.com/quote/SPY/options
I've mostly tried using soup.find_all, using the nested 'div' tags:
from bs4 import BeautifulSoup
import urllib.request
url = 'https://finance.yahoo.com/quote/SPY/options/'
source = urllib.request.urlopen(url).read()
soup = BeautifulSoup(source,'lxml')
for div in soup.find_all('div', class_ = "My(6px) smartphone_Mt(15px)"):
print(div.text)
for div in soup.find_all('div', class_ = "D(ib) Maw(65%) Ov(h)"):
print(div.text)
for div in soup.find_all('div', class_ = "D(ib) Mend(20px)"):
print(div.text)
Nothing is printed. I also tried the following:
print(soup.find('span', attrs = {'data-reactid':"35"}).text)
which results in 'Last Price' being printed. Now obviously I want the last price, rather than the words 'last price', but this is closer.
Nested in that span tag is some html which includes the number I want. I'm guessing the correct answer has to do with the 'react text: 36' stuff within the span tag (can't type it without stackoverflow thinking I'm trying to actually implement the html into this question).

If you just want the price:
import urllib.request
from bs4 import BeautifulSoup, Comment
page = urllib.request.urlopen("https://finance.yahoo.com/quote/SPY?p=SPY")
content = page.read().decode('utf-8')
soup = BeautifulSoup(content, 'html.parser')
comments = soup.findAll(text=lambda text:isinstance(text, Comment))
[comment.extract() for comment in comments]
price = soup.find("span", {"data-reactid": "14", "class" : "Trsdu(0.3s) "}).text
print(price)
Outputs:
271.40

I recommend to you use scrapy, requests modules
import requests
from bs4 import BeautifulSoup
from scrapy.selector import Selector
ajanlar = [
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko)',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko)',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)',
'Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)',
'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)']
url = "https://finance.yahoo.com/quote/SPY/options"
headers = {"User-Agent":random.choice(ajanlar)}
response = requests.get(url,headers=headers,proxies=None)
soup = BeautifulSoup(response.text, 'lxml')
xpath1 = "normalize-space(//div[#class='Mt(6px) smartphone_Mt(15px)'])"
xpath2 = "normalize-space(//div[#class='D(ib) Maw(65%) Maw(70%)--tab768 Ov(h)'])"
xpath3 = "normalize-space(//div[#class='D(ib) Mend(20px)'])"
var1 = Selector(text=response.text).xpath(xpath1).extract()[0]
var2 = Selector(text=response.text).xpath(xpath2).extract()[0]
var3 = Selector(text=response.text).xpath(xpath3).extract()[0]
print(var1)
print(var2)
print(var3)
Outputs:
269.97-1.43 (-0.53%)At close: 4:00PM EST269.61 -0.44 (-0.16%)After hours: 6:08PM ESTPeople also watchDIAIWMQQQXLFGLD
269.97-1.43 (-0.53%)At close: 4:00PM EST269.61 -0.44 (-0.16%)After hours: 6:08PM EST
269.97-1.43 (-0.53%)At close: 4:00PM EST
After than, you could apply regex

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Not able to scrape image URLs using beautiful soup and python - python

Related

I am trying to navigate through the pages of a website and scrape its links but the same page data is scraped even after changing page number

Beautiful soup web scraping returning None-Python

How to get all URLs within a page fom oddsportal?

How to get a link in a dataframe

How to grab spot price from yahoo finance using BeautifulSoup

Categories

Resources