I'm trying to grab the spot price of the SPY ETF: https://finance.yahoo.com/quote/SPY/options
I've mostly tried using soup.find_all, using the nested 'div' tags:
from bs4 import BeautifulSoup
import urllib.request
url = 'https://finance.yahoo.com/quote/SPY/options/'
source = urllib.request.urlopen(url).read()
soup = BeautifulSoup(source,'lxml')
for div in soup.find_all('div', class_ = "My(6px) smartphone_Mt(15px)"):
print(div.text)
for div in soup.find_all('div', class_ = "D(ib) Maw(65%) Ov(h)"):
print(div.text)
for div in soup.find_all('div', class_ = "D(ib) Mend(20px)"):
print(div.text)
Nothing is printed. I also tried the following:
print(soup.find('span', attrs = {'data-reactid':"35"}).text)
which results in 'Last Price' being printed. Now obviously I want the last price, rather than the words 'last price', but this is closer.
Nested in that span tag is some html which includes the number I want. I'm guessing the correct answer has to do with the 'react text: 36' stuff within the span tag (can't type it without stackoverflow thinking I'm trying to actually implement the html into this question).
If you just want the price:
import urllib.request
from bs4 import BeautifulSoup, Comment
page = urllib.request.urlopen("https://finance.yahoo.com/quote/SPY?p=SPY")
content = page.read().decode('utf-8')
soup = BeautifulSoup(content, 'html.parser')
comments = soup.findAll(text=lambda text:isinstance(text, Comment))
[comment.extract() for comment in comments]
price = soup.find("span", {"data-reactid": "14", "class" : "Trsdu(0.3s) "}).text
print(price)
Outputs:
271.40
I recommend to you use scrapy, requests modules
import requests
from bs4 import BeautifulSoup
from scrapy.selector import Selector
ajanlar = [
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko)',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko)',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)',
'Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)',
'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)']
url = "https://finance.yahoo.com/quote/SPY/options"
headers = {"User-Agent":random.choice(ajanlar)}
response = requests.get(url,headers=headers,proxies=None)
soup = BeautifulSoup(response.text, 'lxml')
xpath1 = "normalize-space(//div[#class='Mt(6px) smartphone_Mt(15px)'])"
xpath2 = "normalize-space(//div[#class='D(ib) Maw(65%) Maw(70%)--tab768 Ov(h)'])"
xpath3 = "normalize-space(//div[#class='D(ib) Mend(20px)'])"
var1 = Selector(text=response.text).xpath(xpath1).extract()[0]
var2 = Selector(text=response.text).xpath(xpath2).extract()[0]
var3 = Selector(text=response.text).xpath(xpath3).extract()[0]
print(var1)
print(var2)
print(var3)
Outputs:
269.97-1.43 (-0.53%)At close: 4:00PM EST269.61 -0.44 (-0.16%)After hours: 6:08PM ESTPeople also watchDIAIWMQQQXLFGLD
269.97-1.43 (-0.53%)At close: 4:00PM EST269.61 -0.44 (-0.16%)After hours: 6:08PM EST
269.97-1.43 (-0.53%)At close: 4:00PM EST
After than, you could apply regex
Related
I have a list of movies that I want to scrap the genres from Google.
I've built this code:
import requests
from bs4 import BeautifulSoup
list=['Se7en','Cinema Paradiso','The Shining','Toy Story 3','Capernaum']
gen2 = {}
for i in list:
user_query = i +'movie genre'
URL = 'https://www.google.co.in/search?q=' + user_query
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.63 Safari/537.36'}
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
c = soup.find(class_='EDblX DAVP1')
print(c)
if c != None:
genres = c.findAll('a')
gen2[i]= genres
But it returns an empty dict, so I checked one by one and it worked, for example:
import requests
from bs4 import BeautifulSoup
user_query = 'Se7en movie genre'
URL = "https://www.google.co.in/search?q=" + user_query
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.63 Safari/537.36'}
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
v = soup.find(class_='KKHQ8c')
h = {}
genres = v.findAll('a')
for genre in genres:
h['Se7en']=genre
So I find out that in the for loop the variable c is returning None.
I can't figure out why! It only return None inside the loop.
Currently, your URLs are of the form
URLs
so the returned results(google) aren't accurate for all the movies.
You can change it to
`for i in list:
i="+".join(i.split(" "));
user_query = i + "+movie+genre"
URL = 'https://www.google.com/search?q=+'+user_query`
also, movies that belong to a single genre like Cinema Paradiso are in a div with class name "Z0LcW".
Im having a problem with scraping the table of this website, I should be getting the heading but instead am getting
AttributeError: 'NoneType' object has no attribute 'tbody'
Im a bit new to web-scraping so if you could help me out that would be great
import requests
from bs4 import BeautifulSoup
URL = "https://www.collincad.org/propertysearch?situs_street=Willowgate&situs_street_suffix" \
"=&isd%5B%5D=any&city%5B%5D=any&prop_type%5B%5D=R&prop_type%5B%5D=P&prop_type%5B%5D=MH&active%5B%5D=1&year=2021&sort=G&page_number=1"
s = requests.Session()
page = s.get(URL)
soup = BeautifulSoup(page.content, "lxml")
table = soup.find("table", id="propertysearchresults")
table_data = table.tbody.find_all("tr")
headings = []
for td in table_data[0].find_all("td"):
headings.append(td.b.text.replace('\n', ' ').strip())
print(headings)
What happens?
Note: Always look at your soup first - therein lies the truth. The content can always be slightly to extremely different from the view in the dev tools.
Access Revoked
Your IP address has been blocked. We
detected irregular, bot-like usage of our Property Search originating
from your IP address. This block was instated to reduce stress on our
webserver, to ensure that we're providing optimal site performance to
the taxpayers of Collin County. We have
not blocked your ability to download our
data exports, which you can still use to acquire bulk property
data.
How to fix?
Add a user-agent to your requets so that it looks like your requesting with a "browser".
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
page = s.get(URL,headers=headers)
Or as alternativ just download the results.
Example (scraping table)
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
URL = "https://www.collincad.org/propertysearch?situs_street=Willowgate&situs_street_suffix" \
"=&isd%5B%5D=any&city%5B%5D=any&prop_type%5B%5D=R&prop_type%5B%5D=P&prop_type%5B%5D=MH&active%5B%5D=1&year=2021&sort=G&page_number=1"
s = requests.Session()
page = s.get(URL,headers=headers)
soup = BeautifulSoup(page.content, "lxml")
data = []
for row in soup.select('#propertysearchresults tr'):
data.append([c.get_text(' ',strip=True) for c in row.select('td')])
pd.DataFrame(data[1:], columns=data[0])
Output
Property ID ↓ Geographic ID ↓
Owner Name
Property Address
Legal Description
2021 Market Value
1
2709013 R-10644-00H-0010-1
PARTHASARATHY SURESH & ANITHA HARIKRISHNAN
12209 Willowgate Dr Frisco, TX\xa0 75035
Ridgeview At Panther Creek Phase 2, Blk H, Lot 1
$513,019
2
2709018 R-10644-00H-0020-1
JOSHI PRASHANT & SHWETA PANT
12235 Willowgate Dr Frisco, TX\xa0 75035
Ridgeview At Panther Creek Phase 2, Blk H, Lot 2
$546,254
3
2709019 R-10644-00H-0030-1
THALLAPUREDDY RAVENDRA & UMA MAHESWARI VEMULA
12261 Willowgate Dr Frisco, TX\xa0 75035
Ridgeview At Panther Creek Phase 2, Blk H, Lot 3
$550,768
4
2709020 R-10644-00H-0040-1
KULKARNI BHEEMSEN T & GOURI R
12287 Willowgate Dr Frisco, TX\xa0 75035
Ridgeview At Panther Creek Phase 2, Blk H, Lot 4
$509,593
5
2709021 R-10644-00H-0050-1
BALAM GANESH & SHANTHIREKHA LOKULA
12313 Willowgate Dr Frisco, TX\xa0 75035
Ridgeview At Panther Creek Phase 2, Blk H, Lot 5
$553,949
...
import requests
from bs4 import BeautifulSoup
URL = "https://www.collincad.org/propertysearch?situs_street=Willowgate&situs_street_suffix" \
"=&isd%5B%5D=any&city%5B%5D=any&prop_type%5B%5D=R&prop_type%5B%5D=P&prop_type%5B%5D=MH&active%5B%5D=1&year=2021&sort=G&page_number=1"
s = requests.Session()
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36"}
page = s.get(URL,headers=headers)
soup = BeautifulSoup(page.content, "lxml")
Finding Table Data:
column_data=soup.find("table").find_all("tr")[0]
column=[i.get_text() for i in column_data.find_all("td") if i.get_text()!=""]
row=soup.find("table").find_all("tr")[1:]
main_lst=[]
for row_details in row:
lst=[]
for i in row_details.find_all("td")[1:]:
if i.get_text()!="":
lst.append(i.get_text())
main_lst.append(lst)
Converting to pandas DataFrame:
import pandas as pd
df=pd.DataFrame(main_lst,columns=column)
Output:
Property ID↓ Geographic ID ↓ Owner Name Property Address Legal Description 2021 Market Value
0 2709013R-10644-00H-0010-1 PARTHASARATHY SURESH & ANITHA HARIKRISHNAN 12209 Willowgate DrFrisco, TX 75035 Ridgeview At Panther Creek Phase 2, Blk H, Lot 1 $513,019
.....
If you look at page.content, you will see that "Your IP address has been blocked".
You should add some headers to your request because the website is blocking your request. In your specific case, it will be enough to add a User-Agent:
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
}
URL = "https://www.collincad.org/propertysearch?situs_street=Willowgate&situs_street_suffix" \
"=&isd%5B%5D=any&city%5B%5D=any&prop_type%5B%5D=R&prop_type%5B%5D=P&prop_type%5B%5D=MH&active%5B%5D=1&year=2021&sort=G&page_number=1"
s = requests.Session()
page = s.get(URL, headers=headers)
soup = BeautifulSoup(page.content, "lxml")
table = soup.find("table", id="propertysearchresults")
table_data = table.tbody.find_all("tr")
headings = []
for td in table_data[0].find_all("td"):
headings.append(td.b.text.replace('\n', ' ').strip())
print(headings)
If you add headers, you will still have error, but in the row:
headings.append(td.b.text.replace('\n', ' ').strip())
You should change it to
headings.append(td.text.replace('\n', ' ').strip())
because td doesn't always have b.
This is my code, I want to take the location's name and link, the variable "lugares" finds multiple item-containers, but I only want the first one [0]; then goes the for loop, but I can't find the span classes.
from bs4 import BeautifulSoup
import requests
b=[]
i="https://www.vivanuncios.com.mx"
url = "https://www.vivanuncios.com.mx/s-renta-inmuebles/estado-de-mexico/v1c1098l1014p1"
encabezado = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36",'Accept-Language': 'en-US, en;q=0.5'}
page =requests.get(url,headers=encabezado)
soup = BeautifulSoup(page.content,"html.parser")
lugares = soup.find_all("div",{"class":"items-container"})
lugares=lugares[0]
print(len(lugares))
for lugar in lugares:
locationlink = i + str(lugar.find("span",{"class":"item"}).find("a")["href"])
location= lugar.find("span",{"class":"item"}).text
a=[location,locationlink]
b.append(a)
There are multiple options to get the goal, best one depence on what you expect and wanna do with this information in follow up process.
First Option
If you are just looking for the infos of first location you can do the following:
lugar = soup.select_one('div.items-container a')
b = [lugar.text, f'{i}{lugar["href"]}']
or
lugar = soup.select('div.items-container a')[0]
b = [lugar.text, f'{i}{lugar["href"]}']
Both select the first <a> in the <div> with class items-container.
Output
['Huixquilucan','https://www.vivanuncios.com.mx/s-renta-inmuebles/huixquilucan/v1c1098l10689p1']
Alternativ
If you are interested to get all at once, you should use a list of dicts, so later on you just have to iterate it and get all information in place:
[{'name':x.text, 'link':f'{i}{x["href"]}'} for x in soup.select('div.items-container a')]
Output
[{'name': 'Huixquilucan',
'link': 'https://www.vivanuncios.com.mx/s-renta-inmuebles/huixquilucan/v1c1098l10689p1'},
{'name': 'Naucalpan',
'link': 'https://www.vivanuncios.com.mx/s-renta-inmuebles/naucalpan/v1c1098l10710p1'},
{'name': 'Atizapán',
'link': 'https://www.vivanuncios.com.mx/s-renta-inmuebles/atizapan/v1c1098l10662p1'},
{'name': 'Metepec',
'link': 'https://www.vivanuncios.com.mx/s-renta-inmuebles/metepec-edomex/v1c1098l10707p1'},...]
Example (showing results of both)
from bs4 import BeautifulSoup
import requests
i="https://www.vivanuncios.com.mx"
url = "https://www.vivanuncios.com.mx/s-renta-inmuebles/estado-de-mexico/v1c1098l1014p1"
encabezado = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36",'Accept-Language': 'en-US, en;q=0.5'}
page =requests.get(url,headers=encabezado)
soup = BeautifulSoup(page.content,"html.parser")
lugar = soup.select_one('div.items-container a')
b = [lugar.text, f'{i}{lugar["href"]}']
print(f'First lugar:\n {b} \n')
## or alternative option
allLugaros = [{'name':x.text, 'link':f'{i}{x["href"]}'} for x in soup.select('div.items-container a')]
print(f'First lugar from lugaros (list of dict):\n {allLugaros[0]} \n')
print(f'All lugaros as list of dict:\n {allLugaros} \n')
First, you need to get all spans in the first Lugares lugares[0].
Then you need to iterate for each span to get the link and text for each location.
The Code:
from bs4 import BeautifulSoup
import requests
b=[]
i="https://www.vivanuncios.com.mx"
url = "https://www.vivanuncios.com.mx/s-renta-inmuebles/estado-de-mexico/v1c1098l1014p1"
encabezado = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36",'Accept-Language': 'en-US, en;q=0.5'}
page =requests.get(url,headers=encabezado)
soup = BeautifulSoup(page.content,"html.parser")
lugares = soup.find_all("div",{"class":"items-container"})
#lugares=lugares[0]
print(len(lugares))
# get all spans
spans = lugares[0].find_all("span",{"class":"item"})
# itreate throw each span
for span in spans:
# get location text
location = span.find("a").text
# locationlink builder
site = "www.vivanuncios.com.mx"
link = span.find("a")["href"]
locationlink = f"{site}{link}"
a = [location,locationlink]
b.append(a)
print (b[0])
Output:
['Huixquilucan', 'www.vivanuncios.com.mx/s-renta-inmuebles/huixquilucan/v1c1098l10689p1']
I have a code that scrapes all URLs from oddsportal.com main page.
I want the subsequent links to all pages within the parent URL
e.g.
https://www.oddsportal.com/soccer/africa/africa-cup-of-nations/results/
has further pages i.e. https://www.oddsportal.com/soccer/africa/africa-cup-of-nations/results/, https://www.oddsportal.com/soccer/africa/africa-cup-of-nations-2019/results/, etc.
How can I get that?
My existing code:
import requests
import bs4 as bs
import pandas as pd
url = 'https://www.oddsportal.com/results/#soccer'
headers = {
'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'}
resp = requests.get(url, headers=headers)
soup = bs.BeautifulSoup(resp.text, 'html.parser')
base_url = 'https://www.oddsportal.com'
a = soup.findAll('a', attrs={'foo': 'f'})
# This set will have all the URLs of the main page
s = set()
for i in a:
s.add(base_url + i['href'])
s = list(s)
# This will filter for all soccer URLs
s = [x for x in s if '/soccer/' in x]
s = pd.DataFrame(s)
print(s)
I am very new to webscraping and hence this question.
You can find main_div tag based on class attribute and use find_all method to get a tag by looping over it you can extract href of it
from bs4 import BeautifulSoup
import requests
headers = {
'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'}
source = requests.get("https://www.oddsportal.com/soccer/africa/africa-cup-of-nations/results/",headers=headers)
soup = BeautifulSoup(source.text, 'html.parser')
main_div=soup.find("div",class_="main-menu2 main-menu-gray")
a_tag=main_div.find_all("a")
for i in a_tag:
print(i['href'])
Output:
/soccer/africa/africa-cup-of-nations/results/
/soccer/africa/africa-cup-of-nations-2019/results/
/soccer/africa/africa-cup-of-nations-2017/results/
/soccer/africa/africa-cup-of-nations-2015/results/
/soccer/africa/africa-cup-of-nations-2013/results/
/soccer/africa/africa-cup-of-nations-2012/results/
/soccer/africa/africa-cup-of-nations-2010/results/
/soccer/africa/africa-cup-of-nations-2008/results/
I am trying to gather some information about some books available on Amazon and I am having a weird glitch error that I can't understand. At first I thought it was Amazon blocking my connection but then I noticed the request has a "200 OK" and it had the real HTML content of the corresponding page.
Let's take for example this book: https://www.amazon.co.uk/All-Rage-Cara-Hunter/dp/0241985110
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
url = 'https://www.amazon.co.uk/All-Rage-Cara-Hunter/dp/0241985110/ref=sr_1_1?crid=2PPCQEJD706VY&dchild=1&keywords=books+bestsellers+2020+paperback&qid=1598132071&sprefix=book%2Caps%2C234&sr=8-1'
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, features="lxml")
price = {}
if soup.select("#buyBoxInner > ul > li > span > .a-text-strike") != []:
price["regular_price"] = float(
soup.select("#buyBoxInner > ul > li > span > .a-text-strike")[0].string[1:].replace(",", "."))
price["promo_price"] = float(soup.select(".offer-price")[0].string[1:].replace(",", "."))
else:
price["regular_price"] = float(soup.select(".offer-price")[0].string[1:].replace(",", "."))
price["currency"] = soup.select(".offer-price")[0].string[0]
This part works fine and I can have the regular price and a promo price (if exists), and even the currency. But when I do this:
isbn = soup.select("td.bucket > .content > ul > li")[4].contents[1].string.strip().replace("-", "")
I get "IndexError: list index out of range". But if I debug the code, the content is actually there!
Is this a bug of BeautifulSoup? Is the request response too long?
It seems that Amazon returns two version of the page. One where's <td class="bucket"> and one where are several <span> tags. This script tries to extract ISBN from both of them:
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
url = 'https://www.amazon.co.uk/All-Rage-Cara-Hunter/dp/0241985110'
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, features="lxml")
isbn_10 = soup.select_one('span.a-text-bold:contains("ISBN-10"), b:contains("ISBN-10")').find_parent().text
isbn_13 = soup.select_one('span.a-text-bold:contains("ISBN-13"), b:contains("ISBN-13")').find_parent().text
print(isbn_10.split(':')[-1].strip())
print(isbn_13.split(':')[-1].strip())
Prints:
0241985110
978-0241985113
I wish I had an explanation of the problem but you a solution would be to wrap your code in a function like so:
def scrape():
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
url = 'https://www.amazon.co.uk/All-Rage-Cara-Hunter/dp/0241985110/ref=sr_1_1?crid=2PPCQEJD706VY&dchild=1&keywords=books+bestsellers+2020+paperback&qid=1598132071&sprefix=book%2Caps%2C234&sr=8-1'
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, features="lxml")
price = {}
if soup.select("#buyBoxInner > ul > li > span > .a-text-strike") != []:
price["regular_price"] = float(
soup.select("#buyBoxInner > ul > li > span > .a-text-strike")[0].string[1:].replace(",", "."))
price["promo_price"] = float(soup.select(".offer-price")[0].string[1:].replace(",", "."))
else:
price["regular_price"] = float(soup.select(".offer-price")[0].string[1:].replace(",", "."))
price["currency"] = soup.select(".offer-price")[0].string[0]
#ADD THIS FEATURE TO YOUR CODE
isbn = soup.select("td.bucket > .content > ul > li")
if not isbn:
scrape()
isbn = isbn[4].contents[1].string.strip().replace("-", "")
Then if it fails it will just call itself again. You might want to refactor it so it only makes the request once.