How to scrape a part of a google search result - python

I want to scrape the information from AMD stock that google provide.
I have been able to scrape the whole webpage, but as soon I try to get a specific div or class I am not able to find anything and the console returns [].
When scraping the whole page I cannot find those classes either, after searching I found that this is possibly hidden by Javascript and can somehow be accesed with Selenium? I tried to use Selenium Webdriver but this got me nowhere.
This is what i have so far:
import requests
from bs4 import BeautifulSoup
import urllib3
from selenium import webdriver
requests.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36"}
url = "https://www.google.com/search?q=amd+stock&oq=amd+stock&aqs=chrome..69i57j35i39j0l5j69i60.1017j0j7&sourceid=chrome&ie=UTF-8"
source_code = requests.get(url, requests.headers)
soup = BeautifulSoup(source_code.text, "html.parser")
amd = soup.find_all('div', attrs = {'class': 'aviV4d'})
print(amd)
When printing 'soup' I get the whole page, but when printing 'amd' I get [].

Your code is ok, but use headers= parameter in request() call:
import requests
from bs4 import BeautifulSoup
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36"}
url = "https://www.google.com/search?q=amd+stock&oq=amd+stock&aqs=chrome..69i57j35i39j0l5j69i60.1017j0j7&sourceid=chrome&ie=UTF-8"
source_code = requests.get(url, headers=headers)
soup = BeautifulSoup(source_code.text, "html.parser")
amd = soup.find('div', attrs = {'class': 'aviV4d'})
print(amd.get_text(strip=True, separator='|').split('|')[:3])
Prints:
['Advanced Micro Devices', 'NASDAQ: AMD', '48,16']

That's a dynamic page, it's not gonna give the stock price by just requesting the page source via requests. You will have to use scraping to do that. Try this instead:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument("--incognito")
chromedriver_path = './chromedriver'
driver = webdriver.Chrome(executable_path=chromedriver_path, options=options)
driver.get("https://www.google.com/search?q=amd+stock&oq=amd+stock&aqs=chrome..69i57j35i39j0l5j69i60.1017j0j7&sourceid=chrome&ie=UTF-8")
time.sleep(2)
x = driver.find_element_by_xpath('//*[#id="knowledge-finance-wholepage__entity-summary"]/div/g-card-section/div/g-card-section/span[1]/span/span[1]')
print(x.text)
driver.quit()
Output:
48.16

I believe you need to add amd.response or amd.text
print(amd.response)
print(amd.text)

Related

Python How Can I Scrape Image URL and Image Title From This Link With BeautifulSoup

I could not reach the image url and image title with Beautiful soup. I will be glad if you help me. Thanks
The image url I want to scrape is:
https://cdn.homebnc.com/homeimg/2017/01/29-entry-table-ideas-homebnc.jpg
Title I want to scrape
37 Best Entry Table Ideas (Decorations and Designs) for 2017
from bs4 import BeautifulSoup
import requests
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36',
}
response = requests.get("https://www.bing.com/images/search?view=detailV2&ccid=sg67yP87&id=39EC3D95F0FC25C52E714B1776D819AB564D474B&thid=OIP.sg67yP87Kr9hQF8PiKnKZQHaLG&mediaurl=https%3a%2f%2fcdn.homebnc.com%2fhomeimg%2f2017%2f01%2f29-entry-table-ideas-homebnc.jpg&cdnurl=https%3a%2f%2fth.bing.com%2fth%2fid%2fR.b20ebbc8ff3b2abf61405f0f88a9ca65%3frik%3dS0dNVqsZ2HYXSw%26pid%3dImgRaw%26r%3d0&exph=2247&expw=1500&q=table+ideas&simid=608015185750411203&FORM=IRPRST&ck=7EA9EDE471AB12F7BDA2E7DA12DC56C9&selectedIndex=0&qft=+filterui%3aimagesize-large", headers=headers)
print(response.status_code)
soup = BeautifulSoup(response.content, "html.parser")
As mentioned in the comment above by #Carst3n, BeautifulSoup is only giving you the html format before any scripts are executed. For this reason you should try to scrape the website with a combination of Selenium and BeautifulSoup.
You can download chromedriver from here
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
opts = Options()
opts.add_argument(" --headless")
opts.add_argument("--log-level=3")
driver = webdriver.Chrome(PATH_TO_CHROME_DRIVER, options=opts)
driver.get("https://www.bing.com/images/search?view=detailV2&ccid=sg67yP87&id=39EC3D95F0FC25C52E714B1776D819AB564D474B&thid=OIP.sg67yP87Kr9hQF8PiKnKZQHaLG&mediaurl=https%3a%2f%2fcdn.homebnc.com%2fhomeimg%2f2017%2f01%2f29-entry-table-ideas-homebnc.jpg&cdnurl=https%3a%2f%2fth.bing.com%2fth%2fid%2fR.b20ebbc8ff3b2abf61405f0f88a9ca65%3frik%3dS0dNVqsZ2HYXSw%26pid%3dImgRaw%26r%3d0&exph=2247&expw=1500&q=table+ideas&simid=608015185750411203&FORM=IRPRST&ck=7EA9EDE471AB12F7BDA2E7DA12DC56C9&selectedIndex=0&qft=+filterui%3aimagesize-large")
time.sleep(2)
soup_file = driver.page_source
driver.quit()
soup = BeautifulSoup(soup_file)
print(soup.select('#mainImageWindow .nofocus')[0].get('src'))
print(soup.select('.novid')[0].string)

How do I get the price from this website using BeautifulSoup?

So I'm new to parsing HTML with Python, and I want to get the price of this lumber from the following link:
https://www.lowes.com/pd/2-in-x-4-in-x-8-ft-Whitewood-Stud-Common-1-5-in-x-3-5-in-x-96-in-Actual/1000074211
This is what I have so far, but I'm getting an error that says "AttributeError: 'NoneType' object has no attribute 'text'" :
import requests
from bs4 import BeautifulSoup
HEADERS = {'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,
like Gecko) Chrome/91.0.4472.164 Safari/537.36"}
URL = "https://www.lowes.com/pd/2-in-x-4-in-x-8-ft-Whitewood-Stud-Common-1-5-in-x-3-5-in-x-96-
in-Actual/1000074211"
r = requests.get(URL,headers=HEADERS)
c=r.content
soup = BeautifulSoup(c, "html.parser")
price_box = soup.find("div", class_="sq-bqyKva.ehfErk")
price=price_box.text.strip()
print(price)
Why am I getting this error and how can I fix it?
I can't see the site, but most probably you're getting the error because BS cannot find any element with a class called "sq-bqyKva.ehfErk".
Can you print out the soup and search for the class manually to see that it actually exists?
Also, based on the class name it looks like the div you are trying to find is dynamically generated using JavaScript which means it is not loaded into DOM when the request is made, which means BS won't be able to find it. If this is the case you might want to look into using other tools such as Selenium.
The site is dynamic, relying on a script to populate the page with data from a JSON string, which can be found in a script tag and parsed via the json module to access the price:
from bs4 import BeautifulSoup as soup
import json
headers = {'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36"}
d = soup(requests.get('https://www.lowes.com/pd/2-in-x-4-in-x-8-ft-Whitewood-Stud-Common-1-5-in-x-3-5-in-x-96-in-Actual/1000074211', headers=headers).text, 'html.parser')
data = json.loads(d.select_one('script[data-react-helmet="true"][type="application/ld+json"]').contents[0])
price = data[2]['offers']['price']
Output:
5.98
This is a really difficult site. You will not be able to access the price using that class name. Web scraping becomes difficult if your site has javascript DOM elements. "View Source" will get you the response that Beautifulsoup will get. On closer inspection, your page price is inside a json value under a "script" tag:
import requests
from bs4 import BeautifulSoup
import json
HEADERS = {'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/91.0.4472.164 Safari/537.36"}
URL = "https://www.lowes.com/pd/2-in-x-4-in-x-8-ft-Whitewood-Stud-Common-1-5-in-x-3-5-in-x-96-in-Actual/1000074211"
r = requests.get(URL,headers=HEADERS)
soup = BeautifulSoup(r.content, "html.parser")
contents = soup.find("script")
json_object = json.loads(contents.text)
print(json_object[2]["offers"]["price"])

BeautifulSoup solution

I need your help to get "Description" content of this URL using BeautifulSoup in Python (as shown below).
I have tried below code but it return None only!
import requests as rq
from bs4 import BeautifulSoup
hdr = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'}
page = rq.get(url, headers=hdr)
soup = BeautifulSoup(page.content, "html.parser")
description = soup.find('div', {'class': 'force-wrapping ng-star-inserted'})
I had tried and i saw that soup doesn't has class force-wrapping ng-star-inserted because you had taken the source of site. It is different from what you saw in dev tool, to see source of site, you can press Ctr+U. Then you can see that the description is in meta tag with name is description. So, what you need to do is find this tag and take the content. For Sample:
res = soup.find('meta', {"name":"description"})
print(res['content'])

Why is my CSS selector not working with beautifulsoup but works fine as a chrome console query?

I have a css selector that works fine when executing it in the chrome JS Console, but does not work when running it through beautifulsoup on one example, yet works on another (I'm unable to discern the difference between the two).
url_1 = 'https://www.amazon.com/s?k=bacopa&page=1'
url_2 = 'https://www.amazon.com/s?k=acorus+calamus&page=1'
The following query works fine on both when executing it in the chrome console.
document.querySelectorAll('div.s-result-item')
Then running the two urls through beautifulsoup, this is the output I get.
url_1 (works)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
r = requests.get(url_1, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
listings = soup .select('div.s-result-item')
print(len(listings))
output: 53 (correct)
url_2 (does not work)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
r = requests.get(url_2, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
listings = soup.select('div.s-result-item')
print(len(listings))
output: 0 (incorrect - expected: 49)
Does anyone know what might be going on here and how I can get the css selector to work with beautifulsoup?
I think it is the html. Change the parser to 'lxml'. You can also shorten your css selector to just class and re-use connection with Session object for efficiency.
import requests
from bs4 import BeautifulSoup as bs
urls = ['https://www.amazon.com/s?k=bacopa&page=1','https://www.amazon.com/s?k=acorus+calamus&page=1']
with requests.Session() as s:
for url in urls:
r = s.get(url, headers = {'User-Agent' : 'Mozilla/5.0'})
soup = bs(r.content, 'lxml')
listings = soup.select('.s-result-item')
print(len(listings))
Try selenium library to download the webpage
from selenium import webdriver
from bs4 import BeautifulSoup
url_1 = 'https://www.amazon.com/s?k=bacopa&page=1'
url_2 = 'https://www.amazon.com/s?k=acorus+calamus&page=1'
#set chrome webdriver path
driver = webdriver.Chrome('/usr/bin/chromedriver')
#download webpage
driver.get(url_2)
soup = BeautifulSoup(driver.page_source, 'html.parser')
listings = soup.find_all('div',{'class':'s-result-item'})
print(len(listings))
O/P:
url_1: 50
url_2 : 48

google search using bs4,python

I wanted to pick up the address of "Spotlight 29 casino address" through Google search in the python script. Why isn't my code working?
from bs4 import BeautifulSoup
# from googlesearch import search
import urllib.request
import datetime
article='spotlight 29 casino address'
url1 ='https://www.google.co.in/#q='+article
content1 = urllib.request.urlopen(url1)
soup1 = BeautifulSoup(content1,'lxml')
#print(soup1.prettify())
div1 = soup1.find('div', {'class':'Z0LcW'}) #get the div where it's located
# print (datetime.datetime.now(), 'street address: ' , div1.text)
print (div1)
Pastebin Link
Google uses javascript rendering for that purpose, that's why you don't receive that div with urllib.request.urlopen.
As solution you can use selenium - python library for emulating browser. Install it using 'pip install selenium' console command, then code like this shall work:
from bs4 import BeautifulSoup
from selenium import webdriver
article = 'spotlight 29 casino address'
url = 'https://www.google.co.in/#q=' + article
driver = webdriver.Firefox()
driver.get(url)
html = BeautifulSoup(driver.page_source, "lxml")
div = html.find('div', {'class': 'Z0LcW'})
print(div.text)
If you want to get google search result. Selenium with Python is more simple way.
below is simple code.
from selenium import webdriver
import urllib.parse
from bs4 import BeautifulSoup
chromedriver = '/xxx/chromedriver' #xxx is chromedriver in your installed path
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(chromedriver, chrome_options=chrome_options)
article='spotlight 29 casino address'
driver.get("https://www.google.co.in/#q="+urllib.parse.quote(article))
# driver.page_source <-- html source, you can parser it later.
soup = BeautifulSoup(driver.page_source, 'lxml')
div = soup.find('div',{'class':'Z0LcW'})
print(div.text)
driver.quit()
You were getting an empty div because if you were using requests library the default user-agent is python-requests thus your request is being blocked by Google (in this case) and you received a different HTML with different elements. User-agent fakes "real" user visit.
You can achieve it without selenium if the address was in HTML code (which in this case it is) by passing user-agent into request headers:
headers = {
"User-agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
requests.get("YOUR_URL", headers=headers)
Here's code and full example:
from bs4 import BeautifulSoup
import requests, lxml
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
response = requests.get("https://www.google.com/search?q=spotlight 29 casino address", headers=headers)
soup = BeautifulSoup(response.text, 'lxml')
print(soup.select_one(".sXLaOe").text)
# 46-200 Harrison Pl, Coachella, CA 92236
P.S. There's a dedicated web scraping blog of mine. If you need to parse search engines, have a try using SerpApi.
Disclaimer, I work for SerpApi.

Categories

Resources