Web scrape Google search results using BeautifulSoup - python

My goal is to web scrape Google search results using BeautifulSoup. I am using Anaconda Python and use Ipython as the IDE console. Why don't I get an ouptput when run the following command?
def google_scrape(query):
address = "http://www.google.com/search?q=%s&num=100&hl=en&start=0" % (urllib.quote_plus(query))
request = urllib2.Request(address, None, {'User-Agent':'Mosilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11'})
urlfile = urllib2.urlopen(request)
page = urlfile.read()
soup = BeautifulSoup(page)
linkdictionary = {}
for li in soup.findAll('li', attrs={'class':'g'}):
sLink = li.find('a')
print sLink['href']
sSpan = li.find('span', attrs={'class':'st'})
print sSpan
return linkdictionary
if __name__ == '__main__':
links = google_scrape('english')

You are never adding anything to linkedDictionary
def google_scrape(query):
address = "http://www.google.com/search?q=%s&num=100&hl=en&start=0" % (urllib.quote_plus(query))
request = urllib2.Request(address, None, {'User-Agent':'Mosilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11'})
urlfile = urllib2.urlopen(request)
page = urlfile.read()
soup = BeautifulSoup(page)
linkdictionary = {}
for li in soup.findAll('li', attrs={'class':'g'}):
sLink = li.find('a')
sSpan = li.find('span', attrs={'class':'st'})
linkeDictionary['href'] = sLink['href']
linkedDictionary['sSpan'] = sSpan
return linkdictionary
if __name__ == '__main__':
links = google_scrape('english')

The problem as Cody Bouche mentioned is that nothing has been adding to the dict().
In my opinion, you'll have hard times updating your dict if you haven't change {}(dict) to [](array).
Appending to array is much simpler (note: I could be wrong here, it's just a personal opinion from previous experience).
To make it work in a simple maner, you need to change dict to array {} --> [] and then use .append({}) to append to list()
Code and example in the online IDE:
def google_scrape(query):
html = requests.get(f'https://www.google.com/search?q={query}', headers=headers).text
soup = BeautifulSoup(html, 'lxml')
data = []
for container in soup.findAll('div', class_='tF2Cxc'):
title = container.select_one('.DKV0Md').text
link = container.find('a')['href']
data.append({
'title': title,
'link': link,
})
print(f'{title}\n{link}')
print(json.dumps(data, indent=2))
google_scrape('english')
# part of the outputs:
'''
English language - Wikipedia
https://en.wikipedia.org/wiki/English_language
[
{
"title": "English language - Wikipedia",
"link": "https://en.wikipedia.org/wiki/English_language"
},
]
'''
If you still want to append to dict() then this is one of the ways of approaching this (only part of the for loop shown):
for container in soup.findAll('div', class_='tF2Cxc'):
data_dict = {}
title = container.select_one('.DKV0Md').text
link = container.find('a')['href']
# creates title key and assigns title value
data_dict['title'] = title
# creates link key and assigns link value
data_dict['link'] = link
print(json.dumps(data_dict, indent = 2))
# part of the output:
'''
{
"title": "Minecraft Official Site | Minecraft",
"link": "https://www.minecraft.net/en-us/"
}
'''
Alternatively, you can do the same thing using Google Search Engine Results API from SerpApi. It's a paid API with a free plan.
Essentially, it's doing the same thing as the code above, but you don't to figure out how to do certain things or trying to understand how to scrape certain element, it's already done for the end-user with a JSON output so the only thing that needs to be done is to iterate over a JSON and get the desired output.
Code to integrate:
from serpapi import GoogleSearch
import json
params = {
"api_key": "YOUR_API_KEY",
"engine": "google",
"q": "minecraft",
}
search = GoogleSearch(params)
results = search.get_dict()
for result in results['organic_results']:
print(json.dumps(result, indent = 2, ensure_ascii = False))
# part of the json output:
'''
{
"position": 1,
"title": "Minecraft - Aplikasi di Google Play",
"link": "https://play.google.com/store/apps/details?id=com.mojang.minecraftpe&hl=in&gl=US",
"displayed_link": "https://play.google.com › store › apps › details › id=co...",
"rich_snippet": {
"top": {
"detected_extensions": {
"skor": 46,
"suara": 4144655,
"us": 749
},
"extensions": [
"Skor: 4,6",
"‎4.144.655 suara",
"‎US$7,49",
"‎Android",
"‎Game"
]
}
}
'''
Disclaimer, I work for SerpApi.

Related

How to scrape the URL, Title, and Description of Google Search Results

I'm using selenium to first ask Google a question and then scrape the first few results. I'm trying to add all URLs, Titles, and Descriptions to a Dict which I can then access later. Unfortunately, I can't get it to work - returns 'No Data Found'. Does anyone have an idea of what may be the issue?
Here is what I'm doing:
options = Options()
options.add_argument("--headless")
def googleSearch(query):
# specifing browser web driver
driver = webdriver.Chrome(options=options, executable_path='chromedriver')
# search query
search_engine = "https://www.google.com/search?q="
query = query.replace(" ","+")
driver.get(search_engine + query + "&start=" + "0")
# stored data
# which will be returned by this function
data = {}
# number of search reasult count of first page
s_len = 5
for s_block in range(s_len):
# result block
content_block_xpath = f'''//*[#id="yuRUbf"]/div[{s_block}]/div/div'''
# xpaths
xpath_url = f"""{content_block_xpath}/div[1]/a"""
xpath_title = f"""{content_block_xpath}/div[1]/a/h3"""
xpath_description = f"""{content_block_xpath}/div[2]/span/span"""
try:
# store data collected of each s_block to block {}
block = {}
# find url of content
url = driver.find_element(By.XPATH, xpath_url)
url = url.get_attribute('href')
links.append(url.get('href'))
# find domain name of web having content
pattern = r"""(https?:\/\/)?(([a-z0-9-_]+\.)?([a-z0-9-_]+\.[a-z0-9-_]+))"""
domain = re.search(pattern, url)[0]
print(links)
# find title of content
# title = driver.find_element_by_xpath(xpath_title)
title = driver.find_element(By.XPATH, xpath_title)
title = title.get_attribute("innerText")
# find description of content
# description = driver.find_element_by_xpath(xpath_description)
description = driver.find_element(By.XPATH, xpath_description)
description = description.get_attribute("innerText")
# save all data to block {}
block["domain"] = domain
block["url"] = url
block["title"] = title
block["description"] = description
# save block dictionary to main dictionary
data[f'{s_block}'] = block
except exceptions.NoSuchElementException:
continue
if len(data) == 0:
raise Exception("No data found")
driver.close()
return data
def getQuery():
query = str('How to change a car tire')
link = googleSearch(query)
print(link)
getQuery()
I see two problems:
a mix-up with class and id regarding the use of "yuRUbf"
indexing in xpath starts at 1 and not 0
I also don't get the same hierarchical structure as you, but that's just a tweak.
The following produces reasonable results for me:
content_block_xpath = f'''(//*[#class="yuRUbf"])[{s_block}]'''
xpath_url = f"""{content_block_xpath}/a"""
xpath_title = f"""{content_block_xpath}/a/h3"""
xpath_description = f"""{content_block_xpath}/a//cite/span"""
You can only use BeautifulSoup web scraping library to scrape Google Search without Selenium web driver as the data is not processed through JS and it will speed up the script.
Here's how you can extract title, link and a snippet (description) from Google search results using bs4 and requests packages:
params = {
"q": "How to change a car tire", # query example
"hl": "en", # language
"gl": "uk", # country of the search, UK -> United Kingdom
"start": 0, # number page by default up to 0
#"num": 100 # parameter defines the maximum number of results to return.
}
html = requests.get("https://www.google.com/search", params=params, headers=headers, timeout=30)
soup = BeautifulSoup(html.text, 'lxml')
for result in soup.select(".tF2Cxc"):
title = result.select_one(".DKV0Md").text
try:
snippet = result.select_one(".lEBKkf span").text
except:
snippet = None
links = result.select_one(".yuRUbf a")["href"]
You can also extract not only the first page, but all the rest using pagination whith infinite while loop.
In this case, pagination is possible as long as the next button exists (determined by the presence of a button selector on the page, in our case the CSS selector .d6cvqb a[id=pnnext], you need to increase the value of ["start"] by 10 to access the next page (this may be called as non-token pagination), if present, otherwise, we need to exit the while loop:
if soup.select_one('.d6cvqb a[id=pnnext]'):
params["start"] += 10
else:
break
Check code in the online IDE
from bs4 import BeautifulSoup
import requests, json, lxml
# https://docs.python-requests.org/en/master/user/quickstart/#passing-parameters-in-urls
params = {
"q": "How to change a car tire", # query example
"hl": "en", # language
"gl": "uk", # country of the search, UK -> United Kingdom
"start": 0, # number page by default up to 0
#"num": 100 # parameter defines the maximum number of results to return.
}
# https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
}
page_num = 0
data = []
while True:
page_num += 1
print(f"page: {page_num}")
html = requests.get("https://www.google.com/search", params=params, headers=headers, timeout=30)
soup = BeautifulSoup(html.text, 'lxml')
for result in soup.select(".tF2Cxc"):
title = result.select_one(".DKV0Md").text
try:
snippet = result.select_one(".lEBKkf span").text
except:
snippet = None
links = result.select_one(".yuRUbf a")["href"]
data.append({
"title": title,
"snippet": snippet,
"links": links
})
if soup.select_one(".d6cvqb a[id=pnnext]"):
params["start"] += 10
else:
break
print(json.dumps(data, indent=2, ensure_ascii=False))
Example output:
[
{
"title": "How Long Do Tires Last and When Should I Replace Them?",
"snippet": "As a general rule, we recommend every 5,000-7,000 miles, but it depends on numerous factors, including your car's alignment. You can read more on The Drive's ...",
"links": "https://www.thedrive.com/cars-101/35041/how-long-do-tires-last"
},
{
"title": "Car Tire Valve Stem Replacement - iFixit Repair Guide",
"snippet": "Step 1 Car Tire Valve Stem · Locate the stem valve and remove the cap. · Using the Schrader valve core bit in your 1/4\" driver, unscrew the valve core from the ...",
"links": "https://www.ifixit.com/Guide/Car+Tire+Valve+Stem+Replacement/121415"
},
other results ...
]
Also you can use Google Search Engine Results API from SerpApi. It's a paid API with the free plan.
The difference is that it will bypass blocks (including CAPTCHA) from Google, no need to create the parser and maintain it.
Code example:
from serpapi import GoogleSearch
from urllib.parse import urlsplit, parse_qsl
import json, os
params = {
"api_key": "...", # serpapi key, https://serpapi.com/manage-api-key
"engine": "google", # serpapi parser engine
"q": "How to change a car tire", # search query
"gl": "uk", # country of the search, UK -> United Kingdom
"num": "100" # number of results per page (100 per page in this case)
# other search parameters: https://serpapi.com/search-api#api-parameters
}
search = GoogleSearch(params) # where data extraction happens
organic_results_data = []
page_num = 0
while True:
results = search.get_dict() # JSON -> Python dictionary
page_num += 1
for result in results["organic_results"]:
organic_results_data.append({
"title": result.get("title"),
"snippet": result.get("snippet"),
"link": result.get("link")
})
if "next_link" in results.get("serpapi_pagination", []):
search.params_dict.update(dict(parse_qsl(urlsplit(results.get("serpapi_pagination").get("next_link")).query)))
else:
break
print(json.dumps(organic_results_data, indent=2, ensure_ascii=False))
Output:
[
{
"title": "Today: can you safely change a tire with passengers on board?",
"snippet": "RAY: In any case, the primary danger during a tire change is that the vehicle will slip off the jack and injure the tire changer.",
"link": "https://www.cartalk.com/content/today-can-you-safely-change-tire-passengers-board"
},
{
"title": "How to Change a Flat Tire - Mercedes-Benz Burlington",
"snippet": "How to Switch a Tire in 5 Steps · Secure the wheel wedges against the tires on the opposite side of the flat tire. · Remove the hubcap or wheel ...",
"link": "https://www.mercedes-benz-burlington.ca/how-to-change-a-flat-tire/"
},
other results...
]

Scrape eBay Sold Items Using Selenium Returns []

I have almost no webscraping experience, and wasn't able to solve this using BeautifulSoup, so I'm trying selenium (installed it today). I'm trying to scrape sold items on eBay. I'm trying to scrape:
https://www.ebay.com/sch/i.html?_from=R40&_nkw=oakley+sunglasses&_sacat=0&Brand=Oakley&rt=nc&LH_Sold=1&LH_Complete=1&_ipg=200&_oaa=1&_fsrp=1&_dcat=79720
Here is my code where I load in html code and convert to selenium html:
ebay_url = 'https://www.ebay.com/sch/i.html?_from=R40&_nkw=oakley+sunglasses&_sacat=0&Brand=Oakley&rt=nc&LH_Sold=1&LH_Complete=1&_ipg=200&_oaa=1&_fsrp=1&_dcat=79720'
html = requests.get(ebay_url)
#print(html.text)
driver = wd.Chrome(executable_path=r'/Users/mburley/Downloads/chromedriver')
driver.get(ebay_url)
Which correctly opens a new chrome session at the correct url. I'm working on getting the titles, prices, and date sold and then loading it into a csv file. Here is the code I have for those:
# Find all div tags and set equal to main_data
all_items = driver.find_elements_by_class_name("s-item__info clearfix")[1:]
#print(main_data)
# Loop over main_data to extract div classes for title, price, and date
for item in all_items:
date = item.find_element_by_xpath("//span[contains(#class, 'POSITIVE']").text.strip()
title = item.find_element_by_xpath("//h3[contains(#class, 's-item__title s-item__title--has-tags']").text.strip()
price = item.find_element_by_xpath("//span[contains(#class, 's-item__price']").text.strip()
print('title:', title)
print('price:', price)
print('date:', date)
print('---')
data.append( [title, price, date] )
Which just returns []. I think ebay may be blocking my IP, but the html code loads in and looks correct. Hopefully someone can help! Thanks!
It is not necessary to use Selenium for eBay scraping, as the data is not rendered by JavaScript thus can be extracted from plain HTML. It is enough to use BeautifulSoup web scraping library.
Keep in mind that problems with site parsing may arise when you try to request a site multiple times. eBay may consider that this is a bot that sends a request (not a real user).
To avoid this, one of the ways is to send headers that contain user-agent in the request, then the site will assume that you're a user and display information.
As an additional step is to rotate those user-agents. The ideal scenario is to use proxies in combo with rotated user-agents (besides CAPTCHA solver)
from bs4 import BeautifulSoup
import requests, json, lxml
# https://requests.readthedocs.io/en/latest/user/quickstart/#custom-headers
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36"
}
params = {
'_nkw': 'oakley+sunglasses', # search query
'LH_Sold': '1', # shows sold items
'_pgn': 1 # page number
}
data = []
while True:
page = requests.get('https://www.ebay.com/sch/i.html', params=params, headers=headers, timeout=30)
soup = BeautifulSoup(page.text, 'lxml')
print(f"Extracting page: {params['_pgn']}")
print("-" * 10)
for products in soup.select(".s-item__info"):
title = products.select_one(".s-item__title span").text
price = products.select_one(".s-item__price").text
link = products.select_one(".s-item__link")["href"]
data.append({
"title" : title,
"price" : price,
"link" : link
})
if soup.select_one(".pagination__next"):
params['_pgn'] += 1
else:
break
print(json.dumps(data, indent=2, ensure_ascii=False)
Example output
Extracting page: 1
----------
[
{
"title": "Shop on eBay",
"price": "$20.00",
"link": "https://ebay.com/itm/123456?hash=item28caef0a3a:g:E3kAAOSwlGJiMikD&amdata=enc%3AAQAHAAAAsJoWXGf0hxNZspTmhb8%2FTJCCurAWCHuXJ2Xi3S9cwXL6BX04zSEiVaDMCvsUbApftgXEAHGJU1ZGugZO%2FnW1U7Gb6vgoL%2BmXlqCbLkwoZfF3AUAK8YvJ5B4%2BnhFA7ID4dxpYs4jjExEnN5SR2g1mQe7QtLkmGt%2FZ%2FbH2W62cXPuKbf550ExbnBPO2QJyZTXYCuw5KVkMdFMDuoB4p3FwJKcSPzez5kyQyVjyiIq6PB2q%7Ctkp%3ABlBMULq7kqyXYA"
},
{
"title": "Oakley X-metal Juliet Men's Sunglasses",
"price": "$280.00",
"link": "https://www.ebay.com/itm/265930582326?hash=item3deab2a936:g:t8gAAOSwMNhjRUuB&amdata=enc%3AAQAHAAAAoH76tlPncyxembf4SBvTKma1pJ4vg6QbKr21OxkL7NXZ5kAr7UvYLl2VoCPRA8KTqOumC%2Bl5RsaIpJgN2o2OlI7vfEclGr5Jc2zyO0JkAZ2Gftd7a4s11rVSnktOieITkfiM3JLXJM6QNTvokLclO6jnS%2FectMhVc91CSgZQ7rc%2BFGDjXhGyqq8A%2FoEyw4x1Bwl2sP0viGyBAL81D2LfE8E%3D%7Ctkp%3ABk9SR8yw1LH9YA"
},
{
"title": " Used Oakley PROBATION Sunglasses Polished Gold/Dark Grey (OO4041-03)",
"price": "$120.00",
"link": "https://www.ebay.com/itm/334596701765?hash=item4de7847e45:g:d5UAAOSw4YtjTfEE&amdata=enc%3AAQAHAAAAoItMbbzfQ74gNUiinmOVnzKlPWE%2Fc54B%2BS1%2BrZpy6vm5lB%2Bhvm5H43UFR0zeCU0Up6sPU2Wl6O6WR0x9FPv5Y1wYKTeUbpct5vFKu8OKFBLRT7Umt0yxmtLLMWaVlgKf7StwtK6lQ961Y33rf3YuQyp7MG7H%2Fa9fwSflpbJnE4A9rLqvf3hccR9tlWzKLMj9ZKbGxWT17%2BjyUp19XIvX2ZI%3D%7Ctkp%3ABk9SR8yw1LH9YA"
},
As an alternative, you can use Ebay Organic Results API from SerpApi. It`s a paid API with a free plan that handles blocks and parsing on their backend.
Example code that paginates through all pages:
from serpapi import EbaySearch
import os, json
params = {
"api_key": os.getenv("API_KEY"), # serpapi api key
"engine": "ebay", # search engine
"ebay_domain": "ebay.com", # ebay domain
"_nkw": "oakley+sunglasses", # search query
"_pgn": 1, # page number
"LH_Sold": "1" # shows sold items
}
search = EbaySearch(params) # where data extraction happens
page_num = 0
data = []
while True:
results = search.get_dict() # JSON -> Python dict
if "error" in results:
print(results["error"])
break
for organic_result in results.get("organic_results", []):
link = organic_result.get("link")
price = organic_result.get("price")
data.append({
"price" : price,
"link" : link
})
page_num += 1
print(page_num)
if "next" in results.get("pagination", {}):
params['_pgn'] += 1
else:
break
print(json.dumps(data, indent=2))
Output:
[
{
"price": {
"raw": "$68.96",
"extracted": 68.96
},
"link": "https://www.ebay.com/itm/125360598217?epid=20030526224&hash=item1d3012ecc9:g:478AAOSwCt5iqgG5&amdata=enc%3AAQAHAAAA4Ls3N%2FEH5OR6w3uoTlsxUlEsl0J%2B1aYmOoV6qsUxRO1d1w3twg6LrBbUl%2FCrSTxNOjnDgIh8DSI67n%2BJe%2F8c3GMUrIFpJ5lofIRdEmchFDmsd2I3tnbJEqZjIkWX6wXMnNbPiBEM8%2FML4ljppkSl4yfUZSV%2BYXTffSlCItT%2B7ZhM1fDttRxq5MffSRBAhuaG0tA7Dh69ZPxV8%2Bu1HuM0jDQjjC4g17I3Bjg6J3daC4ZuK%2FNNFlCLHv97w2fW8tMaPl8vANMw8OUJa5z2Eclh99WUBvAyAuy10uEtB3NDwiMV%7Ctkp%3ABk9SR5DKgLD9YA"
},
{
"price": {
"raw": "$62.95",
"extracted": 62.95
},
"link": "https://www.ebay.com/itm/125368283608?epid=1567457519&hash=item1d308831d8:g:rnsAAOSw7PJiqMQz&amdata=enc%3AAQAHAAAA4AwZhKJZfTqrG8VskZL8rtfsuNtZrMdWYpndpFs%2FhfrIOV%2FAjLuzNzaMNIvTa%2B6QUTdkOwTLRun8n43cZizqtOulsoBLQIwy3wf19N0sHxGF5HaIDOBeW%2B2sobRnzGdX%2Fsmgz1PRiKFZi%2BUxaLQpWCoGBf9n8mjcsFXi3esxbmAZ8kenO%2BARbRBzA2Honzaleb2tyH5Tf8%2Bs%2Fm5goqbon%2FcEsR0URO7BROkBUUjDCdDH6fFi99m6anNMMC3yTBpzypaFWio0u2qu5TgjABUfO1wzxb4ofA56BNKjoxttb7E%2F%7Ctkp%3ABk9SR5DKgLD9YA"
},
# ...
]
Disclaimer, I work for SerpApi.
You can use the below code to scrape the details. also you can use pandas to store data in csv file.
Code :
ebay_url = 'https://www.ebay.com/sch/i.html?_from=R40&_nkw=oakley+sunglasses&_sacat=0&Brand=Oakley&rt=nc&LH_Sold=1&LH_Complete=1&_ipg=200&_oaa=1&_fsrp=1&_dcat=79720'
html = requests.get(ebay_url)
# print(html.text)
driver = wd.Chrome(executable_path=r'/Users/mburley/Downloads/chromedriver')
driver.maximize_window()
driver.implicitly_wait(30)
driver.get(ebay_url)
wait = WebDriverWait(driver, 20)
sold_date = []
title = []
price = []
i = 1
for item in driver.find_elements(By.XPATH, "//div[contains(#class,'title--tagblock')]/span[#class='POSITIVE']"):
sold_date.append(item.text)
title.append(driver.find_element_by_xpath(f"(//div[contains(#class,'title--tagblock')]/span[#class='POSITIVE']/ancestor::div[contains(#class,'tag')]/following-sibling::a/h3)[{i}]").text)
price.append(item.find_element_by_xpath(f"(//div[contains(#class,'title--tagblock')]/span[#class='POSITIVE']/ancestor::div[contains(#class,'tag')]/following-sibling::div[contains(#class,'details')]/descendant::span[#class='POSITIVE'])[{i}]").text)
i = i + 1
print(sold_date)
print(title)
print(price)
data = {
'Sold_date': sold_date,
'title': title,
'price': price
}
df = pd.DataFrame.from_dict(data)
df.to_csv('out.csv', index = 0)
Imports :
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By

.findAll() finding things not constantly

I tried to make a little Beautiful Soup Script, to analyze prices on eBay. So the problem is, that my soup.findAll() that should find the prices, is sometimes working, sometimes not and I am wondering why. So here is my code:
import requests
from bs4 import BeautifulSoup
from requests.models import encode_multipart_formdata
article = input("Product:")
keywords = article.strip().replace(" ", "+")
URL_s = "https://www.ebay.de/sch/i.html?_dmd=1&_fosrp=1&LH_SALE_CURRENCY=0&_sop=12&_ipg=50&LH_Complete=1&LH_Sold=1&_sadis=10&_from=R40&_sacat=0&_nkw=" + keywords + "&_dcat=139971&rt=nc&LH_ItemCondition=3"
source = requests.get(URL_s).text
soup = BeautifulSoup(source)
prices = soup.findAll('span', class_='bold bidsold')
# ^ this line sometimes finds the prices, sometimes it just produces an empty list ^
help would be very welcome, hope you are doing well, bye bye :)
If you look at the variable soup, and open the results as an html page you would see something like this:
This means the ebay has some sort of a filtering mechanism to prevent scraping, and requires you to somehow confirm your identity. This is why your query for prices returns empty.
Maybe the prices are rendered by JavaScript. Requests does not wait for the JavaScript to be loaded.
So thats why, you should use other modules, such as Selenium or DryScrape
When using requests, the request may be blocked because the default user-agent in the requests library is python-requests, in order for the website to understand that this is not a bot or script, you need to pass your real User-Agent to the headers.
You can also read Reducing the chance of being blocked while web scraping blog post to learn about other options for solving this problem.
If you want to collect all the information from all pages, you can use a while loop that dynamically paginates all pages.
The while loop will be executed until the stop command appears, in our case, the loop termination command will be to check for the presence of the next page, for which the CSS selector “.pagination__next” is responsible.
Check code in online IDE.
from bs4 import BeautifulSoup
import requests, json, lxml
# https://requests.readthedocs.io/en/latest/user/quickstart/#custom-headers
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36",
}
query = input('Your query is: ') # "shirt" for example
params = {
'_nkw': query, # search query
'_pgn': 1, # page number
'LH_Sold': '1' # shows sold items
}
data = []
page_limit = 10 # page limit (if you need)
while True:
page = requests.get('https://www.ebay.de/sch/i.html', params=params, headers=headers, timeout=30)
soup = BeautifulSoup(page.text, 'lxml')
print(f"Extracting page: {params['_pgn']}")
print("-" * 10)
for products in soup.select(".s-item__info"):
title = products.select_one(".s-item__title span").text
price = products.select_one(".s-item__price").text
data.append({
"title" : title,
"price" : price
})
if params['_pgn'] == page_limit:
break
if soup.select_one(".pagination__next"):
params['_pgn'] += 1
else:
break
print(json.dumps(data, indent=2, ensure_ascii=False))
Example output:
[
{
"title": "CECIL Pullover Damen Hoodie Sweatshirt Gr. L (DE 44) Baumwolle flieder #7902fa2",
"price": "EUR 17,64"
},
{
"title": "Shirt mit Schlangendruck & Strass \"cyclam\" Gr. 40 UVP: 49,99€ 5.65",
"price": "EUR 6,50"
},
{
"title": "Fender Guitars Herren T-Shirt von Difuzed - Größe Medium blau auf blau - Sehr guter Zustand",
"price": "EUR 10,06"
},
other results ...
]
As an alternative, you can use Ebay Organic Results API from SerpApi. It's a paid API with a free plan that handles blocks and parsing on their backend.
Example code with pagination:
from serpapi import EbaySearch
import os, json
query = input('Your query is: ') # "shirt" for example
params = {
"api_key": "...", # serpapi key, https://serpapi.com/manage-api-key
"engine": "ebay", # search engine
"ebay_domain": "ebay.com", # ebay domain
"_nkw": query, # search query
# "LH_Sold": "1" # shows sold items
}
search = EbaySearch(params) # where data extraction happens
page_num = 0
data = []
while True:
results = search.get_dict() # JSON -> Python dict
if "error" in results:
print(results["error"])
break
for organic_result in results.get("organic_results", []):
title = organic_result.get("title")
price = organic_result.get("price")
data.append({
"price" : price,
"title" : title
})
page_num += 1
print(page_num)
if "next" in results.get("pagination", {}):
params['_pgn'] += 1
else:
break
print(json.dumps(data, indent=2))
Output:
[
{
"price": {
"raw": "EUR 17,50",
"extracted": 17.5
},
"title": "Mensch zweiter Klasse Gesund und ungeimpft T-Shirt"
},
{
"price": {
"raw": "EUR 14,90",
"extracted": 14.9
},
"title": "Sprüche Shirt Lustige T-Shirts für Herren oder Unisex Kult Fun Gag Handwerker"
},
# ...
]
There's a 13 ways to scrape any public data from any website blog post if you want to know more about website scraping.

How to convert values into nested JSON in python?

I scrape certain values using beautifulsoup. I want to convert it into Nested JSON format.
Following is my value structure.
category = development
heading = Complete Python Bootcamp | Deep Learning Into Python Coding
image = https://i.udemycdn.com/course/750x422/3871500_3d01_3.jpg
link = https://www.udemy.com/course/complete-python-bootcamp-deep-learning-into-python-coding
category = development
heading = C++ Complete Course For Beginners
image = https://i.udemycdn.com/course/750x422/3847698_8547_2.jpg
link = https://www.udemy.com/course/c-complete-course-for-beginners/?couponCode=FREE2021
category = it-software
heading = TB0-116 TIBCO Enterprise Message Service 6 Practice Exam
image = https://i.udemycdn.com/course/750x422/2931054_d555.jpg
link = https://www.udemy.com/course/tb0-116-tibco-enterprise-message-service-6-practice-exam-t
Expected json output:
[
{
"development":[
{
"heading":" Complete Python Bootcamp | Deep Learning Into Python Coding",
"image":"https://i.udemycdn.com/course/750x422/3871500_3d01_3.jpg",
"courselink":"https://www.udemy.com/course/complete-python-bootcamp-deep-learning"
}
{
"heading":"C++ Complete Course For Beginners",
"image":"https://i.udemycdn.com/course/750x422/3871500_3d01_3.jpg",
"courselink":"https://www.udemy.com/course/complete-python-bootcamp-deep-learning"
}
],
"it-software":[
{
"heading" : "TB0-116 TIBCO Enterprise Message Service 6 Practice Exam",
"image" : "https://i.udemycdn.com/course/750x422/2931054_d555.jpg"
"courselink" : "https://www.udemy.com/course/tb0-116-tibco-enterprise-message-service"
}
],
]
BELOW I ATTACHED MY SCRAPING CODE
def scrapeData(category):
base_url = "https://udemycoupon.learnviral.com/coupon-category/"+category+"/"
print(base_url)
source=requests.get(base_url,headers=headers).text
soup = BeautifulSoup(source,'lxml')
contents = soup.find_all('div',class_="item-holder")
print()
# print(contents)
for item in contents:
print(category)
heading=item.find("h3",{"class":"entry-title"}).text.replace("[Free]","")
print(heading)
image=item.find("div",{"class":"store-image"}).find("img")['src']
imagelink = image.replace('240x135', '750x422')
print(imagelink)
courselink = item.find("a", {"class":"coupon-code-link btn promotion"})
Anyone help me to convert it into my expected format in python.Thanks in advance.
def scrape_category(name):
base_url = 'https://udemycoupon.learnviral.com/coupon-category/' + name + '/'
source = requests.get(base_url).text
soup = BeautifulSoup(source, 'lxml')
contents = soup.find_all('div', class_='item-holder')
courses = []
for item in contents:
heading = item.find('h3', {'class': 'entry-title'}).text.replace('[Free]', '')
image = item.find('div', {'class': 'store-image'}).find('img')['src']
course_link = item.find('a', {'class': 'coupon-code-link btn promotion'})
courses.append({
'heading': heading,
'image': image.replace('240x135', '750x422'),
'courselink': course_link['href'],
})
return courses
result = {}
for category in ('development', 'it-software', ):
result[category] = scrape_category(category)
print(result) # or print([result])
You can use defaultdict and update the scraping code to create the dictionary object for every new course:
from collections import defaultdict
main_d = defaultdict(list)
for item in contents:
print(category)
heading=item.find("h3",{"class":"entry-title"}).text.replace("[Free]","")
print(heading)
image=item.find("div",{"class":"store-image"}).find("img")['src']
imagelink = image.replace('240x135', '750x422')
print(imagelink)
courselink = item.find("a", {"class":"coupon-code-link btn promotion"})
d = {"heading": heading, "image": image, "courselink": courselink}
main_d[category].append(d)
main_d will be a dictionary object with following structure:
{
"development":[
{
"heading":" Complete Python Bootcamp | Deep Learning Into Python Coding",
"image":"https://i.udemycdn.com/course/750x422/3871500_3d01_3.jpg",
"courselink":"https://www.udemy.com/course/complete-python-bootcamp-deep-learning"
}
{
"heading":"C++ Complete Course For Beginners",
"image":"https://i.udemycdn.com/course/750x422/3871500_3d01_3.jpg",
"courselink":"https://www.udemy.com/course/complete-python-bootcamp-deep-learning"
}
],
"it-software":[
{
"heading" : "TB0-116 TIBCO Enterprise Message Service 6 Practice Exam",
"image" : "https://i.udemycdn.com/course/750x422/2931054_d555.jpg"
"courselink" : "https://www.udemy.com/course/tb0-116-tibco-enterprise-message-service"
}
],
}
Note: This is not a tested code and might require some modifications to make it work correctly.

How to scrape all results from Google search results pages (Python/Selenium ChromeDriver)

I am working on a Python script using selenium chromedriver to scrape all google search results (link, header, text) off a specified number of results pages.
The code I have seems to only be scraping the first result from all pages after the first page.
I think this has something to do with how my for-loop is set up in the scrape function, but I have not been able to tweak it into working the way I'd like it to. Any suggestions for how to fix/ better approach this appreciated.
# create instance of webdriver
driver = webdriver.Chrome()
url = 'https://www.google.com'
driver.get(url)
# set keyword
keyword = 'cars'
# we find the search bar using it's name attribute value
searchBar = driver.find_element_by_name('q')
# first we send our keyword to the search bar followed by the ent
searchBar.send_keys(keyword)
searchBar.send_keys('\n')
def scrape():
pageInfo = []
try:
# wait for search results to be fetched
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "g"))
)
except Exception as e:
print(e)
driver.quit()
# contains the search results
searchResults = driver.find_elements_by_class_name('g')
for result in searchResults:
element = result.find_element_by_css_selector('a')
link = element.get_attribute('href')
header = result.find_element_by_css_selector('h3').text
text = result.find_element_by_class_name('IsZvec').text
pageInfo.append({
'header' : header, 'link' : link, 'text': text
})
return pageInfo
# Number of pages to scrape
numPages = 5
# All the scraped data
infoAll = []
# Scraped data from page 1
infoAll.extend(scrape())
for i in range(0 , numPages - 1):
nextButton = driver.find_element_by_link_text('Next')
nextButton.click()
infoAll.extend(scrape())
print(infoAll)
You have an indentation problem:
You should to have return pageInfo outside for loop, otherwise you're returning results after first loop execution
for result in searchResults:
element = result.find_element_by_css_selector('a')
link = element.get_attribute('href')
header = result.find_element_by_css_selector('h3').text
text = result.find_element_by_class_name('IsZvec').text
pageInfo.append({
'header' : header, 'link' : link, 'text': text
})
return pageInfo
Like this:
for result in searchResults:
element = result.find_element_by_css_selector('a')
link = element.get_attribute('href')
header = result.find_element_by_css_selector('h3').text
text = result.find_element_by_class_name('IsZvec').text
pageInfo.append({
'header' : header, 'link' : link, 'text': text
})
return pageInfo
I've ran your code and got results:
[{'header': 'Cars (film) — Wikipédia', 'link': 'https://fr.wikipedia.org/wiki/Cars_(film)', 'text': "Cars : Quatre Roues, ou Les Bagnoles au Québec (Cars), est le septième long-métrage d'animation entièrement en images de synthèse des studios Pixar.\nPays d’origine : États-Unis\nDurée : 116 minutes\nSociétés de production : Pixar Animation Studios\nGenre : Animation\nCars 2 · Michel Fortin · Flash McQueen"}, {'header': 'Cars - Wikipedia, la enciclopedia libre', 'link': 'https://es.wikipedia.org/wiki/Cars', 'text': 'Cars es una película de animación por computadora de 2006, producida por Pixar Animation Studios y lanzada por Walt Disney Studios Motion Pictures.\nAño : 2006\nGénero : Animación; Aventuras; Comedia; Infa...\nHistoria : John Lasseter Joe Ranft Jorgen Klubi...\nProductora : Walt Disney Pictures; Pixar Animat...'}, {'header': '', 'link': 'https://fr.wikipedia.org/wiki/Flash_McQueen', 'text': ''}, {'header': '', 'link': 'https://www.allocine.fr/film/fichefilm-55774/secrets-tournage/', 'text': ''}, {'header': '', 'link': 'https://fr.wikipedia.org/wiki/Martin_(Cars)', 'text': ''},
Suggestions:
Use a timer to control your for loop, otherwise you could be banned by Google due to suspicious activity
Steps:
1.- Import sleep from time: from time import sleep
2.- On your last loop add a timer:
for i in range(0 , numPages - 1):
sleep(5) #It'll wait 5 seconds for each iteration
nextButton = driver.find_element_by_link_text('Next')
nextButton.click()
infoAll.extend(scrape())
Google Search can be parsed with BeautifulSoup web scraping library without selenium, since the data is not being loaded dynamically via JavaScript, and will execute much faster in comparison to selenium as there's no need to render the page and use browser.
In order to get information from all pages, you can use pagination using an infinite while loop. Try to avoid using for i in range() pagination as it is a hardcoded way of doing pagination thus not reliable. If the page number would change (from 5 to 20), pagination will be broken.
Since the while loop is infinite, you need to set the conditions for exiting it, you can make two conditions:
the exit condition will be the presence of a button to switch to the next page (it is not on the last page), the presence can be checked by its CSS selector (in our case - ".d6cvqb a[id=pnnext]")
# condition for exiting the loop in the absence of the next page button
if soup.select_one(".d6cvqb a[id=pnnext]"):
params["start"] += 10
else:
break
another solution would be to add a limit of pages available for scraping if there is no need to extract all the pages.
# condition for exiting the loop when the page limit is reached
if page_num == page_limit:
break
When trying to request a site, it may think that this is a bot, so that this does not happen, you need to send headers that contain user-agent in the request, then the site will assume that you are a user and display the information.
Next step could be to rotate user-agent, for example, to switch between PC, mobile, and tablet, as well as between browsers e.g. Chrome, Firefox, Safari, Edge and so on. The most reliable way is to use rotating proxies, user-agents, and a captcha solver.
Check full code in the online IDE.
from bs4 import BeautifulSoup
import requests, json, lxml
# https://docs.python-requests.org/en/master/user/quickstart/#passing-parameters-in-urls
params = {
"q": "cars", # query example
"hl": "en", # language
"gl": "uk", # country of the search, UK -> United Kingdom
"start": 0, # number page by default up to 0
#"num": 100 # parameter defines the maximum number of results to return.
}
# https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
}
page_limit = 10 # page limit for example
page_num = 0
data = []
while True:
page_num += 1
print(f"page: {page_num}")
html = requests.get("https://www.google.com/search", params=params, headers=headers, timeout=30)
soup = BeautifulSoup(html.text, 'lxml')
for result in soup.select(".tF2Cxc"):
title = result.select_one(".DKV0Md").text
try:
snippet = result.select_one(".lEBKkf span").text
except:
snippet = None
links = result.select_one(".yuRUbf a")["href"]
data.append({
"title": title,
"snippet": snippet,
"links": links
})
# condition for exiting the loop when the page limit is reached
if page_num == page_limit:
break
# condition for exiting the loop in the absence of the next page button
if soup.select_one(".d6cvqb a[id=pnnext]"):
params["start"] += 10
else:
break
print(json.dumps(data, indent=2, ensure_ascii=False))
Example output:
[
{
"title": "Cars (2006) - IMDb",
"snippet": "On the way to the biggest race of his life, a hotshot rookie race car gets stranded in a rundown town, and learns that winning isn't everything in life.",
"links": "https://www.imdb.com/title/tt0317219/"
},
{
"title": "Cars (film) - Wikipedia",
"snippet": "Cars is a 2006 American computer-animated sports comedy film produced by Pixar Animation Studios and released by Walt Disney Pictures. The film was directed ...",
"links": "https://en.wikipedia.org/wiki/Cars_(film)"
},
{
"title": "Cars - Rotten Tomatoes",
"snippet": "Cars offers visual treats that more than compensate for its somewhat thinly written story, adding up to a satisfying diversion for younger viewers.",
"links": "https://www.rottentomatoes.com/m/cars"
},
other results ...
]
Also you can use Google Search Engine Results API from SerpApi. It's a paid API with a free plan.
The difference is that it will bypass blocks (including CAPTCHA) from Google, no need to create the parser and maintain it.
Code example:
from serpapi import GoogleSearch
from urllib.parse import urlsplit, parse_qsl
import json, os
params = {
"api_key": "...", # serpapi key from https://serpapi.com/manage-api-key
"engine": "google", # serpapi parser engine
"q": "cars", # search query
"gl": "uk", # country of the search, UK -> United Kingdom
"num": "100" # number of results per page (100 per page in this case)
# other search parameters: https://serpapi.com/search-api#api-parameters
}
search = GoogleSearch(params) # where data extraction happens
page_limit = 10
organic_results_data = []
page_num = 0
while True:
results = search.get_dict() # JSON -> Python dictionary
page_num += 1
for result in results["organic_results"]:
organic_results_data.append({
"title": result.get("title"),
"snippet": result.get("snippet"),
"link": result.get("link")
})
if page_num == page_limit:
break
if "next_link" in results.get("serpapi_pagination", []):
search.params_dict.update(dict(parse_qsl(urlsplit(results.get("serpapi_pagination").get("next_link")).query)))
else:
break
print(json.dumps(organic_results_data, indent=2, ensure_ascii=False))
Output:
[
{
"title": "Rally Cars - Page 30 - Google Books result",
"snippet": "Some people say rally car drivers are the most skilled racers in the world. Roger Clark, a British rally legend of the 1970s, describes sliding his car down ...",
"link": "https://books.google.co.uk/books?id=uIOlAgAAQBAJ&pg=PA30&lpg=PA30&dq=cars&source=bl&ots=9vDWFi0bHD&sig=ACfU3U1d4R-ShepjsTtWN-b9SDYkW1sTDQ&hl=en&sa=X&ved=2ahUKEwjPv9axu_b8AhX9LFkFHbBaB8c4yAEQ6AF6BAgcEAM"
},
{
"title": "Independent Sports Cars - Page 5 - Google Books result",
"snippet": "The big three American auto makers produced sports and sports-like cars beginning with GMs Corvette and Fords Thunderbird in 1954. Folowed by the Mustang, ...",
"link": "https://books.google.co.uk/books?id=HolUDwAAQBAJ&pg=PA5&lpg=PA5&dq=cars&source=bl&ots=yDaDtQSyW1&sig=ACfU3U11nHeRTwLFORGMHHzWjaVHnbLK3Q&hl=en&sa=X&ved=2ahUKEwjPv9axu_b8AhX9LFkFHbBaB8c4yAEQ6AF6BAgaEAM"
}
other results...
]

Categories

Resources