How to get Google like url excerpts in python - python

I have written a code like this:
#pip3 install google
from googlesearch import search
query = 'java'
for i in search(query, # The query you want to run
tld = 'com', # The top level domain
lang = 'en', # The language
num = 10, # Number of results per page
start = 0, # First result to retrieve
stop = None, # Last result to retrieve
pause = 0, # Lapse between HTTP requests
safe = 'high'
):
print(i)
in the above, I am simply getting the url link. How can I get google like excerpts for each url. Like the attached

I don't think it is possible to do it with the package BUT if you use the requests module on python with some web scraping libraries (beautifulsoup) you can get those descriptions with the HTML tag (<meta name="description" content="google description">)

You can scrape it using requests, bs4 library and user-agent.
Make sure you're using user-agent, because if you don't, your script would have a default user-agent (it could be a tablet or phone) which will show different classes and because of it you will get an empty output.
Here's a code and replit.com(java search result on repl.it):
from bs4 import BeautifulSoup
import requests
import lxml
import json
headers = {
'User-agent':
"Mozilla/5.0 (Windows NT 10.0; Win6a4; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
html = requests.get('https://www.google.com/search?hl=en-US&q=best lasagna recipe ever', headers=headers).text
soup = BeautifulSoup(html, 'lxml'))
summary = []
for container in soup.findAll('div', class_='tF2Cxc'):
heading = container.find('h3', class_='LC20lb DKV0Md').text
article_summary = container.find('span', class_='aCOpRe').text
link = container.find('a')['href']
summary.append({
'Heading': heading,
'Article Summary': article_summary,
'Link': link,
})
print(json.dumps(summary, indent=2, ensure_ascii=False))
JSON output:
[
{
"Heading": "World's Best Lasagna Recipe - Allrecipes.com",
"Article Summary": "Ingredients. 1 pound sweet Italian sausage. ¾ pound lean ground beef. ½ cup minced onion. 2 cloves garlic, crushed. 1 (28 ounce) can crushed tomatoes. 2 (6 ounce) cans tomato paste. 2 (6.5 ounce) cans canned tomato sauce. ½ cup water.",
"Link": "https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/"
},
{
"Heading": "The BEST Lasagna Recipe Ever! | The Recipe Critic",
"Article Summary": "Dec 22, 2019 — The BEST Classic Lasagna Ever has layers of sautéed ground beef and Italian sausage that are cooked together, sweet tomato sauce, Italian ...",
"Link": "https://therecipecritic.com/lasagna-recipe/"
},
{
"Heading": "The Most Amazing Lasagna Recipe - The Stay At Home Chef",
"Article Summary": "The Most Amazing Lasagna Recipe is the best recipe for homemade Italian-style lasagna. The balance ... This recipe is so good—it makes the kind of lasagna people write home about! ... Hands down absolutely the best lasagna recipe ever!",
"Link": "https://thestayathomechef.com/amazing-lasagna-recipe/"
},
{
"Heading": "Best Lasagna - Cafe Delites",
"Article Summary": "My mama's famous lasagna recipe is hands down the best lasagna I have ever had in my life. She learnt her ways from her Italian friends when she lived in New ...",
"Link": "https://cafedelites.com/best-lasagna/"
},
{
"Heading": "The Best Lasagna Recipe EVER | Fail Proof Recipe | Lauren's ...",
"Article Summary": "Start with a bit of meat sauce in the bottom of a large casserole dish or a plain 9×13 and line the bottom with pasta. Top with the cheese mixture and meat sauce.",
"Link": "https://laurenslatest.com/best-lasagna-recipe/"
},
{
"Heading": "Best Lasagna Recipe: How to Make It | Taste of Home",
"Article Summary": "Want to know how to make lasagna for a casual holiday meal? You can't go wrong with this deliciously rich meat lasagna recipe. ... I made this lasagna for my fiance, he said this lasagna was the best he ever tasted, I will never buy frozen ...",
"Link": "https://www.tasteofhome.com/recipes/best-lasagna/"
},
{
"Heading": "Easy Homemade Lasagna {Classic Dinner!} - Spend With ...",
"Article Summary": "May 19, 2020 — noodles – sauce (bake) – cheese. Spread about a cup of meat sauce into a 9×13 pan. Add a layer of noodles. Top the noodles with some of ...",
"Link": "https://www.spendwithpennies.com/easy-homemade-lasagna/"
},
{
"Heading": "The Best Lasagna Recipe {Simple & Classic} - Simply Recipes",
"Article Summary": "Feb 19, 2019 — Ingredients · 1 pound lean ground beef (chuck); 1/2 onion, diced (about 3/4 cup); 1/2 large bell pepper (green, red, or yellow), diced (about 3/4 cup) ...",
"Link": "https://www.simplyrecipes.com/recipes/lasagna/"
},
{
"Heading": "Best Lasagna Recipe - How to Make Lasagna From Scratch",
"Article Summary": "Dec 15, 2020 — The Best Lasagna. Ever. · Bring a large pot of water to a boil. · Meanwhile, in a large skillet or saucepan, combine ground beef, sausage, and garlic ...",
"Link": "https://www.thepioneerwoman.com/food-cooking/recipes/a11728/best-lasagna-recipe/"
}
]
Alternatively, you can use Google Search Engine Results API from SerpApi.
Part of JSON:
"organic_results": [
{
"position": 1,
"title": "World's Best Lasagna Recipe - Allrecipes.com",
"link": "https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/",
"displayed_link": "https://www.allrecipes.com › ... › European › Italian",
}
]
Code to integrate:
import os
from serpapi import GoogleSearch
params = {
"engine": "google",
"q": "best lasagna recipe ever",
"api_key": os.getenv("API_KEY"),
}
search = GoogleSearch(params)
results = search.get_dict()
for result in results["organic_results"]:
print(f"Title: {result['title']}\nLink: {result['link']}")
Output:
Title: World's Best Lasagna Recipe - Allrecipes.com,
Link: https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/
Title: Best Lasagna - Cafe Delites,
Link: https://cafedelites.com/best-lasagna/
Title: The Most Amazing Lasagna Recipe - The Stay At Home Chef,
Link: https://thestayathomechef.com/amazing-lasagna-recipe/
Title: The BEST Lasagna Recipe Ever! | The Recipe Critic,
Link: https://therecipecritic.com/lasagna-recipe/
Title: The Best Lasagna Recipe EVER | Fail Proof Recipe | Lauren's ...,
Link: https://laurenslatest.com/best-lasagna-recipe/
Title: Best Lasagna Recipe - How to Make Lasagna From Scratch,
Link: https://www.thepioneerwoman.com/food-cooking/recipes/a11728/best-lasagna-recipe/
Title: Best Lasagna Recipe: How to Make It | Taste of Home,
Link: https://www.tasteofhome.com/recipes/best-lasagna/
Title: Easy Homemade Lasagna {Classic Dinner!} - Spend With ...,
Link: https://www.spendwithpennies.com/easy-homemade-lasagna/
Title: The Best Lasagna Recipe {Simple & Classic} - Simply Recipes,
Link: https://www.simplyrecipes.com/recipes/lasagna/
Disclaimer: I work for SerpApi.

Related

Pass text to a Python script and return the result using R JSON

I have a string in R that I would like to pass to python in order to compute something and return the result back into R.
I have the following which "works" but not as I would like.
The below passes a string from R, to a Python file, uses openAI to collect the text data and then load it back into R.
library(reticulate)
computePythonFunction <- "
def print_openai_response():
import openai
openai.api_key = 'ej-powurjf___OpenAI_API_KEY___HGAJjswe' # you will need an API key
prompt = 'write me a poem about the sea'
response = openai.Completion.create(engine = 'text-davinci-003', prompt = prompt, max_tokens=1000)
#response['choices'][0]['text']
print(response)
"
py_run_string(computePythonFunction)
py$print_openai_response()
library("rjson")
fromJSON(as.character(py$print_openai_response()))
I would like to store the results in R objects - i.e. Here is one output from the python script.
{
"choices": [
{
"finish_reason": "stop",
"index": 0,
"logprobs": null,
"text": "\n\nThe sea glitters like stars in the night \nWith hues, vibrant and bright\nThe waves flow gentle, serene, and divine \nLike the sun's most gentle shine\n\nAs the sea reaches, so wide, so vast \nAn adventure awaits, and a pleasure, not passed\nWhite sands, with seaweed green \nForms a kingdom of the sea\n\nConnecting different land and tide \nThe sea churns, dancing with the sun's pride\nAs a tempest forms, raging and wild \nThe sea turns, its colors so mild\n\nA raging storm, so wild and deep \nProtecting the creatures that no one can see \nThe sea is a living breathing soul \nA true and untouchable goal \n\nThe sea is a beauty that no one can describe \nAnd it's power, no one can deny \nAn ever-lasting bond, timeless and free \nThe love of the sea, is a love, to keep"
}
],
"created": 1670525403,
"id": "cmpl-6LGG3hDNzeTZ5VFbkyjwfkHH7rDkE",
"model": "text-davinci-003",
"object": "text_completion",
"usage": {
"completion_tokens": 210,
"prompt_tokens": 7,
"total_tokens": 217
}
}
I am interested in the text generated but I am also interested in the completion_tokens, promt_tokens and total_tokens.
I thought about save the Python code as a script, then pass the argument to it such as:
myPythin.py arg1.
How can I return the JSON output from the model to an R object? The only input which changes/varies in the python code is the prompt variable.

I am trying to scrape multiple pages using beautfiul soup but the code keeps returning the same data for every page

I am trying to scrape the special offers on the steam website using Python and beautiful soup. I am trying to scrape data from mutiple pages using a for loop. I have attached the Python code below. Any help is really appreciated. Thanks in advance.
game_lis = set([])
for page in range(0,4):
page_url = "https://store.steampowered.com/specials#p=" +str(page)+"&tab=TopSellers"
#print(page_url)
steam_games = requests.get(page_url)
soup = BeautifulSoup(steam_games.text, 'lxml')
s_game_offers = soup.findAll('a', class_='tab_item')
print(page_url)
for game in s_game_offers:
title = game.find('div',class_='tab_item_name')
discount = game.find('div',class_='discount_pct')
game_lis.add(title.text)
print(title.text+":"+discount.text)
The page is loaded from different URL via JavaScript, so beautifulsoup doesn't see it. You can use next example how to load different pages:
import requests
from bs4 import BeautifulSoup
api_url = "https://store.steampowered.com/contenthub/querypaginated/specials/TopSellers/render/"
params = {
"query": "",
"start": "0",
"count": "15",
"cc": "SK", # <-- probably change code here
"l": "english",
"v": "4",
"tag": "",
}
for page in range(0, 4):
params["start"] = 15 * page
steam_games = requests.get(api_url, params=params)
soup = BeautifulSoup(steam_games.json()["results_html"], "lxml")
s_game_offers = soup.findAll("a", class_="tab_item")
for game in s_game_offers:
title = game.find("div", class_="tab_item_name")
discount = game.find("div", class_="discount_pct")
print(title.text + ":" + discount.text)
print("-" * 80)
Prints:
F.I.S.T.: Forged In Shadow Torch:-10%
HITMAN 2 - Gold Edition:-85%
NieR:Automata™:-50%
Horizon Zero Dawn™ Complete Edition:-40%
Need for Speed™ Heat:-86%
Middle-earth: Shadow of War Definitive Edition:-80%
Batman: Arkham Collection:-80%
No Man's Sky:-50%
Legion TD 2 - Multiplayer Tower Defense:-20%
NieR Replicant™ ver.1.22474487139...:-35%
Days Gone:-20%
Mortal Kombat 11 Ultimate:-65%
Human: Fall Flat:-66%
Muse Dash - Just as planned:-30%
The Elder Scrolls Online - Blackwood:-50%
--------------------------------------------------------------------------------
The Elder Scrolls Online - Blackwood:-50%
Football Manager 2022:-10%
Age of Empires II: Definitive Edition:-33%
OCTOPATH TRAVELER™:-50%
DRAGON QUEST® XI S: Echoes of an Elusive Age™ - Definitive Edition:-35%
Witch It:-70%
Monster Hunter: World:-34%
NARUTO SHIPPUDEN: Ultimate Ninja STORM 4:-77%
MADNESS: Project Nexus:-10%
Mad Max:-75%
Outer Wilds:-40%
Middle-earth: Shadow of Mordor Game of the Year Edition:-75%
Age of Empires III: Definitive Edition:-33%
Ghostrunner:-60%
The Elder Scrolls® Online:-60%
--------------------------------------------------------------------------------
...

.findAll() finding things not constantly

I tried to make a little Beautiful Soup Script, to analyze prices on eBay. So the problem is, that my soup.findAll() that should find the prices, is sometimes working, sometimes not and I am wondering why. So here is my code:
import requests
from bs4 import BeautifulSoup
from requests.models import encode_multipart_formdata
article = input("Product:")
keywords = article.strip().replace(" ", "+")
URL_s = "https://www.ebay.de/sch/i.html?_dmd=1&_fosrp=1&LH_SALE_CURRENCY=0&_sop=12&_ipg=50&LH_Complete=1&LH_Sold=1&_sadis=10&_from=R40&_sacat=0&_nkw=" + keywords + "&_dcat=139971&rt=nc&LH_ItemCondition=3"
source = requests.get(URL_s).text
soup = BeautifulSoup(source)
prices = soup.findAll('span', class_='bold bidsold')
# ^ this line sometimes finds the prices, sometimes it just produces an empty list ^
help would be very welcome, hope you are doing well, bye bye :)
If you look at the variable soup, and open the results as an html page you would see something like this:
This means the ebay has some sort of a filtering mechanism to prevent scraping, and requires you to somehow confirm your identity. This is why your query for prices returns empty.
Maybe the prices are rendered by JavaScript. Requests does not wait for the JavaScript to be loaded.
So thats why, you should use other modules, such as Selenium or DryScrape
When using requests, the request may be blocked because the default user-agent in the requests library is python-requests, in order for the website to understand that this is not a bot or script, you need to pass your real User-Agent to the headers.
You can also read Reducing the chance of being blocked while web scraping blog post to learn about other options for solving this problem.
If you want to collect all the information from all pages, you can use a while loop that dynamically paginates all pages.
The while loop will be executed until the stop command appears, in our case, the loop termination command will be to check for the presence of the next page, for which the CSS selector “.pagination__next” is responsible.
Check code in online IDE.
from bs4 import BeautifulSoup
import requests, json, lxml
# https://requests.readthedocs.io/en/latest/user/quickstart/#custom-headers
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36",
}
query = input('Your query is: ') # "shirt" for example
params = {
'_nkw': query, # search query
'_pgn': 1, # page number
'LH_Sold': '1' # shows sold items
}
data = []
page_limit = 10 # page limit (if you need)
while True:
page = requests.get('https://www.ebay.de/sch/i.html', params=params, headers=headers, timeout=30)
soup = BeautifulSoup(page.text, 'lxml')
print(f"Extracting page: {params['_pgn']}")
print("-" * 10)
for products in soup.select(".s-item__info"):
title = products.select_one(".s-item__title span").text
price = products.select_one(".s-item__price").text
data.append({
"title" : title,
"price" : price
})
if params['_pgn'] == page_limit:
break
if soup.select_one(".pagination__next"):
params['_pgn'] += 1
else:
break
print(json.dumps(data, indent=2, ensure_ascii=False))
Example output:
[
{
"title": "CECIL Pullover Damen Hoodie Sweatshirt Gr. L (DE 44) Baumwolle flieder #7902fa2",
"price": "EUR 17,64"
},
{
"title": "Shirt mit Schlangendruck & Strass \"cyclam\" Gr. 40 UVP: 49,99€ 5.65",
"price": "EUR 6,50"
},
{
"title": "Fender Guitars Herren T-Shirt von Difuzed - Größe Medium blau auf blau - Sehr guter Zustand",
"price": "EUR 10,06"
},
other results ...
]
As an alternative, you can use Ebay Organic Results API from SerpApi. It's a paid API with a free plan that handles blocks and parsing on their backend.
Example code with pagination:
from serpapi import EbaySearch
import os, json
query = input('Your query is: ') # "shirt" for example
params = {
"api_key": "...", # serpapi key, https://serpapi.com/manage-api-key
"engine": "ebay", # search engine
"ebay_domain": "ebay.com", # ebay domain
"_nkw": query, # search query
# "LH_Sold": "1" # shows sold items
}
search = EbaySearch(params) # where data extraction happens
page_num = 0
data = []
while True:
results = search.get_dict() # JSON -> Python dict
if "error" in results:
print(results["error"])
break
for organic_result in results.get("organic_results", []):
title = organic_result.get("title")
price = organic_result.get("price")
data.append({
"price" : price,
"title" : title
})
page_num += 1
print(page_num)
if "next" in results.get("pagination", {}):
params['_pgn'] += 1
else:
break
print(json.dumps(data, indent=2))
Output:
[
{
"price": {
"raw": "EUR 17,50",
"extracted": 17.5
},
"title": "Mensch zweiter Klasse Gesund und ungeimpft T-Shirt"
},
{
"price": {
"raw": "EUR 14,90",
"extracted": 14.9
},
"title": "Sprüche Shirt Lustige T-Shirts für Herren oder Unisex Kult Fun Gag Handwerker"
},
# ...
]
There's a 13 ways to scrape any public data from any website blog post if you want to know more about website scraping.

How to get fortnite stats in python

So i was trying to find something to code, and i decided to use python to get fortnite stats, i came across the fortnite_python library and it works, but it displays item codes for items in the shop when i want it to display the names. Anyone know how to convert them or just disply the name in the first place? This is my code.
​
fortnite = Fortnite('c954ed23-756d-4843-8f99-cfe850d2ed0c')
store = fortnite.store()
fortnite.store()
It outputs something like this
[<StoreItem 12511>,
To print out the attributes of a Python object you can use __dict__ e.g.
from fortnite_python import Fortnite
from json import dumps
fortnite = Fortnite('Your API Key')
# ninjas_account_id = fortnite.player('ninja')
# print(f'ninjas_account: {ninjas_account_id}') # ninjas_account: 4735ce91-3292-4caf-8a5b-17789b40f79c
store = fortnite.store()
example_store_item = store[0]
print(dumps(example_store_item.__dict__, indent=2))
Output:
{
"_data": {
"imageUrl": "https://trackercdn.com/legacycdn/fortnite/237112511_large.png",
"manifestId": 12511,
"name": "Dragacorn",
"rarity": "marvel",
"storeCategory": "BRSpecialFeatured",
"vBucks": 0
},
"id": 12511,
"image_url": "https://trackercdn.com/legacycdn/fortnite/237112511_large.png",
"name": "Dragacorn",
"rarity": "marvel",
"store_category": "BRSpecialFeatured",
"v_bucks": 0
}
So it looks like you want to use name attribute of StoreItem:
for store_item in store:
print(store_item.name)
Output:
Dragacorn
Hulk Smashers
Domino
Unstoppable Force
Scootin'
Captain America
Cable
Probability Dagger
Chimichanga!
Daywalker's Kata
Psi-blade
Snap
Psylocke
Psi-Rider
The Devil's Wings
Daredevil
Meaty Mallets
Silver Surfer
Dayflier
Silver Surfer's Surfboard
Ravenpool
Silver Surfer Pickaxe
Grand Salute
Cuddlepool
Blade
Daredevil's Billy Clubs
Mecha Team
Tricera Ops
Combo Cleaver
Mecha Team Leader
Dino
Triassic
Rex
Cap Kick
Skully
Gold Digger
Windmill Floss
Bold Stance
Jungle Scout
It seems that the library doesn't contain a function to get the names. Also this is what the class of a item from the store looks like:
class StoreItem(Domain):
"""Object containing store items attributes"""
and thats it.

How to scrape all results from Google search results pages (Python/Selenium ChromeDriver)

I am working on a Python script using selenium chromedriver to scrape all google search results (link, header, text) off a specified number of results pages.
The code I have seems to only be scraping the first result from all pages after the first page.
I think this has something to do with how my for-loop is set up in the scrape function, but I have not been able to tweak it into working the way I'd like it to. Any suggestions for how to fix/ better approach this appreciated.
# create instance of webdriver
driver = webdriver.Chrome()
url = 'https://www.google.com'
driver.get(url)
# set keyword
keyword = 'cars'
# we find the search bar using it's name attribute value
searchBar = driver.find_element_by_name('q')
# first we send our keyword to the search bar followed by the ent
searchBar.send_keys(keyword)
searchBar.send_keys('\n')
def scrape():
pageInfo = []
try:
# wait for search results to be fetched
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "g"))
)
except Exception as e:
print(e)
driver.quit()
# contains the search results
searchResults = driver.find_elements_by_class_name('g')
for result in searchResults:
element = result.find_element_by_css_selector('a')
link = element.get_attribute('href')
header = result.find_element_by_css_selector('h3').text
text = result.find_element_by_class_name('IsZvec').text
pageInfo.append({
'header' : header, 'link' : link, 'text': text
})
return pageInfo
# Number of pages to scrape
numPages = 5
# All the scraped data
infoAll = []
# Scraped data from page 1
infoAll.extend(scrape())
for i in range(0 , numPages - 1):
nextButton = driver.find_element_by_link_text('Next')
nextButton.click()
infoAll.extend(scrape())
print(infoAll)
You have an indentation problem:
You should to have return pageInfo outside for loop, otherwise you're returning results after first loop execution
for result in searchResults:
element = result.find_element_by_css_selector('a')
link = element.get_attribute('href')
header = result.find_element_by_css_selector('h3').text
text = result.find_element_by_class_name('IsZvec').text
pageInfo.append({
'header' : header, 'link' : link, 'text': text
})
return pageInfo
Like this:
for result in searchResults:
element = result.find_element_by_css_selector('a')
link = element.get_attribute('href')
header = result.find_element_by_css_selector('h3').text
text = result.find_element_by_class_name('IsZvec').text
pageInfo.append({
'header' : header, 'link' : link, 'text': text
})
return pageInfo
I've ran your code and got results:
[{'header': 'Cars (film) — Wikipédia', 'link': 'https://fr.wikipedia.org/wiki/Cars_(film)', 'text': "Cars : Quatre Roues, ou Les Bagnoles au Québec (Cars), est le septième long-métrage d'animation entièrement en images de synthèse des studios Pixar.\nPays d’origine : États-Unis\nDurée : 116 minutes\nSociétés de production : Pixar Animation Studios\nGenre : Animation\nCars 2 · Michel Fortin · Flash McQueen"}, {'header': 'Cars - Wikipedia, la enciclopedia libre', 'link': 'https://es.wikipedia.org/wiki/Cars', 'text': 'Cars es una película de animación por computadora de 2006, producida por Pixar Animation Studios y lanzada por Walt Disney Studios Motion Pictures.\nAño : 2006\nGénero : Animación; Aventuras; Comedia; Infa...\nHistoria : John Lasseter Joe Ranft Jorgen Klubi...\nProductora : Walt Disney Pictures; Pixar Animat...'}, {'header': '', 'link': 'https://fr.wikipedia.org/wiki/Flash_McQueen', 'text': ''}, {'header': '', 'link': 'https://www.allocine.fr/film/fichefilm-55774/secrets-tournage/', 'text': ''}, {'header': '', 'link': 'https://fr.wikipedia.org/wiki/Martin_(Cars)', 'text': ''},
Suggestions:
Use a timer to control your for loop, otherwise you could be banned by Google due to suspicious activity
Steps:
1.- Import sleep from time: from time import sleep
2.- On your last loop add a timer:
for i in range(0 , numPages - 1):
sleep(5) #It'll wait 5 seconds for each iteration
nextButton = driver.find_element_by_link_text('Next')
nextButton.click()
infoAll.extend(scrape())
Google Search can be parsed with BeautifulSoup web scraping library without selenium, since the data is not being loaded dynamically via JavaScript, and will execute much faster in comparison to selenium as there's no need to render the page and use browser.
In order to get information from all pages, you can use pagination using an infinite while loop. Try to avoid using for i in range() pagination as it is a hardcoded way of doing pagination thus not reliable. If the page number would change (from 5 to 20), pagination will be broken.
Since the while loop is infinite, you need to set the conditions for exiting it, you can make two conditions:
the exit condition will be the presence of a button to switch to the next page (it is not on the last page), the presence can be checked by its CSS selector (in our case - ".d6cvqb a[id=pnnext]")
# condition for exiting the loop in the absence of the next page button
if soup.select_one(".d6cvqb a[id=pnnext]"):
params["start"] += 10
else:
break
another solution would be to add a limit of pages available for scraping if there is no need to extract all the pages.
# condition for exiting the loop when the page limit is reached
if page_num == page_limit:
break
When trying to request a site, it may think that this is a bot, so that this does not happen, you need to send headers that contain user-agent in the request, then the site will assume that you are a user and display the information.
Next step could be to rotate user-agent, for example, to switch between PC, mobile, and tablet, as well as between browsers e.g. Chrome, Firefox, Safari, Edge and so on. The most reliable way is to use rotating proxies, user-agents, and a captcha solver.
Check full code in the online IDE.
from bs4 import BeautifulSoup
import requests, json, lxml
# https://docs.python-requests.org/en/master/user/quickstart/#passing-parameters-in-urls
params = {
"q": "cars", # query example
"hl": "en", # language
"gl": "uk", # country of the search, UK -> United Kingdom
"start": 0, # number page by default up to 0
#"num": 100 # parameter defines the maximum number of results to return.
}
# https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
}
page_limit = 10 # page limit for example
page_num = 0
data = []
while True:
page_num += 1
print(f"page: {page_num}")
html = requests.get("https://www.google.com/search", params=params, headers=headers, timeout=30)
soup = BeautifulSoup(html.text, 'lxml')
for result in soup.select(".tF2Cxc"):
title = result.select_one(".DKV0Md").text
try:
snippet = result.select_one(".lEBKkf span").text
except:
snippet = None
links = result.select_one(".yuRUbf a")["href"]
data.append({
"title": title,
"snippet": snippet,
"links": links
})
# condition for exiting the loop when the page limit is reached
if page_num == page_limit:
break
# condition for exiting the loop in the absence of the next page button
if soup.select_one(".d6cvqb a[id=pnnext]"):
params["start"] += 10
else:
break
print(json.dumps(data, indent=2, ensure_ascii=False))
Example output:
[
{
"title": "Cars (2006) - IMDb",
"snippet": "On the way to the biggest race of his life, a hotshot rookie race car gets stranded in a rundown town, and learns that winning isn't everything in life.",
"links": "https://www.imdb.com/title/tt0317219/"
},
{
"title": "Cars (film) - Wikipedia",
"snippet": "Cars is a 2006 American computer-animated sports comedy film produced by Pixar Animation Studios and released by Walt Disney Pictures. The film was directed ...",
"links": "https://en.wikipedia.org/wiki/Cars_(film)"
},
{
"title": "Cars - Rotten Tomatoes",
"snippet": "Cars offers visual treats that more than compensate for its somewhat thinly written story, adding up to a satisfying diversion for younger viewers.",
"links": "https://www.rottentomatoes.com/m/cars"
},
other results ...
]
Also you can use Google Search Engine Results API from SerpApi. It's a paid API with a free plan.
The difference is that it will bypass blocks (including CAPTCHA) from Google, no need to create the parser and maintain it.
Code example:
from serpapi import GoogleSearch
from urllib.parse import urlsplit, parse_qsl
import json, os
params = {
"api_key": "...", # serpapi key from https://serpapi.com/manage-api-key
"engine": "google", # serpapi parser engine
"q": "cars", # search query
"gl": "uk", # country of the search, UK -> United Kingdom
"num": "100" # number of results per page (100 per page in this case)
# other search parameters: https://serpapi.com/search-api#api-parameters
}
search = GoogleSearch(params) # where data extraction happens
page_limit = 10
organic_results_data = []
page_num = 0
while True:
results = search.get_dict() # JSON -> Python dictionary
page_num += 1
for result in results["organic_results"]:
organic_results_data.append({
"title": result.get("title"),
"snippet": result.get("snippet"),
"link": result.get("link")
})
if page_num == page_limit:
break
if "next_link" in results.get("serpapi_pagination", []):
search.params_dict.update(dict(parse_qsl(urlsplit(results.get("serpapi_pagination").get("next_link")).query)))
else:
break
print(json.dumps(organic_results_data, indent=2, ensure_ascii=False))
Output:
[
{
"title": "Rally Cars - Page 30 - Google Books result",
"snippet": "Some people say rally car drivers are the most skilled racers in the world. Roger Clark, a British rally legend of the 1970s, describes sliding his car down ...",
"link": "https://books.google.co.uk/books?id=uIOlAgAAQBAJ&pg=PA30&lpg=PA30&dq=cars&source=bl&ots=9vDWFi0bHD&sig=ACfU3U1d4R-ShepjsTtWN-b9SDYkW1sTDQ&hl=en&sa=X&ved=2ahUKEwjPv9axu_b8AhX9LFkFHbBaB8c4yAEQ6AF6BAgcEAM"
},
{
"title": "Independent Sports Cars - Page 5 - Google Books result",
"snippet": "The big three American auto makers produced sports and sports-like cars beginning with GMs Corvette and Fords Thunderbird in 1954. Folowed by the Mustang, ...",
"link": "https://books.google.co.uk/books?id=HolUDwAAQBAJ&pg=PA5&lpg=PA5&dq=cars&source=bl&ots=yDaDtQSyW1&sig=ACfU3U11nHeRTwLFORGMHHzWjaVHnbLK3Q&hl=en&sa=X&ved=2ahUKEwjPv9axu_b8AhX9LFkFHbBaB8c4yAEQ6AF6BAgaEAM"
}
other results...
]

Categories

Resources