Webscraping google search results Python - python

I am trying to obtain both the title and link in some form of list or dictionary of the search results for the first page. However, my output shows Element 'a' href= before the link and my title shows Element 'h3' class=('LC20lb', 'MBeuO', 'DKV0Md') instead of the actual title. I have read through many examples but most offer API subscriptions that aren't preferable, nor does the beautifulsoup methods that i have searched for works either. This is the furthest I've got with this project so far. My code is below:
import requests
import urllib
import pandas as pd
from requests_html import HTML
from requests_html import HTMLSession
def get_source(url):
"""Return the source code for the provided URL.
Args:
url (string): URL of the page to scrape.
Returns:
response (object): HTTP response object from requests_html.
"""
try:
session = HTMLSession()
response = session.get(url)
return response
except requests.exceptions.RequestException as e:
print(e)
def scrape_google(query):
query = urllib.parse.quote_plus(query)
response = get_source("https://www.google.co.uk/search?q=" + query)
links = list(response.html.absolute_links)
google_domains = ('https://www.google.',
'https://google.',
'https://webcache.googleusercontent.',
'http://webcache.googleusercontent.',
'https://policies.google.',
'https://support.google.',
'https://maps.google.')
for url in links[:]:
if url.startswith(google_domains):
links.remove(url)
return links
def get_results(query):
query = urllib.parse.quote_plus(query)
response = get_source("https://www.google.co.uk/search?q=" + query)
return response
def parse_results(response):
css_identifier_result = ".MjjYud"
css_identifier_title = "h3.LC20lb.MBeuO.DKV0Md"
css_identifier_link = ".yuRUbf a"
results = response.html.find(css_identifier_result)
output = []
for result in results:
item = {
'title': result.find(css_identifier_title, first=True),
'link': result.find(css_identifier_link, first=True)
}
output.append(item)
return output
def google_search(query):
response = get_results(query)
return parse_results(response)
results = google_search("Elon Musk twitter")
results
Output:
[{'title': <Element 'h3' class=('LC20lb', 'MBeuO', 'DKV0Md')>,
'link': <Element 'a' href='https://www.theguardian.com/technology/2022/oct/30/twitter-trolls-bombard-platform-after-elon-musk-takeover' data-jsarwt='1' data-usg='AOvVaw3lmUr0p6yL70Nhr1Y5jurH' data-ved='2ahUKEwjA9-TFw4j7AhX3QzABHVyyBwcQFnoECBgQAQ'>}]

You can use BeautifulSoop library to parse your library better

Related

Python web scraping script does not find element by css selector

I'm trying to get this web scraper to get current electricity price from this website, it's in finnish but it's right under "Hinta nyt". https://sahko.tk/
Here's my code:
import requests
from bs4 import BeautifulSoup
url = "https://sahko.tk/"
element_selector = ""
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
elements = soup.find_all(element_selector)
if len(elements) == 0:
print("No element found with selector '%s'" % element_selector)
else:
element_text = elements[0].text
print(element_text)
I left the element_selector to empty because what ever I tried it just did not work. I'm not even sure if I'm on the right tracks.
The data you see is embedded inside <script> in that page. To parse the current price you can use next example:
import re
import json
import requests
url = "https://sahko.tk/"
data = requests.get(url).text
data = re.search(r"function prices_today\(\)\{var t= (.*?});", data).group(1)
data = json.loads(data)
print("Hinta nyt", data["now"], "snt/kWh")
Prints:
Hinta nyt 33.27 snt/kWh

How to download images from a website without the 'img' tag?

Recently I've been trying to learn how to webscrape in order to download all the images from my school directory. However, within the elements they are not storing the images under the img tag and instead have them ALL under this: background-image: url("/common/pages/GalleryPhoto.aspx?photoId=323070&width=180&height=180");
Anyway to bypass this??
Here is current code that will download images off of a targeted website
import os, requests, bsf n4, webbrowser, random
url = 'https://jhs.lsc.k12.in.us/staff_directory'
res = requests.get(url)
try:
res.raise_for_status()
except Exception as exc:
print('Sorry an error occured:', exc)
soup = bs4.BeautifulSoup(res.text, 'html.parser')
element = soup.select('background-image')
for i in range(len(element)):
url = element[i].get('img')
name = random.randrange(1, 25)
file = open(str(name) + '.jpg', 'wb')
res = requests.get(url)
for chunk in res.iter_content(10000):
file.write(chunk)
file.close()
print('done')
You can use the internal API this site is using to get the data including the image URL. It first gets the list of staff groups using the /settings endpoint then calls the /Search api with all the groupID
The flow is the following :
get the portletInstanceId value from a div tag with attribute data-portlet-instance-id
call the settings api and get the groups ID:
POST https://jhs.lsc.k12.in.us/Common/controls/StaffDirectory/ws/StaffDirectoryWS.asmx/Settings
call the search api with pagination parameter, you can choose how many people you want to request and the number per page :
POST https://jhs.lsc.k12.in.us/Common/controls/StaffDirectory/ws/StaffDirectoryWS.asmx/Search
The following script get the 20 first people and put the result in a pandas DataFrame:
import requests
from bs4 import BeautifulSoup
import pandas as pd
r = requests.get("https://jhs.lsc.k12.in.us/staff_directory")
soup = BeautifulSoup(r.content, "lxml")
portletInstanceId = soup.select('div[data-portlet-instance-id].staffDirectoryComponent')[0]["data-portlet-instance-id"]
r = requests.post("https://jhs.lsc.k12.in.us/Common/controls/StaffDirectory/ws/StaffDirectoryWS.asmx/Settings",
json = { "portletInstanceId": portletInstanceId })
groupIds = [t["groupID"] for t in r.json()["d"]["groups"]]
print(groupIds)
payload = {
"firstRecord": 0,
"groupIds": groupIds,
"lastRecord": 20,
"portletInstanceId": portletInstanceId,
"searchByJobTitle": True,
"searchTerm": "",
"sortOrder": "LastName,FirstName ASC"
}
r = requests.post("https://jhs.lsc.k12.in.us/Common/controls/StaffDirectory/ws/StaffDirectoryWS.asmx/Search",
json = payload)
results = r.json()["d"]["results"]
#add image url based on userID
for t in results:
t["imageURL"] = f'https://jhs.lsc.k12.in.us/{t["imageURL"]}' if t["imageURL"] else ''
df = pd.DataFrame(results)
#whole data
print(df)
#only image url
with pd.option_context('display.max_colwidth', 400):
print(df["imageURL"])
Try this on repl.it
You need to update firstRecord and lastRecord fields accordingly

Web scraping : not able to scrape text and href for a given div, class

Trying to get the text and href for top news but not able to parse it.
website : News site
import requests
from bs4 import BeautifulSoup
def checkResponse(url):
response = requests.get(url)
if response.status_code == 200:
return response.content
else:
return None
def getTitleURL():
url = 'https://www.gujaratsamachar.com/'
response = checkResponse(url)
if response is not None:
html = BeautifulSoup(response, 'html.parser')
for values in html.find_all('div', class_='main-news'):
print(values.a.href)
if __name__ == '__main__':
print('Getting the list of names....')
names = getTitleURL()
print('... done.\n')
Output is empty
Trying to scrape the part in red:
Elements looks like this:
import requests
data = ["heading", "categorySlug", "articleUrl"]
def main(url):
r = requests.get(url).json()
for item in r['data']:
goal = [item[d] for d in data]
print(goal[0], f"{url[:31]}/news/{'/'.join(goal[1:])}")
main("https://www.gujaratsamachar.com/api/stories/5993f2835b03ab694185ad25?type=top-stories")

Script fails to generate results

I've written a script in python to scrape the result populated upon filling in two inputboxes zipcode and distance with 66109,10000. When I try the inputs manually, the site does display results but when I try the same using the script I get nothing. The script throws no error either. What might be the issues here?
Website link
I've tried with:
import requests
from bs4 import BeautifulSoup
url = 'https://www.sart.org/clinic-pages/find-a-clinic/'
payload = {
'zip': '66109',
'strdistance': '10000',
'SelectedState': 'Select State or Region'
}
def get_clinics(link):
session = requests.Session()
response = session.post(link,data=payload,headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(response.text,"lxml")
item = soup.select_one(".clinics__search-meta").text
print(item)
if __name__ == '__main__':
get_clinics(url)
I'm only after this line Within 10000 miles of 66109 there are 383 clinics. generated when the search is made.
I changed the url and the requests method to GET and worked for me
def get_clinics(link):
session = requests.Session()
response = session.get(link, headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(response.text,"lxml")
item = soup.select_one(".clinics__search-meta").text
print(item)
url = 'https://www.sart.org/clinic-pages/find-a-clinic?zip=66109&strdistance=10000&SelectedState=Select+State+or+Region'
get_clinics(url)
Include cookies is one of the main concern here. If you do it in the right way, you can get a valid response following the way you started. Here is the working code:
import requests
from bs4 import BeautifulSoup
url = 'https://www.sart.org/clinic-pages/find-a-clinic/'
payload = {
'zip': '66109',
'strdistance': '10000',
'SelectedState': 'Select State or Region'
}
def get_clinics(link):
with requests.Session() as s:
res = s.get(link)
req = s.post(link,data=payload,cookies=res.cookies.get_dict())
soup = BeautifulSoup(req.text,"lxml")
item = soup.select_one(".clinics__search-meta").get_text(strip=True)
print(item)
if __name__ == '__main__':
get_clinics(url)

Python - parsing html response

In my code, a user inputs a search term and the get_all_links parses the html response and extract the links that start with ‘http’. When req is replaced with a hard coded url such as:
content = urllib.request.urlopen("http://www.ox.ac.uk")
The program returns a list of properly formatted links correctly. However passing in req, no links are returned. I suspect this may be a formatting blip.
Here is my code:
import urllib.request
def get_all_links(s): # function to get all the links
d=0
links=[] # getting all links into a list
while d!=-1: # untill d is -1. i.e no links in that page
d=s.find('<a href=',d) # if <a href is found
start=s.find('"',d) # stsrt will be the next character
end=s.find('"',start+1) # end will be upto "
if d!=-1: # d is not -1
d+=1
if(s[start+1]=='h'): # add the link which starts with http only.
links.append(s[start+1:end]) # to link list
return links # return list
def main():
term = input('Enter a search term: ')
url = 'http://www.google.com/search'
value = {'q' : term}
user_agent = 'Mozilla/5.0'
headers = {'User-Agent' : user_agent}
data = urllib.parse.urlencode(value)
print(data)
url = url + '?' + data
print(url)
req = urllib.request.Request(url, None, headers)
content = urllib.request.urlopen(req)
s = content.read()
print(s)
links = get_all_links(s.decode('utf-8'))
for i in links: # print the returned list.
print(i)
main()
You should use a HTML parser, as suggested in the comments. A library like BeautifulSoup is perfect for this.
I have adapted your code to use BeautifulSoup
import urllib.request
from bs4 import BeautifulSoup
def get_all_links(s):
soup = BeautifulSoup(s, "html.parser")
return soup.select("a[href^=\"http\"]") # Select all anchor tags whose href attribute starts with 'http'
def main():
term = input('Enter a search term: ')
url = 'http://www.google.com/search'
value = {'q' : term}
user_agent = 'Mozilla/5.0'
headers = {'User-Agent' : user_agent}
data = urllib.parse.urlencode(value)
print(data)
url = url + '?' + data
print(url)
req = urllib.request.Request(url, None, headers)
content = urllib.request.urlopen(req)
s = content.read()
print(s)
links = get_all_links(s.decode('utf-8'))
for i in links: # print the returned list.
print(i)
main()
It uses the select method of the BeautifulSoup library and returns a list of selected elements (in your case anchor-tags).
Using a library like BeautifulSoup not only makes it easier, but you can also use much more complex selections. Imagine how you would have to change your code when you wanted to select all links whose href attribute contains the word "google" or "code"?
You can read the BeautifulSoup documentation here.

Categories

Resources