I use python to download images from some website, some times the image's content-length is zero. The image can be accessed normally in web browser.
I have tried three methodes, and get the same result, so how to resolve this problem?
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 20 13:51:42 2017
"""
import urllib
import urllib2
import re
import uuid
import os
import requests
from lxml import etree
from multiprocessing import Pool
url = 'https://www.sina.com.cn/'
user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
request = urllib2.Request(url)
request.add_header('User-Agent', user_agent)
response = urllib2.urlopen(request)
content = response.read()
tree=etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8'))
node=tree.xpath("//img/#src")
dic1={}
dic2={}
localPath='E:\\pictures\\'
def generateFileName():
return str(uuid.uuid1())
def createFileWithFileName(localPathParam,fileName):
totalPath=localPathParam+'\\'+fileName
if not os.path.exists(totalPath):
file=open(totalPath,'wb')
file.close()
return totalPath
def worker(i):
path = node[i]
if not (dic1.has_key(path)):
dic1[path] = 1
index = path.rfind('/')
suffix = path[index+1:]
filename = suffix
#filename = generateFileName()+'.'+suffix
if(re.search(r'^(https?:)?\/\/', path)):
#print('save picture %s as %s' % (path,filename))
'''
#this code get the same result too
try:
urllib.urlretrieve(path, createFileWithFileName(localPath, filename))
except Exception, ex:
print(ex.message)
'''
with open(localPath + filename, 'wb') as handle:
response = requests.get(path, timeout=60)
if not response.ok:
print response
else:
print 'wrong when get ' + path
for block in response.iter_content(1024):
if not block:
break
handle.write(block)
'''
#this code get the same result too
try:
req = urllib2.Request(path)
req.add_header('User-Agent', user_agent)
picture = urllib2.urlopen(url=path, timeout=5).read()
document = open(localPath+filename,'wb')
document.write(picture)
document.close()
except Exception, ex:
print(ex.message)
'''
if __name__=='__main__':
p = Pool()
for i in range(len(node)):
p.apply_async(worker, args=(i,))
print 'Waiting for all subprocesses done...'
p.close()
p.join()
print 'All subprocesses done.'
Related
I have list of Urls that is in file.txt that are online without problem
I am going to get response 200 whole of them but it just can get 20 percent of them response 200
How can I get it correctly?
I try divide it to 4 thread to get response in smaller time and I set header and get request of urls.
import threading
import requests
import eventlet
def foo():
data = {'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 5_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9B179 Safari/7534.48.3'}
with open('c.txt', 'r') as f:
line = f.readlines()
for l in line[0:50]:
with eventlet.Timeout(2):
try:
r = requests.get(l,headers=data,timeout= 2)
print(r,t.getName())
except:
pass
pass
if __name__ == "__main__":
t = threading.Thread(target = foo, name="Siyamak" )
t.start()
def bar():
data = {'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 5_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9B179 Safari/7534.48.3'}
with open('c.txt', 'r') as f:
line = f.readlines()
for l in line[50:100]:
with eventlet.Timeout(2):
try:
r = requests.get(l,headers=data,timeout= 2)
print(r,t2.getName())
except:
pass
pass
if __name__ == "__main__":
t2 = threading.Thread(target = bar,name="diyana")
t2.start()
I wrote a script that "parses" all domains from the file. After the launch, everything works as it should. But when there are several domains left at the end, it gets stuck. Sometimes it takes a long time to parse the last couple of domains. I can't figure out what the problem is. Who has faced such a situation? Tell me how to cure it.
After the launch, everything works out very quickly (as it should) until the end. At the end, it stops when there are several domains left. There is no difference, 1000 domains or 10 000 domains.
Complete code:
import re
import sys
import json
import requests
from bs4 import BeautifulSoup
from multiprocessing.pool import ThreadPool
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
pool = 100
with open("Rules.json") as file:
REGEX = json.loads(file.read())
ua = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0'}
def Domain_checker(domain):
try:
r = requests.get("http://" + domain, verify=False, headers=ua)
r.encoding = "utf-8"
for company in REGEX.keys():
for type in REGEX[company]:
check_entry = 0
for ph_regex in REGEX[company][type]:
if bool(re.search(ph_regex, r.text)) is True:
check_entry += 1
if check_entry == len(REGEX[company][type]):
title = BeautifulSoup(r.text, "lxml")
Found_domain = "\nCompany: {0}\nRule: {1}\nURL: {2}\nTitle: {3}\n".format(company, type, r.url, title.title.text)
print(Found_domain)
with open("/tmp/__FOUND_DOMAINS__.txt", "a", encoding='utf-8', errors = 'ignore') as file:
file.write(Found_domain)
except requests.exceptions.ConnectionError:
pass
except requests.exceptions.TooManyRedirects:
pass
except requests.exceptions.InvalidSchema:
pass
except requests.exceptions.InvalidURL:
pass
except UnicodeError:
pass
except requests.exceptions.ChunkedEncodingError:
pass
except requests.exceptions.ContentDecodingError:
pass
except AttributeError:
pass
except ValueError:
pass
return domain
if __name__ == '__main__':
with open(sys.argv[1], "r", encoding='utf-8', errors = 'ignore') as file:
Domains = file.read().split()
pool = 100
print("Pool = ", pool)
results = ThreadPool(pool).imap_unordered(Domain_checker, Domains)
string_num = 0
for result in results:
print("{0} => {1}".format(string_num, result))
string_num += 1
with open("/tmp/__FOUND_DOMAINS__.txt", encoding='utf-8', errors = 'ignore') as found_domains:
found_domains = found_domains.read()
print("{0}\n{1}".format("#" * 40, found_domains))
requests.get("http://" + domain, headers=ua, verify=False, timeout=10)
The problem is resolved after installing timeout
Thank you to the user with the nickname "eri" (https://ru.stackoverflow.com/users/16574/eri) :)
I need the counter variable (list_counter) inside my 'scraper' function to increment for each iteration through list1.
The problem is it's assigning a counter to each individual process.
I want each process to simply increment the global list_counter at the end of the loop, not for each process to have its own counter.
I tried passing the variable as an argument but couldn't get it to work that way either.
What you guys think? Is it even possible to have a global counter work with multiple processes - specifically using pool, map, lock?
from multiprocessing import Lock, Pool
from time import sleep
from bs4 import BeautifulSoup
import re
import requests
exceptions = []
lock = Lock()
list_counter = 0
def scraper(url): # url is tied to the individual list items
"""
Testing multiprocessing and requests
"""
global list_counter
lock.acquire()
try:
scrape = requests.get(url,
headers={"user-agent": "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36"},
timeout=10)
if scrape.status_code == 200:
""" --------------------------------------------- """
# ---------------------------------------------------
''' --> SCRAPE ALEXA RANK: <-- '''
# ---------------------------------------------------
""" --------------------------------------------- """
sleep(0.1)
scrape = requests.get("http://data.alexa.com/data?cli=10&dat=s&url=" + url,
headers={"user-agent": "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36"})
html = scrape.content
soup = BeautifulSoup(html, 'lxml')
rank = re.findall(r'<popularity[^>]*text="(\d+)"', str(soup))
print("Server Status:", scrape.status_code, '-', u"\u2713", '-', list_counter, '-', url, '-', "Rank:", rank[0])
list_counter = list_counter + 1
else:
print("Server Status:", scrape.status_code)
list_counter = list_counter + 1
print(list_counter)
pass
except BaseException as e:
exceptions.append(e)
print()
print(e)
print()
list_counter = list_counter + 1
print(list_counter)
pass
finally:
lock.release()
if __name__ == '__main__':
list1 = ["http://www.wallstreetinvestorplace.com/2018/04/cvs-health-corporation-cvs-to-touch-7-54-earnings-growth-for-next-year/",
"https://macondaily.com/2018/04/06/cetera-advisors-llc-lowers-position-in-cvs-health-cvs.html",
"http://www.thesportsbank.net/football/liverpool/jurgen-klopp-very-positive-about-mo-salah-injury/",
"https://www.moneyjournals.com/trump-wasting-time-trying-bring-amazon/",
"https://www.pmnewsnigeria.com/2018/04/06/fcta-targets-800000-children-for-polio-immunisation/",
"http://toronto.citynews.ca/2018/04/06/officials-in-canada-braced-for-another-spike-in-illegal-border-crossings/",
"https://www.pmnewsnigeria.com/2018/04/04/pdp-describes-looters-list-as-plot-to-divert-attention/",
"https://beyondpesticides.org/dailynewsblog/2018/04/epa-administrator-pruitt-colluding-regulated-industry/",
"http://thyblackman.com/2018/04/06/robert-mueller-is-searching-for/",
"https://www.theroar.com.au/2018/04/06/2018-commonwealth-games-swimming-night-2-finals-live-updates-results-blog/",
"https://medicalresearch.com/pain-research/migraine-linked-to-increased-risk-of-heart-disease-and-stroke/40858/",
"http://www.investingbizz.com/2018/04/amazon-com-inc-amzn-stock-creates-investors-concerns/",
"https://stocknewstimes.com/2018/04/06/convergence-investment-partners-llc-grows-position-in-amazon-com-inc-amzn.html",
"https://factsherald.com/old-food-rules-needs-to-be-updated/",
"https://www.nextadvisor.com/blog/2018/04/06/the-facebook-scandal-evolves/",
"http://sacramento.cbslocal.com/2018/04/04/police-family-youtube-shooter/",
"http://en.brinkwire.com/245768/why-does-stress-lead-to-weight-gain-study-sheds-light/",
"https://www.marijuana.com/news/2018/04/monterey-bud-jeff-sessions-is-on-the-wrong-side-of-history-science-and-public-opinion/",
"http://www.stocksgallery.com/2018/04/06/jpmorgan-chase-co-jpm-noted-a-price-change-of-0-80-and-amazon-com-inc-amzn-closes-with-a-move-of-2-92/",
"https://stocknewstimes.com/2018/04/06/front-barnett-associates-llc-has-2-41-million-position-in-cvs-health-corp-cvs.html",
"http://www.liveinsurancenews.com/colorado-mental-health-insurance-bill-to-help-consumers-navigate-the-system/",
"http://newyork.cbslocal.com/2018/04/04/youtube-headquarters-shooting-suspect/",
"https://ledgergazette.com/2018/04/06/liberty-interactive-co-series-a-liberty-ventures-lvnta-shares-bought-by-brandywine-global-investment-management-llc.html",
"http://bangaloreweekly.com/2018-04-06-city-holding-co-invests-in-cvs-health-corporation-cvs-shares/",
"https://www.thenewsguru.com/didnt-know-lawyer-paid-prostitute-130000-donald-trump/",
"http://www.westlondonsport.com/chelsea/football-wls-conte-gives-two-main-reasons-chelseas-loss-tottenham",
"https://registrarjournal.com/2018/04/06/amazon-com-inc-amzn-shares-bought-by-lenox-wealth-management-inc.html",
"http://www.businessdayonline.com/1bn-eca-withdrawal-commence-action-president-buhari-pdp-tasks-nass/",
"http://www.thesportsbank.net/football/manchester-united/pep-guardiola-asks-for-his-fans-help-vs-united-in-manchester-derby/",
"https://www.pakistantoday.com.pk/2018/04/06/three-palestinians-martyred-as-new-clashes-erupt-along-gaza-border/",
"http://www.nasdaqfortune.com/2018/04/06/risky-factor-of-cvs-health-corporation-cvs-is-observed-at-1-03/",
"https://stocknewstimes.com/2018/04/06/cetera-advisor-networks-llc-decreases-position-in-cvs-health-cvs.html",
"http://nasdaqjournal.com/index.php/2018/04/06/planet-fitness-inc-nyseplnt-do-analysts-think-you-should-buy/",
"http://www.tv360nigeria.com/apc-to-hold-national-congress/",
"https://www.pmnewsnigeria.com/2018/04/03/apc-governors-keep-sealed-lips-after-meeting-with-buhari/",
"https://www.healththoroughfare.com/diet/healthy-lifestyle-best-foods-you-should-eat-for-weight-loss/7061",
"https://stocknewstimes.com/2018/04/05/amazon-com-inc-amzn-shares-bought-by-west-oak-capital-llc.html",
"http://www.current-movie-reviews.com/48428/dr-oz-could-you-be-a-victim-of-sexual-assault-while-on-vacation/",
"https://www.brecorder.com/2018/04/07/410124/world-health-day-to-be-observed-on-april-7/",
"http://www.coloradoindependent.com/169637/trump-pruitt-emissions-epa-pollution",
"https://thecrimereport.org/2018/04/05/will-sessions-new-justice-strategy-turn-the-clock-back-on-civil-rights/",
"http://en.brinkwire.com/245490/pasta-unlikely-to-cause-weight-gain-as-part-of-a-healthy-diet/"]
p = Pool(15) # thread count
p.map(scraper, list1) # (function, iterable)
p.terminate()
p.join()
You can use concurrent.futures
import concurrent.futures
import urllib.request
from time import sleep
from bs4 import BeautifulSoup
import re
import requests
def scraper(url):
list_counter = 0
try:
scrape = requests.get(url,
headers={"user-agent": "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36"},
timeout=10)
if scrape.status_code == 200:
sleep(0.1)
scrape = requests.get("http://data.alexa.com/data?cli=10&dat=s&url=" + url,
headers={"user-agent": "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36"})
html = scrape.content
soup = BeautifulSoup(html, 'lxml')
rank = re.findall(r'<popularity[^>]*text="(\d+)"', str(soup))
print("Server Status:", scrape.status_code, '-', u"\u2713", '-', list_counter, '-', url, '-', "Rank:", rank[0])
list_counter = list_counter + 1
else:
print("Server Status:", scrape.status_code)
list_counter = list_counter + 1
print(list_counter)
pass
except BaseException as e:
exceptions.append(e)
print()
print(e)
print()
list_counter = list_counter + 1
print(list_counter)
pass
def load_url(url, timeout):
with urllib.request.urlopen(url, timeout=timeout) as conn:
return conn.read()
list1 copy your list here(in order to save space)
with concurrent.futures.ThreadPoolExecutor(max_workers=50) as executor:
future_to_url = {executor.submit(load_url, url, 50): url for url in list1}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
data = future.result()
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))
else:
print('%r page is %d bytes' % (url, len(data)))
with concurrent.futures.ProcessPoolExecutor() as executor:
for n, p in zip(list1, executor.map(scraper, list1)):
print(n, p)
You will get output(just a few lines)
http://www.coloradoindependent.com/169637/trump-pruitt-emissions-epa-pollution None
Server Status: 200 - ✓ - 0 - https://thecrimereport.org/2018/04/05/will-sessions-new-justice-strategy-turn-the-clock-back-on-civil-rights/ - Rank: 381576
https://thecrimereport.org/2018/04/05/will-sessions-new-justice-strategy-turn-the-clock-back-on-civil-rights/ None
Server Status: 200 - ✓ - 0 - http://en.brinkwire.com/245490/pasta-unlikely-to-cause-weight-gain-as-part-of-a-healthy-diet/ - Rank: 152818
http://en.brinkwire.com/245490/pasta-unlikely-to-cause-weight-gain-as-part-of-a-healthy-diet/ None
Process do not share memory between them. But you can use Manager of the multiprocessing module so the process can manipulate the same object:
manager = multiprocessing.Manager()
list_counter = manager.list()
You will have to pass the list_counter to the scraper function.
Note the list created by the manager is thread/process safe.
I'm using Python to save img to the folder, see the code:
def save_image(raw_image, image_type, save_directory):
extension = image_type if image_type else 'jpg'
file_name = str(uuid.uuid4().hex) + "." + extension
save_path = os.path.join(save_directory, file_name)
with open(save_path, 'wb+') as image_file:
image_file.write(raw_image)
I want to also save in a text file line by line the same name, can someone tell me how to do this?
complete code:
#thanks https://gist.github.com/genekogan/ebd77196e4bf0705db51f86431099e57
import argparse
import json
import itertools
import logging
import re
import os
import uuid
import sys
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
def configure_logging():
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler()
handler.setFormatter(
logging.Formatter('[%(asctime)s %(levelname)s %(module)s]: %(message)s'))
logger.addHandler(handler)
return logger
logger = configure_logging()
REQUEST_HEADER = {
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"}
def get_soup(url, header):
response = urlopen(Request(url, headers=header))
return BeautifulSoup(response, 'html.parser')
def get_query_url(query):
return "https://www.google.co.in/search?q=%s&tbm=isch&source=lnt&tbs=isz:l&sa=X&ved=0ahUKEwjpx6rng6faAhWIzlMKHSHBB8oQpwUIHg&biw=1366&bih=702&dpr=1" % query
def extract_images_from_soup(soup):
image_elements = soup.find_all("div", {"class": "rg_meta"})
metadata_dicts = (json.loads(e.text) for e in image_elements)
link_type_records = ((d["ou"], d["ity"]) for d in metadata_dicts)
return link_type_records
def extract_images(query, num_images):
url = get_query_url(query)
logger.info("Souping")
soup = get_soup(url, REQUEST_HEADER)
logger.info("Extracting image urls")
link_type_records = extract_images_from_soup(soup)
return itertools.islice(link_type_records, num_images)
def get_raw_image(url):
req = Request(url, headers=REQUEST_HEADER)
resp = urlopen(req)
return resp.read()
def save_image(raw_image, image_type, save_directory):
extension = image_type if image_type else 'jpg'
file_name = str(uuid.uuid4().hex) + "." + extension
save_path = os.path.join(save_directory, file_name)
with open(save_path, 'wb+') as image_file:
image_file.write(raw_image)
def download_images_to_dir(images, save_directory, num_images):
for i, (url, image_type) in enumerate(images):
try:
logger.info("Making request (%d/%d): %s", i, num_images, url)
raw_image = get_raw_image(url)
save_image(raw_image, image_type, save_directory)
except Exception as e:
logger.exception(e)
def run(query, save_directory, num_images=100):
query = '+'.join(query.split())
logger.info("Extracting image links")
images = extract_images(query, num_images)
logger.info("Downloading images")
download_images_to_dir(images, save_directory, num_images)
logger.info("Finished")
def main():
parser = argparse.ArgumentParser(description='Scrape Google images')
parser.add_argument('-s', '--search', default='bananas', type=str, help='search term')
parser.add_argument('-n', '--num_images', default=1, type=int, help='num images to save')
parser.add_argument('-d', '--directory', default='C:/Users/user/desktop/potter/images', type=str, help='save directory')
args = parser.parse_args()
run(args.search, args.directory, args.num_images)
if __name__ == '__main__':
main()
This is the code I am using for downloading the images from Google page. This code is taking time in Evaluating and downloading the images. Hence, I thought of using the Beautifulsoup Library for faster evaluation and download. Check the below original code:
import time
import sys
import os
import urllib2
search_keyword = ['Australia']
keywords = [' high resolution']
def download_page(url):
import urllib2
try:
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
req = urllib2.Request(url, headers = headers)
response = urllib2.urlopen(req)
page = response.read()
return page
except:
return"Page Not found"
def _images_get_next_item(s):
start_line = s.find('rg_di')
if start_line == -1:
end_quote = 0
link = "no_links"
return link, end_quote
else:
start_line = s.find('"class="rg_meta"')
start_content = s.find('"ou"',start_line+1)
end_content = s.find(',"ow"',start_content+1)
content_raw = str(s[start_content+6:end_content-1])
return content_raw, end_content
def _images_get_all_items(page):
items = []
while True:
item, end_content = _images_get_next_item(page)
if item == "no_links":
break
else:
items.append(item)
time.sleep(0.1)
page = page[end_content:]
return items
t0 = time.time()
i= 0
while i<len(search_keyword):
items = []
iteration = "Item no.: " + str(i+1) + " -->" + " Item name = " + str(search_keyword[i])
print (iteration)
print ("Evaluating...")
search_keywords = search_keyword[i]
search = search_keywords.replace(' ','%20')
try:
os.makedirs(search_keywords)
except OSError, e:
if e.errno != 17:
raise
pass
j = 0
while j<len(keywords):
pure_keyword = keywords[j].replace(' ','%20')
url = 'https://www.google.com/search?q=' + search + pure_keyword + '&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg'
raw_html = (download_page(url))
time.sleep(0.1)
items = items + (_images_get_all_items(raw_html))
j = j + 1
print ("Total Image Links = "+str(len(items)))
print ("\n")
info = open('output.txt', 'a')
info.write(str(i) + ': ' + str(search_keyword[i-1]) + ": " + str(items) + "\n\n\n")
info.close()
t1 = time.time()
total_time = t1-t0
print("Total time taken: "+str(total_time)+" Seconds")
print ("Starting Download...")
k=0
errorCount=0
while(k<len(items)):
from urllib2 import Request,urlopen
from urllib2 import URLError, HTTPError
try:
req = Request(items[k], headers={"User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"})
response = urlopen(req,None,15)
output_file = open(search_keywords+"/"+str(k+1)+".jpg",'wb')
data = response.read()
output_file.write(data)
response.close();
print("completed ====> "+str(k+1))
k=k+1;
except IOError:
errorCount+=1
print("IOError on image "+str(k+1))
k=k+1;
except HTTPError as e:
errorCount+=1
print("HTTPError"+str(k))
k=k+1;
except URLError as e:
errorCount+=1
print("URLError "+str(k))
k=k+1;
i = i+1
print("\n")
print("Everything downloaded!")
print("\n"+str(errorCount)+" ----> total Errors")
I thought editing the below code, will help in making the code work with BeautifulSoup Library and my work will be completed faster:
def download_page(url):
import urllib2
try:
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
req = urllib2.Request(url, headers = headers)
#response = urllib2.urlopen(req)
#page = response.read()
return BeautifulSoup(urlopen(Request(req)), 'html.parser')
except:
return"Page Not found"
But the above code is returning blank. Kindly, let me know what I might do to make the code work excellently well with BeautifulSoup without any trouble.
You can't just pass Google headers like that. The search engine is ALOT more complex than simply substituting some keywords into a GET URL.
HTML is a markup language only useful for one way rendering of human readable information. For your application, you need machine readable markup rather than trying to decipher human readable text. Google already has a very comprehensive API https://developers.google.com/custom-search/ which is easy to use, and a much better way of achieving this than using BeautifulSoup