Scrapy recursive link crawler - python

It starts with a url on the web (ex: http://python.org), fetches the web-page corresponding to that url, and parses all the links on that page into a repository of links. Next, it fetches the contents of any of the url from the repository just created, parses the links from this new content into the repository and continues this process for all links in the repository until stopped or after a given number of links are fetched.
How can i do that using python and scrapy?. I am able to scrape all links in a webpage but how to perform it recursively in depth

Several remarks :
you don't need Scrapy for such a simple task. Urllib (or Requests) and a html parser (Beautiful soup, etc.) can do the job
I don't recall where I've heard it, but I think it's better to crawl using BFS algorithms. You can easily avoid circular references.
Below a simple implementation : it does not fetcch internal links (only absolute formed hyperlinks) nor does it have any Error handling (403,404,no links,...), and it is abysmally slow ( the multiprocessing module can help a lot in this case).
import BeautifulSoup
import urllib2
import itertools
import random
class Crawler(object):
"""docstring for Crawler"""
def __init__(self):
self.soup = None # Beautiful Soup object
self.current_page = "http://www.python.org/" # Current page's address
self.links = set() # Queue with every links fetched
self.visited_links = set()
self.counter = 0 # Simple counter for debug purpose
def open(self):
# Open url
print self.counter , ":", self.current_page
res = urllib2.urlopen(self.current_page)
html_code = res.read()
self.visited_links.add(self.current_page)
# Fetch every links
self.soup = BeautifulSoup.BeautifulSoup(html_code)
page_links = []
try :
page_links = itertools.ifilter( # Only deal with absolute links
lambda href: 'http://' in href,
( a.get('href') for a in self.soup.findAll('a') ) )
except Exception: # Magnificent exception handling
pass
# Update links
self.links = self.links.union( set(page_links) )
# Choose a random url from non-visited set
self.current_page = random.sample( self.links.difference(self.visited_links),1)[0]
self.counter+=1
def run(self):
# Crawl 3 webpages (or stop if all url has been fetched)
while len(self.visited_links) < 3 or (self.visited_links == self.links):
self.open()
for link in self.links:
print link
if __name__ == '__main__':
C = Crawler()
C.run()
Output:
In [48]: run BFScrawler.py
0 : http://www.python.org/
1 : http://twistedmatrix.com/trac/
2 : http://www.flowroute.com/
http://www.egenix.com/files/python/mxODBC.html
http://wiki.python.org/moin/PyQt
http://wiki.python.org/moin/DatabaseProgramming/
http://wiki.python.org/moin/CgiScripts
http://wiki.python.org/moin/WebProgramming
http://trac.edgewall.org/
http://www.facebook.com/flowroute
http://www.flowroute.com/
http://www.opensource.org/licenses/mit-license.php
http://roundup.sourceforge.net/
http://www.zope.org/
http://www.linkedin.com/company/flowroute
http://wiki.python.org/moin/TkInter
http://pypi.python.org/pypi
http://pycon.org/#calendar
http://dyn.com/
http://www.google.com/calendar/ical/j7gov1cmnqr9tvg14k621j7t5c%40group.calendar.
google.com/public/basic.ics
http://www.pygame.org/news.html
http://www.turbogears.org/
http://www.openbookproject.net/pybiblio/
http://wiki.python.org/moin/IntegratedDevelopmentEnvironments
http://support.flowroute.com/forums
http://www.pentangle.net/python/handbook/
http://dreamhost.com/?q=twisted
http://www.vrplumber.com/py3d.py
http://sourceforge.net/projects/mysql-python
http://wiki.python.org/moin/GuiProgramming
http://software-carpentry.org/
http://www.google.com/calendar/ical/3haig2m9msslkpf2tn1h56nn9g%40group.calendar.
google.com/public/basic.ics
http://wiki.python.org/moin/WxPython
http://wiki.python.org/moin/PythonXml
http://www.pytennessee.org/
http://labs.twistedmatrix.com/
http://www.found.no/
http://www.prnewswire.com/news-releases/voip-innovator-flowroute-relocates-to-se
attle-190011751.html
http://www.timparkin.co.uk/
http://docs.python.org/howto/sockets.html
http://blog.python.org/
http://docs.python.org/devguide/
http://www.djangoproject.com/
http://buildbot.net/trac
http://docs.python.org/3/
http://www.prnewswire.com/news-releases/flowroute-joins-voxbones-inum-network-fo
r-global-voip-calling-197319371.html
http://www.psfmember.org
http://docs.python.org/2/
http://wiki.python.org/moin/Languages
http://sip-trunking.tmcnet.com/topics/enterprise-voip/articles/341902-grandstrea
m-ip-voice-solutions-receive-flowroute-certification.htm
http://www.twitter.com/flowroute
http://wiki.python.org/moin/NumericAndScientific
http://www.google.com/calendar/ical/b6v58qvojllt0i6ql654r1vh00%40group.calendar.
google.com/public/basic.ics
http://freecode.com/projects/pykyra
http://www.xs4all.com/
http://blog.flowroute.com
http://wiki.python.org/moin/PyGtk
http://twistedmatrix.com/trac/
http://wiki.python.org/moin/
http://wiki.python.org/moin/Python2orPython3
http://stackoverflow.com/questions/tagged/twisted
http://www.pycon.org/

Here is the main crawl method written to scrap links recursively from a webpage. This method will crawl a URL and put all the crawled URLs into a buffer. Now multiple threads will be waiting to pop URLs from this global buffer and again call this crawl method.
def crawl(self,urlObj):
'''Main function to crawl URL's '''
try:
if ((urlObj.valid) and (urlObj.url not in CRAWLED_URLS.keys())):
rsp = urlcon.urlopen(urlObj.url,timeout=2)
hCode = rsp.read()
soup = BeautifulSoup(hCode)
links = self.scrap(soup)
boolStatus = self.checkmax()
if boolStatus:
CRAWLED_URLS.setdefault(urlObj.url,"True")
else:
return
for eachLink in links:
if eachLink not in VISITED_URLS:
parsedURL = urlparse(eachLink)
if parsedURL.scheme and "javascript" in parsedURL.scheme:
#print("***************Javascript found in scheme " + str(eachLink) + "**************")
continue
'''Handle internal URLs '''
try:
if not parsedURL.scheme and not parsedURL.netloc:
#print("No scheme and host found for " + str(eachLink))
newURL = urlunparse(parsedURL._replace(**{"scheme":urlObj.scheme,"netloc":urlObj.netloc}))
eachLink = newURL
elif not parsedURL.scheme :
#print("Scheme not found for " + str(eachLink))
newURL = urlunparse(parsedURL._replace(**{"scheme":urlObj.scheme}))
eachLink = newURL
if eachLink not in VISITED_URLS: #Check again for internal URL's
#print(" Found child link " + eachLink)
CRAWL_BUFFER.append(eachLink)
with self._lock:
self.count += 1
#print(" Count is =================> " + str(self.count))
boolStatus = self.checkmax()
if boolStatus:
VISITED_URLS.setdefault(eachLink, "True")
else:
return
except TypeError:
print("Type error occured ")
else:
print("URL already present in visited " + str(urlObj.url))
except socket.timeout as e:
print("**************** Socket timeout occured*******************" )
except URLError as e:
if isinstance(e.reason, ConnectionRefusedError):
print("**************** Conn refused error occured*******************")
elif isinstance(e.reason, socket.timeout):
print("**************** Socket timed out error occured***************" )
elif isinstance(e.reason, OSError):
print("**************** OS error occured*************")
elif isinstance(e,HTTPError):
print("**************** HTTP Error occured*************")
else:
print("**************** URL Error occured***************")
except Exception as e:
print("Unknown exception occured while fetching HTML code" + str(e))
traceback.print_exc()
The complete source code and instructions are available at https://github.com/tarunbansal/crawler

Related

Unable to scrape emails from some websites maybe due to r.html.render() not working properly

I have some website links as samples for extracting any email available in their internal sites.
However, even I am trying to render any JS driven website via r.html.render() within scrape_email(url) method, some of the websites like arken.trygge.dk, gronnebakken.dk, dagtilbud.ballerup.dk/boernehuset-bispevangen etc. does not return any email which might be due to rendering issue.
I have attached the sample file for convenience of running
I dont want to use selenium as there can be thousands or millions of webpage I want to extract emails from.
So far this is my code:
import os
import time
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import re
from requests_html import HTMLSession
import pandas as pd
from gtts import gTTS
import winsound
# For convenience of seeing console output in the script
pd.options.display.max_colwidth = 180
#Get the start time of script execution
startTime = time.time()
#Paste file name inside ''
input_file_name = 'sample'
input_df = pd.read_excel(input_file_name+'.xlsx', engine='openpyxl')
input_df = input_df.dropna(how='all')
internal_urls = set()
emails = set()
total_urls_visited = 0
def is_valid(url):
"""
Checks whether `url` is a valid URL.
"""
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
def get_internal_links(url):
"""
Returns all URLs that is found on `url` in which it belongs to the same website
"""
# all URLs of `url`
urls = set()
# domain name of the URL without the protocol
domain_name = urlparse(url).netloc
print("Domain name -- ",domain_name)
try:
soup = BeautifulSoup(requests.get(url, timeout=5).content, "html.parser")
for a_tag in soup.findAll("a"):
href = a_tag.attrs.get("href")
if href == "" or href is None:
# href empty tag
continue
# join the URL if it's relative (not absolute link)
href = urljoin(url, href)
parsed_href = urlparse(href)
# remove URL GET parameters, URL fragments, etc.
href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
if not is_valid(href):
# not a valid URL
continue
if href in internal_urls:
# already in the set
continue
if parsed_href.netloc != domain_name:
# if the link is not of same domain pass
continue
if parsed_href.path.endswith((".csv",".xlsx",".txt", ".pdf", ".mp3", ".png", ".jpg", ".jpeg", ".svg", ".mov", ".js",".gif",".mp4",".avi",".flv",".wav")):
# Overlook site images,pdf and other file rather than webpages
continue
print(f"Internal link: {href}")
urls.add(href)
internal_urls.add(href)
return urls
except requests.exceptions.Timeout as err:
print("The website is not loading within 5 seconds... Continuing crawling the next one")
pass
except:
print("The website is unavailable. Continuing crawling the next one")
pass
def crawl(url, max_urls=30):
"""
Crawls a web page and extracts all links.
You'll find all links in `external_urls` and `internal_urls` global set variables.
params:
max_urls (int): number of max urls to crawl, default is 30.
"""
global total_urls_visited
total_urls_visited += 1
print(f"Crawling: {url}")
links = get_internal_links(url)
# for link in links:
# if total_urls_visited > max_urls:
# break
# crawl(link, max_urls=max_urls)
def scrape_email(url):
EMAIL_REGEX = r'\b[A-Za-z0-9._%+-]+#[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
# EMAIL_REGEX = r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")#(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
try:
# initiate an HTTP session
session = HTMLSession()
# get the HTTP Response
r = session.get(url, timeout=10)
# for JAVA-Script driven websites
r.html.render()
single_url_email = []
for re_match in re.finditer(EMAIL_REGEX, r.html.raw_html.decode()):
single_url_email.append(re_match.group().lower())
r.session.close()
return set(single_url_email)
except:
pass
def crawl_website_scrape_email(url, max_internal_url_no=20):
crawl(url,max_urls=max_internal_url_no)
each_url_emails = []
global internal_urls
global emails
for each_url in internal_urls:
each_url_emails.append(scrape_email(each_url))
URL_WITH_EMAILS={'main_url': url, 'emails':each_url_emails}
emails = {}
internal_urls = set()
return URL_WITH_EMAILS
def list_check(emails_list, email_match):
match_indexes = [i for i, s in enumerate(emails_list) if email_match in s]
return [emails_list[index] for index in match_indexes]
URL_WITH_EMAILS_LIST = [crawl_website_scrape_email(x) for x in input_df['Website'].values]
URL_WITH_EMAILS_DF = pd.DataFrame(data = URL_WITH_EMAILS_LIST)
URL_WITH_EMAILS_DF.to_excel(f"{input_file_name}_email-output.xlsx", index=False)
How can I solve the issue of not being able to scrape email from some of those above-mentioned and similar type of websites?
Is there also any way to detect and print strings if my get request is refused by bot detector or related protocols?
Also how can I make this code more robust?
Thank you in advance

Web Scraping Python BeautifulSoup get elements for each webpage in website

I am in my infancy of python coding. What I am trying to do is build a web scraper which gets all the links from a website and then returns the elements form each site. The code I started with is from https://www.thepythoncode.com/article/extract-all-website-links-python
this works really nicely to get all the links from a website.
As I am only interested in the internal links I have added some extra code to try and get the elements (tile, h1, some other bits which I haven't added yet) to the code. The issue I am running into is I think the href returns an email, then the code tries and extracts the elements from this so obviously this bugs out. I have tried to avoid it picking the email (which i also thought would be in the def_valid function) but i am obviously missing something. Any help would be really appreciated.
import re
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import colorama
GREEN = colorama.Fore.GREEN
GRAY = colorama.Fore.LIGHTBLACK_EX
RESET = colorama.Fore.RESET
YELLOW = colorama.Fore.YELLOW
internal_urls = set()
external_urls = set()
title_urls = set()
def is_valid(url):
"""
Checks whether `url` is a valid URL.
"""
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
def get_all_website_links(url):
"""
Returns all URLs that is found on `url` in which it belongs to the same website
"""
# all URLs of `url`
urls = set()
# domain name of the URL without the protocol
domain_name = urlparse(url).netloc
soup = BeautifulSoup(requests.get(url).content, "html.parser")
# is_internal_link == True:
title_check = soup.find_all('title')
if title_check != " " or title_check != None:
get_title(url)
get_heading_tags(url)
for a_tag in soup.findAll("a"):
# is_internal_link = False
href = a_tag.attrs.get("href")
if href == "" or href is None:
# href empty tag
continue
# join the URL if it's relative (not absolute link)
href = urljoin(url, href)
parsed_href = urlparse(href)
# remove URL GET parameters, URL fragments, etc.
href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
if not is_valid(href):
# not a valid URL
continue
if href in internal_urls:
# already in the set
continue
if domain_name not in href:
# external link
if href not in external_urls:
#print(f"{GRAY}[!] External link: {href}{RESET}")
external_urls.add(href)
continue
print(f"{GREEN}[*] Internal link: {href}{RESET}")
if re.search('#',href) == True:
continue
urls.add(href)
internal_urls.add(href)
return urls
# number of urls visited so far will be stored here
total_urls_visited = 0
def get_title(url): # domain name of the URL without the protocol
domain_name = urlparse(url).netloc
soup = BeautifulSoup(requests.get(url).content, "html.parser")
#print("Title of the website is : ")
for title in soup.find_all('title'):
if title == "" and title == None:
continue
title_text = title.get_text()
title_urls.add(title_text)
print(title_text)
print((len(title_text)))
def get_heading_tags(url):
soup = BeautifulSoup(requests.get(url).content, "html.parser")
heading_tags = ['h1', 'h2', 'h3']
i = 0
for tags in soup.find_all(heading_tags):
if tags == " " or tags == None:
continue
tags_text = tags.get_text()
letters_in_tags = len(tags_text) - tags_text.count(" ")
i += 1
print(f'{tags.name} {i} -> {tags_text} -> Length ->{letters_in_tags} ')
def crawl(url, max_urls=80):
"""
Crawls a web page and extracts all links.
You'll find all links in `external_urls` and `internal_urls` global set variables.
params:
max_urls (int): number of max urls to crawl, default is 30.
"""
global total_urls_visited
total_urls_visited += 1
print(f"{YELLOW}[*] Crawling: {url}{RESET}")
links = get_all_website_links(url)
for link in links:
if re.search('#',link) != True:
if total_urls_visited > max_urls:
break
crawl(link, max_urls=max_urls)
if __name__ == "__main__":
crawl("https://website.com/") #put website here.
print("[+] Total Internal links:", len(internal_urls))
print("[+] Total External links:", len(external_urls))
print("[+] Total URLs:", len(external_urls) + len(internal_urls))
for link in links:
if re.search('#',link) != True:
if total_urls_visited > max_urls:
break
crawl(link, max_urls=max_urls)
You are only checking if # is present in the link (and that too not Correct!) to know if it's an email or not. Also note that links can also have # in them.
Basically, emails inside <a> will be of the form:
So to differentiate emails from links, you can use the below check.
for link in links:
if not link.startswith('mailto:'):
if total_urls_visited > max_urls:
break
crawl(link, max_urls=max_urls)
This will ignore all the emails and only scrape links.

Web Crawler not working with Python

I'm having issues with a simple web crawler, when I run the following script, it is not iterating through the sites and it does not give me any results.
This is what I get:
1 Visiting: https://www.mongodb.com/
Word never found
Process finished with exit code 0
Any tips as why this is not working correctly? I'm using the following example (http://www.netinstructions.com/how-to-make-a-web-crawler-in-under-50-lines-of-python-code/)
Here is the code:
from html.parser import HTMLParser
from urllib.request import urlopen
from urllib import parse
class LinkParser(HTMLParser):
# This is a function that HTMLParser normally has
# but we are adding some functionality to it
def handle_starttag(self, tag, attrs):
""" We are looking for the begining of a link.
Links normally look
like """
if tag == 'a':
for (key,value) in attrs:
if key == 'href':
# We are grabbing the new URL. We are also adding the
# base URL to it. For example:
# www.netinstructions.com is the base and
# somepage.html is the new URL (a relative URL)
#
# We combine a relative URL with the base URL to create
# an absolute URL like:
# www.netinstructions.com/somepage.html
newUrl = parse.urljoin(self.baseUrl, value)
# And add it to our colection of links:
self.links = self.links + [newUrl]
def getLinks(self, url):
self.links = []
# Remember the base URL which will be important when creating
# absolute URLs
self.baseUrl = url
# Use the urlopen function from the standard Python 3 library
response = urlopen(url)
# Make sure that we are looking at HTML and not other things that
# are floating around on the internet (such as
# JavaScript files, CSS, or .PDFs for example)
if response.getheader('Content-Type') == 'text/html':
htmlBytes = response.read()
# Note that feed() handles Strings well, but not bytes
# (A change from Python 2.x to Python 3.x)
htmlString = htmlBytes.decode("utf-8")
self.feed(htmlString)
return htmlString, self.links
else:
return "", []
# And finally here is our spider. It takes in an URL, a word to find,
# and the number of pages to search through before giving up
def spider(url, word, maxPages):
pagesToVisit = [url]
numberVisited = 0
foundWord = False
# The main loop. Create a LinkParser and get all the links on the page.
# Also search the page for the word or string
# In our getLinks function we return the web page
# (this is useful for searching for the word)
# and we return a set of links from that web page
# (this is useful for where to go next)
while numberVisited < maxPages and pagesToVisit != [] and not foundWord:
numberVisited = numberVisited +1
# Start from the beginning of our collection of pages to visit:
url = pagesToVisit[0]
pagesToVisit = pagesToVisit[1:]
try:
print(numberVisited, "Visiting:", url)
parser = LinkParser()
data, links = parser.getLinks(url)
if data.find(word)>-1:
foundWord = True
# Add the pages that we visited to the end of our collection
# of pages to visit:
pagesToVisit = pagesToVisit + links
print(" **Success!**")
except:
print(" **Failed!**")
if foundWord:
print("The word", word, "was found at", url)
else:
print("Word never found")
if __name__ == "__main__":
spider("https://www.mongodb.com/", "MongoDB" ,400)
First, edit the content-type checker line to:
if response.getheader('Content-Type') == 'text/html; charset=utf-8':
as suggested by #glibdud.
If you would like your program to check all links until maxPages is reached or pagesTovisit = [], simply remove the and condition for found word on the line:
while numberVisited < maxPages and pagesToVisit != [] and not foundWord:
to:
while numberVisited < maxPages and pagesToVisit != []:

Web Crawler problems in Python

I've been working on creating a single-threaded Web Crawler in Python that will group the assets of each page and output a JSON array of the form:
[
{
url: 'http://url.com/',
assets: [
'http://url.com/imgs/img1.jpg',
'http://url.com/css/style.css',
]
},
{
url: 'http://url.com/services',
assets: [
'http://url.com/imgs/services.jpg',
'http://url.com/css/style.css',
]
},
...
]
To quickly summarise the functionality:
Using BeautifulSoup to parse HTML and extract links
Using urlparse to:
Build absolute urls from relative urls
Check if url is local using netloc
Add visited urls/assets to dictionaries via their paths
Using robotparser to check if I can crawl each page I find by looking at the robots.txt file
In order to do this I pass the root of the website to the crawler, i.e. ./crawl.py http://sitename.com/ (including the final slash)
I've made the assumption that if the url ends in .html or the resource path doesn't contain a . that I will be able to crawl it as a HTML page.
I've been having some problem with a few things including:
locales - Is there a smart way to detect and avoid crawling the same pages in different locales?
When trying to crawl particular sites I'll end up with a maximum recursion depth exceeded message from Python.
I tried to avoid this by checking if a links rel attribute contained alternate but this doesn't seem to have a big impact.
An example of this is crawling http://url.com/ but also having to crawl http://url.com/en-us, http://url.com/en-au, etc.
angular/react - Is it possible to crawl sites that are using angular/react/similar frameworks?
I've been trying to search for useful resources to use to help me in this section but so far haven't found anything concrete.
Any info/feedback is greatly appreciated
Code below:
#!/usr/bin/python
import sys
import json
import urlparse
import robotparser
import urllib2
from queue import Queue
from bs4 import BeautifulSoup
class Crawler:
def gethtml(self, url):
try:
return urllib2.urlopen(url)
except urllib2.HTTPError, e:
print 'We failed with error code - %s.' % e.code
if e.code == 404:
print('404 File Not Found: ' + url)
else:
print('e code not 404')
return None
def __init__(self):
url = sys.argv[1]
sys.setrecursionlimit(100000)
parsedurl = urlparse.urlparse(url)
print('Crawling from URL: ' + url)
self.parser = robotparser.RobotFileParser()
self.parser.set_url(url + 'robots.txt')
self.parser.read()
if parsedurl.netloc.startswith('www.'): # compare netlocs without www.
self.netloc = parsedurl.netloc[4:]
else:
self.netloc = parsedurl.netloc
html = self.gethtml(url)
if html is not None:
self.visited = {}
self.current = {}
self.currentassets = {}
self.output = []
self.queue = Queue()
if len(parsedurl.path) < 1:
self.visited['/index.html'] = True
self.crawlhtml(url, html)
else:
print("Sorry, couldn't find HTML at that URL!")
def isabsolute(self, url):
return bool(urlparse.urlparse(url).netloc)
def checkifhtml(self, url):
parsedurl = urlparse.urlparse(url)
path = parsedurl.path
if url.endswith('.html') or '.' not in path: # path is a html file
if not self.visited.has_key(path):
self.queue.enqueue(url)
return True
else:
return False
def getasseturl(self, current_url, url):
if not self.isabsolute(url): # make our relative url absolute
url = urlparse.urljoin(current_url, url)
parsedurl = urlparse.urlparse(url)
path = parsedurl.path
netloc = parsedurl.netloc
local = False
if netloc.startswith('www.'): # check if is local url
netloc = netloc.replace('www.', '', 1)
if netloc == self.netloc:
local = True
if self.currentassets.get(path) is None:
self.currentassets[path] = True
if local:
if self.checkifhtml(url) is False:
self.current['assets'].append(url)
def checkqueue(self):
print('Checking queue. Queue Size: ' + str(self.queue.size()))
if self.queue.size() == 0:
print('\n------------------------------------------------------\n')
print(json.dumps(self.output, indent=4))
print('\n------------------------------------------------------\n')
print(self.visited)
else:
url = self.queue.dequeue()
parsedurl = urlparse.urlparse(url)
path = parsedurl.path
if self.visited.get(path) is None:
self.visited[path] = True
html = self.gethtml(url)
if html is not None:
self.crawlhtml(url, html)
else:
self.checkqueue()
else:
self.checkqueue()
def crawlhtml(self, url, html):
print('---------------------------------------\nLooking at url: ' + url)
if self.parser.can_fetch('*', url):
self.current['url'] = url
self.current['assets'] = []
parsedhtml = BeautifulSoup(html, 'lxml') # use lxml for speed
for link in parsedhtml.find_all(['a', 'link', 'area', 'base', 'image']):
if link.get('href') is not None:
if link.get('rel') is None:
self.getasseturl(url, link.get('href'))
else:
if not 'alternate' in link.get('rel'):
self.getasseturl(url, link.get('href'))
for link in parsedhtml.find_all(['script', 'img', 'frame', 'iframe', 'input', 'audio', 'embed', 'source', 'video']):
if link.get('src') is not None:
self.getasseturl(url, link.get('src'))
self.output.append(self.current)
self.current = {}
self.currentassets = {}
self.checkqueue()
c = Crawler()

Gevent link crawler

Here i have written the code using python and beautiful soup to parse all the links on that page into a repository of links. Next, it fetches the contents of any of the url from the repository just created, parses the links from this new content into the repository and continues this process for all links in the repository until stopped or after a given number of links are fetched.
But this code is very slow. How can i improve it by using asynchronous programming using gevents in python ?
Code
class Crawler(object):
def __init__(self):
self.soup = None # Beautiful Soup object
self.current_page = "http://www.python.org/" # Current page's address
self.links = set() # Queue with every links fetched
self.visited_links = set()
self.counter = 0 # Simple counter for debug purpose
def open(self):
# Open url
print self.counter , ":", self.current_page
res = urllib2.urlopen(self.current_page)
html_code = res.read()
self.visited_links.add(self.current_page)
# Fetch every links
self.soup = BeautifulSoup.BeautifulSoup(html_code)
page_links = []
try :
page_links = itertools.ifilter( # Only deal with absolute links
lambda href: 'http://' in href,
( a.get('href') for a in self.soup.findAll('a') ) )
except Exception as e: # Magnificent exception handling
print 'Error: ',e
pass
# Update links
self.links = self.links.union( set(page_links) )
# Choose a random url from non-visited set
self.current_page = random.sample( self.links.difference(self.visited_links),1)[0]
self.counter+=1
def run(self):
# Crawl 3 webpages (or stop if all url has been fetched)
while len(self.visited_links) < 3 or (self.visited_links == self.links):
self.open()
for link in self.links:
print link
if __name__ == '__main__':
C = Crawler()
C.run()
Update 1
import gevent.monkey; gevent.monkey.patch_thread()
from bs4 import BeautifulSoup
import urllib2
import itertools
import random
import urlparse
import sys
import gevent.monkey; gevent.monkey.patch_all(thread=False)
class Crawler(object):
def __init__(self):
self.soup = None # Beautiful Soup object
self.current_page = "http://www.python.org/" # Current page's address
self.links = set() # Queue with every links fetched
self.visited_links = set()
self.counter = 0 # Simple counter for debug purpose
def open(self):
# Open url
print self.counter , ":", self.current_page
res = urllib2.urlopen(self.current_page)
html_code = res.read()
self.visited_links.add(self.current_page)
# Fetch every links
self.soup = BeautifulSoup(html_code)
page_links = []
try :
for link in [h.get('href') for h in self.soup.find_all('a')]:
print "Found link: '" + link + "'"
if link.startswith('http'):
print 'entered in if link: ',link
page_links.append(link)
print "Adding link" + link + "\n"
elif link.startswith('/'):
print 'entered in elif link: ',link
parts = urlparse.urlparse(self.current_page)
page_links.append(parts.scheme + '://' + parts.netloc + link)
print "Adding link " + parts.scheme + '://' + parts.netloc + link + "\n"
else:
print 'entered in else link: ',link
page_links.append(self.current_page+link)
print "Adding link " + self.current_page+link + "\n"
except Exception, ex: # Magnificent exception handling
print ex
# Update links
self.links = self.links.union( set(page_links) )
# Choose a random url from non-visited set
self.current_page = random.sample( self.links.difference(self.visited_links),1)[0]
self.counter+=1
def run(self):
# Crawl 3 webpages (or stop if all url has been fetched)
crawling_greenlets = []
for i in range(3):
crawling_greenlets.append(gevent.spawn(self.open))
gevent.joinall(crawling_greenlets)
#while len(self.visited_links) < 4 or (self.visited_links == self.links):
# self.open()
for link in self.links:
print link
if __name__ == '__main__':
C = Crawler()
C.run()
import gevent and make sure monkey-patching is done to make standard library calls non-blocking and aware of gevent:
import gevent
from gevent import monkey; monkey.patch_all()
(you can selectively decide what has to be monkey-patched, but let's say it is not
your problem at the moment)
In your run, make your open function to be called inside a greenlet. run can
return the greenlet object, so you can wait for it whenever you need to get the
results using gevent.joinall for example. Something like this:
def run(self):
return gevent.spawn(self.open)
c1 = Crawler()
c2 = Crawler()
c3 = Crawler()
crawling_tasks = [c.run() for c in (c1,c2,c3)]
gevent.joinall(crawling_tasks)
print [c.links for c in (c1, c2, c3)]

Categories

Resources