Gevent link crawler - python

Here i have written the code using python and beautiful soup to parse all the links on that page into a repository of links. Next, it fetches the contents of any of the url from the repository just created, parses the links from this new content into the repository and continues this process for all links in the repository until stopped or after a given number of links are fetched.
But this code is very slow. How can i improve it by using asynchronous programming using gevents in python ?
Code
class Crawler(object):
def __init__(self):
self.soup = None # Beautiful Soup object
self.current_page = "http://www.python.org/" # Current page's address
self.links = set() # Queue with every links fetched
self.visited_links = set()
self.counter = 0 # Simple counter for debug purpose
def open(self):
# Open url
print self.counter , ":", self.current_page
res = urllib2.urlopen(self.current_page)
html_code = res.read()
self.visited_links.add(self.current_page)
# Fetch every links
self.soup = BeautifulSoup.BeautifulSoup(html_code)
page_links = []
try :
page_links = itertools.ifilter( # Only deal with absolute links
lambda href: 'http://' in href,
( a.get('href') for a in self.soup.findAll('a') ) )
except Exception as e: # Magnificent exception handling
print 'Error: ',e
pass
# Update links
self.links = self.links.union( set(page_links) )
# Choose a random url from non-visited set
self.current_page = random.sample( self.links.difference(self.visited_links),1)[0]
self.counter+=1
def run(self):
# Crawl 3 webpages (or stop if all url has been fetched)
while len(self.visited_links) < 3 or (self.visited_links == self.links):
self.open()
for link in self.links:
print link
if __name__ == '__main__':
C = Crawler()
C.run()
Update 1
import gevent.monkey; gevent.monkey.patch_thread()
from bs4 import BeautifulSoup
import urllib2
import itertools
import random
import urlparse
import sys
import gevent.monkey; gevent.monkey.patch_all(thread=False)
class Crawler(object):
def __init__(self):
self.soup = None # Beautiful Soup object
self.current_page = "http://www.python.org/" # Current page's address
self.links = set() # Queue with every links fetched
self.visited_links = set()
self.counter = 0 # Simple counter for debug purpose
def open(self):
# Open url
print self.counter , ":", self.current_page
res = urllib2.urlopen(self.current_page)
html_code = res.read()
self.visited_links.add(self.current_page)
# Fetch every links
self.soup = BeautifulSoup(html_code)
page_links = []
try :
for link in [h.get('href') for h in self.soup.find_all('a')]:
print "Found link: '" + link + "'"
if link.startswith('http'):
print 'entered in if link: ',link
page_links.append(link)
print "Adding link" + link + "\n"
elif link.startswith('/'):
print 'entered in elif link: ',link
parts = urlparse.urlparse(self.current_page)
page_links.append(parts.scheme + '://' + parts.netloc + link)
print "Adding link " + parts.scheme + '://' + parts.netloc + link + "\n"
else:
print 'entered in else link: ',link
page_links.append(self.current_page+link)
print "Adding link " + self.current_page+link + "\n"
except Exception, ex: # Magnificent exception handling
print ex
# Update links
self.links = self.links.union( set(page_links) )
# Choose a random url from non-visited set
self.current_page = random.sample( self.links.difference(self.visited_links),1)[0]
self.counter+=1
def run(self):
# Crawl 3 webpages (or stop if all url has been fetched)
crawling_greenlets = []
for i in range(3):
crawling_greenlets.append(gevent.spawn(self.open))
gevent.joinall(crawling_greenlets)
#while len(self.visited_links) < 4 or (self.visited_links == self.links):
# self.open()
for link in self.links:
print link
if __name__ == '__main__':
C = Crawler()
C.run()

import gevent and make sure monkey-patching is done to make standard library calls non-blocking and aware of gevent:
import gevent
from gevent import monkey; monkey.patch_all()
(you can selectively decide what has to be monkey-patched, but let's say it is not
your problem at the moment)
In your run, make your open function to be called inside a greenlet. run can
return the greenlet object, so you can wait for it whenever you need to get the
results using gevent.joinall for example. Something like this:
def run(self):
return gevent.spawn(self.open)
c1 = Crawler()
c2 = Crawler()
c3 = Crawler()
crawling_tasks = [c.run() for c in (c1,c2,c3)]
gevent.joinall(crawling_tasks)
print [c.links for c in (c1, c2, c3)]

Related

returning a scraped variable from a function in python

I want to create a function that returns a varibable I can write to a csv.
If I write:
from makesoup import make_soup
def get_links(soupbowl):
linkname=""
for boot in soupbowl.findAll('tbody'):
for record in boot.findAll('tr', {"row0", "row1"}):
for link in record.find_all('a'):
if link.has_attr('href'):
linkname = linkname+"\n" + (link.attrs['href'])[1:]
print(linkname)
soup = make_soup("https://www.footballdb.com/teams/index.html")
pyt = get_links(soup)
print(pyt)
It prints what I want(all links on page) in the function and None with print(pyt)
Instead of print(linkname) in the function, i want to return(linkname).
But when I do I only print the first link on the page. Is there a way to pass all the links to variable pyt which is outside of the function?
Thank You in advance
Try the following, to get all the links in one go:
from makesoup import make_soup
def get_links(soupbowl):
links_found = []
linkname=""
for boot in soupbowl.findAll('tbody'):
for record in boot.findAll('tr', {"row0", "row1"}):
for link in record.find_all('a'):
if link.has_attr('href'):
linkname = linkname+"\n" + (link.attrs['href'])[1:]
links_found.append(linkname)
return links_found
soup = make_soup("https://www.footballdb.com/teams/index.html")
pyt = get_links(soup)
print(pyt)
Or use yield, to return them one by one - while you process the output for something else:
from makesoup import make_soup
def get_links(soupbowl):
linkname=""
for boot in soupbowl.findAll('tbody'):
for record in boot.findAll('tr', {"row0", "row1"}):
for link in record.find_all('a'):
if link.has_attr('href'):
linkname = linkname+"\n" + (link.attrs['href'])[1:]
yield linkname
soup = make_soup("https://www.footballdb.com/teams/index.html")
pyt = get_links(soup)
for link in pyt:
do_something()
from makesoup import make_soup
def get_links(soupbowl):
links = []
for boot in soupbowl.findAll('tbody'):
for record in boot.findAll('tr', {"row0", "row1"}):
for link in record.find_all('a'):
if link.has_attr('href'):
linkname = linkname+"\n" + (link.attrs['href'])[1:]
links.append(linkname)
return links
soup = make_soup("https://www.footballdb.com/teams/index.html")
pyt = get_links(soup)
print(pyt)

Class crawler written in python throws attribute error

After writing some code in python, I've got stuck in deep trouble. I'm a newbie in writing code following the OOP design in python. The xpaths I've used in my code are flawless. I'm getting lost when it comes to run the "passing_links" method in my "info_grabber" class through the instance of "page_crawler" class. Every time I run my code I get an error "'page_crawler' object has no attribute 'passing_links'". Perhaps the way I've written my class-crawler is not how it should be. However, as I've spent few hours on it so I suppose I might get any suggestion as to which lines I should rectify to make it work. Thanks in advance for taking a look into it:
from lxml import html
import requests
class page_crawler(object):
main_link = "https://www.yellowpages.com/search?search_terms=pizza&geo_location_terms=San%20Francisco%2C%20CA"
base_link = "https://www.yellowpages.com"
def __init__(self):
self.links = [self.main_link]
def crawler(self):
for link in self.links:
self.get_link(link)
def get_link(self, link):
print("Running page "+ link)
page = requests.get(link)
tree = html.fromstring(page.text)
item_links = tree.xpath('//h2[#class="n"]/a[#class="business-name"][not(#itemprop="name")]/#href')
for item_link in item_links:
return self.base_link + item_link
links = tree.xpath('//div[#class="pagination"]//li/a/#href')
for url in links:
if not self.base_link + url in self.links:
self.links += [self.base_link + url]
class Info_grabber(page_crawler):
def __init__(self, plinks):
page_crawler.__init__(self)
self.plinks = [plinks]
def passing_links(self):
for nlink in self.plinks:
print(nlink)
self.crawling_deep(nlink)
def crawling_deep(self, uurl):
page = requests.get(uurl)
tree = html.fromstring(page.text)
name = tree.findtext('.//div[#class="sales-info"]/h1')
phone = tree.findtext('.//p[#class="phone"]')
try:
email = tree.xpath('//div[#class="business-card-footer"]/a[#class="email-business"]/#href')[0]
except IndexError:
email=""
print(name, phone, email)
if __name__ == '__main__':
crawl = Info_grabber(page_crawler)
crawl.crawler()
crawl.passing_links()
Now upon execution I get a new error "raise MissingSchema(error)" when it hits the line "self.crawling_deep(nlink)"
I'm not sure i understand what you're trying to do in page_crawler.get_link, but i think you should have a different method for collecting "pagination" links.
I renamed Info_grabber.plinks to Info_grabber.links so that the page_crawler.crawler can access them, and managed to extract info from several pages, however the code is far from ideal.
class page_crawler(object):
main_link = "https://www.yellowpages.com/search?search_terms=pizza&geo_location_terms=San%20Francisco%2C%20CA"
base_link = "https://www.yellowpages.com"
def __init__(self):
self.links = []
self.pages = []
def crawler(self):
for link in self.links:
self.get_link(link)
def get_link(self, link):
print("Running page "+ link)
page = requests.get(link)
tree = html.fromstring(page.text)
item_links = tree.xpath('//h2[#class="n"]/a[#class="business-name"][not(#itemprop="name")]/#href')
for item_link in item_links:
if not self.base_link + item_link in self.links:
self.links += [self.base_link + item_link]
def get_pages(self, link):
page = requests.get(link)
tree = html.fromstring(page.text)
links = tree.xpath('//div[#class="pagination"]//li/a/#href')
for url in links:
if not self.base_link + url in self.pages:
self.pages += [self.base_link + url]
class Info_grabber(page_crawler):
def __init__(self, plinks):
page_crawler.__init__(self)
self.links += [plinks]
def passing_links(self):
for nlink in self.links:
print(nlink)
self.crawling_deep(nlink)
def crawling_deep(self, uurl):
page = requests.get(uurl)
tree = html.fromstring(page.text)
name = tree.findtext('.//div[#class="sales-info"]/h1')
phone = tree.findtext('.//p[#class="phone"]')
try:
email = tree.xpath('//div[#class="business-card-footer"]/a[#class="email-business"]/#href')[0]
except IndexError:
email=""
print(name, phone, email)
if __name__ == '__main__':
url = page_crawler.main_link
crawl = Info_grabber(url)
crawl.crawler()
crawl.passing_links()
You'll notice that i added a pages property and a get_pages method in page_crawler, i'll leave the implementation part to you.
You might need to add more methods to page_crawler later on, as they could be of use if you develop more child classes. Finally consider looking into composition as it is also a strong OOP feature.
Your crawl is an instance of the page crawler class, but not the InfoGrabber class, which is the class that has the method passing_links. I think what you want to do is make crawl an instance of InfoGrabber instead.
Then I believe before doing self.crawling_deep you must do:
if n_link:
page = requests.get(n_link).text
tel = re.findall(r'\d{10}', page)[0] if re.findall(r'\d{10}', page) else ""
print(tel)

Web Crawler not working with Python

I'm having issues with a simple web crawler, when I run the following script, it is not iterating through the sites and it does not give me any results.
This is what I get:
1 Visiting: https://www.mongodb.com/
Word never found
Process finished with exit code 0
Any tips as why this is not working correctly? I'm using the following example (http://www.netinstructions.com/how-to-make-a-web-crawler-in-under-50-lines-of-python-code/)
Here is the code:
from html.parser import HTMLParser
from urllib.request import urlopen
from urllib import parse
class LinkParser(HTMLParser):
# This is a function that HTMLParser normally has
# but we are adding some functionality to it
def handle_starttag(self, tag, attrs):
""" We are looking for the begining of a link.
Links normally look
like """
if tag == 'a':
for (key,value) in attrs:
if key == 'href':
# We are grabbing the new URL. We are also adding the
# base URL to it. For example:
# www.netinstructions.com is the base and
# somepage.html is the new URL (a relative URL)
#
# We combine a relative URL with the base URL to create
# an absolute URL like:
# www.netinstructions.com/somepage.html
newUrl = parse.urljoin(self.baseUrl, value)
# And add it to our colection of links:
self.links = self.links + [newUrl]
def getLinks(self, url):
self.links = []
# Remember the base URL which will be important when creating
# absolute URLs
self.baseUrl = url
# Use the urlopen function from the standard Python 3 library
response = urlopen(url)
# Make sure that we are looking at HTML and not other things that
# are floating around on the internet (such as
# JavaScript files, CSS, or .PDFs for example)
if response.getheader('Content-Type') == 'text/html':
htmlBytes = response.read()
# Note that feed() handles Strings well, but not bytes
# (A change from Python 2.x to Python 3.x)
htmlString = htmlBytes.decode("utf-8")
self.feed(htmlString)
return htmlString, self.links
else:
return "", []
# And finally here is our spider. It takes in an URL, a word to find,
# and the number of pages to search through before giving up
def spider(url, word, maxPages):
pagesToVisit = [url]
numberVisited = 0
foundWord = False
# The main loop. Create a LinkParser and get all the links on the page.
# Also search the page for the word or string
# In our getLinks function we return the web page
# (this is useful for searching for the word)
# and we return a set of links from that web page
# (this is useful for where to go next)
while numberVisited < maxPages and pagesToVisit != [] and not foundWord:
numberVisited = numberVisited +1
# Start from the beginning of our collection of pages to visit:
url = pagesToVisit[0]
pagesToVisit = pagesToVisit[1:]
try:
print(numberVisited, "Visiting:", url)
parser = LinkParser()
data, links = parser.getLinks(url)
if data.find(word)>-1:
foundWord = True
# Add the pages that we visited to the end of our collection
# of pages to visit:
pagesToVisit = pagesToVisit + links
print(" **Success!**")
except:
print(" **Failed!**")
if foundWord:
print("The word", word, "was found at", url)
else:
print("Word never found")
if __name__ == "__main__":
spider("https://www.mongodb.com/", "MongoDB" ,400)
First, edit the content-type checker line to:
if response.getheader('Content-Type') == 'text/html; charset=utf-8':
as suggested by #glibdud.
If you would like your program to check all links until maxPages is reached or pagesTovisit = [], simply remove the and condition for found word on the line:
while numberVisited < maxPages and pagesToVisit != [] and not foundWord:
to:
while numberVisited < maxPages and pagesToVisit != []:

Web Crawler problems in Python

I've been working on creating a single-threaded Web Crawler in Python that will group the assets of each page and output a JSON array of the form:
[
{
url: 'http://url.com/',
assets: [
'http://url.com/imgs/img1.jpg',
'http://url.com/css/style.css',
]
},
{
url: 'http://url.com/services',
assets: [
'http://url.com/imgs/services.jpg',
'http://url.com/css/style.css',
]
},
...
]
To quickly summarise the functionality:
Using BeautifulSoup to parse HTML and extract links
Using urlparse to:
Build absolute urls from relative urls
Check if url is local using netloc
Add visited urls/assets to dictionaries via their paths
Using robotparser to check if I can crawl each page I find by looking at the robots.txt file
In order to do this I pass the root of the website to the crawler, i.e. ./crawl.py http://sitename.com/ (including the final slash)
I've made the assumption that if the url ends in .html or the resource path doesn't contain a . that I will be able to crawl it as a HTML page.
I've been having some problem with a few things including:
locales - Is there a smart way to detect and avoid crawling the same pages in different locales?
When trying to crawl particular sites I'll end up with a maximum recursion depth exceeded message from Python.
I tried to avoid this by checking if a links rel attribute contained alternate but this doesn't seem to have a big impact.
An example of this is crawling http://url.com/ but also having to crawl http://url.com/en-us, http://url.com/en-au, etc.
angular/react - Is it possible to crawl sites that are using angular/react/similar frameworks?
I've been trying to search for useful resources to use to help me in this section but so far haven't found anything concrete.
Any info/feedback is greatly appreciated
Code below:
#!/usr/bin/python
import sys
import json
import urlparse
import robotparser
import urllib2
from queue import Queue
from bs4 import BeautifulSoup
class Crawler:
def gethtml(self, url):
try:
return urllib2.urlopen(url)
except urllib2.HTTPError, e:
print 'We failed with error code - %s.' % e.code
if e.code == 404:
print('404 File Not Found: ' + url)
else:
print('e code not 404')
return None
def __init__(self):
url = sys.argv[1]
sys.setrecursionlimit(100000)
parsedurl = urlparse.urlparse(url)
print('Crawling from URL: ' + url)
self.parser = robotparser.RobotFileParser()
self.parser.set_url(url + 'robots.txt')
self.parser.read()
if parsedurl.netloc.startswith('www.'): # compare netlocs without www.
self.netloc = parsedurl.netloc[4:]
else:
self.netloc = parsedurl.netloc
html = self.gethtml(url)
if html is not None:
self.visited = {}
self.current = {}
self.currentassets = {}
self.output = []
self.queue = Queue()
if len(parsedurl.path) < 1:
self.visited['/index.html'] = True
self.crawlhtml(url, html)
else:
print("Sorry, couldn't find HTML at that URL!")
def isabsolute(self, url):
return bool(urlparse.urlparse(url).netloc)
def checkifhtml(self, url):
parsedurl = urlparse.urlparse(url)
path = parsedurl.path
if url.endswith('.html') or '.' not in path: # path is a html file
if not self.visited.has_key(path):
self.queue.enqueue(url)
return True
else:
return False
def getasseturl(self, current_url, url):
if not self.isabsolute(url): # make our relative url absolute
url = urlparse.urljoin(current_url, url)
parsedurl = urlparse.urlparse(url)
path = parsedurl.path
netloc = parsedurl.netloc
local = False
if netloc.startswith('www.'): # check if is local url
netloc = netloc.replace('www.', '', 1)
if netloc == self.netloc:
local = True
if self.currentassets.get(path) is None:
self.currentassets[path] = True
if local:
if self.checkifhtml(url) is False:
self.current['assets'].append(url)
def checkqueue(self):
print('Checking queue. Queue Size: ' + str(self.queue.size()))
if self.queue.size() == 0:
print('\n------------------------------------------------------\n')
print(json.dumps(self.output, indent=4))
print('\n------------------------------------------------------\n')
print(self.visited)
else:
url = self.queue.dequeue()
parsedurl = urlparse.urlparse(url)
path = parsedurl.path
if self.visited.get(path) is None:
self.visited[path] = True
html = self.gethtml(url)
if html is not None:
self.crawlhtml(url, html)
else:
self.checkqueue()
else:
self.checkqueue()
def crawlhtml(self, url, html):
print('---------------------------------------\nLooking at url: ' + url)
if self.parser.can_fetch('*', url):
self.current['url'] = url
self.current['assets'] = []
parsedhtml = BeautifulSoup(html, 'lxml') # use lxml for speed
for link in parsedhtml.find_all(['a', 'link', 'area', 'base', 'image']):
if link.get('href') is not None:
if link.get('rel') is None:
self.getasseturl(url, link.get('href'))
else:
if not 'alternate' in link.get('rel'):
self.getasseturl(url, link.get('href'))
for link in parsedhtml.find_all(['script', 'img', 'frame', 'iframe', 'input', 'audio', 'embed', 'source', 'video']):
if link.get('src') is not None:
self.getasseturl(url, link.get('src'))
self.output.append(self.current)
self.current = {}
self.currentassets = {}
self.checkqueue()
c = Crawler()

Scrapy recursive link crawler

It starts with a url on the web (ex: http://python.org), fetches the web-page corresponding to that url, and parses all the links on that page into a repository of links. Next, it fetches the contents of any of the url from the repository just created, parses the links from this new content into the repository and continues this process for all links in the repository until stopped or after a given number of links are fetched.
How can i do that using python and scrapy?. I am able to scrape all links in a webpage but how to perform it recursively in depth
Several remarks :
you don't need Scrapy for such a simple task. Urllib (or Requests) and a html parser (Beautiful soup, etc.) can do the job
I don't recall where I've heard it, but I think it's better to crawl using BFS algorithms. You can easily avoid circular references.
Below a simple implementation : it does not fetcch internal links (only absolute formed hyperlinks) nor does it have any Error handling (403,404,no links,...), and it is abysmally slow ( the multiprocessing module can help a lot in this case).
import BeautifulSoup
import urllib2
import itertools
import random
class Crawler(object):
"""docstring for Crawler"""
def __init__(self):
self.soup = None # Beautiful Soup object
self.current_page = "http://www.python.org/" # Current page's address
self.links = set() # Queue with every links fetched
self.visited_links = set()
self.counter = 0 # Simple counter for debug purpose
def open(self):
# Open url
print self.counter , ":", self.current_page
res = urllib2.urlopen(self.current_page)
html_code = res.read()
self.visited_links.add(self.current_page)
# Fetch every links
self.soup = BeautifulSoup.BeautifulSoup(html_code)
page_links = []
try :
page_links = itertools.ifilter( # Only deal with absolute links
lambda href: 'http://' in href,
( a.get('href') for a in self.soup.findAll('a') ) )
except Exception: # Magnificent exception handling
pass
# Update links
self.links = self.links.union( set(page_links) )
# Choose a random url from non-visited set
self.current_page = random.sample( self.links.difference(self.visited_links),1)[0]
self.counter+=1
def run(self):
# Crawl 3 webpages (or stop if all url has been fetched)
while len(self.visited_links) < 3 or (self.visited_links == self.links):
self.open()
for link in self.links:
print link
if __name__ == '__main__':
C = Crawler()
C.run()
Output:
In [48]: run BFScrawler.py
0 : http://www.python.org/
1 : http://twistedmatrix.com/trac/
2 : http://www.flowroute.com/
http://www.egenix.com/files/python/mxODBC.html
http://wiki.python.org/moin/PyQt
http://wiki.python.org/moin/DatabaseProgramming/
http://wiki.python.org/moin/CgiScripts
http://wiki.python.org/moin/WebProgramming
http://trac.edgewall.org/
http://www.facebook.com/flowroute
http://www.flowroute.com/
http://www.opensource.org/licenses/mit-license.php
http://roundup.sourceforge.net/
http://www.zope.org/
http://www.linkedin.com/company/flowroute
http://wiki.python.org/moin/TkInter
http://pypi.python.org/pypi
http://pycon.org/#calendar
http://dyn.com/
http://www.google.com/calendar/ical/j7gov1cmnqr9tvg14k621j7t5c%40group.calendar.
google.com/public/basic.ics
http://www.pygame.org/news.html
http://www.turbogears.org/
http://www.openbookproject.net/pybiblio/
http://wiki.python.org/moin/IntegratedDevelopmentEnvironments
http://support.flowroute.com/forums
http://www.pentangle.net/python/handbook/
http://dreamhost.com/?q=twisted
http://www.vrplumber.com/py3d.py
http://sourceforge.net/projects/mysql-python
http://wiki.python.org/moin/GuiProgramming
http://software-carpentry.org/
http://www.google.com/calendar/ical/3haig2m9msslkpf2tn1h56nn9g%40group.calendar.
google.com/public/basic.ics
http://wiki.python.org/moin/WxPython
http://wiki.python.org/moin/PythonXml
http://www.pytennessee.org/
http://labs.twistedmatrix.com/
http://www.found.no/
http://www.prnewswire.com/news-releases/voip-innovator-flowroute-relocates-to-se
attle-190011751.html
http://www.timparkin.co.uk/
http://docs.python.org/howto/sockets.html
http://blog.python.org/
http://docs.python.org/devguide/
http://www.djangoproject.com/
http://buildbot.net/trac
http://docs.python.org/3/
http://www.prnewswire.com/news-releases/flowroute-joins-voxbones-inum-network-fo
r-global-voip-calling-197319371.html
http://www.psfmember.org
http://docs.python.org/2/
http://wiki.python.org/moin/Languages
http://sip-trunking.tmcnet.com/topics/enterprise-voip/articles/341902-grandstrea
m-ip-voice-solutions-receive-flowroute-certification.htm
http://www.twitter.com/flowroute
http://wiki.python.org/moin/NumericAndScientific
http://www.google.com/calendar/ical/b6v58qvojllt0i6ql654r1vh00%40group.calendar.
google.com/public/basic.ics
http://freecode.com/projects/pykyra
http://www.xs4all.com/
http://blog.flowroute.com
http://wiki.python.org/moin/PyGtk
http://twistedmatrix.com/trac/
http://wiki.python.org/moin/
http://wiki.python.org/moin/Python2orPython3
http://stackoverflow.com/questions/tagged/twisted
http://www.pycon.org/
Here is the main crawl method written to scrap links recursively from a webpage. This method will crawl a URL and put all the crawled URLs into a buffer. Now multiple threads will be waiting to pop URLs from this global buffer and again call this crawl method.
def crawl(self,urlObj):
'''Main function to crawl URL's '''
try:
if ((urlObj.valid) and (urlObj.url not in CRAWLED_URLS.keys())):
rsp = urlcon.urlopen(urlObj.url,timeout=2)
hCode = rsp.read()
soup = BeautifulSoup(hCode)
links = self.scrap(soup)
boolStatus = self.checkmax()
if boolStatus:
CRAWLED_URLS.setdefault(urlObj.url,"True")
else:
return
for eachLink in links:
if eachLink not in VISITED_URLS:
parsedURL = urlparse(eachLink)
if parsedURL.scheme and "javascript" in parsedURL.scheme:
#print("***************Javascript found in scheme " + str(eachLink) + "**************")
continue
'''Handle internal URLs '''
try:
if not parsedURL.scheme and not parsedURL.netloc:
#print("No scheme and host found for " + str(eachLink))
newURL = urlunparse(parsedURL._replace(**{"scheme":urlObj.scheme,"netloc":urlObj.netloc}))
eachLink = newURL
elif not parsedURL.scheme :
#print("Scheme not found for " + str(eachLink))
newURL = urlunparse(parsedURL._replace(**{"scheme":urlObj.scheme}))
eachLink = newURL
if eachLink not in VISITED_URLS: #Check again for internal URL's
#print(" Found child link " + eachLink)
CRAWL_BUFFER.append(eachLink)
with self._lock:
self.count += 1
#print(" Count is =================> " + str(self.count))
boolStatus = self.checkmax()
if boolStatus:
VISITED_URLS.setdefault(eachLink, "True")
else:
return
except TypeError:
print("Type error occured ")
else:
print("URL already present in visited " + str(urlObj.url))
except socket.timeout as e:
print("**************** Socket timeout occured*******************" )
except URLError as e:
if isinstance(e.reason, ConnectionRefusedError):
print("**************** Conn refused error occured*******************")
elif isinstance(e.reason, socket.timeout):
print("**************** Socket timed out error occured***************" )
elif isinstance(e.reason, OSError):
print("**************** OS error occured*************")
elif isinstance(e,HTTPError):
print("**************** HTTP Error occured*************")
else:
print("**************** URL Error occured***************")
except Exception as e:
print("Unknown exception occured while fetching HTML code" + str(e))
traceback.print_exc()
The complete source code and instructions are available at https://github.com/tarunbansal/crawler

Categories

Resources