Web Crawler not working with Python - python

I'm having issues with a simple web crawler, when I run the following script, it is not iterating through the sites and it does not give me any results.
This is what I get:
1 Visiting: https://www.mongodb.com/
Word never found
Process finished with exit code 0
Any tips as why this is not working correctly? I'm using the following example (http://www.netinstructions.com/how-to-make-a-web-crawler-in-under-50-lines-of-python-code/)
Here is the code:
from html.parser import HTMLParser
from urllib.request import urlopen
from urllib import parse
class LinkParser(HTMLParser):
# This is a function that HTMLParser normally has
# but we are adding some functionality to it
def handle_starttag(self, tag, attrs):
""" We are looking for the begining of a link.
Links normally look
like """
if tag == 'a':
for (key,value) in attrs:
if key == 'href':
# We are grabbing the new URL. We are also adding the
# base URL to it. For example:
# www.netinstructions.com is the base and
# somepage.html is the new URL (a relative URL)
#
# We combine a relative URL with the base URL to create
# an absolute URL like:
# www.netinstructions.com/somepage.html
newUrl = parse.urljoin(self.baseUrl, value)
# And add it to our colection of links:
self.links = self.links + [newUrl]
def getLinks(self, url):
self.links = []
# Remember the base URL which will be important when creating
# absolute URLs
self.baseUrl = url
# Use the urlopen function from the standard Python 3 library
response = urlopen(url)
# Make sure that we are looking at HTML and not other things that
# are floating around on the internet (such as
# JavaScript files, CSS, or .PDFs for example)
if response.getheader('Content-Type') == 'text/html':
htmlBytes = response.read()
# Note that feed() handles Strings well, but not bytes
# (A change from Python 2.x to Python 3.x)
htmlString = htmlBytes.decode("utf-8")
self.feed(htmlString)
return htmlString, self.links
else:
return "", []
# And finally here is our spider. It takes in an URL, a word to find,
# and the number of pages to search through before giving up
def spider(url, word, maxPages):
pagesToVisit = [url]
numberVisited = 0
foundWord = False
# The main loop. Create a LinkParser and get all the links on the page.
# Also search the page for the word or string
# In our getLinks function we return the web page
# (this is useful for searching for the word)
# and we return a set of links from that web page
# (this is useful for where to go next)
while numberVisited < maxPages and pagesToVisit != [] and not foundWord:
numberVisited = numberVisited +1
# Start from the beginning of our collection of pages to visit:
url = pagesToVisit[0]
pagesToVisit = pagesToVisit[1:]
try:
print(numberVisited, "Visiting:", url)
parser = LinkParser()
data, links = parser.getLinks(url)
if data.find(word)>-1:
foundWord = True
# Add the pages that we visited to the end of our collection
# of pages to visit:
pagesToVisit = pagesToVisit + links
print(" **Success!**")
except:
print(" **Failed!**")
if foundWord:
print("The word", word, "was found at", url)
else:
print("Word never found")
if __name__ == "__main__":
spider("https://www.mongodb.com/", "MongoDB" ,400)

First, edit the content-type checker line to:
if response.getheader('Content-Type') == 'text/html; charset=utf-8':
as suggested by #glibdud.
If you would like your program to check all links until maxPages is reached or pagesTovisit = [], simply remove the and condition for found word on the line:
while numberVisited < maxPages and pagesToVisit != [] and not foundWord:
to:
while numberVisited < maxPages and pagesToVisit != []:

Related

Unable to scrape emails from some websites maybe due to r.html.render() not working properly

I have some website links as samples for extracting any email available in their internal sites.
However, even I am trying to render any JS driven website via r.html.render() within scrape_email(url) method, some of the websites like arken.trygge.dk, gronnebakken.dk, dagtilbud.ballerup.dk/boernehuset-bispevangen etc. does not return any email which might be due to rendering issue.
I have attached the sample file for convenience of running
I dont want to use selenium as there can be thousands or millions of webpage I want to extract emails from.
So far this is my code:
import os
import time
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import re
from requests_html import HTMLSession
import pandas as pd
from gtts import gTTS
import winsound
# For convenience of seeing console output in the script
pd.options.display.max_colwidth = 180
#Get the start time of script execution
startTime = time.time()
#Paste file name inside ''
input_file_name = 'sample'
input_df = pd.read_excel(input_file_name+'.xlsx', engine='openpyxl')
input_df = input_df.dropna(how='all')
internal_urls = set()
emails = set()
total_urls_visited = 0
def is_valid(url):
"""
Checks whether `url` is a valid URL.
"""
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
def get_internal_links(url):
"""
Returns all URLs that is found on `url` in which it belongs to the same website
"""
# all URLs of `url`
urls = set()
# domain name of the URL without the protocol
domain_name = urlparse(url).netloc
print("Domain name -- ",domain_name)
try:
soup = BeautifulSoup(requests.get(url, timeout=5).content, "html.parser")
for a_tag in soup.findAll("a"):
href = a_tag.attrs.get("href")
if href == "" or href is None:
# href empty tag
continue
# join the URL if it's relative (not absolute link)
href = urljoin(url, href)
parsed_href = urlparse(href)
# remove URL GET parameters, URL fragments, etc.
href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
if not is_valid(href):
# not a valid URL
continue
if href in internal_urls:
# already in the set
continue
if parsed_href.netloc != domain_name:
# if the link is not of same domain pass
continue
if parsed_href.path.endswith((".csv",".xlsx",".txt", ".pdf", ".mp3", ".png", ".jpg", ".jpeg", ".svg", ".mov", ".js",".gif",".mp4",".avi",".flv",".wav")):
# Overlook site images,pdf and other file rather than webpages
continue
print(f"Internal link: {href}")
urls.add(href)
internal_urls.add(href)
return urls
except requests.exceptions.Timeout as err:
print("The website is not loading within 5 seconds... Continuing crawling the next one")
pass
except:
print("The website is unavailable. Continuing crawling the next one")
pass
def crawl(url, max_urls=30):
"""
Crawls a web page and extracts all links.
You'll find all links in `external_urls` and `internal_urls` global set variables.
params:
max_urls (int): number of max urls to crawl, default is 30.
"""
global total_urls_visited
total_urls_visited += 1
print(f"Crawling: {url}")
links = get_internal_links(url)
# for link in links:
# if total_urls_visited > max_urls:
# break
# crawl(link, max_urls=max_urls)
def scrape_email(url):
EMAIL_REGEX = r'\b[A-Za-z0-9._%+-]+#[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
# EMAIL_REGEX = r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")#(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
try:
# initiate an HTTP session
session = HTMLSession()
# get the HTTP Response
r = session.get(url, timeout=10)
# for JAVA-Script driven websites
r.html.render()
single_url_email = []
for re_match in re.finditer(EMAIL_REGEX, r.html.raw_html.decode()):
single_url_email.append(re_match.group().lower())
r.session.close()
return set(single_url_email)
except:
pass
def crawl_website_scrape_email(url, max_internal_url_no=20):
crawl(url,max_urls=max_internal_url_no)
each_url_emails = []
global internal_urls
global emails
for each_url in internal_urls:
each_url_emails.append(scrape_email(each_url))
URL_WITH_EMAILS={'main_url': url, 'emails':each_url_emails}
emails = {}
internal_urls = set()
return URL_WITH_EMAILS
def list_check(emails_list, email_match):
match_indexes = [i for i, s in enumerate(emails_list) if email_match in s]
return [emails_list[index] for index in match_indexes]
URL_WITH_EMAILS_LIST = [crawl_website_scrape_email(x) for x in input_df['Website'].values]
URL_WITH_EMAILS_DF = pd.DataFrame(data = URL_WITH_EMAILS_LIST)
URL_WITH_EMAILS_DF.to_excel(f"{input_file_name}_email-output.xlsx", index=False)
How can I solve the issue of not being able to scrape email from some of those above-mentioned and similar type of websites?
Is there also any way to detect and print strings if my get request is refused by bot detector or related protocols?
Also how can I make this code more robust?
Thank you in advance

Web Scraping Python BeautifulSoup get elements for each webpage in website

I am in my infancy of python coding. What I am trying to do is build a web scraper which gets all the links from a website and then returns the elements form each site. The code I started with is from https://www.thepythoncode.com/article/extract-all-website-links-python
this works really nicely to get all the links from a website.
As I am only interested in the internal links I have added some extra code to try and get the elements (tile, h1, some other bits which I haven't added yet) to the code. The issue I am running into is I think the href returns an email, then the code tries and extracts the elements from this so obviously this bugs out. I have tried to avoid it picking the email (which i also thought would be in the def_valid function) but i am obviously missing something. Any help would be really appreciated.
import re
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import colorama
GREEN = colorama.Fore.GREEN
GRAY = colorama.Fore.LIGHTBLACK_EX
RESET = colorama.Fore.RESET
YELLOW = colorama.Fore.YELLOW
internal_urls = set()
external_urls = set()
title_urls = set()
def is_valid(url):
"""
Checks whether `url` is a valid URL.
"""
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
def get_all_website_links(url):
"""
Returns all URLs that is found on `url` in which it belongs to the same website
"""
# all URLs of `url`
urls = set()
# domain name of the URL without the protocol
domain_name = urlparse(url).netloc
soup = BeautifulSoup(requests.get(url).content, "html.parser")
# is_internal_link == True:
title_check = soup.find_all('title')
if title_check != " " or title_check != None:
get_title(url)
get_heading_tags(url)
for a_tag in soup.findAll("a"):
# is_internal_link = False
href = a_tag.attrs.get("href")
if href == "" or href is None:
# href empty tag
continue
# join the URL if it's relative (not absolute link)
href = urljoin(url, href)
parsed_href = urlparse(href)
# remove URL GET parameters, URL fragments, etc.
href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
if not is_valid(href):
# not a valid URL
continue
if href in internal_urls:
# already in the set
continue
if domain_name not in href:
# external link
if href not in external_urls:
#print(f"{GRAY}[!] External link: {href}{RESET}")
external_urls.add(href)
continue
print(f"{GREEN}[*] Internal link: {href}{RESET}")
if re.search('#',href) == True:
continue
urls.add(href)
internal_urls.add(href)
return urls
# number of urls visited so far will be stored here
total_urls_visited = 0
def get_title(url): # domain name of the URL without the protocol
domain_name = urlparse(url).netloc
soup = BeautifulSoup(requests.get(url).content, "html.parser")
#print("Title of the website is : ")
for title in soup.find_all('title'):
if title == "" and title == None:
continue
title_text = title.get_text()
title_urls.add(title_text)
print(title_text)
print((len(title_text)))
def get_heading_tags(url):
soup = BeautifulSoup(requests.get(url).content, "html.parser")
heading_tags = ['h1', 'h2', 'h3']
i = 0
for tags in soup.find_all(heading_tags):
if tags == " " or tags == None:
continue
tags_text = tags.get_text()
letters_in_tags = len(tags_text) - tags_text.count(" ")
i += 1
print(f'{tags.name} {i} -> {tags_text} -> Length ->{letters_in_tags} ')
def crawl(url, max_urls=80):
"""
Crawls a web page and extracts all links.
You'll find all links in `external_urls` and `internal_urls` global set variables.
params:
max_urls (int): number of max urls to crawl, default is 30.
"""
global total_urls_visited
total_urls_visited += 1
print(f"{YELLOW}[*] Crawling: {url}{RESET}")
links = get_all_website_links(url)
for link in links:
if re.search('#',link) != True:
if total_urls_visited > max_urls:
break
crawl(link, max_urls=max_urls)
if __name__ == "__main__":
crawl("https://website.com/") #put website here.
print("[+] Total Internal links:", len(internal_urls))
print("[+] Total External links:", len(external_urls))
print("[+] Total URLs:", len(external_urls) + len(internal_urls))
for link in links:
if re.search('#',link) != True:
if total_urls_visited > max_urls:
break
crawl(link, max_urls=max_urls)
You are only checking if # is present in the link (and that too not Correct!) to know if it's an email or not. Also note that links can also have # in them.
Basically, emails inside <a> will be of the form:
So to differentiate emails from links, you can use the below check.
for link in links:
if not link.startswith('mailto:'):
if total_urls_visited > max_urls:
break
crawl(link, max_urls=max_urls)
This will ignore all the emails and only scrape links.

Web Crawler problems in Python

I've been working on creating a single-threaded Web Crawler in Python that will group the assets of each page and output a JSON array of the form:
[
{
url: 'http://url.com/',
assets: [
'http://url.com/imgs/img1.jpg',
'http://url.com/css/style.css',
]
},
{
url: 'http://url.com/services',
assets: [
'http://url.com/imgs/services.jpg',
'http://url.com/css/style.css',
]
},
...
]
To quickly summarise the functionality:
Using BeautifulSoup to parse HTML and extract links
Using urlparse to:
Build absolute urls from relative urls
Check if url is local using netloc
Add visited urls/assets to dictionaries via their paths
Using robotparser to check if I can crawl each page I find by looking at the robots.txt file
In order to do this I pass the root of the website to the crawler, i.e. ./crawl.py http://sitename.com/ (including the final slash)
I've made the assumption that if the url ends in .html or the resource path doesn't contain a . that I will be able to crawl it as a HTML page.
I've been having some problem with a few things including:
locales - Is there a smart way to detect and avoid crawling the same pages in different locales?
When trying to crawl particular sites I'll end up with a maximum recursion depth exceeded message from Python.
I tried to avoid this by checking if a links rel attribute contained alternate but this doesn't seem to have a big impact.
An example of this is crawling http://url.com/ but also having to crawl http://url.com/en-us, http://url.com/en-au, etc.
angular/react - Is it possible to crawl sites that are using angular/react/similar frameworks?
I've been trying to search for useful resources to use to help me in this section but so far haven't found anything concrete.
Any info/feedback is greatly appreciated
Code below:
#!/usr/bin/python
import sys
import json
import urlparse
import robotparser
import urllib2
from queue import Queue
from bs4 import BeautifulSoup
class Crawler:
def gethtml(self, url):
try:
return urllib2.urlopen(url)
except urllib2.HTTPError, e:
print 'We failed with error code - %s.' % e.code
if e.code == 404:
print('404 File Not Found: ' + url)
else:
print('e code not 404')
return None
def __init__(self):
url = sys.argv[1]
sys.setrecursionlimit(100000)
parsedurl = urlparse.urlparse(url)
print('Crawling from URL: ' + url)
self.parser = robotparser.RobotFileParser()
self.parser.set_url(url + 'robots.txt')
self.parser.read()
if parsedurl.netloc.startswith('www.'): # compare netlocs without www.
self.netloc = parsedurl.netloc[4:]
else:
self.netloc = parsedurl.netloc
html = self.gethtml(url)
if html is not None:
self.visited = {}
self.current = {}
self.currentassets = {}
self.output = []
self.queue = Queue()
if len(parsedurl.path) < 1:
self.visited['/index.html'] = True
self.crawlhtml(url, html)
else:
print("Sorry, couldn't find HTML at that URL!")
def isabsolute(self, url):
return bool(urlparse.urlparse(url).netloc)
def checkifhtml(self, url):
parsedurl = urlparse.urlparse(url)
path = parsedurl.path
if url.endswith('.html') or '.' not in path: # path is a html file
if not self.visited.has_key(path):
self.queue.enqueue(url)
return True
else:
return False
def getasseturl(self, current_url, url):
if not self.isabsolute(url): # make our relative url absolute
url = urlparse.urljoin(current_url, url)
parsedurl = urlparse.urlparse(url)
path = parsedurl.path
netloc = parsedurl.netloc
local = False
if netloc.startswith('www.'): # check if is local url
netloc = netloc.replace('www.', '', 1)
if netloc == self.netloc:
local = True
if self.currentassets.get(path) is None:
self.currentassets[path] = True
if local:
if self.checkifhtml(url) is False:
self.current['assets'].append(url)
def checkqueue(self):
print('Checking queue. Queue Size: ' + str(self.queue.size()))
if self.queue.size() == 0:
print('\n------------------------------------------------------\n')
print(json.dumps(self.output, indent=4))
print('\n------------------------------------------------------\n')
print(self.visited)
else:
url = self.queue.dequeue()
parsedurl = urlparse.urlparse(url)
path = parsedurl.path
if self.visited.get(path) is None:
self.visited[path] = True
html = self.gethtml(url)
if html is not None:
self.crawlhtml(url, html)
else:
self.checkqueue()
else:
self.checkqueue()
def crawlhtml(self, url, html):
print('---------------------------------------\nLooking at url: ' + url)
if self.parser.can_fetch('*', url):
self.current['url'] = url
self.current['assets'] = []
parsedhtml = BeautifulSoup(html, 'lxml') # use lxml for speed
for link in parsedhtml.find_all(['a', 'link', 'area', 'base', 'image']):
if link.get('href') is not None:
if link.get('rel') is None:
self.getasseturl(url, link.get('href'))
else:
if not 'alternate' in link.get('rel'):
self.getasseturl(url, link.get('href'))
for link in parsedhtml.find_all(['script', 'img', 'frame', 'iframe', 'input', 'audio', 'embed', 'source', 'video']):
if link.get('src') is not None:
self.getasseturl(url, link.get('src'))
self.output.append(self.current)
self.current = {}
self.currentassets = {}
self.checkqueue()
c = Crawler()

Recursive function gives no output

I'm scraping all the URL of my domain with recursive function.
But it outputs nothing, without any error.
#usr/bin/python
from bs4 import BeautifulSoup
import requests
import tldextract
def scrape(url):
for links in url:
main_domain = tldextract.extract(links)
r = requests.get(links)
data = r.text
soup = BeautifulSoup(data)
for href in soup.find_all('a'):
href = href.get('href')
if not href:
continue
link_domain = tldextract.extract(href)
if link_domain.domain == main_domain.domain :
problem.append(href)
elif not href == '#' and link_domain.tld == '':
new = 'http://www.'+ main_domain.domain + '.' + main_domain.tld + '/' + href
problem.append(new)
return len(problem)
return scrape(problem)
problem = ["http://xyzdomain.com"]
print(scrape(problem))
When I create a new list, it works, but I don't want to make a list every time for every loop.
You need to structure your code so that it meets the pattern for recursion as your current code doesn't - you also should not call variables the same name as libraries, e.g. href = href.get() because this will usually stop the library working as it becomes the variable, your code as it currently is will only ever return the len() as this return is unconditionally reached before: return scrap(problem).:
def Recursive(Factorable_problem)
if Factorable_problem is Simplest_Case:
return AnswerToSimplestCase
else:
return Rule_For_Generating_From_Simpler_Case(Recursive(Simpler_Case))
for example:
def Factorial(n):
""" Recursively Generate Factorials """
if n < 2:
return 1
else:
return n * Factorial(n-1)
Hello I've made a none recursive version of this that appears to get all the links on the same domain.
The code below I've tested using the problem included in the code. When I'd solved the problems with the recursive version the next problem was hitting the recursion depth limit so I rewrote it so it ran in an iterative fashion, the code and result below:
from bs4 import BeautifulSoup
import requests
import tldextract
def print_domain_info(d):
print "Main Domain:{0} \nSub Domain:{1} \nSuffix:{2}".format(d.domain,d.subdomain,d.suffix)
SEARCHED_URLS = []
problem = [ "http://Noelkd.neocities.org/", "http://youpi.neocities.org/"]
while problem:
# Get a link from the stack of links
link = problem.pop()
# Check we haven't been to this address before
if link in SEARCHED_URLS:
continue
# We don't want to come back here again after this point
SEARCHED_URLS.append(link)
# Try and get the website
try:
req = requests.get(link)
except:
# If its not working i don't care for it
print "borked website found: {0}".format(link)
continue
# Now we get to this point worth printing something
print "Trying to parse:{0}".format(link)
print "Status Code:{0} Thats: {1}".format(req.status_code, "A-OK" if req.status_code == 200 else "SOMTHINGS UP" )
# Get the domain info
dInfo = tldextract.extract(link)
print_domain_info(dInfo)
# I like utf-8
data = req.text.encode("utf-8")
print "Lenght Of Data Retrived:{0}".format(len(data)) # More info
soup = BeautifulSoup(data) # This was here before so i left it.
print "Found {0} link{1}".format(len(soup.find_all('a')),"s" if len(soup.find_all('a')) > 1 else "")
FOUND_THIS_ITERATION = [] # Getting the same links over and over was boring
found_links = [x for x in soup.find_all('a') if x.get('href') not in SEARCHED_URLS] # Find me all the links i don't got
for href in found_links:
href = href.get('href') # You wrote this seems to work well
if not href:
continue
link_domain = tldextract.extract(href)
if link_domain.domain == dInfo.domain: # JUST FINDING STUFF ON SAME DOMAIN RIGHT?!
if href not in FOUND_THIS_ITERATION: # I'ma check you out next time
print "Check out this link: {0}".format(href)
print_domain_info(link_domain)
FOUND_THIS_ITERATION.append(href)
problem.append(href)
else: # I got you already
print "DUPE LINK!"
else:
print "Not on same domain moving on"
# Count down
print "We have {0} more sites to search".format(len(problem))
if problem:
continue
else:
print "Its been fun"
print "Lets see the URLS we've visited:"
for url in SEARCHED_URLS:
print url
Which prints, after a lot of other logging loads of neocities websites!
What's happening is the script is popping a value of the list of websites yet to visit, it then gets all the links on the page which are on the same domain. If those links are to pages we haven't visited we add the link to the list of links to be visited. After we do that we pop the next page and do the same thing again until there are no pages left to visit.
Think this is what your looking for, get back to us in the comments if this doesn't work in the way that you want or if anyone can improve please leave a comment.

Should not visit the same url

I am new to python, and i am developing a web crawler below is the program which get the links from given url, but the problem is i dont want it to visit the same url which is already visited. please help me.
import re
import urllib.request
import sqlite3
db = sqlite3.connect('test2.db')
db.row_factory = sqlite3.Row
db.execute('drop table if exists test')
db.execute('create table test(id INTEGER PRIMARY KEY,url text)')
#linksList = []
#module to vsit the given url and get the all links in that page
def get_links(urlparse):
try:
if urlparse.find('.msi') ==-1: #check whether the url contains .msi extensions
htmlSource = urllib.request.urlopen(urlparse).read().decode("iso-8859-1")
#parsing htmlSource and finding all anchor tags
linksList = re.findall('<a href=(.*?)>.*?</a>',htmlSource) #returns href and other attributes of a tag
for link in linksList:
start_quote = link.find('"') # setting start point in the link
end_quote = link.find('"', start_quote + 1) #setting end point in the link
url = link[start_quote + 1:end_quote] # get the string between start_quote and end_quote
def concate(url): #since few href may return only /contact or /about so concatenating its baseurl
if url.find('http://'):
url = (urlparse) + url
return url
else:
return url
url_after_concate = concate(url)
# linksList.append(url_after_concate)
try:
if url_after_concate.find('.tar.bz') == -1: # skipping links which containts link to some softwares or downloads page
db.execute('insert or ignore into test(url) values (?)', [url_after_concate])
except:
print("insertion failed")
else:
return True
except:
print("failed")
get_links('http://www.python.org')
cursor = db.execute('select * from test')
for row in cursor: # retrieve the links stored in database
print (row['id'],row['url'])
urlparse = row['url']
# print(linksList)
# if urlparse in linksList == -1:
try:
get_links(urlparse) # again parse the link from database
except:
print ("url error")
Please suggest me the way how to solve the problem.
You should have a list of 'visited' pages. When you come to request the next url you can check whether the list already contains the url and if so skip it. I'm not a python programmer so here's some peusdo-code
Create listOfVisitedUrls
...
Start Loop
Get nextUrl
If nextUrl IsNotIn listOfVisitedUrls Then
Request nextUrl
Add nextUrl to listOfVisitedUrls
End If
Loop
You can use the following code:
import re
from urllib import urlopen
# Since few href may return only /contact or /about, concatenate to baseurl.
def concat(url, baseurl):
if url.find('http://'):
url = baseurl + url
return url
else:
return url
def get_links(baseurl):
resulting_urls = set()
try:
# Check whether the url contains .msi extensions.
if baseurl.find('.msi') == -1:
# Parse htmlSource and find all anchor tags.
htmlSource = urlopen(baseurl).read()
htmlSource = htmlSource.decode("iso-8859-1")
# Returns href and other attributes of a tag.
linksList = re.findall('<a href=(.*?)>.*?</a>',htmlSource)
for link in linksList:
# Setting start and end points in the link.
start_quote = link.find('"')
end_quote = link.find('"', start_quote + 1)
# Get the string between start_quote and end_quote.
url = link[start_quote + 1:end_quote]
url_after_concat = concat(url, baseurl)
resulting_urls.add(url_after_concat)
else:
return True
except:
print("failed")
return resulting_urls
get_links('http://www.python.org')
It will return a set() containing unique URLs for your baseurl; for `http://www.python.org', you should get:
set([u'http://www.python.org/download/',
u'http://docs.python.org/',
u'http://www.python.org#left-hand-navigation',
u'http://wiki.python.org/moin/PyQt',
u'http://wiki.python.org/moin/DatabaseProgramming/',
u'http://roundup.sourceforge.net/',
u'http://www.python.org/ftp/python/3.2.3/Python-3.2.3.tar.bz2',
u'http://www.python.org/about/website',
u'http://www.python.org/about/quotes',
u'http://www.python.org/community/jobs/',
u'http://www.python.org/psf/donations/',
u'http://www.python.org/about/help/',
u'http://wiki.python.org/moin/CgiScripts',
u'http://www.zope.org/',
u'http://www.pygame.org/news.html',
u'http://pypi.python.org/pypi',
u'http://wiki.python.org/moin/Python2orPython3',
u'http://www.python.org/download/releases/2.7.3/',
u'http://www.python.org/ftp/python/3.2.3/python-3.2.3.msi',
u'http://www.python.org/community/',
u'http://www.python.org/ftp/python/2.7.3/Python-2.7.3.tar.bz2',
u'http://wiki.python.org/moin/WebProgramming',
u'http://www.openbookproject.net/pybiblio/',
u'http://twistedmatrix.com/trac/',
u'http://wiki.python.org/moin/IntegratedDevelopmentEnvironments',
u'http://www.pentangle.net/python/handbook/',
u'http://wiki.python.org/moin/TkInter',
u'http://www.vrplumber.com/py3d.py',
u'http://sourceforge.net/projects/mysql-python',
u'http://wiki.python.org/moin/GuiProgramming',
u'http://www.python.org/about/',
u'http://www.edgewall.com/trac/',
u'http://osl.iu.edu/~lums/swc/',
u'http://www.python.org/community/merchandise/',
u"http://www.python.org'/psf/",
u'http://wiki.python.org/moin/WxPython',
u'http://docs.python.org/3.2/',
u'http://www.python.org#content-body',
u'http://www.python.org/getit/',
u'http://www.python.org/news/',
u'http://www.python.org/search',
u'http://www.python.org/community/sigs/current/edu-sig',
u'http://www.python.org/about/legal',
u'http://www.timparkin.co.uk/',
u'http://www.python.org/about/apps',
u'http://www.turbogears.org/',
u'http://www.egenix.com/files/python/mxODBC.html',
u'http://docs.python.org/devguide/',
u'http://docs.python.org/howto/sockets.html',
u'http://www.djangoproject.com/',
u'http://buildbot.net/trac',
u'http://www.python.org/psf/',
u'http://www.python.org/doc/',
u'http://wiki.python.org/moin/Languages',
u'http://www.xs4all.com/',
u'http://www.python.org/',
u'http://wiki.python.org/moin/NumericAndScientific',
u'http://www.python.org/channews.rdf',
u'http://www.alobbs.com/pykyra',
u'http://wiki.python.org/moin/PythonXml',
u'http://wiki.python.org/moin/PyGtk',
u'http://www.python.org/ftp/python/2.7.3/python-2.7.3.msi',
u'http://www.python.org/download/releases/3.2.3/',
u'http://www.python.org/3kpoll'])
Hope that helps.

Categories

Resources