I am in my infancy of python coding. What I am trying to do is build a web scraper which gets all the links from a website and then returns the elements form each site. The code I started with is from https://www.thepythoncode.com/article/extract-all-website-links-python
this works really nicely to get all the links from a website.
As I am only interested in the internal links I have added some extra code to try and get the elements (tile, h1, some other bits which I haven't added yet) to the code. The issue I am running into is I think the href returns an email, then the code tries and extracts the elements from this so obviously this bugs out. I have tried to avoid it picking the email (which i also thought would be in the def_valid function) but i am obviously missing something. Any help would be really appreciated.
import re
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import colorama
GREEN = colorama.Fore.GREEN
GRAY = colorama.Fore.LIGHTBLACK_EX
RESET = colorama.Fore.RESET
YELLOW = colorama.Fore.YELLOW
internal_urls = set()
external_urls = set()
title_urls = set()
def is_valid(url):
"""
Checks whether `url` is a valid URL.
"""
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
def get_all_website_links(url):
"""
Returns all URLs that is found on `url` in which it belongs to the same website
"""
# all URLs of `url`
urls = set()
# domain name of the URL without the protocol
domain_name = urlparse(url).netloc
soup = BeautifulSoup(requests.get(url).content, "html.parser")
# is_internal_link == True:
title_check = soup.find_all('title')
if title_check != " " or title_check != None:
get_title(url)
get_heading_tags(url)
for a_tag in soup.findAll("a"):
# is_internal_link = False
href = a_tag.attrs.get("href")
if href == "" or href is None:
# href empty tag
continue
# join the URL if it's relative (not absolute link)
href = urljoin(url, href)
parsed_href = urlparse(href)
# remove URL GET parameters, URL fragments, etc.
href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
if not is_valid(href):
# not a valid URL
continue
if href in internal_urls:
# already in the set
continue
if domain_name not in href:
# external link
if href not in external_urls:
#print(f"{GRAY}[!] External link: {href}{RESET}")
external_urls.add(href)
continue
print(f"{GREEN}[*] Internal link: {href}{RESET}")
if re.search('#',href) == True:
continue
urls.add(href)
internal_urls.add(href)
return urls
# number of urls visited so far will be stored here
total_urls_visited = 0
def get_title(url): # domain name of the URL without the protocol
domain_name = urlparse(url).netloc
soup = BeautifulSoup(requests.get(url).content, "html.parser")
#print("Title of the website is : ")
for title in soup.find_all('title'):
if title == "" and title == None:
continue
title_text = title.get_text()
title_urls.add(title_text)
print(title_text)
print((len(title_text)))
def get_heading_tags(url):
soup = BeautifulSoup(requests.get(url).content, "html.parser")
heading_tags = ['h1', 'h2', 'h3']
i = 0
for tags in soup.find_all(heading_tags):
if tags == " " or tags == None:
continue
tags_text = tags.get_text()
letters_in_tags = len(tags_text) - tags_text.count(" ")
i += 1
print(f'{tags.name} {i} -> {tags_text} -> Length ->{letters_in_tags} ')
def crawl(url, max_urls=80):
"""
Crawls a web page and extracts all links.
You'll find all links in `external_urls` and `internal_urls` global set variables.
params:
max_urls (int): number of max urls to crawl, default is 30.
"""
global total_urls_visited
total_urls_visited += 1
print(f"{YELLOW}[*] Crawling: {url}{RESET}")
links = get_all_website_links(url)
for link in links:
if re.search('#',link) != True:
if total_urls_visited > max_urls:
break
crawl(link, max_urls=max_urls)
if __name__ == "__main__":
crawl("https://website.com/") #put website here.
print("[+] Total Internal links:", len(internal_urls))
print("[+] Total External links:", len(external_urls))
print("[+] Total URLs:", len(external_urls) + len(internal_urls))
for link in links:
if re.search('#',link) != True:
if total_urls_visited > max_urls:
break
crawl(link, max_urls=max_urls)
You are only checking if # is present in the link (and that too not Correct!) to know if it's an email or not. Also note that links can also have # in them.
Basically, emails inside <a> will be of the form:
So to differentiate emails from links, you can use the below check.
for link in links:
if not link.startswith('mailto:'):
if total_urls_visited > max_urls:
break
crawl(link, max_urls=max_urls)
This will ignore all the emails and only scrape links.
Related
I have some website links as samples for extracting any email available in their internal sites.
However, even I am trying to render any JS driven website via r.html.render() within scrape_email(url) method, some of the websites like arken.trygge.dk, gronnebakken.dk, dagtilbud.ballerup.dk/boernehuset-bispevangen etc. does not return any email which might be due to rendering issue.
I have attached the sample file for convenience of running
I dont want to use selenium as there can be thousands or millions of webpage I want to extract emails from.
So far this is my code:
import os
import time
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import re
from requests_html import HTMLSession
import pandas as pd
from gtts import gTTS
import winsound
# For convenience of seeing console output in the script
pd.options.display.max_colwidth = 180
#Get the start time of script execution
startTime = time.time()
#Paste file name inside ''
input_file_name = 'sample'
input_df = pd.read_excel(input_file_name+'.xlsx', engine='openpyxl')
input_df = input_df.dropna(how='all')
internal_urls = set()
emails = set()
total_urls_visited = 0
def is_valid(url):
"""
Checks whether `url` is a valid URL.
"""
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
def get_internal_links(url):
"""
Returns all URLs that is found on `url` in which it belongs to the same website
"""
# all URLs of `url`
urls = set()
# domain name of the URL without the protocol
domain_name = urlparse(url).netloc
print("Domain name -- ",domain_name)
try:
soup = BeautifulSoup(requests.get(url, timeout=5).content, "html.parser")
for a_tag in soup.findAll("a"):
href = a_tag.attrs.get("href")
if href == "" or href is None:
# href empty tag
continue
# join the URL if it's relative (not absolute link)
href = urljoin(url, href)
parsed_href = urlparse(href)
# remove URL GET parameters, URL fragments, etc.
href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
if not is_valid(href):
# not a valid URL
continue
if href in internal_urls:
# already in the set
continue
if parsed_href.netloc != domain_name:
# if the link is not of same domain pass
continue
if parsed_href.path.endswith((".csv",".xlsx",".txt", ".pdf", ".mp3", ".png", ".jpg", ".jpeg", ".svg", ".mov", ".js",".gif",".mp4",".avi",".flv",".wav")):
# Overlook site images,pdf and other file rather than webpages
continue
print(f"Internal link: {href}")
urls.add(href)
internal_urls.add(href)
return urls
except requests.exceptions.Timeout as err:
print("The website is not loading within 5 seconds... Continuing crawling the next one")
pass
except:
print("The website is unavailable. Continuing crawling the next one")
pass
def crawl(url, max_urls=30):
"""
Crawls a web page and extracts all links.
You'll find all links in `external_urls` and `internal_urls` global set variables.
params:
max_urls (int): number of max urls to crawl, default is 30.
"""
global total_urls_visited
total_urls_visited += 1
print(f"Crawling: {url}")
links = get_internal_links(url)
# for link in links:
# if total_urls_visited > max_urls:
# break
# crawl(link, max_urls=max_urls)
def scrape_email(url):
EMAIL_REGEX = r'\b[A-Za-z0-9._%+-]+#[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
# EMAIL_REGEX = r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")#(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
try:
# initiate an HTTP session
session = HTMLSession()
# get the HTTP Response
r = session.get(url, timeout=10)
# for JAVA-Script driven websites
r.html.render()
single_url_email = []
for re_match in re.finditer(EMAIL_REGEX, r.html.raw_html.decode()):
single_url_email.append(re_match.group().lower())
r.session.close()
return set(single_url_email)
except:
pass
def crawl_website_scrape_email(url, max_internal_url_no=20):
crawl(url,max_urls=max_internal_url_no)
each_url_emails = []
global internal_urls
global emails
for each_url in internal_urls:
each_url_emails.append(scrape_email(each_url))
URL_WITH_EMAILS={'main_url': url, 'emails':each_url_emails}
emails = {}
internal_urls = set()
return URL_WITH_EMAILS
def list_check(emails_list, email_match):
match_indexes = [i for i, s in enumerate(emails_list) if email_match in s]
return [emails_list[index] for index in match_indexes]
URL_WITH_EMAILS_LIST = [crawl_website_scrape_email(x) for x in input_df['Website'].values]
URL_WITH_EMAILS_DF = pd.DataFrame(data = URL_WITH_EMAILS_LIST)
URL_WITH_EMAILS_DF.to_excel(f"{input_file_name}_email-output.xlsx", index=False)
How can I solve the issue of not being able to scrape email from some of those above-mentioned and similar type of websites?
Is there also any way to detect and print strings if my get request is refused by bot detector or related protocols?
Also how can I make this code more robust?
Thank you in advance
I am building a scraper for Ebay. I am trying to figure out a way to manipulate the page number portion of the Ebay url to go to the next page until there are no more pages (If you were on page 2 the page number portion would look like "_pgn=2"). I noticed that if you put any number greater than the max number of pages a listing has, the page will reload to the last page, not give like a page doesn't exist error. (If a listing has 5 pages, then the last listing' page number url portion of _pgn=5 would rout to the same page if the page number url portion was _pgn=100). How can I implement a way to start at page one, get the html soup of the page, get the all relevant data I want from the soup, then load up the next page with the new page number and start the process again until there are not any new pages to scrape? I tried to get the number of results a listing has by using selenium xpath and math.ceil the quotient of number of results and 50 (default number of max listings per page) and use that quotient as my max_page, but I get errors saying the element doesn't exist even though it does. self.driver.findxpath('xpath').text. That 243 is what I am trying to get with the xpath.
class EbayScraper(object):
def __init__(self, item, buying_type):
self.base_url = "https://www.ebay.com/sch/i.html?_nkw="
self.driver = webdriver.Chrome(r"chromedriver.exe")
self.item = item
self.buying_type = buying_type + "=1"
self.url_seperator = "&_sop=12&rt=nc&LH_"
self.url_seperator2 = "&_pgn="
self.page_num = "1"
def getPageUrl(self):
if self.buying_type == "Buy It Now=1":
self.buying_type = "BIN=1"
self.item = self.item.replace(" ", "+")
url = self.base_url + self.item + self.url_seperator + self.buying_type + self.url_seperator2 + self.page_num
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
return soup
def getInfo(self, soup):
for listing in soup.find_all("li", {"class": "s-item"}):
raw = listing.find_all("a", {"class": "s-item__link"})
if raw:
raw_price = listing.find_all("span", {"class": "s-item__price"})[0]
raw_title = listing.find_all("h3", {"class": "s-item__title"})[0]
raw_link = listing.find_all("a", {"class": "s-item__link"})[0]
raw_condition = listing.find_all("span", {"class": "SECONDARY_INFO"})[0]
condition = raw_condition.text
price = float(raw_price.text[1:])
title = raw_title.text
link = raw_link['href']
print(title)
print(condition)
print(price)
if self.buying_type != "BIN=1":
raw_time_left = listing.find_all("span", {"class": "s-item__time-left"})[0]
time_left = raw_time_left.text[:-4]
print(time_left)
print(link)
print('\n')
if __name__ == '__main__':
item = input("Item: ")
buying_type = input("Buying Type (e.g, 'Buy It Now' or 'Auction'): ")
instance = EbayScraper(item, buying_type)
page = instance.getPageUrl()
instance.getInfo(page)
if you want to iterate all pages and gather all results then your script needs to check if there is a next page after you visit the page
import requests
from bs4 import BeautifulSoup
class EbayScraper(object):
def __init__(self, item, buying_type):
...
self.currentPage = 1
def get_url(self, page=1):
if self.buying_type == "Buy It Now=1":
self.buying_type = "BIN=1"
self.item = self.item.replace(" ", "+")
# _ipg=200 means that expect a 200 items per page
return '{}{}{}{}{}{}&_ipg=200'.format(
self.base_url, self.item, self.url_seperator, self.buying_type,
self.url_seperator2, page
)
def page_has_next(self, soup):
container = soup.find('ol', 'x-pagination__ol')
currentPage = container.find('li', 'x-pagination__li--selected')
next_sibling = currentPage.next_sibling
if next_sibling is None:
print(container)
return next_sibling is not None
def iterate_page(self):
# this will loop if there are more pages otherwise end
while True:
page = instance.getPageUrl(self.currentPage)
instance.getInfo(page)
if self.page_has_next(page) is False:
break
else:
self.currentPage += 1
def getPageUrl(self, pageNum):
url = self.get_url(pageNum)
print('page: ', url)
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
return soup
def getInfo(self, soup):
...
if __name__ == '__main__':
item = input("Item: ")
buying_type = input("Buying Type (e.g, 'Buy It Now' or 'Auction'): ")
instance = EbayScraper(item, buying_type)
instance.iterate_page()
the important functions here are page_has_next and iterate_page
page_has_next - a function that check if the pagination of the page has another li element next to the selected page. e.g < 1 2 3 > if we are on page 1 then it checks if there is 2 next -> something like this
iterate_page - a function that loop until there is no page_next
also note that you don't need selenium for this unless you need to mimic user clicks or need a browser to navigate.
I'm trying to write a script to find out the non-responsive links of a web-page in python. While trying, i find out that python doesn't support multi child nodes. Is it true? or we can access the multi child nodes.
Below is my code snippet:
import httplib2
import requests
from bs4 import BeautifulSoup, SoupStrainer
status = {}
response = {}
output = {}
def get_url_status(url, count):
global links
links = []
print(url)
print(count)
if count == 0:
return output
else:
# if url not in output.keys():
headers = requests.utils.default_headers()
req = requests.get(url, headers)
if('200' in str(req)):
# if url not in output.keys():
output[url] = '200';
for link in BeautifulSoup(req.content, parse_only=SoupStrainer('a')):
if 'href' in str(link):
links.append(link.get('href'))
# removing other non-mandotary links
for link in links[:]:
if "mi" not in link:
links.remove(link)
# removing same url
for link in links[:]:
if link.rstrip('/') == url:
links.remove(link)
# removing duplicate links
links = list(dict.fromkeys(links))
if len(links) > 0:
for urllink in links:
return get_url_status(urllink, count-1)
result = get_url_status('https://www.mi.com/in', 5)
print(result)
In this code it's only traversing to only the left nodes of the child and skipping rest. something like this.
And the output is not satisfactory and very very less compared to real.
{'https://www.mi.com/in': '200', 'https://in.c.mi.com/': '200', 'https://in.c.mi.com/index.php': '200', 'https://in.c.mi.com/global/': '200', 'https://c.mi.com/index.php': '200'}
I know, i'm lacking at multiple locations but i've never done something of this scale and this is my first time. So please excuse if this is a novice question.
Note: I've used mi.com just for the reference.
At a glance, there's one obvious problem.
if len(links) > 0:
for urllink in links:
return get_url_status(urllink, count-1)
This snippet does not iterate over links. It has return in its iterative body which means it will only run for the first item in links, and immediately return it. There is another bug. The function returns just None instead of output if it encounters a page with no links before count reaches 0. Do the following instead.
if len(links):
for urllink in links:
get_url_status(urllink, count-1)
return output
And if('200' in str(req)) is not the right way to check the status code. It will check for a substring '200' in the body, instead of only checking the status code. It should be if req.status_code == 200.
Another thing is that the function only adds responsive links to output. If you want to check for non-responsive links, don't you have to add links that do not return the 200 status code?
import requests
from bs4 import BeautifulSoup, SoupStrainer
status = {}
response = {}
output = {}
def get_url_status(url, count):
global links
links = []
# if url not in output.keys():
headers = requests.utils.default_headers()
req = requests.get(url, headers)
if req.status_code == 200:
# if url not in output.keys():
output[url] = '200'
if count == 0:
return output
for link in BeautifulSoup(req.content, parse_only=SoupStrainer('a'), parser="html.parser"):
if 'href' in str(link):
links.append(link.get('href'))
# removing other non-mandotary links
for link in links:
if "mi" not in link:
links.remove(link)
# removing same url
for link in links:
if link.rstrip('/') == url:
links.remove(link)
# removing duplicate links
links = list(dict.fromkeys(links))
print(links)
if len(links):
for urllink in links:
get_url_status(urllink, count-1)
return output
result = get_url_status('https://www.mi.com/in', 1)
print(result)
I'm having issues with a simple web crawler, when I run the following script, it is not iterating through the sites and it does not give me any results.
This is what I get:
1 Visiting: https://www.mongodb.com/
Word never found
Process finished with exit code 0
Any tips as why this is not working correctly? I'm using the following example (http://www.netinstructions.com/how-to-make-a-web-crawler-in-under-50-lines-of-python-code/)
Here is the code:
from html.parser import HTMLParser
from urllib.request import urlopen
from urllib import parse
class LinkParser(HTMLParser):
# This is a function that HTMLParser normally has
# but we are adding some functionality to it
def handle_starttag(self, tag, attrs):
""" We are looking for the begining of a link.
Links normally look
like """
if tag == 'a':
for (key,value) in attrs:
if key == 'href':
# We are grabbing the new URL. We are also adding the
# base URL to it. For example:
# www.netinstructions.com is the base and
# somepage.html is the new URL (a relative URL)
#
# We combine a relative URL with the base URL to create
# an absolute URL like:
# www.netinstructions.com/somepage.html
newUrl = parse.urljoin(self.baseUrl, value)
# And add it to our colection of links:
self.links = self.links + [newUrl]
def getLinks(self, url):
self.links = []
# Remember the base URL which will be important when creating
# absolute URLs
self.baseUrl = url
# Use the urlopen function from the standard Python 3 library
response = urlopen(url)
# Make sure that we are looking at HTML and not other things that
# are floating around on the internet (such as
# JavaScript files, CSS, or .PDFs for example)
if response.getheader('Content-Type') == 'text/html':
htmlBytes = response.read()
# Note that feed() handles Strings well, but not bytes
# (A change from Python 2.x to Python 3.x)
htmlString = htmlBytes.decode("utf-8")
self.feed(htmlString)
return htmlString, self.links
else:
return "", []
# And finally here is our spider. It takes in an URL, a word to find,
# and the number of pages to search through before giving up
def spider(url, word, maxPages):
pagesToVisit = [url]
numberVisited = 0
foundWord = False
# The main loop. Create a LinkParser and get all the links on the page.
# Also search the page for the word or string
# In our getLinks function we return the web page
# (this is useful for searching for the word)
# and we return a set of links from that web page
# (this is useful for where to go next)
while numberVisited < maxPages and pagesToVisit != [] and not foundWord:
numberVisited = numberVisited +1
# Start from the beginning of our collection of pages to visit:
url = pagesToVisit[0]
pagesToVisit = pagesToVisit[1:]
try:
print(numberVisited, "Visiting:", url)
parser = LinkParser()
data, links = parser.getLinks(url)
if data.find(word)>-1:
foundWord = True
# Add the pages that we visited to the end of our collection
# of pages to visit:
pagesToVisit = pagesToVisit + links
print(" **Success!**")
except:
print(" **Failed!**")
if foundWord:
print("The word", word, "was found at", url)
else:
print("Word never found")
if __name__ == "__main__":
spider("https://www.mongodb.com/", "MongoDB" ,400)
First, edit the content-type checker line to:
if response.getheader('Content-Type') == 'text/html; charset=utf-8':
as suggested by #glibdud.
If you would like your program to check all links until maxPages is reached or pagesTovisit = [], simply remove the and condition for found word on the line:
while numberVisited < maxPages and pagesToVisit != [] and not foundWord:
to:
while numberVisited < maxPages and pagesToVisit != []:
I am new to python, and i am developing a web crawler below is the program which get the links from given url, but the problem is i dont want it to visit the same url which is already visited. please help me.
import re
import urllib.request
import sqlite3
db = sqlite3.connect('test2.db')
db.row_factory = sqlite3.Row
db.execute('drop table if exists test')
db.execute('create table test(id INTEGER PRIMARY KEY,url text)')
#linksList = []
#module to vsit the given url and get the all links in that page
def get_links(urlparse):
try:
if urlparse.find('.msi') ==-1: #check whether the url contains .msi extensions
htmlSource = urllib.request.urlopen(urlparse).read().decode("iso-8859-1")
#parsing htmlSource and finding all anchor tags
linksList = re.findall('<a href=(.*?)>.*?</a>',htmlSource) #returns href and other attributes of a tag
for link in linksList:
start_quote = link.find('"') # setting start point in the link
end_quote = link.find('"', start_quote + 1) #setting end point in the link
url = link[start_quote + 1:end_quote] # get the string between start_quote and end_quote
def concate(url): #since few href may return only /contact or /about so concatenating its baseurl
if url.find('http://'):
url = (urlparse) + url
return url
else:
return url
url_after_concate = concate(url)
# linksList.append(url_after_concate)
try:
if url_after_concate.find('.tar.bz') == -1: # skipping links which containts link to some softwares or downloads page
db.execute('insert or ignore into test(url) values (?)', [url_after_concate])
except:
print("insertion failed")
else:
return True
except:
print("failed")
get_links('http://www.python.org')
cursor = db.execute('select * from test')
for row in cursor: # retrieve the links stored in database
print (row['id'],row['url'])
urlparse = row['url']
# print(linksList)
# if urlparse in linksList == -1:
try:
get_links(urlparse) # again parse the link from database
except:
print ("url error")
Please suggest me the way how to solve the problem.
You should have a list of 'visited' pages. When you come to request the next url you can check whether the list already contains the url and if so skip it. I'm not a python programmer so here's some peusdo-code
Create listOfVisitedUrls
...
Start Loop
Get nextUrl
If nextUrl IsNotIn listOfVisitedUrls Then
Request nextUrl
Add nextUrl to listOfVisitedUrls
End If
Loop
You can use the following code:
import re
from urllib import urlopen
# Since few href may return only /contact or /about, concatenate to baseurl.
def concat(url, baseurl):
if url.find('http://'):
url = baseurl + url
return url
else:
return url
def get_links(baseurl):
resulting_urls = set()
try:
# Check whether the url contains .msi extensions.
if baseurl.find('.msi') == -1:
# Parse htmlSource and find all anchor tags.
htmlSource = urlopen(baseurl).read()
htmlSource = htmlSource.decode("iso-8859-1")
# Returns href and other attributes of a tag.
linksList = re.findall('<a href=(.*?)>.*?</a>',htmlSource)
for link in linksList:
# Setting start and end points in the link.
start_quote = link.find('"')
end_quote = link.find('"', start_quote + 1)
# Get the string between start_quote and end_quote.
url = link[start_quote + 1:end_quote]
url_after_concat = concat(url, baseurl)
resulting_urls.add(url_after_concat)
else:
return True
except:
print("failed")
return resulting_urls
get_links('http://www.python.org')
It will return a set() containing unique URLs for your baseurl; for `http://www.python.org', you should get:
set([u'http://www.python.org/download/',
u'http://docs.python.org/',
u'http://www.python.org#left-hand-navigation',
u'http://wiki.python.org/moin/PyQt',
u'http://wiki.python.org/moin/DatabaseProgramming/',
u'http://roundup.sourceforge.net/',
u'http://www.python.org/ftp/python/3.2.3/Python-3.2.3.tar.bz2',
u'http://www.python.org/about/website',
u'http://www.python.org/about/quotes',
u'http://www.python.org/community/jobs/',
u'http://www.python.org/psf/donations/',
u'http://www.python.org/about/help/',
u'http://wiki.python.org/moin/CgiScripts',
u'http://www.zope.org/',
u'http://www.pygame.org/news.html',
u'http://pypi.python.org/pypi',
u'http://wiki.python.org/moin/Python2orPython3',
u'http://www.python.org/download/releases/2.7.3/',
u'http://www.python.org/ftp/python/3.2.3/python-3.2.3.msi',
u'http://www.python.org/community/',
u'http://www.python.org/ftp/python/2.7.3/Python-2.7.3.tar.bz2',
u'http://wiki.python.org/moin/WebProgramming',
u'http://www.openbookproject.net/pybiblio/',
u'http://twistedmatrix.com/trac/',
u'http://wiki.python.org/moin/IntegratedDevelopmentEnvironments',
u'http://www.pentangle.net/python/handbook/',
u'http://wiki.python.org/moin/TkInter',
u'http://www.vrplumber.com/py3d.py',
u'http://sourceforge.net/projects/mysql-python',
u'http://wiki.python.org/moin/GuiProgramming',
u'http://www.python.org/about/',
u'http://www.edgewall.com/trac/',
u'http://osl.iu.edu/~lums/swc/',
u'http://www.python.org/community/merchandise/',
u"http://www.python.org'/psf/",
u'http://wiki.python.org/moin/WxPython',
u'http://docs.python.org/3.2/',
u'http://www.python.org#content-body',
u'http://www.python.org/getit/',
u'http://www.python.org/news/',
u'http://www.python.org/search',
u'http://www.python.org/community/sigs/current/edu-sig',
u'http://www.python.org/about/legal',
u'http://www.timparkin.co.uk/',
u'http://www.python.org/about/apps',
u'http://www.turbogears.org/',
u'http://www.egenix.com/files/python/mxODBC.html',
u'http://docs.python.org/devguide/',
u'http://docs.python.org/howto/sockets.html',
u'http://www.djangoproject.com/',
u'http://buildbot.net/trac',
u'http://www.python.org/psf/',
u'http://www.python.org/doc/',
u'http://wiki.python.org/moin/Languages',
u'http://www.xs4all.com/',
u'http://www.python.org/',
u'http://wiki.python.org/moin/NumericAndScientific',
u'http://www.python.org/channews.rdf',
u'http://www.alobbs.com/pykyra',
u'http://wiki.python.org/moin/PythonXml',
u'http://wiki.python.org/moin/PyGtk',
u'http://www.python.org/ftp/python/2.7.3/python-2.7.3.msi',
u'http://www.python.org/download/releases/3.2.3/',
u'http://www.python.org/3kpoll'])
Hope that helps.