I have a code which scrape out everything from a specific web page, I now want to build a code which can help me to know the specific details, for example if I enter style id, it should give me the details related to it, or if I enter category, it should give me all the items in that category with their details. My code is:-
import requests, re
from bs4 import BeautifulSoup
url="http://www.barneys.com/theory-andrejs-sweater-503900006.html#start=2"
r=requests.get(url)
soup=BeautifulSoup(r.content)
links=soup.find_all("a")
img=soup.find(itemprop="image")
g_d4=soup.find_all("ol", {"class":"breadcrumb"})
for item in g_d4:
links_2=soup.find_all('a', href=re.compile('^http://www.barneys.com/barneys-new-york/men/'))
pattern_2=re.compile("clothing/(\w+)")
for link in links_2:
match_1=pattern_2.search(link["href"])
if match_1:
print ("Category:- " + match_1.group(1))
break
g_d1 = soup.find_all("div", {"id": "product-content"})
for item in g_d1:
try:
print ("\n\nBRAND:-" + item.contents[1].text)
except:
pass
try:
a_1=item.find("ol", {"class":"breadcrumb"})
a_2=a_1.text
print a_2
except:
pass
try:
print ("TYPE:-" + item.find("h1",{"class":"product-name"},{"itemprop":"name"}).text+';')
except:
pass
try:
d2=item.find("div",{"class":"panel-body standard-p"})
d3=d2.text
p_id=re.findall(r'[0-9]{9}',d3)
id_2=p_id[0]
url_1 = 'http://recs.richrelevance.com/rrserver/p13n_generated.js?a=dbeab3c977a08905&ts=1434386243747&p='+str(id_2)+'&pt=%7Citem_page.rr1%7Citem_page.featured_item_0%7Citem_page.featured_item_1%7Citem_page.featured_item_2%7Citem_page.featured_item_3&u=mVBBR9wkG1PJ7zehLfmNXwzRp4WGMeDLG4M%3D&s=mVBBR9wkG1PJ7zehLfmNXwzRp4WGMeDLG4M%3D&cts=http%3A%2F%2Fwww.barneys.com&chi=%7Cmens-shirts-dress-classic&flv=18.0.0&rcs=eF4NyjEOgCAMBdCFybs0obQfyg28BhRIHNzU88v68sJxf881TDUSq6hYTimWomRGm9gkh9fPZo21olN3qbT3ogUYOcATzpgRP7a2EmY&l=1'
r_1= requests.get(url_1)
pattern = re.compile(r'(?<=p=)[0-9]+(?=&)')
product_ids = pattern.findall(str(r_1.content))
print ("DETAILS:- " + d3+';')
print ("\nStyle ID:- " + id_2+';')
print ("\nRecommended Product ID's:- ")
print (','.join(i for i in product_ids))
except:
pass
try:
print ("\nURL:-" + img["src"]+';')
except:
pass
try:
print ("\nFull Price:-" + item.find("span",{"class":"price-standard"}).text+';')
except:
pass
try:
print ("\nDiscounted Price:-" + item.find("span",{"class":"price-sales"}).text+';')
except:
pass
g_d2=soup.find_all("div", {"class":"color-scroll"})
pattern_1=re.compile("pid=(\w+)")
for item in g_d2:
links_1=soup.find_all('a', href=re.compile('^/on/demandware.store/Sites-BNY-Site/default/Product-Variation'))
for link in links_1[1:]:
match=pattern_1.search(link["href"])
if match:
print ("\nProduct ID of other color:-")
print (match.group(1))
I added a dictionary called d
import requests, re
from bs4 import BeautifulSoup
d={}
url="http://www.barneys.com/theory-andrejs-sweater-503900006.html#start=2"
r=requests.get(url)
soup=BeautifulSoup(r.content)
links = soup.find_all("a")
d["links"] = []
d["links"].append(("href", [link.get("href") for link in links]))
d["links"].append(("class", [link.get("class") for link in links]))
img=soup.find(itemprop="image")
d["img"] = []
d["img"].append([("alt", img.get("alt")), ("src", img.get("src")), ("itemprop", img.get("itemprop")), ("class", img.get("class")[0])]) #You will have to put d["img"]["0"] instead of d["img"]["alt"]
g_d4=soup.find_all("ol", {"class":"breadcrumb"})
for item in g_d4:
links_2=soup.find_all('a', href=re.compile('^http://www.barneys.com/barneys-new-york/men/'))
pattern_2=re.compile("clothing/(\w+)")
for link in links_2:
match_1=pattern_2.search(link["href"])
if match_1:
print ("Category:- " + match_1.group(1))
break
g_d1 = soup.find_all("div", {"id": "product-content"})
for item in g_d1:
try:
d["Brand"] = item.contents[1].text
print ("\n\nBRAND:-" + item.contents[1].text)
except:
pass
try:
a_1=item.find("ol", {"class":"breadcrumb"})
a_2=a_1.text
d["a_2"] = a_2
print a_2
except:
pass
try:
print ("TYPE:-" + item.find("h1",{"class":"product-name"},{"itemprop":"name"}).text+';')
d["Type"] = item.find("h1",{"class":"product-name"},{"itemprop":"name"}).text
except:
pass
try:
d2=item.find("div",{"class":"panel-body standard-p"})
d3=d2.text
p_id=re.findall(r'[0-9]{9}',d3)
id_2=p_id[0]
url_1 = 'http://recs.richrelevance.com/rrserver/p13n_generated.js?a=dbeab3c977a08905&ts=1434386243747&p='+str(id_2)+'&pt=%7Citem_page.rr1%7Citem_page.featured_item_0%7Citem_page.featured_item_1%7Citem_page.featured_item_2%7Citem_page.featured_item_3&u=mVBBR9wkG1PJ7zehLfmNXwzRp4WGMeDLG4M%3D&s=mVBBR9wkG1PJ7zehLfmNXwzRp4WGMeDLG4M%3D&cts=http%3A%2F%2Fwww.barneys.com&chi=%7Cmens-shirts-dress-classic&flv=18.0.0&rcs=eF4NyjEOgCAMBdCFybs0obQfyg28BhRIHNzU88v68sJxf881TDUSq6hYTimWomRGm9gkh9fPZo21olN3qbT3ogUYOcATzpgRP7a2EmY&l=1'
r_1= requests.get(url_1)
pattern = re.compile(r'(?<=p=)[0-9]+(?=&)')
product_ids = pattern.findall(str(r_1.content))
print ("DETAILS:- " + d3+';')
d["Details"] = d3.split(",")
print ("\nStyle ID:- " + id_2+';')
d["Style"] = ("ID", id_2)
print ("\nRecommended Product ID's:- ")
print (','.join(i for i in product_ids))
d["RecommendedProductIDs"] = [i for i in product_ids]
except:
pass
try:
print ("\nURL:-" + img["src"]+';')
except:
pass
try:
print ("\nFull Price:-" + item.find("span",{"class":"price-standard"}).text+';')
except:
pass
try:
print ("\nDiscounted Price:-" + item.find("span",{"class":"price-sales"}).text+';')
except:
pass
g_d2=soup.find_all("div", {"class":"color-scroll"})
pattern_1=re.compile("pid=(\w+)")
for item in g_d2:
links_1=soup.find_all('a', href=re.compile('^/on/demandware.store/Sites-BNY-Site/default/Product-Variation'))
for link in links_1[1:]:
match=pattern_1.search(link["href"])
if match:
print ("\nProduct ID of other color:-")
print (match.group(1))
Related
I need the below code to ask for user input again, after executing and showing results. I guess a while loop would be best but not sure how to do it as have BeautifulSoup and requests library in use.
Any help would be greatly appreciated.
from bs4 import BeautifulSoup
user_input = input("Enter article:")
response = requests.get("https://en.wikipedia.org/wiki/" + user_input)
soup = BeautifulSoup(response.text, "html.parser")
list = []
count = 0
IGNORE = ["Wikipedia:", "Category:", "Template:", "Template talk:", "User:",
"User talk:", "Module:", "Help:", "File:", "Portal:", "#", "About this", ".ogg", "disambiguation", "Edit section"]
for tag in soup.select('div.mw-parser-output a:not(.infobox a)'):
if count <= 10:
title = tag.get("title", "")
if not any(x in title for x in IGNORE) and title != "":
count = count + 1
print(title)
list.append(title)
else:
break
Use function with return statement
Example
import requests
from bs4 import BeautifulSoup
IGNORE = ["Wikipedia:", "Category:", "Template:", "Template talk:", "User:",
"User talk:", "Module:", "Help:", "File:", "Portal:", "#", "About this", ".ogg", "disambiguation",
"Edit section"]
def get_user_input():
user_input = input("Enter article:")
if len(str(user_input)) > 0:
return get_response(user_input)
else:
return get_user_input()
def get_response(user_input):
response = requests.get("https://en.wikipedia.org/wiki/" + user_input)
soup = BeautifulSoup(response.text, "html.parser")
title_list = []
count = 0
for tag in soup.select('div.mw-parser-output a:not(.infobox a)'):
if count <= 10:
title = tag.get("title", "")
if not any(x in title for x in IGNORE) and title != "":
count = count + 1
print(title)
title_list.append(title)
print(title_list)
else:
return get_user_input()
if __name__ == '__main__':
get_user_input()
I've got a script set up to loop through a group of URLs. The script runs fine, but I can't figure out how to tweak things to produce a cleaner CSV output.
I'll take any suggestions I can to minimize the time needed to clean up the formatting, delete excel cells, and the like.
Note: The way I'm scraping the volume text has been the only way I've figured out how to get what I need. Hopefully, we can find a good solution for improving the final output without compromising this part of the script.
Here's my script:
group_url = [
'https://www.example.com',
'https://www.example2.com',
'https://www.example3.com',
'https://www.example4.com',
]
data = []
for group in group_url:
driver.get(group)
wait = WebDriverWait(driver, 90)
element = wait.until(EC.visibility_of_element_located((By.XPATH, '//*[#id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[1]/td[6]/div')))
time.sleep(3)
kws = driver.find_elements_by_css_selector(".css-hijzdp-base")
counter = 1
for kw in kws:
if counter <= 5:
try:
data.append({
"Keyword": kw.text
})
counter = counter + 1
except NoSuchElementException:
pass
urls = driver.find_elements_by_css_selector(".css-a5m6co-text.css-p8ym46-fontFamily.css-11397xj-fontSize.css-18j1nfb-display")
count = 1
for url in urls:
if count <= 5:
try:
data.append({
"URL": url.text
})
count = count + 1
except NoSuchElementException:
pass
try:
vol1 = driver.find_element_by_xpath('//*[#id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[1]/td[6]/div')
except NoSuchElementException:
pass
else:
data.append({
"Volume1": vol1.text
})
try:
vol2 = driver.find_element_by_xpath('//*[#id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[2]/td[6]/div')
except NoSuchElementException:
pass
else:
data.append({
"Volume2": vol2.text
})
try:
vol3 = driver.find_element_by_xpath('//*[#id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[3]/td[6]/div')
except NoSuchElementException:
pass
else:
data.append({
"Volume3": vol3.text
})
try:
vol4 = driver.find_element_by_xpath('//*[#id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[4]/td[6]/div')
except NoSuchElementException:
pass
else:
data.append({
"Volume4": vol4.text
})
try:
vol5 = driver.find_element_by_xpath('//*[#id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[5]/td[6]/div')
except NoSuchElementException:
pass
else:
data.append({
"Volume5": vol5.text
})
driver.close()
print(data)
#print to csv
df = pd.DataFrame(data)
df.to_csv('testOutput 11_11_21.csv')
Here is a screenshot of the final output:
You're appending every row item independently to data. Collect them first in a dictionary within the for loop, then append the dictionary to the list data:
group_url = [
'https://www.example.com',
'https://www.example2.com',
'https://www.example3.com',
'https://www.example4.com',
]
data = []
for group in group_url:
tmp_dict = {}
driver.get(group)
wait = WebDriverWait(driver, 90)
element = wait.until(EC.visibility_of_element_located((By.XPATH, '//*[#id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[1]/td[6]/div')))
time.sleep(3)
kws = driver.find_elements_by_css_selector(".css-hijzdp-base")
counter = 1
keywords = []
for kw in kws:
if counter <= 5:
try:
keywords.append(kw.text)
counter = counter + 1
except NoSuchElementException:
pass
tmp_dict["Keyword"] = keywords
urls = driver.find_elements_by_css_selector(".css-a5m6co-text.css-p8ym46-fontFamily.css-11397xj-fontSize.css-18j1nfb-display")
count = 1
urls_results = []
for url in urls:
if count <= 5:
try:
urls_results.append(url.text)
count = count + 1
except NoSuchElementException:
pass
tmp_dict["URL"] = urls_results
try:
vol1 = driver.find_element_by_xpath('//*[#id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[1]/td[6]/div')
except NoSuchElementException:
pass
else:
tmp_dict["Volume1"]= vol1.text
try:
vol2 = driver.find_element_by_xpath('//*[#id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[2]/td[6]/div')
except NoSuchElementException:
pass
else:
tmp_dict["Volume2"]= vol2.text
try:
vol3 = driver.find_element_by_xpath('//*[#id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[3]/td[6]/div')
except NoSuchElementException:
pass
else:
tmp_dict["Volume3"]= vol3.text
try:
vol4 = driver.find_element_by_xpath('//*[#id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[4]/td[6]/div')
except NoSuchElementException:
pass
else:
tmp_dict["Volume4"]= vol4.text
try:
vol5 = driver.find_element_by_xpath('//*[#id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[5]/td[6]/div')
except NoSuchElementException:
pass
else:
tmp_dict["Volume5"]= vol5.text
data.append(tmp_dict)
driver.close()
print(data)
#print to csv
df = pd.DataFrame(data)
df.to_csv('testOutput 11_11_21.csv')
Python code get stuck in the try block
`
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
#import urllib2
def url1_to_string(url1):
html=""
proxyDict = {
'http': 'http://username:pwd#proxyurl:8080',
'https': 'https://username:pwd#proxyurl:8080'
}
try:
print('Before res in try')
res = requests.get(url1,proxies=proxyDict)
print('After res in try')
except:
pass
html = res.text
soup = BeautifulSoup(html, 'html5lib')
for script in soup(["script", "style", 'aside']):
script.extract()
return " ".join(re.split(r'[\n\t]+', soup.get_text()))
df=pd.read_csv(r'C:\filepath\abc.csv',encoding='latin-1')
anchor_count = []
account_count = []
aggregate_page_count=[]
agg_url_count=[]
for index, row in df.iterrows():
agg_url_list = []
ini_url="http://www.google.com/search?q="+row['ANCHOR_NAME']+" AND "+row['ACCOUNT_NAME']
r = requests.get(ini_url,proxies={"http":"http://one.proxy.att.com:8080"})
ny_bb1 = url1_to_string(ini_url)
anchor_count.append(ny_bb1.lower().count(row['ANCHOR_NAME'].lower()))
account_count.append(ny_bb1.lower().count(row['ACCOUNT_NAME'].lower()))
print(anchor_count)
soup = BeautifulSoup(r.text,"html.parser")
get_details1 = soup.find_all("div", attrs={"class": "g"})
sublist1 = []
for details1 in get_details1:
link1 = details1.find_all("h3")
for mdetails1 in link1[:]:
links1 = mdetails1.find_all("a")
lmk1 = ""
for lnk1 in links1[:]:
lmk1 = lnk1.get("href")[7:].split("&")
sublist1.append(lmk1[0])
aggregate_count1=0
for x1 in sublist1[:3]:
anchorcount1=0
accountcount1=0
print("aagg url",x1)
try:
print('In try block')
ny_bb1 = url1_to_string(x1)
except KeyboardInterrupt: print('You cancelled the operation.')
finally:
pass
ny_bb1=ny_bb1.upper()
print(ny_bb1)
row['ANCHOR_NAME']=row['ANCHOR_NAME'].upper()
row['ACCOUNT_NAME']=row['ACCOUNT_NAME'].upper()
anchor_name=re.match(r'\W*(\w[^,. !?"]*)', row['ANCHOR_NAME']).groups()[0]
account_name=re.match(r'\W*(\w[^,. !?"]*)', row['ACCOUNT_NAME']).groups()[0]
if(anchor_name==account_name):
if(row['ANCHOR_NAME'] in ny_bb1.upper()):
anchorcount1 = anchorcount1 + 1
if(row['ACCOUNT_NAME'] in ny_bb1.upper()):
accountcount1 = accountcount1 + 1
else:
if (anchor_name in ny_bb1.upper()):
anchorcount1 = anchorcount1 + 1
if(account_name in ny_bb1.upper()):
accountcount1 = accountcount1 + 1
if(anchorcount1 > 0 and accountcount1 > 0):
aggregate_count1=aggregate_count1+1
agg_url_list.append(x1[:])
print("existance of both",aggregate_count1)
aggregate_page_count.append(aggregate_count1)
agg_url_count.append(agg_url_list)
df['anc_cnt']=pd.Series(anchor_count)
df['acc_cnt']=pd.Series(account_count)
df['agg_cnt']=pd.Series(aggregate_page_count)
df['agg_url_list']=pd.Series(agg_url_count)
`
The contents of the abc.csv file as follows ::
ANCHOR_NAME,ACCOUNT_NAME
ABC,ABC
XYZ,ZYZ
and so on
For particular URL's the code gets stuck in the try block and control does not come to except block where I want to ignore the exception and continue with normal program flow, as executing the next URL's and so on.
I'm trying to understand BeautifulSoup and tried want to find all the links within facebook.com and iterate each and every link within it...
Here is my code...it works fine but once it finds Linkedin.com and iterates over it, it get stuck at a point after this URL - http://www.linkedin.com/redir/redirect?url=http%3A%2F%2Fbusiness%2Elinkedin%2Ecom%2Ftalent-solutions%3Fsrc%3Dli-footer&urlhash=f9Nj
When I run Linkedin.com separately, I don't have any problem...
Could this be a limitation within my operating system..Im using Ubuntu Linux...
import urllib2
import BeautifulSoup
import re
def main_process(response):
print "Main process started"
soup = BeautifulSoup.BeautifulSoup(response)
limit = '5'
count = 0
main_link = valid_link = re.search("^(https?://(?:\w+.)+\.com)(?:/.*)?$","http://www.facebook.com")
if main_link:
main_link = main_link.group(1)
print 'main_link = ', main_link
result = {}
result[main_link] = {'incoming':[],'outgoing':[]}
print 'result = ', result
for link in soup.findAll('a',href=True):
if count < 10:
valid_link = re.search("^(https?://(?:\w+.)+\.com)(?:/.*)?$",link.get('href'))
if valid_link:
#print 'Main link = ', link.get('href')
print 'Links object = ', valid_link.group(1)
connecting_link = valid_link.group(1)
connecting_link = connecting_link.encode('ascii')
if main_link <> connecting_link:
print 'outgoing link = ', connecting_link
result = add_new_link(connecting_link, result)
#Check if the outgoing is already added, if its then don't add it
populate_result(result,main_link,connecting_link)
print 'result = ', result
print 'connecting'
request = urllib2.Request(connecting_link)
response = urllib2.urlopen(request)
soup = BeautifulSoup.BeautifulSoup(response)
for sublink in soup.findAll('a',href=True):
print 'sublink = ', sublink.get('href')
valid_link = re.search("^(https?://(?:\w+.)+\.com)(?:/.*)?$",sublink.get('href'))
if valid_link:
print 'valid_link = ', valid_link.group(1)
valid_link = valid_link.group(1)
if valid_link <> connecting_link:
populate_result(result,connecting_link,valid_link)
count += 1
print 'final result = ', result
# print 'found a url with national-park in the link'
def add_new_link(connecting_link, result):
result[connecting_link] = {'incoming':[],'outgoing':[]}
return result
def populate_result(result,link,dest_link):
if len(result[link]['outgoing']) == 0:
result[link]['outgoing'].append(dest_link)
else:
found_in_list = 'Y'
try:
result[link]['outgoing'].index(dest_link)
found_in_list = 'Y'
except ValueError:
found_in_list = 'N'
if found_in_list == 'N':
result[link]['outgoing'].append(dest_link)
return result
if __name__ == "__main__":
request = urllib2.Request("http://facebook.com")
print 'process start'
try:
response = urllib2.urlopen(request)
main_process(response)
except urllib2.URLError, e:
print "URLERROR"
print "program ended"
The problem is in hanging re.search() on certain URLs on this line:
valid_link = re.search("^(https?://(?:\w+.)+\.com)(?:/.*)?$", sublink.get('href'))
For example, it hangs on https://www.facebook.com/campaign/landing.php?placement=pflo&campaign_id=402047449186&extra_1=auto url:
>>> import re
>>> s = "https://www.facebook.com/campaign/landing.php?placement=pflo&campaign_id=402047449186&extra_1=auto"
>>> re.search("^(https?://(?:\w+.)+\.com)(?:/.*)?$", s)
hanging "forever"...
Looks like, it introduces a Catastrophic Backtracking case that causes regex search to hang.
One solution would be to use a different regex for validating the URL, see plenty of options here:
How do you validate a URL with a regular expression in Python?
Hope that helps.
Here i have written the code using python and beautiful soup to parse all the links on that page into a repository of links. Next, it fetches the contents of any of the url from the repository just created, parses the links from this new content into the repository and continues this process for all links in the repository until stopped or after a given number of links are fetched.
But this code is very slow. How can i improve it by using asynchronous programming using gevents in python ?
Code
class Crawler(object):
def __init__(self):
self.soup = None # Beautiful Soup object
self.current_page = "http://www.python.org/" # Current page's address
self.links = set() # Queue with every links fetched
self.visited_links = set()
self.counter = 0 # Simple counter for debug purpose
def open(self):
# Open url
print self.counter , ":", self.current_page
res = urllib2.urlopen(self.current_page)
html_code = res.read()
self.visited_links.add(self.current_page)
# Fetch every links
self.soup = BeautifulSoup.BeautifulSoup(html_code)
page_links = []
try :
page_links = itertools.ifilter( # Only deal with absolute links
lambda href: 'http://' in href,
( a.get('href') for a in self.soup.findAll('a') ) )
except Exception as e: # Magnificent exception handling
print 'Error: ',e
pass
# Update links
self.links = self.links.union( set(page_links) )
# Choose a random url from non-visited set
self.current_page = random.sample( self.links.difference(self.visited_links),1)[0]
self.counter+=1
def run(self):
# Crawl 3 webpages (or stop if all url has been fetched)
while len(self.visited_links) < 3 or (self.visited_links == self.links):
self.open()
for link in self.links:
print link
if __name__ == '__main__':
C = Crawler()
C.run()
Update 1
import gevent.monkey; gevent.monkey.patch_thread()
from bs4 import BeautifulSoup
import urllib2
import itertools
import random
import urlparse
import sys
import gevent.monkey; gevent.monkey.patch_all(thread=False)
class Crawler(object):
def __init__(self):
self.soup = None # Beautiful Soup object
self.current_page = "http://www.python.org/" # Current page's address
self.links = set() # Queue with every links fetched
self.visited_links = set()
self.counter = 0 # Simple counter for debug purpose
def open(self):
# Open url
print self.counter , ":", self.current_page
res = urllib2.urlopen(self.current_page)
html_code = res.read()
self.visited_links.add(self.current_page)
# Fetch every links
self.soup = BeautifulSoup(html_code)
page_links = []
try :
for link in [h.get('href') for h in self.soup.find_all('a')]:
print "Found link: '" + link + "'"
if link.startswith('http'):
print 'entered in if link: ',link
page_links.append(link)
print "Adding link" + link + "\n"
elif link.startswith('/'):
print 'entered in elif link: ',link
parts = urlparse.urlparse(self.current_page)
page_links.append(parts.scheme + '://' + parts.netloc + link)
print "Adding link " + parts.scheme + '://' + parts.netloc + link + "\n"
else:
print 'entered in else link: ',link
page_links.append(self.current_page+link)
print "Adding link " + self.current_page+link + "\n"
except Exception, ex: # Magnificent exception handling
print ex
# Update links
self.links = self.links.union( set(page_links) )
# Choose a random url from non-visited set
self.current_page = random.sample( self.links.difference(self.visited_links),1)[0]
self.counter+=1
def run(self):
# Crawl 3 webpages (or stop if all url has been fetched)
crawling_greenlets = []
for i in range(3):
crawling_greenlets.append(gevent.spawn(self.open))
gevent.joinall(crawling_greenlets)
#while len(self.visited_links) < 4 or (self.visited_links == self.links):
# self.open()
for link in self.links:
print link
if __name__ == '__main__':
C = Crawler()
C.run()
import gevent and make sure monkey-patching is done to make standard library calls non-blocking and aware of gevent:
import gevent
from gevent import monkey; monkey.patch_all()
(you can selectively decide what has to be monkey-patched, but let's say it is not
your problem at the moment)
In your run, make your open function to be called inside a greenlet. run can
return the greenlet object, so you can wait for it whenever you need to get the
results using gevent.joinall for example. Something like this:
def run(self):
return gevent.spawn(self.open)
c1 = Crawler()
c2 = Crawler()
c3 = Crawler()
crawling_tasks = [c.run() for c in (c1,c2,c3)]
gevent.joinall(crawling_tasks)
print [c.links for c in (c1, c2, c3)]