This is my code, it contains no recursion, but it hits maximum recursion depth on first pickle...
Code:
#!/usr/bin/env python
from bs4 import BeautifulSoup
from urllib2 import urlopen
import pickle
# open page and return soup list
def get_page_startups(page_url):
html = urlopen(page_url).read()
soup = BeautifulSoup(html, "lxml")
return soup.find_all("div","startup item")
#
# Get certain text from startup soup
#
def get_name(startup):
return startup.find("a", "profile").string
def get_website(startup):
return startup.find("a", "visit")["href"]
def get_status(startup):
return startup.find("p","status").strong.string[8:]
def get_twitter(startup):
return startup.find("a", "comment").string
def get_high_concept_pitch(startup):
return startup.find("div","headline").find_all("em")[1].string
def get_elevator_pitch(startup):
startup_soup = BeautifulSoup(urlopen("http://startupli.st" + startup.find("a","profile")["href"]).read(),"lxml")
return startup_soup.find("p", "desc").string.rstrip().lstrip()
def get_tags(startup):
return startup.find("p","tags").string
def get_blog(startup):
try:
return startup.find("a","visit blog")["href"]
except TypeError:
return None
def get_facebook(startup):
try:
return startup.find("a","visit facebook")["href"]
except TypeError:
return None
def get_angellist(startup):
try:
return startup.find("a","visit angellist")["href"]
except TypeError:
return None
def get_linkedin(startup):
try:
return startup.find("a","visit linkedin")["href"]
except TypeError:
return None
def get_crunchbase(startup):
try:
return startup.find("a","visit crunchbase")["href"]
except TypeError:
return None
# site to scrape
BASE_URL = "http://startupli.st/startups/latest/"
# scrape all pages
for page_no in xrange(1,142):
startups = get_page_startups(BASE_URL + str(page_no))
# search soup and pickle data
for i, startup in enumerate(startups):
s = {}
s['name'] = get_name(startup)
s['website'] = get_website(startup)
s['status'] = get_status(startup)
s['high_concept_pitch'] = get_high_concept_pitch(startup)
s['elevator_pitch'] = get_elevator_pitch(startup)
s['tags'] = get_tags(startup)
s['twitter'] = get_twitter(startup)
s['facebook'] = get_facebook(startup)
s['blog'] = get_blog(startup)
s['angellist'] = get_angellist(startup)
s['linkedin'] = get_linkedin(startup)
s['crunchbase'] = get_crunchbase(startup)
f = open(str(i)+".pkl", "wb")
pickle.dump(s,f)
f.close()
print "Done " + str(page_no)
This is the content of 0.pkl after the exception is raised:
http://pastebin.com/DVS1GKzz Thousand lines long!
There's some HTML from the BASE_URL in the pickle... but I didn't pickle any html strings...
BeautifulSoup .string attributes aren't actually strings:
>>> from bs4 import BeautifulSoup
>>> soup = BeautifulSoup('<div>Foo</div>')
>>> soup.find('div').string
u'Foo'
>>> type(soup.find('div').string)
bs4.element.NavigableString
Try using str(soup.find('div').string) instead and see if it helps. Also, I don't think Pickle is really the best format here. JSON is much easier in this case.
Most likely pickle is doing recursion internally, and the file you are trying parse is to big. You could try to increase the limit of the number of recursions allowed.
import sys
sys.setrecursionlimit(10000)
This is however not recommended for any type of production ready application, as it may mask actual issue, but could help highlight issue(s) during debugging.
Pickle cannot handle BeautifulSoup nodes. Similar questions with some workarounds:
RuntimeError: maximum recursion depth exceeded with Python 3.2 pickle.dump
pickle.dump meet RuntimeError: maximum recursion depth exceeded in cmp
Related
I am new in asks and trio in python, I got a sample code. let me explain
I have a list of URL every one is news URLs, each one has sub urls.
the first url requests and get all other hrefs and add in a list.
then get the article of all hrefs in that list.
The issue is certain times the article is getting other times empty.
tried the sample code for single urls that time its working
import asks
import trio
from goose3 import Goose
import logging as log
from goose3.configuration import ArticleContextPattern
from pprint import pprint
import json
import time
asks.init('trio')
async def extractor(path, htmls, paths, session):
try:
r = await session.get(path, timeout=2)
out = r.content
htmls.append(out)
paths.append(path)
except Exception as e:
out = str(e)
htmls.append(out)
paths.append(path)
async def main(path_list, session):
htmls = []
paths = []
async with trio.open_nursery() as n:
for path in path_list:
n.start_soon(extractor, path, htmls, paths, session)
return htmls, paths
async def run(urls, conns=50):
s = asks.Session(connections=conns)
g = Goose()
htmls, paths = await main(urls, s)
print(htmls," ",paths)
cleaned = []
for html, path in zip(htmls, paths):
dic = {}
dic['url'] = path
if html is not None:
try:
#g.config.known_context_pattern = ArticleContextPattern(attr='class', value='the-post')
article = g.extract(raw_html=html)
author=article.authors
dic['goose_text'] = article.cleaned_text
#print(article.cleaned_text)
#dic['goose_date'] = article.publish_datetime
dic['goose_title'] = article.title
if author:
dic['authors']=author[0]
else:
dic['authors'] =''
except Exception as e:
raise
print(e)
log.info('goose found no text using html')
dic['goose_html'] = html
dic['goose_text'] = ''
dic['goose_date'] = None
dic['goose_title'] = None
dic['authors'] =''
cleaned.append(dic)
return cleaned
async def real_main():
sss= '[{"crawl_delay_sec": 0, "name": "mining","goose_text":"","article_date":"","title":"", "story_url": "http://www.mining.com/canalaska-start-drilling-west-mcarthur-uranium-project","url": "http://www.mining.com/tag/latin-america/page/1/"},{"crawl_delay_sec": 0, "name": "mining", "story_url": "http://www.mining.com/web/tesla-fires-sound-alarms-safety-electric-car-batteries", "url": "http://www.mining.com/tag/latin-america/page/1/"}]'
obj = json.loads(sss)
pprint(obj)
articles=[]
for l in obj:
articles.append(await run([l['story_url']]))
#await trio.sleep(3)
pprint(articles)
if __name__ == "__main__":
trio.run(real_main)
get the article data without missing
I lack some further information to answer your question in-depth, but most likely it has to do with the way goose search for text within the html. See this answer for more details: https://stackoverflow.com/a/30408761/8867146
"asks" does not always raise an exception when the status code is != 200. You need to examine the response's status code before using its content. You might also want to increase the timeout, 2 seconds is not enough, particularly when you're firing off up to 50 connections in parallel.
In any case, here's a simplified program – all that Goose stuff is completely unnecessary for showing the actual error, two result arrays are not a good idea, and adding error messages to the result array looks broken.
Also you should investigate running the URL fetching and the processing in parallel. trio.open_memory_channel is your friend here.
import asks
asks.init('trio')
import trio
from pprint import pprint
async def extractor(path, session, results):
try:
r = await session.get(path, timeout=2)
if r.status_code != 200:
raise asks.errors.BadStatus("Not OK",r.status_code)
out = r.content
except Exception as e:
# do some reasonable error handling
print(path, repr(e))
else:
results.append((out, path))
async def main(path_list, session):
results = []
async with trio.open_nursery() as n:
for path in path_list:
n.start_soon(extractor, path, session, results)
return results
async def run(conns=50):
s = asks.Session(connections=conns)
urls = [
"http://www.mining.com/web/tesla-fires-sound-alarms-safety-electric-car-batteries",
"http://www.mining.com/canalaska-start-drilling-west-mcarthur-uranium-project",
"https://www.google.com", # just for testing more parallel connections
"https://www.debian.org",
]
results = await main(urls, s)
for content, path in results:
pass # analyze this result
print("OK")
if __name__ == "__main__":
trio.run(run)
there is about 70% chance shows error:
res=pool.map(feng,urls)
File "c:\Python27\lib\multiprocessing\pool.py", line 251, in map
return self.map_async(func, iterable, chunksize).get()
File "c:\Python27\lib\multiprocessing\pool.py", line 567, in get
raise self._value
IndexError: list index out of range
don't know why,if data less then 100,only 5%chance show that message.any one have idea how to improve?
#coding:utf-8
import multiprocessing
import requests
import bs4
import re
import string
root_url = 'http://www.haoshiwen.org'
#index_url = root_url+'/type.php?c=1'
def xianqin_url():
f = 0
h = 0
x = 0
y = 0
b = []
l=[]
for i in range(1,64):#页数
index_url=root_url+'/type.php?c=1'+'&page='+"%s" % i
response = requests.get(index_url)
soup = bs4.BeautifulSoup(response.text,"html.parser")
x = [a.attrs.get('href') for a in soup.select('div.sons a[href^=/]')]#取出每一页的div是sons的链接
c=len(x)#一共c个链接
j=0
for j in range(c):
url = root_url+x[j]
us = str(url)
print "收集到%s" % us
l.append(url) #pool = multiprocessing.Pool(8)
return l
def feng (url) :
response = requests.get(url)
response.encoding='utf-8'
#print response.text
soup = bs4.BeautifulSoup(response.text, "html.parser")
#content = soup.select('div.shileft')
qq=str(soup)
soupout = re.findall(r"原文(.+?)</div>",qq,re.S)#以“原文”开头<div>结尾的字段
#print soupout[1]
content=str(soupout[1])
b="风"
cc=content.count(b,0,len(content))
return cc
def start_process():
print 'Starting',multiprocessing.current_process().name
def feng (url) :
response = requests.get(url)
response.encoding='utf-8'
#print response.text
soup = bs4.BeautifulSoup(response.text, "html.parser")
#content = soup.select('div.shileft')
qq=str(soup)
soupout = re.findall(r"原文(.+?)</div>",qq,re.S)#以“原文”开头<div>结尾的字段
#print soupout[1]
content=str(soupout[1])
b="风"
c="花"
d="雪"
e="月"
f=content.count(b,0,len(content))
h=content.count(c,0,len(content))
x=content.count(d,0,len(content))
y=content.count(e,0,len(content))
return f,h,x,y
def find(urls):
r= [0,0,0,0]
pool=multiprocessing.Pool()
res=pool.map4(feng,urls)
for i in range(len(res)):
r=map(lambda (a,b):a+b, zip(r,res[i]))
return r
if __name__=="__main__":
print "开始收集网址"
qurls=xianqin_url()
print "收集到%s个链接" % len(qurls)
print "开始匹配先秦诗文"
find(qurls)
print '''
%s篇先秦文章中:
---------------------------
风有:%s
花有:%s
雪有:%s
月有:%s
数据来源:%s
''' % (len(qurls),find(qurls)[0],find(qurls)[1],find(qurls)[2],find(qurls)[3],root_url)
stackoverflow :Body cannot contain "`pool ma p".
changed it as res=pool.map4(feng,urls)
i'm trying to get some sub string from this website,with multiprocessing.
Indeed, multiprocessing makes it a bit hard to debug as you don't see where the index out of bound error occurred (the error message makes it appear as if it happened internally in the multiprocessing module).
In some cases this line:
content=str(soupout[1])
raises an index out of bound, because soupout is an empty list. If you change it to
if len(soupout) == 0:
return None
and then remove the None that were returned by changing
res=pool.map(feng,urls)
into
res = pool.map(feng,urls)
res = [r for r in res if r is not None]
then you can avoid the error. That said. You probably want to find out the root cause why re.findall returned an empty list. It is certainly a better idea to select the node with beatifulsoup than with regex, as generally matching with bs4 is more stable, especially if the website slightly changes their markup (e.g. whitespaces, etc.)
Update:
why is soupout is an empty list? When I didn't use pool.map never I have this error message shown
This is probably because you hammer the web server too fast. In a comment you mention that you sometimes get 504 in response.status_code. 504 means Gateway Time-out: The server was acting as a gateway or proxy and did not receive a timely response from the upstream server
This is because haoshiwen.org seems to be powered by kangle which is a reverse proxy. Now the reverse proxy handles back all the requests you send him to the web server behind, and if you now start too many processes at once the poor web server cannot handle the flood. Kangle has a default timeout of 60s so as soon as he doesn't get an answer back from the web server within 60s he shows the error you posted.
How do you fix that?
you could limit the number of processes: pool=multiprocessing.Pool(2), you'd need to play around with a good number of processes
at the top of feng(url) you could add a time.sleep(5) so each process waits 5 seconds between each request. Also here you'd need to play around with the sleep time.
This is a code with Web crawler.
I'm a beginer in learning python.So i don't know how to solve.
It seems wrong with search()
# -*- coding:utf-8 -*-
import urllib,urllib2,re
class BDTB:
def __init__(self,baseUrl,seeLZ):
self.baseUrl = baseUrl
self.seeLZ = '?see_lz' + str(seeLZ)
def getPage(self,pageNum):
try:
url = self.baseUrl + self.seeLZ + '&pn=' + str(pageNum)
request = urllib2.Request(url)
response = urllib2.urlopen(request)
#print response.read().decode('utf-8')
return response
except urllib2.URLError,e:
if hasattr(e,'reason'):
print u'连接百度贴吧失败,错误原因',e.reason
return None
def getTitle(self):
page = self.getPage(1)
pattern = re.compile('<h3 class.*?px">(.*?)</h3>',re.S)
result = re.search(pattern,page)
if result:
print result.group(1)
return result.group(1).strip()
else:
return None
baseURL = 'http://tieba.baidu.com/p/4095047339'
bdtb = BDTB(baseURL,1)
bdtb.getTitle()
This will raise a TypeError: expected string or buffer because you are passing the object returned from urllib2.urlopen(request) to re.search() when it requires an str.
If you change the return value from:
return responce # returns the object
to one that returns the text contained in the request:
return responce.read() # returns the text contained in the responce
Your script works and after executing it returns:
广告兼职及二手物品交易集中贴
Additionally, since you're working with Python 2.x you might want to change you object from class BDTB: to class BDTB(object) in order to use new style classes.
I'm trying to grab the most recently uploaded videos. There's a standard feed for that - it's called most_recent. I don't have any problems grabbing the feed, but when I look at the entries inside, they're all half a year old, which is hardly recent.
Here's the code I'm using:
import requests
import os.path as P
import sys
from lxml import etree
import datetime
namespaces = {"a": "http://www.w3.org/2005/Atom", "yt": "http://gdata.youtube.com/schemas/2007"}
fmt = "%Y-%m-%dT%H:%M:%S.000Z"
class VideoEntry:
"""Data holder for the video."""
def __init__(self, node):
self.entry_id = node.find("./a:id", namespaces=namespaces).text
published = node.find("./a:published", namespaces=namespaces).text
self.published = datetime.datetime.strptime(published, fmt)
def __str__(self):
return "VideoEntry[id='%s']" % self.entry_id
def paginate(xml):
root = etree.fromstring(xml)
next_page = root.find("./a:link[#rel='next']", namespaces=namespaces)
if next_page == None:
next_link = None
else:
next_link = next_page.get("href")
entries = [VideoEntry(e) for e in root.xpath("/a:feed/a:entry", namespaces=namespaces)]
return entries, next_link
prefix = "https://gdata.youtube.com/feeds/api/standardfeeds/"
standard_feeds = set("top_rated top_favorites most_shared most_popular most_recent most_discussed most_responded recently_featured on_the_web most_viewed".split(" "))
feed_name = sys.argv[1]
assert feed_name in standard_feeds
feed_url = prefix + feed_name
all_video_ids = []
while feed_url is not None:
r = requests.get(feed_url)
if r.status_code != 200:
break
text = r.text.encode("utf-8")
video_ids, feed_url = paginate(text)
all_video_ids += video_ids
all_upload_times = [e.published for e in all_video_ids]
print min(all_upload_times), max(all_upload_times)
As you can see, it prints the min and max timestamps for the entire feed.
misha#misha-antec$ python get_standard_feed.py most_recent
2013-02-02 14:40:02 2013-02-02 14:54:00
misha#misha-antec$ python get_standard_feed.py top_rated
2006-04-06 21:30:53 2013-07-28 22:22:38
I've glanced through the downloaded XML and it appears to match the output. Am I doing something wrong?
Also, on an unrelated note, the feeds I'm getting are all about 100 entries (I'm paginating through them 25 at a time). Is this normal? I expected the feeds to be a bit bigger.
Regarding the "Most-Recent-Feed"-Topic: There is a ticket for this one here. Unfortunately, the YouTube-API-Teams doesn't respond or solved the problem so far.
Regarding the number of entries: That depends on the type of standardfeed, but for the most-recent-Feed it´s usually around 100.
Note: You could try using the "orderby=published" parameter to get recents videos, although I don´t know how "recent" they are.
https://gdata.youtube.com/feeds/api/videos?orderby=published&prettyprint=True
You can combine this query with the "category"-parameter or other ones (region-specific queries - like for the standard feeds - are not possible, afaik).
In my project, I use the multiprocessing class in order to run tasks parallely. I want to use threading instead, as it has better performance (my tasks are TCP/IP bound, not CPU or I/O bound).
multiprocessing has wonderful functions, as Pool.imap_unordered and Pool.map_async, that does not exist in the threading class.
What is the right way to convert my code to use threading instead? The documentation introduces the multiprocessing.dummy class, that is a wrapper for the threading class. However that raises lots of errors (at least on python 2.7.3):
pool = multiprocessing.Pool(processes)
File "C:\python27\lib\multiprocessing\dummy\__init__.py", line 150, in Pool
return ThreadPool(processes, initializer, initargs)
File "C:\python27\lib\multiprocessing\pool.py", line 685, in __init__
Pool.__init__(self, processes, initializer, initargs)
File "C:\python27\lib\multiprocessing\pool.py", line 136, in __init__
self._repopulate_pool()
File "C:\python27\lib\multiprocessing\pool.py", line 199, in _repopulate_pool
w.start()
File "C:\python27\lib\multiprocessing\dummy\__init__.py", line 73, in start
self._parent._children[self] = None
AttributeError: '_DummyThread' object has no attribute '_children'
Edit: What actually happens is that I have a GUI that runs a different thread (to prevent the GUI from gettint stuck). That thread runs the specific search function that has the ThreadPool that fails.
Edit 2: The bugfix was fixed and will be included in future releases.
Great to see a crasher fixed!
import urllib2, htmllib, formatter
import multiprocessing.dummy as multiprocessing
import xml.dom.minidom
import os
import string, random
from urlparse import parse_qs, urlparse
from useful_util import retry
import config
from logger import log
class LinksExtractor(htmllib.HTMLParser):
def __init__(self, formatter):
htmllib.HTMLParser.__init__(self, formatter)
self.links = []
self.ignoredSites = config.WebParser_ignoredSites
def start_a(self, attrs):
for attr in attrs:
if attr[0] == "href" and attr[1].endswith(".mp3"):
if not filter(lambda x: (x in attr[1]), self.ignoredSites):
self.links.append(attr[1])
def get_links(self):
return self.links
def GetLinks(url, returnMetaUrlObj=False):
'''
Function gather links from a url.
#param url: Url Address.
#param returnMetaUrlObj: If true, returns a MetaUrl Object list.
Else, returns a string list. Default is False.
#return links: Look up.
'''
htmlparser = LinksExtractor(formatter.NullFormatter())
try:
data = urllib2.urlopen(url)
except (urllib2.HTTPError, urllib2.URLError) as e:
log.error(e)
return []
htmlparser.feed(data.read())
htmlparser.close()
links = list(set(htmlparser.get_links()))
if returnMetaUrlObj:
links = map(MetaUrl, links)
return links
def isAscii(s):
"Function checks is the string is ascii."
try:
s.decode('ascii')
except (UnicodeEncodeError, UnicodeDecodeError):
return False
return True
#retry(Exception, logger=log)
def parse(song, source):
'''
Function parses the source search page and returns the .mp3 links in it.
#param song: Search string.
#param source: Search website source. Value can be dilandau, mp3skull, youtube, seekasong.
#return links: .mp3 url links.
'''
source = source.lower()
if source == "dilandau":
return parse_dilandau(song)
elif source == "mp3skull":
return parse_Mp3skull(song)
elif source == "SeekASong":
return parse_SeekASong(song)
elif source == "youtube":
return parse_Youtube(song)
log.error('no source "%s". (from parse function in WebParser)')
return []
def parse_dilandau(song, pages=1):
"Function connects to Dilandau.eu and returns the .mp3 links in it"
if not isAscii(song): # Dilandau doesn't like unicode.
log.warning("Song is not ASCII. Skipping on dilandau")
return []
links = []
song = urllib2.quote(song.encode("utf8"))
for i in range(pages):
url = 'http://en.dilandau.eu/download_music/%s-%d.html' % (song.replace('-','').replace(' ','-').replace('--','-').lower(),i+1)
log.debug("[Dilandau] Parsing %s... " % url)
links.extend(GetLinks(url, returnMetaUrlObj=True))
log.debug("[Dilandau] found %d links" % len(links))
for metaUrl in links:
metaUrl.source = "Dilandau"
return links
def parse_Mp3skull(song, pages=1):
"Function connects to mp3skull.com and returns the .mp3 links in it"
links = []
song = urllib2.quote(song.encode("utf8"))
for i in range(pages):
# http://mp3skull.com/mp3/how_i_met_your_mother.html
url = 'http://mp3skull.com/mp3/%s.html' % (song.replace('-','').replace(' ','_').replace('__','_').lower())
log.debug("[Mp3skull] Parsing %s... " % url)
links.extend(GetLinks(url, returnMetaUrlObj=True))
log.debug("[Mp3skull] found %d links" % len(links))
for metaUrl in links:
metaUrl.source = "Mp3skull"
return links
def parse_SeekASong(song):
"Function connects to seekasong.com and returns the .mp3 links in it"
song = urllib2.quote(song.encode("utf8"))
url = 'http://www.seekasong.com/mp3/%s.html' % (song.replace('-','').replace(' ','_').replace('__','_').lower())
log.debug("[SeekASong] Parsing %s... " % url)
links = GetLinks(url, returnMetaUrlObj=True)
for metaUrl in links:
metaUrl.source = "SeekASong"
log.debug("[SeekASong] found %d links" % len(links))
return links
def parse_Youtube(song, amount=10):
'''
Function searches a song in youtube.com and returns the clips in it using Youtube API.
#param song: The search string.
#param amount: Amount of clips to obtain.
#return links: List of links.
'''
"Function connects to youtube.com and returns the .mp3 links in it"
song = urllib2.quote(song.encode("utf8"))
url = r"http://gdata.youtube.com/feeds/api/videos?q=%s&max-results=%d&v=2" % (song.replace(' ', '+'), amount)
urlObj = urllib2.urlopen(url, timeout=4)
data = urlObj.read()
videos = xml.dom.minidom.parseString(data).getElementsByTagName('feed')[0].getElementsByTagName('entry')
links = []
for video in videos:
youtube_watchurl = video.getElementsByTagName('link')[0].attributes.item(0).value
links.append(get_youtube_hightest_quality_link(youtube_watchurl))
return links
def get_youtube_hightest_quality_link(youtube_watchurl, priority=config.youtube_quality_priority):
'''
Function returns the highest quality link for a specific youtube clip.
#param youtube_watchurl: The Youtube Watch Url.
#param priority: A list represents the qualities priority.
#return MetaUrlObj: MetaUrl Object.
'''
video_id = parse_qs(urlparse(youtube_watchurl).query)['v'][0]
youtube_embedded_watchurl = "http://www.youtube.com/embed/%s?autoplay=1" % video_id
d = get_youtube_dl_links(video_id)
for x in priority:
if x in d.keys():
return MetaUrl(d[x][0], 'youtube', d['VideoName'], x, youtube_embedded_watchurl)
log.error("No Youtube link has been found in get_youtube_hightest_quality_link.")
return ""
#retry(Exception, logger=log)
def get_youtube_dl_links(video_id):
'''
Function gets the download links for a youtube clip.
This function parses the get_video_info format of youtube.
#param video_id: Youtube Video ID.
#return d: A dictonary of qualities as keys and urls as values.
'''
d = {}
url = r"http://www.youtube.com/get_video_info?video_id=%s&el=vevo" % video_id
urlObj = urllib2.urlopen(url, timeout=12)
data = urlObj.read()
data = urllib2.unquote(urllib2.unquote(urllib2.unquote(data)))
data = data.replace(',url', '\nurl')
data = data.split('\n')
for line in data:
if 'timedtext' in line or 'status=fail' in line or '<AdBreaks>' in line:
continue
try:
url = line.split('&quality=')[0].split('url=')[1]
quality = line.split('&quality=')[1].split('&')[0]
except:
continue
if quality in d:
d[quality].append(url)
else:
d[quality] = [url]
try:
videoName = "|".join(data).split('&title=')[1].split('&')[0]
except Exception, e:
log.error("Could not parse VideoName out of get_video_info (%s)" % str(e))
videoName = ""
videoName = unicode(videoName, 'utf-8')
d['VideoName'] = videoName.replace('+',' ').replace('--','-')
return d
class NextList(object):
"A list with a 'next' method."
def __init__(self, l):
self.l = l
self.next_index = 0
def next(self):
if self.next_index < len(self.l):
value = self.l[self.next_index]
self.next_index += 1
return value
else:
return None
def isEOF(self):
" Checks if the list has reached the end "
return (self.next_index >= len(self.l))
class MetaUrl(object):
"a url strecture data with many metadata"
def __init__(self, url, source="", videoName="", quality="", youtube_watchurl=""):
self.url = str(url)
self.source = source
self.videoName = videoName # Youtube Links Only
self.quality = quality # Youtube Links Onlys
self.youtube_watchurl = youtube_watchurl # Youtube Links Onlys
def __repr__(self):
return "<MetaUrl '%s' | %s>" % (self.url, self.source)
def search(song, n, processes=config.search_processes):
'''
Function searches song and returns n valid .mp3 links.
#param song: Search string.
#param n: Number of songs.
#param processes: Number of processes to launch in the subprocessing pool.
'''
linksFromSources = []
pool = multiprocessing.Pool(processes)
args = [(song, source) for source in config.search_sources]
imapObj = pool.imap_unordered(_parse_star, args)
for i in range(len(args)):
linksFromSources.append(NextList(imapObj.next(15)))
pool.terminate()
links = []
next_source = 0
while len(links) < n and not all(map(lambda x: x.isEOF(), linksFromSources)):
nextItem = linksFromSources[next_source].next()
if nextItem:
log.debug("added song %.80s from source ID %d (%s)" % (nextItem.url.split('/')[-1], next_source, nextItem.source))
links.append(nextItem)
if len(linksFromSources) == next_source+1:
next_source = 0
else:
next_source += 1
return links
def _parse_star(args):
return parse(*args)
I can't reproduce your problem on my machine. What's in your processes variable? Is it an int?
Python 2.7.3 (default, Apr 10 2012, 23:31:26) [MSC v.1500 32 bit (Intel)] on win32
Type "help", "copyright", "credits" or "license" for more information.
>>> import multiprocessing.dummy as multiprocessing
>>> pool = multiprocessing.Pool(5)
>>> pool
<multiprocessing.pool.ThreadPool object at 0x00C7DF90>
>>>
----Edit----
You probably also want to double check if you had messed up your standard library, try an clean install of python 2.7.3 in a different folder.
----Edit 2----
You can quickly patch it like this:
import multiprocessing.dummy
import weakref
import threading
class Worker(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
def run(self):
poll = multiprocessing.dummy.Pool(5)
print str(poll)
w = Worker()
w._children = weakref.WeakKeyDictionary()
w.start()