Moving from multiprocessing to threading

Moving from multiprocessing to threading - python

In my project, I use the multiprocessing class in order to run tasks parallely. I want to use threading instead, as it has better performance (my tasks are TCP/IP bound, not CPU or I/O bound).
multiprocessing has wonderful functions, as Pool.imap_unordered and Pool.map_async, that does not exist in the threading class.
What is the right way to convert my code to use threading instead? The documentation introduces the multiprocessing.dummy class, that is a wrapper for the threading class. However that raises lots of errors (at least on python 2.7.3):
pool = multiprocessing.Pool(processes)
File "C:\python27\lib\multiprocessing\dummy\__init__.py", line 150, in Pool
return ThreadPool(processes, initializer, initargs)
File "C:\python27\lib\multiprocessing\pool.py", line 685, in __init__
Pool.__init__(self, processes, initializer, initargs)
File "C:\python27\lib\multiprocessing\pool.py", line 136, in __init__
self._repopulate_pool()
File "C:\python27\lib\multiprocessing\pool.py", line 199, in _repopulate_pool
w.start()
File "C:\python27\lib\multiprocessing\dummy\__init__.py", line 73, in start
self._parent._children[self] = None
AttributeError: '_DummyThread' object has no attribute '_children'
Edit: What actually happens is that I have a GUI that runs a different thread (to prevent the GUI from gettint stuck). That thread runs the specific search function that has the ThreadPool that fails.
Edit 2: The bugfix was fixed and will be included in future releases.
Great to see a crasher fixed!
import urllib2, htmllib, formatter
import multiprocessing.dummy as multiprocessing
import xml.dom.minidom
import os
import string, random
from urlparse import parse_qs, urlparse
from useful_util import retry
import config
from logger import log
class LinksExtractor(htmllib.HTMLParser):
def __init__(self, formatter):
htmllib.HTMLParser.__init__(self, formatter)
self.links = []
self.ignoredSites = config.WebParser_ignoredSites
def start_a(self, attrs):
for attr in attrs:
if attr[0] == "href" and attr[1].endswith(".mp3"):
if not filter(lambda x: (x in attr[1]), self.ignoredSites):
self.links.append(attr[1])
def get_links(self):
return self.links
def GetLinks(url, returnMetaUrlObj=False):
'''
Function gather links from a url.
#param url: Url Address.
#param returnMetaUrlObj: If true, returns a MetaUrl Object list.
Else, returns a string list. Default is False.
#return links: Look up.
'''
htmlparser = LinksExtractor(formatter.NullFormatter())
try:
data = urllib2.urlopen(url)
except (urllib2.HTTPError, urllib2.URLError) as e:
log.error(e)
return []
htmlparser.feed(data.read())
htmlparser.close()
links = list(set(htmlparser.get_links()))
if returnMetaUrlObj:
links = map(MetaUrl, links)
return links
def isAscii(s):
"Function checks is the string is ascii."
try:
s.decode('ascii')
except (UnicodeEncodeError, UnicodeDecodeError):
return False
return True
#retry(Exception, logger=log)
def parse(song, source):
'''
Function parses the source search page and returns the .mp3 links in it.
#param song: Search string.
#param source: Search website source. Value can be dilandau, mp3skull, youtube, seekasong.
#return links: .mp3 url links.
'''
source = source.lower()
if source == "dilandau":
return parse_dilandau(song)
elif source == "mp3skull":
return parse_Mp3skull(song)
elif source == "SeekASong":
return parse_SeekASong(song)
elif source == "youtube":
return parse_Youtube(song)
log.error('no source "%s". (from parse function in WebParser)')
return []
def parse_dilandau(song, pages=1):
"Function connects to Dilandau.eu and returns the .mp3 links in it"
if not isAscii(song): # Dilandau doesn't like unicode.
log.warning("Song is not ASCII. Skipping on dilandau")
return []
links = []
song = urllib2.quote(song.encode("utf8"))
for i in range(pages):
url = 'http://en.dilandau.eu/download_music/%s-%d.html' % (song.replace('-','').replace(' ','-').replace('--','-').lower(),i+1)
log.debug("[Dilandau] Parsing %s... " % url)
links.extend(GetLinks(url, returnMetaUrlObj=True))
log.debug("[Dilandau] found %d links" % len(links))
for metaUrl in links:
metaUrl.source = "Dilandau"
return links
def parse_Mp3skull(song, pages=1):
"Function connects to mp3skull.com and returns the .mp3 links in it"
links = []
song = urllib2.quote(song.encode("utf8"))
for i in range(pages):
# http://mp3skull.com/mp3/how_i_met_your_mother.html
url = 'http://mp3skull.com/mp3/%s.html' % (song.replace('-','').replace(' ','_').replace('__','_').lower())
log.debug("[Mp3skull] Parsing %s... " % url)
links.extend(GetLinks(url, returnMetaUrlObj=True))
log.debug("[Mp3skull] found %d links" % len(links))
for metaUrl in links:
metaUrl.source = "Mp3skull"
return links
def parse_SeekASong(song):
"Function connects to seekasong.com and returns the .mp3 links in it"
song = urllib2.quote(song.encode("utf8"))
url = 'http://www.seekasong.com/mp3/%s.html' % (song.replace('-','').replace(' ','_').replace('__','_').lower())
log.debug("[SeekASong] Parsing %s... " % url)
links = GetLinks(url, returnMetaUrlObj=True)
for metaUrl in links:
metaUrl.source = "SeekASong"
log.debug("[SeekASong] found %d links" % len(links))
return links
def parse_Youtube(song, amount=10):
'''
Function searches a song in youtube.com and returns the clips in it using Youtube API.
#param song: The search string.
#param amount: Amount of clips to obtain.
#return links: List of links.
'''
"Function connects to youtube.com and returns the .mp3 links in it"
song = urllib2.quote(song.encode("utf8"))
url = r"http://gdata.youtube.com/feeds/api/videos?q=%s&max-results=%d&v=2" % (song.replace(' ', '+'), amount)
urlObj = urllib2.urlopen(url, timeout=4)
data = urlObj.read()
videos = xml.dom.minidom.parseString(data).getElementsByTagName('feed')[0].getElementsByTagName('entry')
links = []
for video in videos:
youtube_watchurl = video.getElementsByTagName('link')[0].attributes.item(0).value
links.append(get_youtube_hightest_quality_link(youtube_watchurl))
return links
def get_youtube_hightest_quality_link(youtube_watchurl, priority=config.youtube_quality_priority):
'''
Function returns the highest quality link for a specific youtube clip.
#param youtube_watchurl: The Youtube Watch Url.
#param priority: A list represents the qualities priority.
#return MetaUrlObj: MetaUrl Object.
'''
video_id = parse_qs(urlparse(youtube_watchurl).query)['v'][0]
youtube_embedded_watchurl = "http://www.youtube.com/embed/%s?autoplay=1" % video_id
d = get_youtube_dl_links(video_id)
for x in priority:
if x in d.keys():
return MetaUrl(d[x][0], 'youtube', d['VideoName'], x, youtube_embedded_watchurl)
log.error("No Youtube link has been found in get_youtube_hightest_quality_link.")
return ""
#retry(Exception, logger=log)
def get_youtube_dl_links(video_id):
'''
Function gets the download links for a youtube clip.
This function parses the get_video_info format of youtube.
#param video_id: Youtube Video ID.
#return d: A dictonary of qualities as keys and urls as values.
'''
d = {}
url = r"http://www.youtube.com/get_video_info?video_id=%s&el=vevo" % video_id
urlObj = urllib2.urlopen(url, timeout=12)
data = urlObj.read()
data = urllib2.unquote(urllib2.unquote(urllib2.unquote(data)))
data = data.replace(',url', '\nurl')
data = data.split('\n')
for line in data:
if 'timedtext' in line or 'status=fail' in line or '<AdBreaks>' in line:
continue
try:
url = line.split('&quality=')[0].split('url=')[1]
quality = line.split('&quality=')[1].split('&')[0]
except:
continue
if quality in d:
d[quality].append(url)
else:
d[quality] = [url]
try:
videoName = "|".join(data).split('&title=')[1].split('&')[0]
except Exception, e:
log.error("Could not parse VideoName out of get_video_info (%s)" % str(e))
videoName = ""
videoName = unicode(videoName, 'utf-8')
d['VideoName'] = videoName.replace('+',' ').replace('--','-')
return d
class NextList(object):
"A list with a 'next' method."
def __init__(self, l):
self.l = l
self.next_index = 0
def next(self):
if self.next_index < len(self.l):
value = self.l[self.next_index]
self.next_index += 1
return value
else:
return None
def isEOF(self):
" Checks if the list has reached the end "
return (self.next_index >= len(self.l))
class MetaUrl(object):
"a url strecture data with many metadata"
def __init__(self, url, source="", videoName="", quality="", youtube_watchurl=""):
self.url = str(url)
self.source = source
self.videoName = videoName # Youtube Links Only
self.quality = quality # Youtube Links Onlys
self.youtube_watchurl = youtube_watchurl # Youtube Links Onlys
def __repr__(self):
return "<MetaUrl '%s' | %s>" % (self.url, self.source)
def search(song, n, processes=config.search_processes):
'''
Function searches song and returns n valid .mp3 links.
#param song: Search string.
#param n: Number of songs.
#param processes: Number of processes to launch in the subprocessing pool.
'''
linksFromSources = []
pool = multiprocessing.Pool(processes)
args = [(song, source) for source in config.search_sources]
imapObj = pool.imap_unordered(_parse_star, args)
for i in range(len(args)):
linksFromSources.append(NextList(imapObj.next(15)))
pool.terminate()
links = []
next_source = 0
while len(links) < n and not all(map(lambda x: x.isEOF(), linksFromSources)):
nextItem = linksFromSources[next_source].next()
if nextItem:
log.debug("added song %.80s from source ID %d (%s)" % (nextItem.url.split('/')[-1], next_source, nextItem.source))
links.append(nextItem)
if len(linksFromSources) == next_source+1:
next_source = 0
else:
next_source += 1
return links
def _parse_star(args):
return parse(*args)

I can't reproduce your problem on my machine. What's in your processes variable? Is it an int?
Python 2.7.3 (default, Apr 10 2012, 23:31:26) [MSC v.1500 32 bit (Intel)] on win32
Type "help", "copyright", "credits" or "license" for more information.
>>> import multiprocessing.dummy as multiprocessing
>>> pool = multiprocessing.Pool(5)
>>> pool
<multiprocessing.pool.ThreadPool object at 0x00C7DF90>
>>>
----Edit----
You probably also want to double check if you had messed up your standard library, try an clean install of python 2.7.3 in a different folder.
----Edit 2----
You can quickly patch it like this:
import multiprocessing.dummy
import weakref
import threading
class Worker(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
def run(self):
poll = multiprocessing.dummy.Pool(5)
print str(poll)
w = Worker()
w._children = weakref.WeakKeyDictionary()
w.start()

Related

How should I write a function in Python that check whether a user has fetched some url or not?

I'm studying Python and I'd like to have some hints on how to write it as I read about the implementation of a fetch(url) function as follows:
_cache = {}
def fetch(url):
user = os.environ['USER']
if user not in _cache:
_cache[user] = {}
if url not in _cache[user]:
_cache[user][url] = requests.get(url).content
return _cache[user][url]
and I'm trying to figure out how to modify this kind of function in order it checks wether a user has fetched some url or not, assuming that fetching a resource from web might take 0.1 seconds, while fetching it on the cache provides an instant result.
It should be something like
import sys # ignore
sys.path.insert(0,'.') # ignore
from Root.fetch import fetch
def did_fetch(user, url):
pass #return if url has been fatched by user
with the function to be implemented inside

You can use the code below to find all process running, and then loop through each process and see if say Facebook is in there, it won't show the URL of the page but it will show what you see on the tab, you can use that
import ctypes # process find
EnumWindows = ctypes.windll.user32.EnumWindows
EnumWindowsProc = ctypes.WINFUNCTYPE(ctypes.c_bool, ctypes.POINTER(ctypes.c_int),
ctypes.POINTER(ctypes.c_int))
GetWindowText = ctypes.windll.user32.GetWindowTextW
GetWindowTextLength = ctypes.windll.user32.GetWindowTextLengthW
IsWindowVisible = ctypes.windll.user32.IsWindowVisible
####### Modules to gather data
titles = [] # Empty list for titles (As String Objects)
def foreach_window(hwnd, lParam):
if IsWindowVisible(hwnd):
length = GetWindowTextLength(hwnd)
buff = ctypes.create_unicode_buffer(length + 1)
GetWindowText(hwnd, buff, length + 1)
titles.append(buff.value)
return True
EnumWindows(EnumWindowsProc(foreach_window), 0)
if 'YOUR TITLE OF WEBPAGE NOT URL' in titles:
print('DO SOMETHING HERE')
print(titles)
Edit: To answer your comment, I've adjusted it like this, not sure how you want the user bit to work, so I left that out, if you give further explanation then I might be able to figure something out, also, this only works if the tab is currently open, so yeah
import ctypes # process find
def did_fetch(url):
EnumWindows = ctypes.windll.user32.EnumWindows
EnumWindowsProc = ctypes.WINFUNCTYPE(ctypes.c_bool, ctypes.POINTER(ctypes.c_int),
ctypes.POINTER(ctypes.c_int))
GetWindowText = ctypes.windll.user32.GetWindowTextW
GetWindowTextLength = ctypes.windll.user32.GetWindowTextLengthW
IsWindowVisible = ctypes.windll.user32.IsWindowVisible
####### Modules to gather data
titles = [] # Empty list for titles (As String Objects)
def foreach_window(hwnd, lParam):
if IsWindowVisible(hwnd):
length = GetWindowTextLength(hwnd)
buff = ctypes.create_unicode_buffer(length + 1)
GetWindowText(hwnd, buff, length + 1)
titles.append(buff.value)
return True
EnumWindows(EnumWindowsProc(foreach_window), 0)
for process in titles:
if url in process:
return True
did_fetch('Facebook')

Yes exactly, that's the code I wrote down, based on your advice:
import sys # ignore
sys.path.insert(0,'.') # ignore
from Root.fetch import fetch
import ctypes # process find
def did_fetch(user, url):
EnumWindows = ctypes.windll.user32.EnumWindows
EnumWindowsProc = ctypes.WINFUNCTYPE(ctypes.c_bool, ctypes.POINTER(ctypes.c_int),
ctypes.POINTER(ctypes.c_int))
GetWindowText = ctypes.windll.user32.GetWindowTextW
GetWindowTextLength = ctypes.windll.user32.GetWindowTextLengthW
IsWindowVisible = ctypes.windll.user32.IsWindowVisible
####### Modules to gather data
titles = [] # Empty list for titles (As String Objects)
def foreach_window(hwnd, lParam):
if IsWindowVisible(hwnd):
length = GetWindowTextLength(hwnd)
buff = ctypes.create_unicode_buffer(length + 1)
GetWindowText(hwnd, buff, length + 1)
titles.append(buff.value)
return True
EnumWindows(EnumWindowsProc(foreach_window), 0)
for process in titles:
if url in process:
return True
did_fetch(user, url)
I used the template that the teacher gave me, and I implemented it with your advice... what's wrong in it?

import sys # ignore
import os
sys.path.insert(0,'.') # ignore
from Root.fetch import fetch
import time
def did_fetch(user, url):
os.environ['USER'] = user
start = time.time()
fetch(url)
end = time.time()
timed = end - start
if timed > 0.1:
return False
return True

Iteratively process large wikipedia dump

I want to parse a large wikipedia dump iteratively. I found a tutorial for this here: https://towardsdatascience.com/wikipedia-data-science-working-with-the-worlds-largest-encyclopedia-c08efbac5f5c
However, when I want to read in the data like this:
data_path = 'C:\\Users\\Me\\datasets\\dewiki-latest-pages-articles1.xml-p1p262468.bz2'
import xml.sax
class WikiXmlHandler(xml.sax.handler.ContentHandler):
"""Content handler for Wiki XML data using SAX"""
def __init__(self):
xml.sax.handler.ContentHandler.__init__(self)
self._buffer = None
self._values = {}
self._current_tag = None
self._pages = []
def characters(self, content):
"""Characters between opening and closing tags"""
if self._current_tag:
self._buffer.append(content)
def startElement(self, name, attrs):
"""Opening tag of element"""
if name in ('title', 'text'):
self._current_tag = name
self._buffer = []
def endElement(self, name):
"""Closing tag of element"""
if name == self._current_tag:
self._values[name] = ' '.join(self._buffer)
if name == 'page':
self._pages.append((self._values['title'], self._values['text']))
# Object for handling xml
handler = WikiXmlHandler()
# Parsing object
parser = xml.sax.make_parser()
parser.setContentHandler(handler)
# Iteratively process file
for line in subprocess.Popen(['bzcat'],
stdin = open(data_path),
stdout = subprocess.PIPE,shell=True).stdout:
parser.feed(line)
# Stop when 3 articles have been found
if len(handler._pages) > 3:
break
it seems like nothing happens. The handler._pages list is empty. This is where the parsed articles should be stored. I also added shell=True because otherwise I get the error message FileNotFoundError: [WinError 2].
I never worked with subprocesses in python so I don't know what the problem might be.
I also tried to specify the data_path differently (with / and //).
Thank you in advance.

Why does this pickle reach maximum recursion depth without recursion?

This is my code, it contains no recursion, but it hits maximum recursion depth on first pickle...
Code:
#!/usr/bin/env python
from bs4 import BeautifulSoup
from urllib2 import urlopen
import pickle
# open page and return soup list
def get_page_startups(page_url):
html = urlopen(page_url).read()
soup = BeautifulSoup(html, "lxml")
return soup.find_all("div","startup item")
#
# Get certain text from startup soup
#
def get_name(startup):
return startup.find("a", "profile").string
def get_website(startup):
return startup.find("a", "visit")["href"]
def get_status(startup):
return startup.find("p","status").strong.string[8:]
def get_twitter(startup):
return startup.find("a", "comment").string
def get_high_concept_pitch(startup):
return startup.find("div","headline").find_all("em")[1].string
def get_elevator_pitch(startup):
startup_soup = BeautifulSoup(urlopen("http://startupli.st" + startup.find("a","profile")["href"]).read(),"lxml")
return startup_soup.find("p", "desc").string.rstrip().lstrip()
def get_tags(startup):
return startup.find("p","tags").string
def get_blog(startup):
try:
return startup.find("a","visit blog")["href"]
except TypeError:
return None
def get_facebook(startup):
try:
return startup.find("a","visit facebook")["href"]
except TypeError:
return None
def get_angellist(startup):
try:
return startup.find("a","visit angellist")["href"]
except TypeError:
return None
def get_linkedin(startup):
try:
return startup.find("a","visit linkedin")["href"]
except TypeError:
return None
def get_crunchbase(startup):
try:
return startup.find("a","visit crunchbase")["href"]
except TypeError:
return None
# site to scrape
BASE_URL = "http://startupli.st/startups/latest/"
# scrape all pages
for page_no in xrange(1,142):
startups = get_page_startups(BASE_URL + str(page_no))
# search soup and pickle data
for i, startup in enumerate(startups):
s = {}
s['name'] = get_name(startup)
s['website'] = get_website(startup)
s['status'] = get_status(startup)
s['high_concept_pitch'] = get_high_concept_pitch(startup)
s['elevator_pitch'] = get_elevator_pitch(startup)
s['tags'] = get_tags(startup)
s['twitter'] = get_twitter(startup)
s['facebook'] = get_facebook(startup)
s['blog'] = get_blog(startup)
s['angellist'] = get_angellist(startup)
s['linkedin'] = get_linkedin(startup)
s['crunchbase'] = get_crunchbase(startup)
f = open(str(i)+".pkl", "wb")
pickle.dump(s,f)
f.close()
print "Done " + str(page_no)
This is the content of 0.pkl after the exception is raised:
http://pastebin.com/DVS1GKzz Thousand lines long!
There's some HTML from the BASE_URL in the pickle... but I didn't pickle any html strings...

BeautifulSoup .string attributes aren't actually strings:
>>> from bs4 import BeautifulSoup
>>> soup = BeautifulSoup('<div>Foo</div>')
>>> soup.find('div').string
u'Foo'
>>> type(soup.find('div').string)
bs4.element.NavigableString
Try using str(soup.find('div').string) instead and see if it helps. Also, I don't think Pickle is really the best format here. JSON is much easier in this case.

Most likely pickle is doing recursion internally, and the file you are trying parse is to big. You could try to increase the limit of the number of recursions allowed.
import sys
sys.setrecursionlimit(10000)
This is however not recommended for any type of production ready application, as it may mask actual issue, but could help highlight issue(s) during debugging.

Pickle cannot handle BeautifulSoup nodes. Similar questions with some workarounds:
RuntimeError: maximum recursion depth exceeded with Python 3.2 pickle.dump
pickle.dump meet RuntimeError: maximum recursion depth exceeded in cmp

Automatic background changer using Python 2.7.3 not working, though it should

I'm very new to Ubuntu/Python/Bash/Gnome in general, so I still feel like there's a chance I'm doing something wrong, but it's been 3 days now without success...
Here's what the script is supposed to do:
* [✓] Download 1 random image from wallbase.cc
* [✓] Save it to the same directory that the script is running from
* [x] Set it as the wallpaper
There are two attempts made to set the wallpaper two using different commands and NEITHER work when in the script. There is a print statement (2nd line from the bottom) that spits out the correct terminal command because I can C&P the print result and it works fine, it just doesn't work when it's executed in the script.
#!/usr/bin/env python
import urllib2
import os
from gi.repository import Gio
response = urllib2.urlopen("http://wallbase.cc/random/12/eqeq/1366x768/0.000/100/32")
page_source = response.read()
thlink_pos = page_source.find("ico-X")
address_start = (page_source.find("href=\"", thlink_pos) + 6)
address_end = page_source.find("\"", address_start + 1)
response = urllib2.urlopen(page_source[address_start:address_end])
page_source = response.read()
bigwall_pos = page_source.find("bigwall")
address_start = (page_source.find("src=\"", bigwall_pos) + 5)
address_end = page_source.find("\"", address_start + 1)
address = page_source[address_start:address_end]
slash_pos = address.rfind("/") + 1
pic_name = address[slash_pos:]
bashCommand = "wget " + page_source[address_start:address_end]
os.system(bashCommand)
print "Does my new image exists?", os.path.exists(os.getcwd() + "/" + pic_name)
#attempt 1
settings = Gio.Settings.new("org.gnome.desktop.background")
settings.set_string("picture-uri", "file://" + os.getcwd() + "/" + pic_name)
settings.apply()
#attempt 2
bashCommand = "gsettings set org.gnome.desktop.background picture-uri file://" + os.getcwd() + "/" + pic_name
print bashCommand
os.system(bashCommand)
settings.apply()

You've successfully changed your settings, but they're still left unapplied, try next:
settings.apply()
after setting "picture-uri" string.

It works for me (Ubuntu 12.04).
I've modified your script (unrelated to your error):
#!/usr/bin/python
"""Set desktop background using random images from http://wallbase.cc
It uses `gi.repository.Gio.Settings` to set the background.
"""
import functools
import itertools
import logging
import os
import posixpath
import random
import re
import sys
import time
import urllib
import urllib2
import urlparse
from collections import namedtuple
from bs4 import BeautifulSoup # $ sudo apt-get install python-bs4
from gi.repository.Gio import Settings # pylint: disable=F0401,E0611
DEFAULT_IMAGE_DIR = os.path.expanduser('~/Pictures/backgrounds')
HTMLPAGE_SIZE_MAX = 1 << 20 # bytes
TIMEOUT_MIN = 300 # seconds
TIMEOUT_DELTA = 30 # jitter
# "Anime/Manga", "Wallpapers/General", "High Resolution Images"
CATEGORY_W, CATEGORY_WG, CATEGORY_HR = range(1, 4)
PURITY_SFW, PURITY_SKETCHY, PURITY_NSFW, PURITY_DEFAULT = 4, 2, 1, 0
DAY_IN_SECONDS = 86400
UrlRetreiveResult = namedtuple('UrlRetreiveResult', "path headers")
def set_background(image_path, check_exist=True):
"""Change desktop background to image pointed by `image_path`.
"""
if check_exist: # make sure we can read it (at this time)
with open(image_path, 'rb') as f:
f.read(1)
# prepare uri
path = os.path.abspath(image_path)
if isinstance(path, unicode): # quote() doesn't like unicode
path = path.encode('utf-8')
uri = 'file://' + urllib.quote(path)
# change background
bg_setting = Settings.new('org.gnome.desktop.background')
bg_setting.set_string('picture-uri', uri)
bg_setting.apply()
def url2filename(url):
"""Return basename corresponding to url.
>>> url2filename('http://example.com/path/to/file?opt=1')
'file'
"""
urlpath = urlparse.urlsplit(url).path # pylint: disable=E1103
basename = posixpath.basename(urllib.unquote(urlpath))
if os.path.basename(basename) != basename:
raise ValueError # refuse 'dir%5Cbasename.ext' on Windows
return basename
def download(url, dirpath, extensions=True, filename=None):
"""Download url to dirpath.
Use basename of the url path as a filename.
Create destination directory if necessary.
Use `extensions` to require the file to have an extension or any
of in a given sequence of extensions.
Return (path, headers) on success.
Don't retrieve url if path exists (headers are None in this case).
"""
if not os.path.isdir(dirpath):
os.makedirs(dirpath)
logging.info('created directory %s', dirpath)
# get filename from the url
filename = url2filename(url) if filename is None else filename
if os.path.basename(filename) != filename:
logging.critical('filename must not have path separator in it "%s"',
filename)
return
if extensions:
# require the file to have an extension
root, ext = os.path.splitext(filename)
if root and len(ext) > 1:
# require the extension to be in the list
try:
it = iter(extensions)
except TypeError:
pass
else:
if ext not in it:
logging.warn(("file extension is not in the list"
" url=%s"
" extensions=%s"),
url, extensions)
return
else:
logging.warn("file has no extension url=%s", url)
return
# download file
path = os.path.join(dirpath, filename)
logging.info("%s\n%s", url, path)
if os.path.exists(path): # don't retrieve if path exists
logging.info('path exists')
return UrlRetreiveResult(path, None)
try:
return UrlRetreiveResult(*urllib.urlretrieve(url, path,
_print_download_status))
except IOError:
logging.warn('failed to download {url} -> {path}'.format(
url=url, path=path))
def _print_download_status(block_count, block_size, total_size):
logging.debug('%10s bytes of %s', block_count * block_size, total_size)
def min_time_between_calls(min_delay):
"""Enforce minimum time delay between calls."""
def decorator(func):
lastcall = [None] # emulate nonlocal keyword
#functools.wraps(func)
def wrapper(*args, **kwargs):
if lastcall[0] is not None:
delay = time.time() - lastcall[0]
if delay < min_delay:
_sleep(min_delay - delay)
lastcall[0] = time.time()
return func(*args, **kwargs)
return wrapper
return decorator
#min_time_between_calls(5)
def _makesoup(url):
try:
logging.info(vars(url) if isinstance(url, urllib2.Request) else url)
page = urllib2.urlopen(url)
soup = BeautifulSoup(page.read(HTMLPAGE_SIZE_MAX))
return soup
except (IOError, OSError) as e:
logging.warn('failed to return soup for %s, error: %s',
getattr(url, 'get_full_url', lambda: url)(), e)
class WallbaseImages:
"""Given parameters it provides image urls to download."""
def __init__(self,
categories=None, # default; sequence of CATEGORY_*
resolution_exactly=True, # False means 'at least'
resolution=None, # all; (width, height)
aspect_ratios=None, # all; sequence eg, [(5,4),(16,9)]
purity=PURITY_DEFAULT, # combine with |
thumbs_per_page=None, # default; an integer
):
"""See usage below."""
self.categories = categories
self.resolution_exactly = resolution_exactly
self.resolution = resolution
self.aspect_ratios = aspect_ratios
self.purity = purity
self.thumbs_per_page = thumbs_per_page
def _as_request(self):
"""Create a urllib2.Request() using given parameters."""
# make url
if self.categories is not None:
categories = "".join(str(n) for n in (2, 1, 3)
if n in self.categories)
else: # default
categories = "0"
if self.resolution_exactly:
at_least_or_exactly_resolution = "eqeq"
else:
at_least_or_exactly_resolution = "gteq"
if self.resolution is not None:
resolution = "{width:d}x{height:d}".format(
width=self.resolution[0], height=self.resolution[1])
else:
resolution = "0x0"
if self.aspect_ratios is not None:
aspect_ratios = "+".join("%.2f" % (w / float(h),)
for w, h in self.aspect_ratios)
else: # default
aspect_ratios = "0"
purity = "{0:03b}".format(self.purity)
thumbs = 20 if self.thumbs_per_page is None else self.thumbs_per_page
url = ("http://wallbase.cc/random/"
"{categories}/"
"{at_least_or_exactly_resolution}/{resolution}/"
"{aspect_ratios}/"
"{purity}/{thumbs:d}").format(**locals())
logging.info(url)
# make post data
data = urllib.urlencode(dict(query='', board=categories, nsfw=purity,
res=resolution,
res_opt=at_least_or_exactly_resolution,
aspect=aspect_ratios,
thpp=thumbs))
req = urllib2.Request(url, data)
return req
def __iter__(self):
"""Yield background image urls."""
# find links to bigwall pages
# css-like: #thumbs div[class="thumb"] \
# a[class~="thlink" and href^="http://"]
soup = _makesoup(self._as_request())
if not soup:
logging.warn("can't retrieve the main page")
return
thumbs_soup = soup.find(id="thumbs")
for thumb in thumbs_soup.find_all('div', {'class': "thumb"}):
bigwall_a = thumb.find('a', {'class': "thlink",
'href': re.compile(r"^http://")})
if bigwall_a is None:
logging.warn("can't find thlink link")
continue # try the next thumb
# find image url on the bigwall page
# css-like: #bigwall > img[alt and src^="http://"]
bigwall_soup = _makesoup(bigwall_a['href'])
if bigwall_soup is not None:
bigwall = bigwall_soup.find(id='bigwall')
if bigwall is not None:
img = bigwall.find('img',
src=re.compile(r"(?i)^http://.*\.jpg$"),
alt=True)
if img is not None:
url = img['src']
filename = url2filename(url)
if filename.lower().endswith('.jpg'):
yield url, filename # successfully found image url
else:
logging.warn('suspicious url "%s"', url)
continue
logging.warn("can't parse bigwall page")
def main():
level = logging.INFO
if '-d' in sys.argv:
sys.argv.remove('-d')
level = logging.DEBUG
# configure logging
logging.basicConfig(format='%(levelname)s: %(asctime)s %(message)s',
level=level, datefmt='%Y-%m-%d %H:%M:%S %Z')
if len(sys.argv) > 1:
backgrounds_dir = sys.argv[1]
else:
backgrounds_dir = DEFAULT_IMAGE_DIR
# infinite loop: Press Ctrl+C to interrupt it
#NOTE: here's some arbitrary logic: modify for you needs e.g., break
# after the first image found
timeout = TIMEOUT_MIN # seconds
for i in itertools.cycle(xrange(timeout, DAY_IN_SECONDS)):
found = False
try:
for url, filename in WallbaseImages(
categories=[CATEGORY_WG, CATEGORY_HR, CATEGORY_W],
purity=PURITY_SFW,
thumbs_per_page=60):
res = download(url, backgrounds_dir, extensions=('.jpg',),
filename=filename)
if res and res.path:
found = True
set_background(res.path)
# don't hammer the site
timeout = max(TIMEOUT_MIN, i % DAY_IN_SECONDS)
_sleep(random.randint(timeout, timeout + TIMEOUT_DELTA))
except Exception: # pylint: disable=W0703
logging.exception('unexpected error')
_sleep(timeout)
else:
if not found:
logging.error('failed to retrieve any images')
_sleep(timeout)
timeout = (timeout * 2) % DAY_IN_SECONDS
def _sleep(timeout):
"""Add logging to time.sleep() call."""
logging.debug('sleep for %s seconds', timeout)
time.sleep(timeout)
main()

Tried to implement a python script that used the PIL library to write text on an image then update the Gnome background "picture-uri" to point to that image using the Gio class. The python script would ping pong between two images to always modify the one not in use and then attempt to "switch" by updating the Settings. Did this to avoid any flicker as modifying the current background directly drops it out temporarily. While in the shell and calling the script directly I rarely saw any issue, but in the cronjob it simply wouldn't update on the pong. I used both sync and apply and would wait several minutes before trying to switch the images. Didn't work. Tried cron as user (su -c "cmd" user) and that didn't work either.
Finally gave up on the ping pong approach when I noticed that Gnome will detect any change in the background file and update. So dropped the ping pong method and went to a temp file that I just copy over the current background using the shutil library. Works like a charm.

split a pdf based on outline

i would like to use pyPdf to split a pdf file based on the outline where each destination in the outline refers to a different page within the pdf.
example outline:
main --> points to page 1
sect1 --> points to page 1
sect2 --> points to page 15
sect3 --> points to page 22
it is easy within pyPdf to iterate over each page of the document or each destination in the document's outline; however, i cannot figure out how to get the page number where the destination points.
does anybody know how to find the referencing page number for each destination in the outline?

I figured it out:
class Darrell(pyPdf.PdfFileReader):
def getDestinationPageNumbers(self):
def _setup_outline_page_ids(outline, _result=None):
if _result is None:
_result = {}
for obj in outline:
if isinstance(obj, pyPdf.pdf.Destination):
_result[(id(obj), obj.title)] = obj.page.idnum
elif isinstance(obj, list):
_setup_outline_page_ids(obj, _result)
return _result
def _setup_page_id_to_num(pages=None, _result=None, _num_pages=None):
if _result is None:
_result = {}
if pages is None:
_num_pages = []
pages = self.trailer["/Root"].getObject()["/Pages"].getObject()
t = pages["/Type"]
if t == "/Pages":
for page in pages["/Kids"]:
_result[page.idnum] = len(_num_pages)
_setup_page_id_to_num(page.getObject(), _result, _num_pages)
elif t == "/Page":
_num_pages.append(1)
return _result
outline_page_ids = _setup_outline_page_ids(self.getOutlines())
page_id_to_page_numbers = _setup_page_id_to_num()
result = {}
for (_, title), page_idnum in outline_page_ids.iteritems():
result[title] = page_id_to_page_numbers.get(page_idnum, '???')
return result
pdf = Darrell(open(PATH-TO-PDF, 'rb'))
template = '%-5s %s'
print template % ('page', 'title')
for p,t in sorted([(v,k) for k,v in pdf.getDestinationPageNumbers().iteritems()]):
print template % (p+1,t)

This is just what I was looking for. Darrell's additions to PdfFileReader should be part of PyPDF2.
I wrote a little recipe that uses PyPDF2 and sejda-console to split a PDF by bookmarks. In my case there are several Level 1 sections that I want to keep together. This script allows me to do that and give the resulting files meaningful names.
import operator
import os
import subprocess
import sys
import time
import PyPDF2 as pyPdf
# need to have sejda-console installed
# change this to point to your installation
sejda = 'C:\\sejda-console-1.0.0.M2\\bin\\sejda-console.bat'
class Darrell(pyPdf.PdfFileReader):
...
if __name__ == '__main__':
t0= time.time()
# get the name of the file to split as a command line arg
pdfname = sys.argv[1]
# open up the pdf
pdf = Darrell(open(pdfname, 'rb'))
# build list of (pagenumbers, newFileNames)
splitlist = [(1,'FrontMatter')] # Customize name of first section
template = '%-5s %s'
print template % ('Page', 'Title')
print '-'*72
for t,p in sorted(pdf.getDestinationPageNumbers().iteritems(),
key=operator.itemgetter(1)):
# Customize this to get it to split where you want
if t.startswith('Chapter') or \
t.startswith('Preface') or \
t.startswith('References'):
print template % (p+1, t)
# this customizes how files are renamed
new = t.replace('Chapter ', 'Chapter')\
.replace(': ', '-')\
.replace(': ', '-')\
.replace(' ', '_')
splitlist.append((p+1, new))
# call sejda tools and split document
call = sejda
call += ' splitbypages'
call += ' -f "%s"'%pdfname
call += ' -o ./'
call += ' -n '
call += ' '.join([str(p) for p,t in splitlist[1:]])
print '\n', call
subprocess.call(call)
print '\nsejda-console has completed.\n\n'
# rename the split files
for p,t in splitlist:
old ='./%i_'%p + pdfname
new = './' + t + '.pdf'
print 'renaming "%s"\n to "%s"...'%(old, new),
try:
os.remove(new)
except OSError:
pass
try:
os.rename(old, new)
print' succeeded.\n'
except:
print' failed.\n'
print '\ndone. Spliting took %.2f seconds'%(time.time() - t0)

Small update to #darrell class to be able to parse UTF-8 outlines, which I post as answer because comment would be hard to read.
Problem is in pyPdf.pdf.Destination.title which may be returned in two flavors:
pyPdf.generic.TextStringObject
pyPdf.generic.ByteStringObject
so that output from _setup_outline_page_ids() function returns also two different types for title object, which fails with UnicodeDecodeError if outline title contains anything then ASCII.
I added this code to solve the problem:
if isinstance(title, pyPdf.generic.TextStringObject):
title = title.encode('utf-8')
of whole class:
class PdfOutline(pyPdf.PdfFileReader):
def getDestinationPageNumbers(self):
def _setup_outline_page_ids(outline, _result=None):
if _result is None:
_result = {}
for obj in outline:
if isinstance(obj, pyPdf.pdf.Destination):
_result[(id(obj), obj.title)] = obj.page.idnum
elif isinstance(obj, list):
_setup_outline_page_ids(obj, _result)
return _result
def _setup_page_id_to_num(pages=None, _result=None, _num_pages=None):
if _result is None:
_result = {}
if pages is None:
_num_pages = []
pages = self.trailer["/Root"].getObject()["/Pages"].getObject()
t = pages["/Type"]
if t == "/Pages":
for page in pages["/Kids"]:
_result[page.idnum] = len(_num_pages)
_setup_page_id_to_num(page.getObject(), _result, _num_pages)
elif t == "/Page":
_num_pages.append(1)
return _result
outline_page_ids = _setup_outline_page_ids(self.getOutlines())
page_id_to_page_numbers = _setup_page_id_to_num()
result = {}
for (_, title), page_idnum in outline_page_ids.iteritems():
if isinstance(title, pyPdf.generic.TextStringObject):
title = title.encode('utf-8')
result[title] = page_id_to_page_numbers.get(page_idnum, '???')
return result

Darrell's class can be modified slightly to produce a multi-level table of contents for a pdf (in the manner of pdftoc in the pdftk toolkit.)
My modification adds one more parameter to _setup_page_id_to_num, an integer "level" which defaults to 1. Each invocation increments the level. Instead of storing just the page number in the result, we store the pair of page number and level. Appropriate modifications should be applied when using the returned result.
I am using this to implement the "PDF Hacks" browser-based page-at-a-time document viewer with a sidebar table of contents which reflects LaTeX section, subsection etc bookmarks. I am working on a shared system where pdftk can not be installed but where python is available.

A solution 10 years later for newer python and PyPDF:
from PyPDF2 import PdfReader, PdfWriter
filename = "main.pdf"
with open(filename, "rb") as f:
r = PdfReader(f)
bookmarks = list(map(lambda x: (x.title, r.get_destination_page_number(x)), r.outline))
print(bookmarks)
for i, b in enumerate(bookmarks):
begin = b[1]
end = bookmarks[i+1][1] if i < len(bookmarks) - 1 else len(r.pages)
# print(len(r.pages[begin:end]))
name = b[0] + ".pdf"
print(f"{name=}: {begin=}, {end=}")
with open(name, "wb") as f:
w = PdfWriter(f)
for p in r.pages[begin:end]:
w.add_page(p)
w.write(f)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Moving from multiprocessing to threading - python

Related

How should I write a function in Python that check whether a user has fetched some url or not?

Iteratively process large wikipedia dump

Why does this pickle reach maximum recursion depth without recursion?

Automatic background changer using Python 2.7.3 not working, though it should

split a pdf based on outline

Categories

Resources