The Python script below has worked for me in converting multiple PDF documents to a single PDF when running on Python 2.7. I'm now trying to use it on 3.10 and am getting errors.
I believe I've conquered most of them, especially changing print to print( and disabling imports of CoreFoundation and Quartz.CoreGraphics.
It seems that the only remaining error is:
line 85, in main
writeContext = CGPDFContextCreateWithURL(CFURLCreateFromFileSystemRepresentation(kCFAllocatorDefault,
arg, len(arg), False), None, None) NameError: name
'CGPDFContextCreateWithURL' is not defined
If I declare CGPDFContextCreateWithURL as an empty global, the error shifts to CFURLCreateFromFileSystemRepresentation. Declare that and the error is kCFAllocatorDefault.
Here's how I'm trying to handle those.
global CGPDFContextCreateWithURL CGPDFContextCreateWithURL = ""
I don't understand that entire line so any help in making it right would be appreciated.
#
# join
# Joing pages from a a collection of PDF files into a single PDF file.
#
# join [--output <file>] [--shuffle] [--verbose]"
#
# Parameter:
#
# --shuffle
# Take a page from each PDF input file in turn before taking another from each file.
# If this option is not specified then all of the pages from a PDF file are appended
# to the output PDF file before the next input PDF file is processed.
#
# --verbose
# Write information about the doings of this tool to stderr.
#
import sys
import os
import getopt
import tempfile
import shutil
# from CoreFoundation import *
# from Quartz.CoreGraphics import *
global verbose
verbose = False
def createPDFDocumentWithPath(path):
if verbose:
print("Creating PDF document from file %s" % (path))
return CGPDFDocumentCreateWithURL(CFURLCreateFromFileSystemRepresentation(kCFAllocatorDefault, path, len(path), False))
def writePageFromDoc(writeContext, doc, pageNum):
page = CGPDFDocumentGetPage(doc, pageNum)
if page:
mediaBox = CGPDFPageGetBoxRect(page, kCGPDFMediaBox)
if CGRectIsEmpty(mediaBox):
mediaBox = None
CGContextBeginPage(writeContext, mediaBox)
CGContextDrawPDFPage(writeContext, page)
CGContextEndPage(writeContext)
if verbose:
print("Copied page %d from %s" % (pageNum, doc))
def shufflePages(writeContext, docs, maxPages):
for pageNum in xrange(1, maxPages + 1):
for doc in docs:
writePageFromDoc(writeContext, doc, pageNum)
def append(writeContext, docs, maxPages):
for doc in docs:
for pageNum in xrange(1, maxPages + 1) :
writePageFromDoc(writeContext, doc, pageNum)
def main(argv):
global verbose
# The PDF context we will draw into to create a new PDF
writeContext = None
# If True then generate more verbose information
source = None
shuffle = False
# Parse the command line options
try:
options, args = getopt.getopt(argv, "o:sv", ["output=", "shuffle", "verbose"])
except getopt.GetoptError:
usage()
sys.exit(2)
for option, arg in options:
if option in ("-o", "--output") :
if verbose:
print("Setting %s as the destination." % (arg))
writeContext = CGPDFContextCreateWithURL(CFURLCreateFromFileSystemRepresentation(kCFAllocatorDefault, arg, len(arg), False), None, None)
elif option in ("-s", "--shuffle") :
if verbose :
print("Shuffle pages to the output file.")
shuffle = True
elif option in ("-v", "--verbose") :
print("Verbose mode enabled.")
verbose = True
else :
print("Unknown argument: %s" % (option))
if writeContext:
# create PDFDocuments for all of the files.
docs = map(createPDFDocumentWithPath, args)
# find the maximum number of pages.
maxPages = 0
for doc in docs:
if CGPDFDocumentGetNumberOfPages(doc) > maxPages:
maxPages = CGPDFDocumentGetNumberOfPages(doc)
if shuffle:
shufflePages(writeContext, docs, maxPages)
else:
append(writeContext, docs, maxPages)
CGPDFContextClose(writeContext)
del writeContext
#CGContextRelease(writeContext)
def usage():
print("Usage: join [--output <file>] [--shuffle] [--verbose]")
if __name__ == "__main__":
main(sys.argv[1:])
Related
I am trying to copy parameters passed into a python script to a file. Here is the parameters.
["0013","1","1","\"john.dow#gmail.com\"","1","P123-ND 10Q","10Q H??C"]
I understand that there is a buffer problem and I am getting bad data into my parameters. However, I do not have control over what is being passed in. I am trying to copy, starting at the 5th parameter, the parameters into a file.
f = open(in_file_name, 'w')
for x in range(5, len(arg_list)):
f.write(arg_list[x] + '\n')
f.close()
The result of the file is below:
P123-ND 10Q
10Q H??C
Here is what it should be:
P123-ND
10Q
How can I not include the bad data? What is happening to the spaces between the valid information and the bad information?
As requested, here is the full program:
#!/bin/python
class Argument_Indices:
PRINTER_INDEX = 0
AREA_INDEX = 1
LABEL_INDEX = 2
EMAIL_INDEX = 3
RUN_TYPE_INDEX = 4
import argparse
import json
import os
from subprocess import call
import sys
from time import strftime
def _handle_args():
''' Setup and run argpars '''
parser = argparse.ArgumentParser(description='Set environment variables for and to call Program')
parser.add_argument('time_to_run', default='NOW', choices=['NOW', 'EOP'], help='when to run the report')
parser.add_argument('arguments', nargs='+', help='the remaining command line arguments')
return parser.parse_args()
def _proces_program(arg_list):
time_stamp = strftime("%d_%b_%Y_%H_%M_%S")
printer = arg_list[Argument_Indices.PRINTER_INDEX]
area = arg_list[Argument_Indices.AREA_INDEX]
label = arg_list[Argument_Indices.LABEL_INDEX]
in_file_name = "/tmp/program{0}.inp".format(time_stamp)
os.environ['INPUT_FILE'] = in_file_name
f = open(in_file_name, 'w')
for x in range(5, len(arg_list)):
f.write(arg_list[x])
f.close()
call(['./Program.bin', printer, area, label])
os.remove(in_file_name)
def main():
''' Main Function '''
arg_list = None
args = _handle_args()
if len(args.arguments) < 1:
print('Missing name of input file')
return -1
with open(args.arguments[0]) as input_file:
arg_list = json.load(input_file)
_process_program(arg_list)
return 0
if __name__ == '__main__':
if main() != 0:
print('Program run failed')
sys.exit()
For your exact case (where you're getting duplicated parameters received with some spaces in between) this would work:
received_param_list = ["0013","1","1","\"john.dow#gmail.com\"","1","P123-ND 10Q","10Q H??C"]
arg_list = [i.split(" ")[0] for i in received_param_list]
last_param = received_param_list[-1].split()[-1]
if last_param != arg_list[-1]:
arg_list.append(last_param)
for x in range(5, len(arg_list)):
print (arg_list[x])
Although there might be another simpler way
I'm writing a program that fetches metadata from FLAC fliles and batch renames them. To do so, I'm using the py library.
Here is my code:
#!/usr/bin/python
# -*- coding: utf-8 -*-
# This program takes the information from FLAC metadata to rename the files
# according to various naming paterns.
"""
rename-flac takes the information from FLAC metadata to batch rename
the files according to various naming paterns.
Usage:
rename-flac.py (-s | -v) <directory>
rename-flac.py (-h | --help)
rename-flac.py --version
Options:
-h --help Show the help screen
-- version Outputs version information and exits
-s Define album as single artist
-v Define album as various artist
"""
from docopt import docopt
import subprocess
import sys
import os
from py.path import local
# Dependency check
programlist = ["flac", "python-py", "python-docopt"]
for program in programlist:
pipe = subprocess.Popen(
["dpkg", "-l", program], stdout=subprocess.PIPE)
dependency, error = pipe.communicate()
if pipe.returncode:
print """
%s is not installed: this program won't run correctly.
To instal %s, run: aptitude install %s
""" % (program, program, program)
sys.exit()
else:
pass
# Defining the function that fetches metadata and formats it
def metadata(filename):
filename = str(filename).decode("utf-8")
pipe = subprocess.Popen(
["metaflac", "--show-tag=tracknumber", filename],
stdout=subprocess.PIPE)
tracknumber, error = pipe.communicate()
tracknumber = tracknumber.decode("utf-8")
tracknumber = tracknumber.replace("tracknumber=", "")
tracknumber = tracknumber.replace("TRACKNUMBER=", "")
tracknumber = tracknumber.rstrip() # Remove whitespaces
if int(tracknumber) < 10:
if "0" in tracknumber:
pass
else:
tracknumber = "0" + tracknumber
else:
pass
pipe = subprocess.Popen(
["metaflac", "--show-tag=title", filename],
stdout=subprocess.PIPE)
title, error = pipe.communicate()
title = title.decode("utf-8")
title = title.replace("TITLE=", "")
title = title.replace("title=", "")
title = title.rstrip()
pipe = subprocess.Popen(
["metaflac", "--show-tag=artist", filename],
stdout=subprocess.PIPE)
artist, error = pipe.communicate()
artist = artist.decode("utf-8")
artist = artist.replace("ARTIST=", "")
artist = artist.replace("artist=", "")
artist = artist.rstrip()
return tracknumber, title, artist
# Defining function that renames the files
def rename(root):
if output == str(filename.purebasename).decode("utf-8"):
print "%s is already named correctly\n" % (title)
else:
filename.rename(filename.new(purebasename=output))
# Importing command line arguments
args = docopt(__doc__, version="rename-flac 0.5")
for option, value in args.iteritems():
global root, choice
if option == "<directory>":
root = local(value)
elif option == "-s" and value == True:
choice = 1
elif option == "-v" and value == True:
choice = 2
else:
pass
# 1 - Single artist
# File naming partern: TRACKNUMBER - TITLE.flac
if choice == 1:
for filename in root.visit(fil="*.flac", rec=True):
tracknumber, title, artist = metadata(filename)
output = "%s - %s" % (tracknumber, title)
rename(root)
print "Files renamed"
else:
pass
# 2 - Various artists
# File naming pattern: TRACKNUMBER - ARTIST - TITLE.flac
if choice == 2:
for filename in root.visit(fil="*.flac", rec=True):
tracknumber, title, artist = metadata(filename)
output = "%s - %s - %s" % (tracknumber, artist, title)
rename(root)
print "Files renamed"
else:
pass
My code runs fine when filename has utf-8 characters, but when the path to filename has utf-8 characters it get this error message:
Traceback (most recent call last):
File "/media/Main/Programmes/Rename_FLAC/rename-flac.py", line 122, in <module>
rename(root)
File "/media/Main/Programmes/Rename_FLAC/rename-flac.py", line 97, in rename
filename.rename(filename.new(purebasename=output))
File "/usr/lib/python2.7/dist-packages/py/_path/local.py", line 273, in new
"%(dirname)s%(sep)s%(basename)s" % kw)
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 28: ordinal not in range(128)
This may seem obvious to more experienced programmers, but I have been trying to resolve that error for a few hours now...
This was the problem:
filename.rename(filename.new(purebasename=output)) crashed because output was a unicode mess.
output was created by joining tracknumber, artist and title. In metadata(filename) I was reencoding them as unicode with string = string.decode("utf-8"), but they were already in unicode, resulting in output being mess-up and the crash.
So I deleted string = string.decode("utf-8") and it works.
yay.
I'm very new to Ubuntu/Python/Bash/Gnome in general, so I still feel like there's a chance I'm doing something wrong, but it's been 3 days now without success...
Here's what the script is supposed to do:
* [✓] Download 1 random image from wallbase.cc
* [✓] Save it to the same directory that the script is running from
* [x] Set it as the wallpaper
There are two attempts made to set the wallpaper two using different commands and NEITHER work when in the script. There is a print statement (2nd line from the bottom) that spits out the correct terminal command because I can C&P the print result and it works fine, it just doesn't work when it's executed in the script.
#!/usr/bin/env python
import urllib2
import os
from gi.repository import Gio
response = urllib2.urlopen("http://wallbase.cc/random/12/eqeq/1366x768/0.000/100/32")
page_source = response.read()
thlink_pos = page_source.find("ico-X")
address_start = (page_source.find("href=\"", thlink_pos) + 6)
address_end = page_source.find("\"", address_start + 1)
response = urllib2.urlopen(page_source[address_start:address_end])
page_source = response.read()
bigwall_pos = page_source.find("bigwall")
address_start = (page_source.find("src=\"", bigwall_pos) + 5)
address_end = page_source.find("\"", address_start + 1)
address = page_source[address_start:address_end]
slash_pos = address.rfind("/") + 1
pic_name = address[slash_pos:]
bashCommand = "wget " + page_source[address_start:address_end]
os.system(bashCommand)
print "Does my new image exists?", os.path.exists(os.getcwd() + "/" + pic_name)
#attempt 1
settings = Gio.Settings.new("org.gnome.desktop.background")
settings.set_string("picture-uri", "file://" + os.getcwd() + "/" + pic_name)
settings.apply()
#attempt 2
bashCommand = "gsettings set org.gnome.desktop.background picture-uri file://" + os.getcwd() + "/" + pic_name
print bashCommand
os.system(bashCommand)
settings.apply()
You've successfully changed your settings, but they're still left unapplied, try next:
settings.apply()
after setting "picture-uri" string.
It works for me (Ubuntu 12.04).
I've modified your script (unrelated to your error):
#!/usr/bin/python
"""Set desktop background using random images from http://wallbase.cc
It uses `gi.repository.Gio.Settings` to set the background.
"""
import functools
import itertools
import logging
import os
import posixpath
import random
import re
import sys
import time
import urllib
import urllib2
import urlparse
from collections import namedtuple
from bs4 import BeautifulSoup # $ sudo apt-get install python-bs4
from gi.repository.Gio import Settings # pylint: disable=F0401,E0611
DEFAULT_IMAGE_DIR = os.path.expanduser('~/Pictures/backgrounds')
HTMLPAGE_SIZE_MAX = 1 << 20 # bytes
TIMEOUT_MIN = 300 # seconds
TIMEOUT_DELTA = 30 # jitter
# "Anime/Manga", "Wallpapers/General", "High Resolution Images"
CATEGORY_W, CATEGORY_WG, CATEGORY_HR = range(1, 4)
PURITY_SFW, PURITY_SKETCHY, PURITY_NSFW, PURITY_DEFAULT = 4, 2, 1, 0
DAY_IN_SECONDS = 86400
UrlRetreiveResult = namedtuple('UrlRetreiveResult', "path headers")
def set_background(image_path, check_exist=True):
"""Change desktop background to image pointed by `image_path`.
"""
if check_exist: # make sure we can read it (at this time)
with open(image_path, 'rb') as f:
f.read(1)
# prepare uri
path = os.path.abspath(image_path)
if isinstance(path, unicode): # quote() doesn't like unicode
path = path.encode('utf-8')
uri = 'file://' + urllib.quote(path)
# change background
bg_setting = Settings.new('org.gnome.desktop.background')
bg_setting.set_string('picture-uri', uri)
bg_setting.apply()
def url2filename(url):
"""Return basename corresponding to url.
>>> url2filename('http://example.com/path/to/file?opt=1')
'file'
"""
urlpath = urlparse.urlsplit(url).path # pylint: disable=E1103
basename = posixpath.basename(urllib.unquote(urlpath))
if os.path.basename(basename) != basename:
raise ValueError # refuse 'dir%5Cbasename.ext' on Windows
return basename
def download(url, dirpath, extensions=True, filename=None):
"""Download url to dirpath.
Use basename of the url path as a filename.
Create destination directory if necessary.
Use `extensions` to require the file to have an extension or any
of in a given sequence of extensions.
Return (path, headers) on success.
Don't retrieve url if path exists (headers are None in this case).
"""
if not os.path.isdir(dirpath):
os.makedirs(dirpath)
logging.info('created directory %s', dirpath)
# get filename from the url
filename = url2filename(url) if filename is None else filename
if os.path.basename(filename) != filename:
logging.critical('filename must not have path separator in it "%s"',
filename)
return
if extensions:
# require the file to have an extension
root, ext = os.path.splitext(filename)
if root and len(ext) > 1:
# require the extension to be in the list
try:
it = iter(extensions)
except TypeError:
pass
else:
if ext not in it:
logging.warn(("file extension is not in the list"
" url=%s"
" extensions=%s"),
url, extensions)
return
else:
logging.warn("file has no extension url=%s", url)
return
# download file
path = os.path.join(dirpath, filename)
logging.info("%s\n%s", url, path)
if os.path.exists(path): # don't retrieve if path exists
logging.info('path exists')
return UrlRetreiveResult(path, None)
try:
return UrlRetreiveResult(*urllib.urlretrieve(url, path,
_print_download_status))
except IOError:
logging.warn('failed to download {url} -> {path}'.format(
url=url, path=path))
def _print_download_status(block_count, block_size, total_size):
logging.debug('%10s bytes of %s', block_count * block_size, total_size)
def min_time_between_calls(min_delay):
"""Enforce minimum time delay between calls."""
def decorator(func):
lastcall = [None] # emulate nonlocal keyword
#functools.wraps(func)
def wrapper(*args, **kwargs):
if lastcall[0] is not None:
delay = time.time() - lastcall[0]
if delay < min_delay:
_sleep(min_delay - delay)
lastcall[0] = time.time()
return func(*args, **kwargs)
return wrapper
return decorator
#min_time_between_calls(5)
def _makesoup(url):
try:
logging.info(vars(url) if isinstance(url, urllib2.Request) else url)
page = urllib2.urlopen(url)
soup = BeautifulSoup(page.read(HTMLPAGE_SIZE_MAX))
return soup
except (IOError, OSError) as e:
logging.warn('failed to return soup for %s, error: %s',
getattr(url, 'get_full_url', lambda: url)(), e)
class WallbaseImages:
"""Given parameters it provides image urls to download."""
def __init__(self,
categories=None, # default; sequence of CATEGORY_*
resolution_exactly=True, # False means 'at least'
resolution=None, # all; (width, height)
aspect_ratios=None, # all; sequence eg, [(5,4),(16,9)]
purity=PURITY_DEFAULT, # combine with |
thumbs_per_page=None, # default; an integer
):
"""See usage below."""
self.categories = categories
self.resolution_exactly = resolution_exactly
self.resolution = resolution
self.aspect_ratios = aspect_ratios
self.purity = purity
self.thumbs_per_page = thumbs_per_page
def _as_request(self):
"""Create a urllib2.Request() using given parameters."""
# make url
if self.categories is not None:
categories = "".join(str(n) for n in (2, 1, 3)
if n in self.categories)
else: # default
categories = "0"
if self.resolution_exactly:
at_least_or_exactly_resolution = "eqeq"
else:
at_least_or_exactly_resolution = "gteq"
if self.resolution is not None:
resolution = "{width:d}x{height:d}".format(
width=self.resolution[0], height=self.resolution[1])
else:
resolution = "0x0"
if self.aspect_ratios is not None:
aspect_ratios = "+".join("%.2f" % (w / float(h),)
for w, h in self.aspect_ratios)
else: # default
aspect_ratios = "0"
purity = "{0:03b}".format(self.purity)
thumbs = 20 if self.thumbs_per_page is None else self.thumbs_per_page
url = ("http://wallbase.cc/random/"
"{categories}/"
"{at_least_or_exactly_resolution}/{resolution}/"
"{aspect_ratios}/"
"{purity}/{thumbs:d}").format(**locals())
logging.info(url)
# make post data
data = urllib.urlencode(dict(query='', board=categories, nsfw=purity,
res=resolution,
res_opt=at_least_or_exactly_resolution,
aspect=aspect_ratios,
thpp=thumbs))
req = urllib2.Request(url, data)
return req
def __iter__(self):
"""Yield background image urls."""
# find links to bigwall pages
# css-like: #thumbs div[class="thumb"] \
# a[class~="thlink" and href^="http://"]
soup = _makesoup(self._as_request())
if not soup:
logging.warn("can't retrieve the main page")
return
thumbs_soup = soup.find(id="thumbs")
for thumb in thumbs_soup.find_all('div', {'class': "thumb"}):
bigwall_a = thumb.find('a', {'class': "thlink",
'href': re.compile(r"^http://")})
if bigwall_a is None:
logging.warn("can't find thlink link")
continue # try the next thumb
# find image url on the bigwall page
# css-like: #bigwall > img[alt and src^="http://"]
bigwall_soup = _makesoup(bigwall_a['href'])
if bigwall_soup is not None:
bigwall = bigwall_soup.find(id='bigwall')
if bigwall is not None:
img = bigwall.find('img',
src=re.compile(r"(?i)^http://.*\.jpg$"),
alt=True)
if img is not None:
url = img['src']
filename = url2filename(url)
if filename.lower().endswith('.jpg'):
yield url, filename # successfully found image url
else:
logging.warn('suspicious url "%s"', url)
continue
logging.warn("can't parse bigwall page")
def main():
level = logging.INFO
if '-d' in sys.argv:
sys.argv.remove('-d')
level = logging.DEBUG
# configure logging
logging.basicConfig(format='%(levelname)s: %(asctime)s %(message)s',
level=level, datefmt='%Y-%m-%d %H:%M:%S %Z')
if len(sys.argv) > 1:
backgrounds_dir = sys.argv[1]
else:
backgrounds_dir = DEFAULT_IMAGE_DIR
# infinite loop: Press Ctrl+C to interrupt it
#NOTE: here's some arbitrary logic: modify for you needs e.g., break
# after the first image found
timeout = TIMEOUT_MIN # seconds
for i in itertools.cycle(xrange(timeout, DAY_IN_SECONDS)):
found = False
try:
for url, filename in WallbaseImages(
categories=[CATEGORY_WG, CATEGORY_HR, CATEGORY_W],
purity=PURITY_SFW,
thumbs_per_page=60):
res = download(url, backgrounds_dir, extensions=('.jpg',),
filename=filename)
if res and res.path:
found = True
set_background(res.path)
# don't hammer the site
timeout = max(TIMEOUT_MIN, i % DAY_IN_SECONDS)
_sleep(random.randint(timeout, timeout + TIMEOUT_DELTA))
except Exception: # pylint: disable=W0703
logging.exception('unexpected error')
_sleep(timeout)
else:
if not found:
logging.error('failed to retrieve any images')
_sleep(timeout)
timeout = (timeout * 2) % DAY_IN_SECONDS
def _sleep(timeout):
"""Add logging to time.sleep() call."""
logging.debug('sleep for %s seconds', timeout)
time.sleep(timeout)
main()
Tried to implement a python script that used the PIL library to write text on an image then update the Gnome background "picture-uri" to point to that image using the Gio class. The python script would ping pong between two images to always modify the one not in use and then attempt to "switch" by updating the Settings. Did this to avoid any flicker as modifying the current background directly drops it out temporarily. While in the shell and calling the script directly I rarely saw any issue, but in the cronjob it simply wouldn't update on the pong. I used both sync and apply and would wait several minutes before trying to switch the images. Didn't work. Tried cron as user (su -c "cmd" user) and that didn't work either.
Finally gave up on the ping pong approach when I noticed that Gnome will detect any change in the background file and update. So dropped the ping pong method and went to a temp file that I just copy over the current background using the shutil library. Works like a charm.
I have written a script in python using pywin32 to save pdf files to text that up until recently was working fine. I use similar methods in Excel. The code is below:
def __pdf2Txt(self, pdf, fileformat="com.adobe.acrobat.accesstext"):
outputLoc = os.path.dirname(pdf)
outputLoc = os.path.join(outputLoc, os.path.splitext(os.path.basename(pdf))[0] + '.txt')
try:
win32com.client.gencache.EnsureModule('{E64169B3-3592-47d2-816E-602C5C13F328}', 0, 1, 1)
adobe = win32com.client.DispatchEx('AcroExch.App')
pdDoc = win32com.client.DispatchEx('AcroExch.PDDoc')
pdDoc.Open(pdf)
jObject = pdDoc.GetJSObject()
jObject.SaveAs(outputLoc, "com.adobe.acrobat.accesstext")
except:
traceback.print_exc()
return False
finally:
del jObject
pdDoc.Close()
del pdDoc
adobe.Exit()
del adobe
However this code has suddenly stopped working and I get the following output:
Traceback (most recent call last):
File "C:\Documents and Settings\ablishen\workspace\HooverKeyCreator\src\HooverKeyCreator.py", line 38, in __pdf2Txt
jObject.SaveAs(outputLoc, "com.adobe.acrobat.accesstext")
File "C:\Python27\lib\site-packages\win32com\client\dynamic.py", line 505, in __getattr__
ret = self._oleobj_.Invoke(retEntry.dispid,0,invoke_type,1)
com_error: (-2147467263, 'Not implemented', None, None)
False
I have similar code written in VB that works correctly so I'm guessing that it has something to do with the COM interfaces not binding to the appropriate functions correctly? (my COM knowledge is patchy).
Blish, this thread holds the key to the solution you are looking for: https://mail.python.org/pipermail/python-win32/2002-March/000260.html
I admit that the post above is not the easiest to find (probably because Google scores it low based on the age of the content?).
Specifically, applying this piece of advice will get things running for you: https://mail.python.org/pipermail/python-win32/2002-March/000265.html
For reference, the complete piece of code that does not require you to manually patch dynamic.py (snippet should run pretty much out of the box):
# gets all files under ROOT_INPUT_PATH with FILE_EXTENSION and tries to extract text from them into ROOT_OUTPUT_PATH with same filename as the input file but with INPUT_FILE_EXTENSION replaced by OUTPUT_FILE_EXTENSION
from win32com.client import Dispatch
from win32com.client.dynamic import ERRORS_BAD_CONTEXT
import winerror
# try importing scandir and if found, use it as it's a few magnitudes of an order faster than stock os.walk
try:
from scandir import walk
except ImportError:
from os import walk
import fnmatch
import sys
import os
ROOT_INPUT_PATH = None
ROOT_OUTPUT_PATH = None
INPUT_FILE_EXTENSION = "*.pdf"
OUTPUT_FILE_EXTENSION = ".txt"
def acrobat_extract_text(f_path, f_path_out, f_basename, f_ext):
avDoc = Dispatch("AcroExch.AVDoc") # Connect to Adobe Acrobat
# Open the input file (as a pdf)
ret = avDoc.Open(f_path, f_path)
assert(ret) # FIXME: Documentation says "-1 if the file was opened successfully, 0 otherwise", but this is a bool in practise?
pdDoc = avDoc.GetPDDoc()
dst = os.path.join(f_path_out, ''.join((f_basename, f_ext)))
# Adobe documentation says "For that reason, you must rely on the documentation to know what functionality is available through the JSObject interface. For details, see the JavaScript for Acrobat API Reference"
jsObject = pdDoc.GetJSObject()
# Here you can save as many other types by using, for instance: "com.adobe.acrobat.xml"
jsObject.SaveAs(dst, "com.adobe.acrobat.accesstext")
pdDoc.Close()
avDoc.Close(True) # We want this to close Acrobat, as otherwise Acrobat is going to refuse processing any further files after a certain threshold of open files are reached (for example 50 PDFs)
del pdDoc
if __name__ == "__main__":
assert(5 == len(sys.argv)), sys.argv # <script name>, <script_file_input_path>, <script_file_input_extension>, <script_file_output_path>, <script_file_output_extension>
#$ python get.txt.from.multiple.pdf.py 'C:\input' '*.pdf' 'C:\output' '.txt'
ROOT_INPUT_PATH = sys.argv[1]
INPUT_FILE_EXTENSION = sys.argv[2]
ROOT_OUTPUT_PATH = sys.argv[3]
OUTPUT_FILE_EXTENSION = sys.argv[4]
# tuples are of schema (path_to_file, filename)
matching_files = ((os.path.join(_root, filename), os.path.splitext(filename)[0]) for _root, _dirs, _files in walk(ROOT_INPUT_PATH) for filename in fnmatch.filter(_files, INPUT_FILE_EXTENSION))
# patch ERRORS_BAD_CONTEXT as per https://mail.python.org/pipermail/python-win32/2002-March/000265.html
global ERRORS_BAD_CONTEXT
ERRORS_BAD_CONTEXT.append(winerror.E_NOTIMPL)
for filename_with_path, filename_without_extension in matching_files:
print "Processing '{}'".format(filename_without_extension)
acrobat_extract_text(filename_with_path, ROOT_OUTPUT_PATH, filename_without_extension, OUTPUT_FILE_EXTENSION)
I have tested this on WinPython x64 2.7.6.3, Acrobat X Pro
makepy.py is a script that comes with the win32com python package.
Running it for your installation "wires" python into the COM/OLE object in Windows. The following is an excerpt of some code I used to talk to Excel and do some stuff in it. This example gets the name of sheet 1 in the current workbook. It automatically runs makepy if it has an exception:
import win32com;
import win32com.client;
from win32com.client import selecttlb;
def attachExcelCOM():
makepyExe = r'python C:\Python25\Lib\site-packages\win32com\client\makepy.py';
typeList = selecttlb.EnumTlbs();
for tl in typeList:
if (re.match('^Microsoft.*Excel.*', tl.desc, re.IGNORECASE)):
makepyCmd = "%s -d \"%s\"" % (makepyExe, tl.desc);
os.system(makepyCmd);
# end if
# end for
# end def
def getSheetName(sheetNum):
try:
xl = win32com.client.Dispatch("Excel.Application");
wb = xl.Workbooks.Item(sheetNum);
except Exception, detail:
print 'There was a problem attaching to Excel, refreshing connect config...';
print Exception, str(detail);
attachExcelCOM();
try:
xl = win32com.client.Dispatch("Excel.Application");
wb = xl.Workbooks.Item(sheetNum);
except:
print 'Could not attach to Excel...';
sys.exit(-1);
# end try/except
# end try/except
wsName = wb.Name;
if (wsName == 'PERSONAL.XLS'):
return( None );
# end if
print 'The target worksheet is:';
print ' ', wsName;
print 'Is this correct? [Y/N]',;
answer = string.strip( sys.stdin.readline() );
answer = answer.upper();
if (answer != 'Y'):
print 'Sheet not identified correctly.';
return(None);
# end if
return( (wb, wsName) );
# end def
# -- Main --
sheetInfo = getSheetName(sheetNum);
if (sheetInfo == None):
print 'Sheet not found';
sys.exit(-1);
else:
(wb, wsName) = sheetInfo;
# end if
i would like to use pyPdf to split a pdf file based on the outline where each destination in the outline refers to a different page within the pdf.
example outline:
main --> points to page 1
sect1 --> points to page 1
sect2 --> points to page 15
sect3 --> points to page 22
it is easy within pyPdf to iterate over each page of the document or each destination in the document's outline; however, i cannot figure out how to get the page number where the destination points.
does anybody know how to find the referencing page number for each destination in the outline?
I figured it out:
class Darrell(pyPdf.PdfFileReader):
def getDestinationPageNumbers(self):
def _setup_outline_page_ids(outline, _result=None):
if _result is None:
_result = {}
for obj in outline:
if isinstance(obj, pyPdf.pdf.Destination):
_result[(id(obj), obj.title)] = obj.page.idnum
elif isinstance(obj, list):
_setup_outline_page_ids(obj, _result)
return _result
def _setup_page_id_to_num(pages=None, _result=None, _num_pages=None):
if _result is None:
_result = {}
if pages is None:
_num_pages = []
pages = self.trailer["/Root"].getObject()["/Pages"].getObject()
t = pages["/Type"]
if t == "/Pages":
for page in pages["/Kids"]:
_result[page.idnum] = len(_num_pages)
_setup_page_id_to_num(page.getObject(), _result, _num_pages)
elif t == "/Page":
_num_pages.append(1)
return _result
outline_page_ids = _setup_outline_page_ids(self.getOutlines())
page_id_to_page_numbers = _setup_page_id_to_num()
result = {}
for (_, title), page_idnum in outline_page_ids.iteritems():
result[title] = page_id_to_page_numbers.get(page_idnum, '???')
return result
pdf = Darrell(open(PATH-TO-PDF, 'rb'))
template = '%-5s %s'
print template % ('page', 'title')
for p,t in sorted([(v,k) for k,v in pdf.getDestinationPageNumbers().iteritems()]):
print template % (p+1,t)
This is just what I was looking for. Darrell's additions to PdfFileReader should be part of PyPDF2.
I wrote a little recipe that uses PyPDF2 and sejda-console to split a PDF by bookmarks. In my case there are several Level 1 sections that I want to keep together. This script allows me to do that and give the resulting files meaningful names.
import operator
import os
import subprocess
import sys
import time
import PyPDF2 as pyPdf
# need to have sejda-console installed
# change this to point to your installation
sejda = 'C:\\sejda-console-1.0.0.M2\\bin\\sejda-console.bat'
class Darrell(pyPdf.PdfFileReader):
...
if __name__ == '__main__':
t0= time.time()
# get the name of the file to split as a command line arg
pdfname = sys.argv[1]
# open up the pdf
pdf = Darrell(open(pdfname, 'rb'))
# build list of (pagenumbers, newFileNames)
splitlist = [(1,'FrontMatter')] # Customize name of first section
template = '%-5s %s'
print template % ('Page', 'Title')
print '-'*72
for t,p in sorted(pdf.getDestinationPageNumbers().iteritems(),
key=operator.itemgetter(1)):
# Customize this to get it to split where you want
if t.startswith('Chapter') or \
t.startswith('Preface') or \
t.startswith('References'):
print template % (p+1, t)
# this customizes how files are renamed
new = t.replace('Chapter ', 'Chapter')\
.replace(': ', '-')\
.replace(': ', '-')\
.replace(' ', '_')
splitlist.append((p+1, new))
# call sejda tools and split document
call = sejda
call += ' splitbypages'
call += ' -f "%s"'%pdfname
call += ' -o ./'
call += ' -n '
call += ' '.join([str(p) for p,t in splitlist[1:]])
print '\n', call
subprocess.call(call)
print '\nsejda-console has completed.\n\n'
# rename the split files
for p,t in splitlist:
old ='./%i_'%p + pdfname
new = './' + t + '.pdf'
print 'renaming "%s"\n to "%s"...'%(old, new),
try:
os.remove(new)
except OSError:
pass
try:
os.rename(old, new)
print' succeeded.\n'
except:
print' failed.\n'
print '\ndone. Spliting took %.2f seconds'%(time.time() - t0)
Small update to #darrell class to be able to parse UTF-8 outlines, which I post as answer because comment would be hard to read.
Problem is in pyPdf.pdf.Destination.title which may be returned in two flavors:
pyPdf.generic.TextStringObject
pyPdf.generic.ByteStringObject
so that output from _setup_outline_page_ids() function returns also two different types for title object, which fails with UnicodeDecodeError if outline title contains anything then ASCII.
I added this code to solve the problem:
if isinstance(title, pyPdf.generic.TextStringObject):
title = title.encode('utf-8')
of whole class:
class PdfOutline(pyPdf.PdfFileReader):
def getDestinationPageNumbers(self):
def _setup_outline_page_ids(outline, _result=None):
if _result is None:
_result = {}
for obj in outline:
if isinstance(obj, pyPdf.pdf.Destination):
_result[(id(obj), obj.title)] = obj.page.idnum
elif isinstance(obj, list):
_setup_outline_page_ids(obj, _result)
return _result
def _setup_page_id_to_num(pages=None, _result=None, _num_pages=None):
if _result is None:
_result = {}
if pages is None:
_num_pages = []
pages = self.trailer["/Root"].getObject()["/Pages"].getObject()
t = pages["/Type"]
if t == "/Pages":
for page in pages["/Kids"]:
_result[page.idnum] = len(_num_pages)
_setup_page_id_to_num(page.getObject(), _result, _num_pages)
elif t == "/Page":
_num_pages.append(1)
return _result
outline_page_ids = _setup_outline_page_ids(self.getOutlines())
page_id_to_page_numbers = _setup_page_id_to_num()
result = {}
for (_, title), page_idnum in outline_page_ids.iteritems():
if isinstance(title, pyPdf.generic.TextStringObject):
title = title.encode('utf-8')
result[title] = page_id_to_page_numbers.get(page_idnum, '???')
return result
Darrell's class can be modified slightly to produce a multi-level table of contents for a pdf (in the manner of pdftoc in the pdftk toolkit.)
My modification adds one more parameter to _setup_page_id_to_num, an integer "level" which defaults to 1. Each invocation increments the level. Instead of storing just the page number in the result, we store the pair of page number and level. Appropriate modifications should be applied when using the returned result.
I am using this to implement the "PDF Hacks" browser-based page-at-a-time document viewer with a sidebar table of contents which reflects LaTeX section, subsection etc bookmarks. I am working on a shared system where pdftk can not be installed but where python is available.
A solution 10 years later for newer python and PyPDF:
from PyPDF2 import PdfReader, PdfWriter
filename = "main.pdf"
with open(filename, "rb") as f:
r = PdfReader(f)
bookmarks = list(map(lambda x: (x.title, r.get_destination_page_number(x)), r.outline))
print(bookmarks)
for i, b in enumerate(bookmarks):
begin = b[1]
end = bookmarks[i+1][1] if i < len(bookmarks) - 1 else len(r.pages)
# print(len(r.pages[begin:end]))
name = b[0] + ".pdf"
print(f"{name=}: {begin=}, {end=}")
with open(name, "wb") as f:
w = PdfWriter(f)
for p in r.pages[begin:end]:
w.add_page(p)
w.write(f)