I would like to use a python script to find and replace some text in an InDesign file and then save it as pdf.
I managed to use python to open indesign and save it as pdf however I do not know how to search for text and replace it with a random string generated by the first part of the script.
Here is what I got so far:
import win32com.client
import random
import string
def id_generator(size=6, chars=string.ascii_uppercase + string.digits):
return ''.join(random.choice(chars) for _ in range(size))
app = win32com.client.Dispatch('InDesign.Application.CC.2018')
myFile = r'C:\Users\some_file.indd'
myDocument = app.Open(myFile)
myPDFFile = r'C:\Users\some_file.pdf'
directory = os.path.dirname(myPDFFile)
idPDFType = 1952403524
# 1=[High Quality Print], 2=[PDF/X-1a:2001] etc..
myPDFPreset = app.PDFExportPresets.Item(1)
if not os.path.exists(directory):
if os.path.exists(directory):
myDocument.Export(idPDFType, myPDFFile, False, myPDFPreset)
except Exception as e:
print('Export to PDF failed: ' + str(e))
You need to iterate over all of the TextFrames of the document and then search and replace the text with the ChangeText function.
Here is a snippet of what you can do:
voucher = id_generator()
searchText = 'test'
app = win32com.client.Dispatch('InDesign.Application.CC.2018')
app.scriptPreferences.userInteractionLevel = 1699640946
myFile = r'C:\Users\some_file.indd'
myDocument = app.Open(myFile)
myPage = myDocument.Pages.Item(1)
idNothing = 1851876449 #from enum idNothingEnum, see doc_reference
for it in myDocument.TextFrames:
if searchText in (it.Contents):
app.FindTextPreferences.FindWhat = searchText
app.ChangeTextPreferences.ChangeTo = voucher
app.FindTextPreferences.FindWhat = idNothing
app.ChangeTextPreferences.ChangeTo = idNothing
#and then save the changes as PDF...
I'm trying to make a program that scans PDFs downloaded from a website with selectable text and highlights specific discrepancies. I can make it work for specific "bad words" and "good words" but I am stuck on how to make it find missing check boxes. They are no longer interactive fields in PDF form:
Here is my code for everything else so far:
import os
import fitz
source_folder = r"C:\Users\Sserb\Desktop\Test Files"
list_files = os.listdir(source_folder)
good_terms = ["trend", "decrease", "increase"]
bad_terms = ["school", "academic", "homework"] # words that should be in every pdf file (not every page)
pdf_files = [x for x in list_files if x.endswith(".pdf")]
highlight_summary = []
good_term_summary = []
for file_name in pdf_files:
full_filename = os.path.join(source_folder, file_name)
doc = fitz.open(full_filename)
good_terms_not_found = good_terms.copy()
list_hl_pages = []
for page_num, page in enumerate(doc, 1):
for text in bad_terms:
text_instances = page.search_for(text)
for inst in text_instances:
highlight = page.addHighlightAnnot(inst)
if page_num not in list_hl_pages:
# Search for good terms- all must be found
words_found = []
for good_word in good_terms_not_found:
text_instances = page.search_for(good_word)
if text_instances:
for word in words_found:
highlight_summary.append([file_name, list_hl_pages.copy()])
if good_terms_not_found:
good_term_summary.append([file_name, good_terms_not_found.copy()])
if list_hl_pages:
out_file = file_name.replace(".pdf", "-errors.pdf")
doc.save(os.path.join(source_folder, "output", out_file), garbage=4, deflate=True, clean=True)
output_folder=r"C:\Users\Sserb\Desktop\Test Files\output"
new = os.path.join(output_folder,'outputfile.txt')
file = open(new, 'w')
value = str(good_term_summary) + '\n'
Both "value" and "export value" are always treated as text, but there are at least 8 different kinds of check-boxes in word. see how these are altered by the font used here Check boxes are shown as ☐ when unchecked, or ☑ or ☒ when checked, so search for ☑Client rather than ☐Client etc
With pyminizip i am able to zip a file with password in python :
import pyminizip
pyminizip.compress(filepath, None,"output.zip", "password", 0)
But how do I zip the whole folder 'myFolder' into a zip file with password?
I tried removing the filename from the path but it gives the error
OSError: error in opening C:\Users\xxx\Desktop\myFolder for reading
The below link has a function which will zip the directory. But It wont add a password.
If anyone can let me know if it is possible to add a password to an existing zip file, that will solve my problem. Is that possible?
I was finally able to accomplish encryping the whole directory(including all subfolder struncture and files) using a library called 'pyzipper' suggested by Anupam Chaplot.
Here is the solution :
def zip_folderPyzipper(folder_path, output_path):
"""Zip the contents of an entire folder (with that folder included
in the archive). Empty subfolders will be included in the archive
as well.
parent_folder = os.path.dirname(folder_path)
# Retrieve the paths of the folder contents.
contents = os.walk(folder_path)
zip_file = pyzipper.AESZipFile('new_test.zip','w',compression=pyzipper.ZIP_DEFLATED,encryption=pyzipper.WZ_AES)
for root, folders, files in contents:
# Include all subfolders, including empty ones.
for folder_name in folders:
absolute_path = os.path.join(root, folder_name)
relative_path = absolute_path.replace(parent_folder + '\\',
print ("Adding '%s' to archive." % absolute_path)
zip_file.write(absolute_path, relative_path)
for file_name in files:
absolute_path = os.path.join(root, file_name)
relative_path = absolute_path.replace(parent_folder + '\\',
print ("Adding '%s' to archive." % absolute_path)
zip_file.write(absolute_path, relative_path)
print ("'%s' created successfully." % output_path)
except IOError as message:
print (message)
except OSError as message:
except zipfile.BadZipfile as message:
print (message)
Since I am new in python i cant explain the code in detail. Here are the references :
To extract the Generated ZIP file in windows :
Right Click - > Unzip(Encripted)
If you directly click Extract All option, then it will give error
Try this:
Firstly check here please for pynzip. After that try it.
import pyminizip as pyzip
compression = 8
pyzip.compress("test.txt", "test.zip", "Pswrd", compression)
Here is how to copy all a directory with its subdirectories and its files, then compress it and encrypt a zip, with password and without needing an associated backup file, here we will see how to authorize a mac address to execute the decryption. So then it's up to you to change or improve the script.
But the essentials work very well.
After a lot of research, testing and thinking, I created this effective solution
my setup:
Python 3.8 64:bits on windows 7 64:bits
Usage terminology:
First step, we need to import the cryptography module
check for support or other is here https://cryptography.io/en/latest/installation/
pip install cryptography
Then we will use the fernet object resulting from this module
with password
and shutil:
file second.py:
import os
import re, uuid
import string
import shutil
import zlib
from cryptography.fernet import Fernet
from cryptography.hazmat.primitives import hashes
from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC
import base64
import zipfile
class zipy:
def __init__(self, pathDir=None):
"""If pathDir optional is none, this script copy all directory in current execution."""
if pathDir != None:
if os.path.isdir(pathDir):
pathDir = pathDir.replace(os.sep, '/')
if pathDir.endswith('/'):
self.root = pathDir
self.root = pathDir + '/'
self.root = os.getcwd()+os.sep
self.root = self.root.replace(os.sep, '/')
self.root = os.getcwd()+os.sep
self.root = self.root.replace(os.sep, '/')
self.name = 'sauvegarde'
self.dirSauvegarde = self.root+self.name
self.dirSauvegarde = self.dirSauvegarde.replace(os.sep, '/')
lectureDossier = os.listdir(self.root)
self.path_system = {}
for element in lectureDossier:
if os.path.isdir(element):
if element != '__pycache__':
self.path_system[element] = self.root + element + os.sep.replace(os.sep, '/')
self.path_system[element] = self.path_system[element].replace(os.sep, '/')
elif os.path.isfile(element):
self.path_system[element] = self.root + element
self.path_system[element] = self.path_system[element].replace(os.sep, '/')
self.zipi = myZip(self.dirSauvegarde)
def save(self):
"""sauvegarde le fichier"""
chemin_src = ""
chemin_dist = ""
for element in self.path_system:
if element != self.dirSauvegarde:
chemin_src = self.root+element
chemin_dest = self.dirSauvegarde + os.sep + element
chemin_dest = chemin_dest.replace(os.sep, '/')
if os.path.isdir(chemin_src):
self.copyDir(chemin_src, chemin_dest)
self.copyFile(chemin_src, chemin_dest)
def copyDir(self, src, dest):
shutil.copytree(src, dest, dirs_exist_ok=True)
def copyFile(self, src, dest):
shutil.copyfile(src, dest)
def createDir(self, dirPath):
if os.path.isdir(dirPath):
os.makedirs(dirPath, exist_ok=True)
def delDir(self, dir):
if os.path.isdir(dir):
if len(os.listdir(dir)) > 0:
shutil.rmtree(dir, ignore_errors=True)
def decrypt(self):
class myZip:
def __init__(self, dir):
self.pathDir = dir
self.nom = os.path.basename(dir)
self.pathZip = self.pathDir + '.zip'
self.crypt = Encryptor()
def zip(self, zip_exist=False):
if zip_exist == False:
if os.path.isfile(self.pathZip):
shutil.make_archive(os.path.splitext(self.pathZip)[0], 'zip', self.pathDir)
key = self.crypt.key_create()
self.crypt.file_encrypt(key, self.pathZip, self.pathZip)
self.crypt.key_write(self.pathZip, key)
def unzip(self):
if self.crypt.checkPass(self.pathZip):
#print('ok adresse mac autoriser')
key = self.crypt.key_load(self.pathZip)
self.crypt.file_decrypt(key, self.pathZip, self.pathZip)
print('pas ok adresse mac erroner')
class Encryptor:
def __init__(self):
self.salto = None
def key_create(self):
password = self.getMac()
password = bytes(password, encoding="utf-8")
self.salto = os.urandom(16)
key = base64.urlsafe_b64encode(kdf.derive(password))
return key
def key_write(self, pathZip, key):
with zipfile.ZipFile(pathZip, 'a') as zip:
zip.comment = key + bytes(' byMe ', encoding="utf-8") + self.salto
def key_load(self, pathZip):
stri = []
with zipfile.ZipFile(pathZip, 'a') as zip:
stri = zip.comment.split(b' byMe ')
key = stri[0]
self.salto = stri[1]
return key
def checkPass(self, pathZip):
key = base64.urlsafe_b64decode(self.key_load(pathZip))
salt = self.salto
mdp = self.getMac()
mdp = bytes(mdp, encoding="utf-8")
retour = False
kdf.verify(mdp, key)
retour = True
retour = False
return retour
def file_encrypt(self, key, original_file, encrypted_file):
f = Fernet(key)
with open(original_file, 'rb') as file:
original = file.read()
encrypted = f.encrypt(original)
with open (encrypted_file, 'wb') as file:
def file_decrypt(self, key, encrypted_file, decrypted_file):
f = Fernet(key)
with open(encrypted_file, 'rb') as file:
encrypted = file.read()
decrypted = f.decrypt(encrypted)
with open(decrypted_file, 'wb') as file:
def getMac(self):
return "".join(re.findall('..', '%012x' % uuid.getnode()))
Use like this:
file : main.py
from second import zipy
#If the argument is empty, the script will make a copy of the directory being executed, otherwise the script will work and output the zip in the place indicated in argument
dd = zipy("E:/path")
#or dd = zipy("E:/path/") or dd = zipy() if you give arg, give absolute path
#Save the zip and encrypt it. Change second.py to directly give it a password as an argument
#decrypt zip
Here's a snippet with pyminizip: gets a list of files and zips the whole thing.
import pyminizip
import os
def get_paths_recursively(src_root_path):
files = []
if src_root_path is not None:
for root, directories, filenames in os.walk(src_root_path):
entries = []
for filename in filenames:
full_file_name = os.path.join(root, filename)
if os.path.isfile(full_file_name) and not filename.startswith('.'):
files.append(os.path.join(root, filename))
return files
def pyminizip_zipper(folder_path, output_path, password):
paths = get_paths_recursively(folder_path)
roots = []
for path in paths:
roots.append(os.path.dirname(path.replace(os.path.dirname(folder_path), './')))
pyminizip.compress_multiple(paths, roots, output_path, password, 5)
So I have my main python script which I run and essentially pass three arguments that are -p, -e and -d to another python script. I have been using subprocess in order to this which I understand.
What I want to achieve is rather than using subprocess I want to import the second file 'generate_json.py', and be able to pass the three arguments to its main() function. How can I pass the three arguments like I have in my subprocess call?
My code for my main script is as follows:
import generate_json as gs
def get_json_location(username=os.getlogin()):
first = "/Users/"
last = "/Desktop/data-code/Testdata"
result = first + username + last
return result
Assuming that the script files do not have to be used individually, i.e: generate_json.py on its own from the command line.
I think a cleaner approach would be to wrap generate_json.py functions and put it into a class.
In this case I renamed generate_json.py to ConfigurationHandling.py
import os
import json
from functions import read_config
class ConfigurationHandler(object):
def __init__(self, new_parameter_file, new_export_data_file, new_export_date):
self._parameter_file = new_parameter_file
self._export_data_file = new_export_data_file
self._export_date = new_export_date
self._parsed_configuration = self.read_configuration()
def _read_configuration(self):
"""Uses lower level function `read_config` in function.py file to read configuration file"""
parsed_configuration = read_config(self.export_data_file)
return parsed_configuration
def _perform_some_action1(self):
def _perform_some_action2(self):
# Logic code for parsing goes here.
def get_config(self):
"""Returns configuration"""
return [self.parameter_file, self.parsed_configuration, self.export_date]
def json_work(self):
cfg = self.get_config()[0] # json location
data = self.get_config()[1] # export_agent_core_agent.yaml
date = self.get_config()[2] # synthetic data folder - YYYY-MM-DD
if not date:
date = ""
date = date + "/"
json_location = cfg # json data path
json_database = data["config"]["database"]
json_collection = data["config"]["collection"]
json_path = "{0}/{1}{2}/{3}/{3}.json".format(json_location, date, json_database, json_collection)
json_base_name = json_database + "/" + json_collection + "/" + os.path.basename(json_path) # prints json filename
current_day = date
with open('dates/' + current_day + '.json', 'a') as file:
data = {}
if os.path.exists(json_path):
json_file_size = str(os.path.getsize(json_path)) # prints json file size
print("File Name:" " " + json_base_name + " " "Exists " + "\n")
print("File Size:" " " + json_file_size + " " "Bytes " "\n")
print("Writing to file")
# if json_path is not False:
data['File Size'] = int(json_file_size)
data['File Name'] = json_base_name
json.dump(data, file, sort_keys=True)
print(json_base_name + " " "does not exist")
print("Writing to file")
data['File Name'] = json_base_name
data['File Size'] = None
json.dump(data, file, sort_keys=True)
Then in main.py
from ConfigurationHandler import ConfigurationHandler
def main():
#Drive the program from here and add the functionality together.
#Routine to do some work here and get the required variables
parameter_file = "some_parameter"
export_data_file = "some_file.yaml"
new_export_date = "iso_8601_date_etc"
conf_handl = ConfigurationHandler(parameter_file, export_data_file, new_export_date)
configuration = conf_handl.get_config()
if __name__ == '__main__':
In the project, you should aim to have only one main function and split up the functionality accordingly.
It will be much easier to change parts of the program later on when everything is split out evenly.
So far i have got the following :
from genrate_jsonv2 import ConfigurationHandler
import os
import argparse
def get_json_location(username=os.getlogin()):
first = "/Users/"
last = "/Desktop/data-code/Testdata"
result = first + username + last
return result
def get_config():
parser = argparse.ArgumentParser()
parser.add_argument("-d", "--export-date", action="store", required=True)
args = parser.parse_args()
return [args.export_date]
yml_directory = os.listdir('yaml')
data = get_config()[0]
def main():
for yml in yml_directory:
parameter_file = get_json_location
export_data_file = yml
new_export_date = data
conf_handl = ConfigurationHandler(parameter_file, export_data_file, new_export_date)
configuration = conf_handl.get_config()
if __name__ == '__main__':
The issue is , within export_data_file , i don't really want to be passing a file_path location , i rather have it loop through each file_name in the yml directory. When doing so i get an error saying ,'Error reading config file'
This question already has answers here:
How can I replace text in a PDF using Python?
(4 answers)
Closed 14 hours ago.
I am writing mailmerge software as part of a Python web app.
I have a template called letter.pdf which was generated from a MS Word file and includes the text {name} where the resident's name will go. I also have a list of c. 100 residents' names.
What I want to do is to read in letter.pdf do a search for "{name}" and replace it with the resident's name (for each resident) then write the result to another pdf. I then want to gather all these pdfs together into a big pdf (one page per letter) which my web app's users will print out to create their letters.
Are there any Python libraries that will do this? I've looked at pdfrw and pdfminer but I couldn't see where they would be able to do it.
(NB: I also have the MS Word file, so if there was another way of using that, and not going through a pdf, that would also do the job.)
This can be done with PyPDF2 package. The implementation may depend on the original PDF template structure. But if the template is stable enough and isn't changed very often the replacement code shouldn't be generic but rather simple.
I did a small sketch on how you could replace the text inside a PDF file. It replaces all occurrences of PDF tokens to DOC.
import os
import argparse
from PyPDF2 import PdfFileReader, PdfFileWriter
from PyPDF2.generic import DecodedStreamObject, EncodedStreamObject
def replace_text(content, replacements = dict()):
lines = content.splitlines()
result = ""
in_text = False
for line in lines:
if line == "BT":
in_text = True
elif line == "ET":
in_text = False
elif in_text:
cmd = line[-2:]
if cmd.lower() == 'tj':
replaced_line = line
for k, v in replacements.items():
replaced_line = replaced_line.replace(k, v)
result += replaced_line + "\n"
result += line + "\n"
result += line + "\n"
return result
def process_data(object, replacements):
data = object.getData()
decoded_data = data.decode('utf-8')
replaced_data = replace_text(decoded_data, replacements)
encoded_data = replaced_data.encode('utf-8')
if object.decodedSelf is not None:
if __name__ == "__main__":
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--input", required=True, help="path to PDF document")
args = vars(ap.parse_args())
in_file = args["input"]
filename_base = in_file.replace(os.path.splitext(in_file)[1], "")
# Provide replacements list that you need here
replacements = { 'PDF': 'DOC'}
pdf = PdfFileReader(in_file)
writer = PdfFileWriter()
for page_number in range(0, pdf.getNumPages()):
page = pdf.getPage(page_number)
contents = page.getContents()
if isinstance(contents, DecodedStreamObject) or isinstance(contents, EncodedStreamObject):
process_data(contents, replacements)
elif len(contents) > 0:
for obj in contents:
if isinstance(obj, DecodedStreamObject) or isinstance(obj, EncodedStreamObject):
streamObj = obj.getObject()
process_data(streamObj, replacements)
with open(filename_base + ".result.pdf", 'wb') as out_file:
The results are
UPDATE 2021-03-21:
Updated the code example to handle DecodedStreamObject and EncodedStreamObject which actually contian data stream with text to update.
If #Dmytrio solution do not alter final PDF
Dymitrio's updated code example to handle DecodedStreamObject and EncodedStreamObject which actually contain data stream with text to update could run fine, but with a file different from example, was not able to alter pdf text content.
According to EDIT 3, from How to replace text in a PDF using Python?:
By inserting page[NameObject("/Contents")] = contents.decodedSelf before writer.addPage(page), we force pyPDF2 to update content of the page object.
This way I was able to overcome this problem and replace text from pdf file.
Final code should look like this:
import os
import argparse
from PyPDF2 import PdfFileReader, PdfFileWriter
from PyPDF2.generic import DecodedStreamObject, EncodedStreamObject, NameObject
def replace_text(content, replacements = dict()):
lines = content.splitlines()
result = ""
in_text = False
for line in lines:
if line == "BT":
in_text = True
elif line == "ET":
in_text = False
elif in_text:
cmd = line[-2:]
if cmd.lower() == 'tj':
replaced_line = line
for k, v in replacements.items():
replaced_line = replaced_line.replace(k, v)
result += replaced_line + "\n"
result += line + "\n"
result += line + "\n"
return result
def process_data(object, replacements):
data = object.getData()
decoded_data = data.decode('utf-8')
replaced_data = replace_text(decoded_data, replacements)
encoded_data = replaced_data.encode('utf-8')
if object.decodedSelf is not None:
if __name__ == "__main__":
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--input", required=True, help="path to PDF document")
args = vars(ap.parse_args())
in_file = args["input"]
filename_base = in_file.replace(os.path.splitext(in_file)[1], "")
# Provide replacements list that you need here
replacements = { 'PDF': 'DOC'}
pdf = PdfFileReader(in_file)
writer = PdfFileWriter()
for page_number in range(0, pdf.getNumPages()):
page = pdf.getPage(page_number)
contents = page.getContents()
if isinstance(contents, DecodedStreamObject) or isinstance(contents, EncodedStreamObject):
process_data(contents, replacements)
elif len(contents) > 0:
for obj in contents:
if isinstance(obj, DecodedStreamObject) or isinstance(obj, EncodedStreamObject):
streamObj = obj.getObject()
process_data(streamObj, replacements)
# Force content replacement
page[NameObject("/Contents")] = contents.decodedSelf
with open(filename_base + ".result.pdf", 'wb') as out_file:
Important: from PyPDF2.generic import NameObject
Decompress the pdf to make parsing easier (solves many of the issues in the previous answer). I use pdftk. (If this step fails, one hack to pre-process the pdf is to open the pdf in OSX Preview, print it, and then choose save as pdf from the print menu. Then retry the command below.)
pdftk original.pdf output uncompressed.pdf uncompress
Parse and replace using PyPDF2.
from PyPDF2 import PdfFileReader, PdfFileWriter
replacements = [
("old string", "new string")
pdf = PdfFileReader(open("uncompressed.pdf", "rb"))
writer = PdfFileWriter()
for page in pdf.pages:
contents = page.getContents().getData()
for (a,b) in replacements:
contents = contents.replace(a.encode('utf-8'), b.encode('utf-8'))
with open("modified.pdf", "wb") as f:
[Optional] Re-compress the pdf.
pdftk modified.pdf output recompressed.pdf compress
Here is a solution using the MS Word source file.
As trying to edit the pdf itself turned out to be too complicated for me because of the encoding errors, I went with the MS Word >> Pdf option.
Prepare MS Word template with {{input_fields}}
Fill in the template with data
Convert the filled in MS Word file to PDF
The DocxTemplate module uses jinja like syntax: {{variable_name}}
In my solution I use an intermediate temp file. I tried to get rid of this step using BytesIO/StringIO to virtualize this step only in memory, but haven't make that work yet.
Here is an easy and working solution to perform the required task:
import os
import comtypes.client
from pathlib import Path
from docxtpl import DocxTemplate
import random
in_file_path = "files/template.docx"
temp_file_path = "files/"+str(random.randint(0,50))+".docx"
out_file_path = "files/output.pdf"
# Fill in text
data_to_fill = {'Field_name' : "John Tester",
'Field_ocupation' : "Test tester",
'Field_address' : "Test Address 123",
template = DocxTemplate(Path(in_file_path))
# Convert to PDF
wdFormatPDF = 17
in_file = os.path.abspath(Path(temp_file_path))
out_file = os.path.abspath(Path(out_file_path))
word = comtypes.client.CreateObject('Word.Application')
doc = word.Documents.Open(in_file)
doc.SaveAs(out_file, FileFormat=wdFormatPDF)
# Get rid of the temp file
If you were to save an Image using it's URL how would you do it ?
Also how do I give the Image a unique file name while saving it.
response = urllib.urlopen(image_url)
file_name = ''.join(random.choice(string.ascii_uppercase + string.digits) for x in range(10))
f = open('/media/images/temp/'+file_name, "wb")
It throws no error nor saves the file... I'm new to this I have no clue what is going wrong : |
import urllib
import string
import random
import os
filename_charset = string.ascii_letters + string.digits
filename_length = 10
file_save_dir = '/home/user/download/'
filename = ''.join(random.choice(filename_charset)
for s in range(filename_length))
urllib.urlretrieve ("http://www.example.com/image.png",
os.path.join(file_save_dir, filename + '.png'))