Python script to merge PDF files with a blank page

Python script to merge PDF files with a blank page - python

I have the following script compiled from other's suggestions, but I can't seem to get it to run properly. I need to merge several 3 page bill files into a single file for printing while adding a blank page in between each bill file so that each bill prints properly (we don't want the first page of one bill printed on the back of the previous bill).
# If the file errors with "no module PyPDF2" then from command line, run pip install PyPDF2
import os
from os import listdir, mkdir, startfile
from os.path import isfile, join, exists
from PyPDF2 import PdfFileMerger, PdfFileReader, PdfFileWriter
#Input file path and print the pdf files in that path
path = input("Enter the folder location: ")
pdffiles = [f for f in listdir(path) if isfile(join(path, f)) and '.pdf' in f]
print('\nList of PDF Files:\n')
for file in pdffiles:
print(file)
def add_blank_to_end(pdffiles: list) -> list:
names = []
for f in pdffiles:
pdf_in = open(f, 'rb')
pdf_file = PdfFileReader(pdf_in)
output = PdfFileWriter()
output.appendPagesFromReader(pdf_file)
output.addBlankPage()
names.append(f'b{f}')
outputStream = open(f'b{f}', 'wb')
output.write(outputStream)
return names
#Append the pdf files
def merge_pdfs(pdffiles: list):
merger = PdfFileMerger()
for f in pdffiles:
merger.append(f)
merger.write("document-output.pdf")
with_blank = add_blank_to_end(pdffiles)
merge_pdfs(with_blank)

# If the file errors with "no module PyPDF2" then from command line, run pip install PyPDF2
import os
from os import listdir, mkdir, startfile
from os.path import isfile, join, exists
from PyPDF2 import PdfFileMerger, PdfFileReader, PdfFileWriter
#Input file path and print the pdf files in that path
path = input("Enter the folder location")
pdffiles = [f for f in listdir(path) if isfile(join(path, f)) and '.pdf' in f]
print('\nList of PDF Files:\n')
for file in pdffiles:
print(file)
def add_blank_to_end(pdffiles: list) -> list:
names = []
for f in pdffiles:
pdf_in = open(path+'/'+f, 'rb')
pdf_file = PdfFileReader(pdf_in)
output = PdfFileWriter()
output.appendPagesFromReader(pdf_file)
output.addBlankPage()
names.append(f'b{f}')
outputStream = open(f'b{f}', 'wb')
output.write(outputStream)
return names
def merge_pdfs(pdffiles: list):
merger = PdfFileMerger()
for f in pdffiles:
merger.append(f)
merger.write("document-output.pdf")
with_blank = add_blank_to_end(pdffiles)
merge_pdfs(with_blank)

Related

Getting this error: RuntimeError: Proxy error(FileNotFoundException): Could not find file 'C:\Users\user\stuff\tests\pythonlearn.pdf'

import os
import aspose.words as aw
rootdir = 'C:/Users/user/stuff/tests'
for subdir, dirs, files in os.walk(rootdir):
for file in files:
a = os.path.join(subdir, file)
doc = aw.Document(a)
doc.save("utput.docx")
doc = aw.Document("Output.docx")
doc.save("output.pdf")
This is my program.
I am trying to run python through a folder containing pdf files, and decrypt them one by one by converting it to word, and then to pdf. What am I doing wrong?

Don't use os.walk. use os.listdir(rootdir) instead. Please note that the saved file and the used file have the same name.
Example:
import os
import aspose.words as aw
root = "C:/Users/user/stuff/tests"
for item in os.listdir(root):
if os.path.isfile(os.path.join(root, item)):
doc = aw.Document(item)
doc.save("Output.docx")
doc = aw.Document("Output.docx")
doc.save("output.pdf")
[EDIT]
above code cant find other folders so i decide to use glob to find all folders
Here:
import os
import aspose.words as aw
import glob
# Set base directory
os.chdir(os.path.join("C:/Users/user/stuff/tests"))
# Geting all pdf files in list
pdf_files = glob.glob("*.pdf")
for files in pdf_files:
doc = aw.Document(files)
doc.save("Output.docx")
doc = aw.Document("Output.docx")
doc.save("output.pdf")
[EDIT-2]
First take all .pdf files in one list :
pdf_files = glob.glob("*.pdf")
other_pdf_files = glob.glob('*/*.pdf')
all_pdf_files=(*pdf_files,*other_pdf_files)
Secondly, you need to use PyPDF2 to get rid of password.
Get unencrypted pdfs by sending all pdf files into decrypt_pdf (don't forget to specify the password). For example: (More detail here and here)
from PyPDF2 import PdfFileReader, PdfFileWriter
def decrypt_pdf(input_path, output_path, password):
with open(input_path, 'rb') as input_file, \
open(output_path, 'wb') as output_file:
reader = PdfFileReader(input_file)
reader.decrypt(password)
writer = PdfFileWriter()
for i in range(reader.getNumPages()):
writer.addPage(reader.getPage(i))
writer.write(output_file)
You can run other parts in the same way.
for files in all_pdf_files:
doc = aw.Document(files)
...

Merge multiple pdf files into single txt file

I want to loop through a directory of .pdf files and merge all of them into a single .txt file. So far I have this code, but it won't write anything into file.txt. I get the following error:
raise PdfReadError("EOF marker not found")
PyPDF2.errors.PdfReadError: EOF marker not found
Here's the code:
import os
from PyPDF2 import PdfFileReader, PdfFileWriter
for filename in os.listdir("path"):
path = os.path.join("path/", filename)
print(path)
with open('file.txt', 'w', encoding='utf-8') as file:
for page_num in range(PdfFileReader(path, strict=False).numPages):
print('Página: {0}'.format(page_num))
pageObj = PdfFileReader(path).getPage(page_num)
try:
txt = pageObj.extractText()
except:
pass
else:
file.write('Page{0}\n'.format(page_num+1))
file.write(txt)
file.close()

disclaimer I am the author of borb, the library used in this answer
from borb.pdf import PDF
from borb.toolkit import SimpleTextExtraction
from pathlib import Path
def get_text_from_pdf(p: Path) -> str:
l: SimpleTextExtraction = SimpleTextExtraction()
with open(p, "rb") as fh:
PDF.loads(fh, [l])
return "".join([(v+"\n") for k,v in l.get_text().items()])
Now you can simply call this for every file in the directory.

Combine PDF using PyPDF2

Anyone have ideas on how to combine two pdfs into single A4 pdf file. I can combine on landscape, but not on portrait. Below is my code using Python3 and PyPDF2.
(assuming there are 2 pdf files in "output" directory which named "left.pdf" and "right.pdf")
import sys import os import PyPDF2
from pdfrw import PdfReader, PdfWriter, PageMerge, IndirectPdfDict from PyPDF2 import PdfFileMerger, PageObject
inpfn = 'output.pdf'
outfn = 'output/right.pdf'
reader = PdfReader(inpfn)
writer = PdfWriter(outfn)
writer.addpage(adjust3(reader.pages[0]))
writer.trailer.Info = IndirectPdfDict(reader.Info or {})
writer.write()
merger = PdfFileMerger()
path_to_files = r'output/'
for root, dirs, file_names in os.walk(path_to_files):
for file_name in file_names:
merger.append(path_to_files + file_name)
merger.write("file_gabungan.pdf")
merger.close()

How to read all files in one folder and apply a function over them in python?

I would like to run a function over all files in one folder and create new files out of them. I have put the code for one file bellow. I would appreciate it if you kindly help me.
def newfield2(infile,outfile):
output = ["%s\t%s" %(item.strip(),2) for item in infile]
outfile.write("\n".join(output))
outfile.close()
return outfile
infile = open("E:/SAGA/data/2006last/325125401.all","r")
outfile = open("E:/SAGA/data/2006last/325125401_edit.all","r")
I would like to change all the files in the 'E:/SAGA/data/2006last/' folder and create new files with edit extension.

Use os.listdir() to list all files in a directory. The function returns just the filenames, not the full path. The os.path module gives you the tools to construct filenames as needed:
import os
folder = 'E:/SAGA/data/2006last'
for filename in os.listdir(folder):
infilename = os.path.join(folder, filename)
if not os.path.isfile(infilename): continue
base, extension = os.path.splitext(filename)
infile = open(infilename, 'r')
outfile = open(os.path.join(folder, '{}_edit.{}'.format(base, extension)), 'w')
newfield2(infile, outfile)

import os
def apply_to_all_files:
for sub_path in os.listdir(path):
next_path = os.path.join(path, sub_path)
if os.path.isfile(next_path):
infile = open(next_path,"r")
outfile = open(next_path + '.out', "w")
newfield2(infile, outfile)

Removing path from a zip file using python

I have a zip file that has a path. When I unzip the file using python and put it in my target folder, it then creates all of the files in the path inside my target folder.
Target: d:\unzip_files
zip file has a path and file name of: \NIS\TEST\Files\tnt.png
What happens: d:\unzip_files\NIS\TEST\Files\tnt.png
Is there a way to hae it just unzip the tnt.png file into d:\unzip_files? Or will I have to read down the list and move the file and then delete all of the empty folders?
import os, sys, zipfile
zippath = r"D:\zip_files\test.zip"
zipdir = r"D:\unzip_files"
zfile = zipfile.ZipFile(zippath, "r")
for name in zfile.namelist():
zfile.extract(name, zipdir)
zfile.close()
So, this is what worked..
import os, sys, zipfile
zippath = r"D:\zip_files\test.zip"
zipdir = r"D:\unzip_files"
zfile = zipfile.ZipFile(zippath, "r")
for name in zfile.namelist():
fname = os.path.join(zipdir, os.path.basename(name))
fout = open(fname, "wb")
fout.write(zfile.read(name))
fout.close()
Thanks for the help.

How about reading file as binary and dump it? Need to deal cases where there is pre-existing file.
for name in zfile.namelist():
fname = os.path.join(zipdir, os.path.basename(name))
fout = open(fname, 'wb')
fout.write(zfile.read(name))

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python script to merge PDF files with a blank page - python

Related

Getting this error: RuntimeError: Proxy error(FileNotFoundException): Could not find file 'C:\Users\user\stuff\tests\pythonlearn.pdf'

Merge multiple pdf files into single txt file

Combine PDF using PyPDF2

How to read all files in one folder and apply a function over them in python?

Removing path from a zip file using python

Categories

Resources