Consider my folder structure having files in these fashion:-
abc.csv
abc.json
bcd.csv
bcd.json
efg.csv
efg.json
and so on i.e. a pair of csv files and json files having the same names, i have to perform the same operation by reading the same named files , do some operation and proceed to the next pair of files. How do i go about this?
Basically what i have in mind as a pseudo code is:-
for files in folder_name:
df_csv=pd.read_csv('abc.csv')
df_json=pd.read_json('abc.json')
# some script to execute
#now read the next pair and repeat for all files
Did you think of something like this?
import os
# collects filenames in the folder
filelist = os.listdir()
# iterates through filenames in the folder
for file in filelist:
# pairs the .csv files with the .json files
if file.endswith(".csv"):
with open(file) as csv_file:
pre, ext = os.path.splitext(file)
secondfile = pre + ".json"
with open(secondfile) as json_file:
# do something
You can use the glob module to extract the file names matching a pattern:
import glob
import os.path
for csvfile in glob.iglob('*.csv'):
jsonfile = csvfile[:-3] + 'json'
# optionaly control file existence
if not os.path.exists(jsonfile):
# show error message
...
continue
# do smth. with csvfile
# do smth. else with jsonfile
# and proceed to next pair
If the directory structure is consistent you could do the following:
import os
for f_name in {x.split('.')[0] for x in os.listdir('./path/to/dir')}:
df_csv = pd.read_csv("{f_name}.csv")
df_json = pd.read_json("{f_name}.json")
# execute the rest
I'm trying to remove all the outlook .ost and .nst files from the user's folder on a network PC, as well as I'm trying to get it to write what files were removed into a CSV file.
I'm able to get it to find all the files in the directory and write it to a CSV file but when I try to remove the files with os.remove it doesn't seem to run, I hashed it out for the time being.
I added in the try and except, to skip the files that are in use.
import os
import sys
sys.stdout = open("output_file.csv", "w")
try:
for rootDir, subdir, files in os.walk("//network_pc_name/c$/Users"):
for filenames in files:
if filenames.endswith((".nst",".ost")):
foundfiles = os.path.join(rootDir, filenames)
#os.remove(os.path.join(rootDir, filenames))
print(foundfiles)
except:
pass
sys.stdout.close()
I made some change to the script as suggested and it appears to run alot quicker, however, I can't seem to figure out how to ignore files which are in use.
I switched the files extensions to .xlsx and .txt files to simulate the .xlsx file being open receiving the permissions error and to see if the script would continue to run and remove the .txt file.
I got the following error:
PermissionError: [WinError 32] The process cannot access the file because it is being used by another process: '//DESKTOP-HRLS19N/c$/globtest\Book1.xlsx
import glob
import os
files = [i for i in glob.glob("//DESKTOP-HRLS19N/c$/globtest/**", recursive = True) if i.endswith((".xlsx",".txt"))]
[os.remove(f) for f in files]
with open("output_file.csv", "w") as f:
f.writelines("\n".join(files))
In my experience glob is much easier:
print([i for i in glob.glob("//network_pc_name/c$/Users/**", recursive=True) if i.endswith((".nst", ".ost"))])
Assuming that prints out the files you're expecting:
files = [i for i in glob.glob("//network_pc_name/c$/Users/**", recursive=True) if i.endswith((".nst", ".ost"))]
removed_files = []
for file in files:
try:
size = os.path.getsize(file)
os.remove(file)
removed_files.append(file + " Bytes: " + size)
except Exception as e:
print("Could not remove file: " + file)
with open("output_file.csv", "w") as f:
f.writelines("\n".join(removed_files))
I'm very new to Python/programming, and am trying to automate an office task that is very time consuming.
I have multiple folders with PDFs. For each folder I need to combine the PDFs into one PDF, and save it inside the folder whose contents it's the sum of. I've gotten the contents of one folder combined, and saved to my desktop successfully using this:
import PyPDF2
import os
Path = '/Users/jlaw/Desktop/Testing/FolderName/'
filelist = os.listdir(Path)
pdfMerger = PyPDF2.PdfFileMerger(strict=False)
for file in filelist:
if file.endswith('.pdf'):
pdfMerger.append(Path+file)
pdfOutput = open('Tab C.pdf', 'wb')
pdfMerger.write(pdfOutput)
pdfOutput.close()`
With the below code I'm trying to do the above, but for all the folders in a specific directory. When I run this, I get "Tab C.pdf" files appearing correctly, but I'm unable to open them.
import PyPDF2
import os
Path = '/Users/jlaw/Desktop/Testing/'
folders = os.listdir(Path)
def pdf_merge(filelist, foldername):
pdfMerger = PyPDF2.PdfFileMerger()
for file in filelist:
if file.endswith('.pdf'):
pdfMerger.append(Path+foldername+"/"+file)
pdfOutput = open(Path+foldername+'/Tab C.pdf', 'wb')
pdfMerger.write(pdfOutput)
pdfOutput.close()
for folder in folders:
pdf_merge(Path+'/'+folder, folder)`
I'm using Python Version: 3.8
The Tab C.pdf files are only 1kb in size. When I try and open with Adobe Acrobat, a pop up says, "There was an error opening this document. This file cannot be opened because it has no pages. If I try Chrome, it will open, but it's just an empty PDF, and with Edge (Chromium based) it says, 'We can't open this file. Something went Wrong"
Any pieces of advice or hints are much appreciated.
The below works. I'm not experienced enough yet to know why this is works, while the above doesn't.
import PyPDF2
import os
Path = 'C:/Users/jlaw/Desktop/Testing/'
folders = os.listdir(Path)
pdfMerger = PyPDF2.PdfFileMerger()
def pdf_merge(filelist): #Changed to just one argument
pdfMerger = PyPDF2.PdfFileMerger()
for file in os.listdir(filelist): #added os.listdir()
if file.endswith('.pdf'):
pdfMerger.append(filelist+'/'+file) #replaced Path+foldername with filelist
pdfOutput = open(Path+folder+'/Tab C.pdf', 'wb') #Moved back one tab to prevent infinite loop
pdfMerger.write(pdfOutput) #Moved back one tab to prevent infinite loop
pdfOutput.close() #Moved back one tab to prevent infinite loop
for folder in folders:
pdf_merge(Path+folder)` #Removed redundant + "/"
Folder A has more than 100 files, folder B is my destination folder. I want to copy 10 files in folder A to folder B. The 10 files names are in the text file C.
import os
import shutil
from glob import glob
namelist = open('/Users/C.txt').read().splitlines()
input = '/Users/A'
output = '/Users/B'
path = '/Users/A'
files = glob(path)
for path in files:
filedir, filename = os.path.split(path)
for filename in namelist:
shutil.copy2(input,output)
It returns an Error. Please help me to do it in Python, thanks a lot!
There are a lot of things that you can do with your code:
import os
import shutil
from glob import glob
#namelist = open('/Users/C.txt').read().splitlines()
# context manager will take care of closing the file after open
# no need read as one string and do splitlines, readlines take care of that
with open('/Users/C.txt') as fp:
namelist = fp.readlines()
input = '/Users/A'
output = '/Users/B'
path = '/Users/A'
files = os.listdir(path)
# dont need glob import as you already imported os
#files = glob(path)
# loop only through files mentioned in the text file and see if they are available in
# folder A
for file_name in namelist:
file_path = os.path.join(input,file_name)
if file_path in files:
dest_path = os.path.join(output,file_name)
shutil.copy(file_path,dest_path)
#for path in files:
# filedir, filename = os.path.split(path)
# for filename in namelist:
# shutil.copy2(input,output)
I do not have sample data or error message to check. From what i can see in your code,
for path in files:
filedir, filename = os.path.split(path)
if filename in namelist:
shutil.copy2(input,output)
Your paths are from the root folder because of the starting forward slash. Try putting a dot in front of them if the folders and files are relative to the location of your .py file or no preceding slash:
./Users/A or Users/A
Is it possible, using Python, to merge separate PDF files?
Assuming so, I need to extend this a little further. I am hoping to loop through folders in a directory and repeat this procedure.
And I may be pushing my luck, but is it possible to exclude a page that is contained in each of the PDFs (my report generation always creates an extra blank page).
You can use pypdfs PdfMerger class.
File Concatenation
You can simply concatenate files by using the append method.
from pypdf import PdfMerger
pdfs = ['file1.pdf', 'file2.pdf', 'file3.pdf', 'file4.pdf']
merger = PdfMerger()
for pdf in pdfs:
merger.append(pdf)
merger.write("result.pdf")
merger.close()
You can pass file handles instead file paths if you want.
File Merging
If you want more fine grained control of merging there is a merge method of the PdfMerger, which allows you to specify an insertion point in the output file, meaning you can insert the pages anywhere in the file. The append method can be thought of as a merge where the insertion point is the end of the file.
e.g.
merger.merge(2, pdf)
Here we insert the whole PDF into the output but at page 2.
Page Ranges
If you wish to control which pages are appended from a particular file, you can use the pages keyword argument of append and merge, passing a tuple in the form (start, stop[, step]) (like the regular range function).
e.g.
merger.append(pdf, pages=(0, 3)) # first 3 pages
merger.append(pdf, pages=(0, 6, 2)) # pages 1,3, 5
If you specify an invalid range you will get an IndexError.
Note: also that to avoid files being left open, the PdfMergers close method should be called when the merged file has been written. This ensures all files are closed (input and output) in a timely manner. It's a shame that PdfMerger isn't implemented as a context manager, so we can use the with keyword, avoid the explicit close call and get some easy exception safety.
You might also want to look at the pdfly cat command provided by the pypdf developers. You can potentially avoid the need to write code altogether.
The pypdf documentation also includes some example code demonstrating merging.
PyMuPdf
Another library perhaps worth a look is PyMuPdf. Merging is equally simple.
From command line:
python -m fitz join -o result.pdf file1.pdf file2.pdf file3.pdf
and from code
import fitz
result = fitz.open()
for pdf in ['file1.pdf', 'file2.pdf', 'file3.pdf']:
with fitz.open(pdf) as mfile:
result.insert_pdf(mfile)
result.save("result.pdf")
With plenty of options, detailed in the projects wiki.
note: in older versions of PyMuPDF insert_pdf was insertPDF
Use Pypdf or its successor PyPDF2:
A Pure-Python library built as a PDF toolkit. It is capable of:
splitting documents page by page,
merging documents page by page,
(and much more)
Here's a sample program that works with both versions.
#!/usr/bin/env python
import sys
try:
from PyPDF2 import PdfFileReader, PdfFileWriter
except ImportError:
from pyPdf import PdfFileReader, PdfFileWriter
def pdf_cat(input_files, output_stream):
input_streams = []
try:
# First open all the files, then produce the output file, and
# finally close the input files. This is necessary because
# the data isn't read from the input files until the write
# operation. Thanks to
# https://stackoverflow.com/questions/6773631/problem-with-closing-python-pypdf-writing-getting-a-valueerror-i-o-operation/6773733#6773733
for input_file in input_files:
input_streams.append(open(input_file, 'rb'))
writer = PdfFileWriter()
for reader in map(PdfFileReader, input_streams):
for n in range(reader.getNumPages()):
writer.addPage(reader.getPage(n))
writer.write(output_stream)
finally:
for f in input_streams:
f.close()
output_stream.close()
if __name__ == '__main__':
if sys.platform == "win32":
import os, msvcrt
msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
pdf_cat(sys.argv[1:], sys.stdout)
Merge all pdf files that are present in a dir
Put the pdf files in a dir. Launch the program. You get one pdf with all the pdfs merged.
import os
from PyPDF2 import PdfMerger
x = [a for a in os.listdir() if a.endswith(".pdf")]
merger = PdfMerger()
for pdf in x:
merger.append(open(pdf, 'rb'))
with open("result.pdf", "wb") as fout:
merger.write(fout)
How would I make the same code above today
from glob import glob
from PyPDF2 import PdfMerger
def pdf_merge():
''' Merges all the pdf files in current directory '''
merger = PdfMerger()
allpdfs = [a for a in glob("*.pdf")]
[merger.append(pdf) for pdf in allpdfs]
with open("Merged_pdfs.pdf", "wb") as new_file:
merger.write(new_file)
if __name__ == "__main__":
pdf_merge()
The pdfrw library can do this quite easily, assuming you don't need to preserve bookmarks and annotations, and your PDFs aren't encrypted. cat.py is an example concatenation script, and subset.py is an example page subsetting script.
The relevant part of the concatenation script -- assumes inputs is a list of input filenames, and outfn is an output file name:
from pdfrw import PdfReader, PdfWriter
writer = PdfWriter()
for inpfn in inputs:
writer.addpages(PdfReader(inpfn).pages)
writer.write(outfn)
As you can see from this, it would be pretty easy to leave out the last page, e.g. something like:
writer.addpages(PdfReader(inpfn).pages[:-1])
Disclaimer: I am the primary pdfrw author.
Is it possible, using Python, to merge seperate PDF files?
Yes.
The following example merges all files in one folder to a single new PDF file:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from argparse import ArgumentParser
from glob import glob
from pyPdf import PdfFileReader, PdfFileWriter
import os
def merge(path, output_filename):
output = PdfFileWriter()
for pdffile in glob(path + os.sep + '*.pdf'):
if pdffile == output_filename:
continue
print("Parse '%s'" % pdffile)
document = PdfFileReader(open(pdffile, 'rb'))
for i in range(document.getNumPages()):
output.addPage(document.getPage(i))
print("Start writing '%s'" % output_filename)
with open(output_filename, "wb") as f:
output.write(f)
if __name__ == "__main__":
parser = ArgumentParser()
# Add more options if you like
parser.add_argument("-o", "--output",
dest="output_filename",
default="merged.pdf",
help="write merged PDF to FILE",
metavar="FILE")
parser.add_argument("-p", "--path",
dest="path",
default=".",
help="path of source PDF files")
args = parser.parse_args()
merge(args.path, args.output_filename)
from PyPDF2 import PdfFileMerger
import webbrowser
import os
dir_path = os.path.dirname(os.path.realpath(__file__))
def list_files(directory, extension):
return (f for f in os.listdir(directory) if f.endswith('.' + extension))
pdfs = list_files(dir_path, "pdf")
merger = PdfFileMerger()
for pdf in pdfs:
merger.append(open(pdf, 'rb'))
with open('result.pdf', 'wb') as fout:
merger.write(fout)
webbrowser.open_new('file://'+ dir_path + '/result.pdf')
Git Repo: https://github.com/mahaguru24/Python_Merge_PDF.git
You can use pikepdf too (source code documentation).
Example code could be (taken from the documentation):
from glob import glob
from pikepdf import Pdf
pdf = Pdf.new()
for file in glob('*.pdf'): # you can change this to browse directories recursively
with Pdf.open(file) as src:
pdf.pages.extend(src.pages)
pdf.save('merged.pdf')
pdf.close()
If you want to exclude pages, you might proceed another way, for instance copying pages to a new pdf (you can select which ones you do not copy, then, the pdf.pages object behaving like a list).
It is still actively maintained, which, as of february 2022, does not seem to be the case of PyPDF2 nor pdfrw.
I haven't benchmarked it, so I don't know if it is quicker or slower than other solutions.
One advantage over PyMuPDF, in my case, is that an official Ubuntu package is available (python3-pikepdf), what is practical to package my own software depending on it.
here, http://pieceofpy.com/2009/03/05/concatenating-pdf-with-python/, gives an solution.
similarly:
from pyPdf import PdfFileWriter, PdfFileReader
def append_pdf(input,output):
[output.addPage(input.getPage(page_num)) for page_num in range(input.numPages)]
output = PdfFileWriter()
append_pdf(PdfFileReader(file("C:\\sample.pdf","rb")),output)
append_pdf(PdfFileReader(file("c:\\sample1.pdf","rb")),output)
append_pdf(PdfFileReader(file("c:\\sample2.pdf","rb")),output)
append_pdf(PdfFileReader(file("c:\\sample3.pdf","rb")),output)
output.write(file("c:\\combined.pdf","wb"))
------ Updated on 25th Nov. ------
------ Seems above code doesn't work anymore------
------ Please use the following:------
from PyPDF2 import PdfFileMerger, PdfFileReader
import os
merger = PdfFileMerger()
file_folder = "C:\\My Ducoments\\"
root, dirs, files = next(os.walk(file_folder))
for path, subdirs, files in os.walk(root):
for f in files:
if f.endswith(".pdf"):
merger.append(file_folder + f)
merger.write(file_folder + "Economists-1.pdf")
Here's a time comparison for the most common answers for my specific use case: combining a list of 5 large single-page pdf files. I ran each test twice.
(Disclaimer: I ran this function within Flask, your mileage may vary)
TL;DR
pdfrw is the fastest library for combining pdfs out of the 3 I tested.
PyPDF2
start = time.time()
merger = PdfFileMerger()
for pdf in all_pdf_obj:
merger.append(
os.path.join(
os.getcwd(), pdf.filename # full path
)
)
formatted_name = f'Summary_Invoice_{date.today()}.pdf'
merge_file = os.path.join(os.getcwd(), formatted_name)
merger.write(merge_file)
merger.close()
end = time.time()
print(end - start) #1 66.50084733963013 #2 68.2995400428772
PyMuPDF
start = time.time()
result = fitz.open()
for pdf in all_pdf_obj:
with fitz.open(os.path.join(os.getcwd(), pdf.filename)) as mfile:
result.insertPDF(mfile)
formatted_name = f'Summary_Invoice_{date.today()}.pdf'
result.save(formatted_name)
end = time.time()
print(end - start) #1 2.7166640758514404 #2 1.694727897644043
pdfrw
start = time.time()
result = fitz.open()
writer = PdfWriter()
for pdf in all_pdf_obj:
writer.addpages(PdfReader(os.path.join(os.getcwd(), pdf.filename)).pages)
formatted_name = f'Summary_Invoice_{date.today()}.pdf'
writer.write(formatted_name)
end = time.time()
print(end - start) #1 0.6040127277374268 #2 0.9576816558837891
A slight variation using a dictionary for greater flexibility (e.g. sort, dedup):
import os
from PyPDF2 import PdfFileMerger
# use dict to sort by filepath or filename
file_dict = {}
for subdir, dirs, files in os.walk("<dir>"):
for file in files:
filepath = subdir + os.sep + file
# you can have multiple endswith
if filepath.endswith((".pdf", ".PDF")):
file_dict[file] = filepath
# use strict = False to ignore PdfReadError: Illegal character error
merger = PdfFileMerger(strict=False)
for k, v in file_dict.items():
print(k, v)
merger.append(v)
merger.write("combined_result.pdf")
I used pdf unite on the linux terminal by leveraging subprocess (assumes one.pdf and two.pdf exist on the directory) and the aim is to merge them to three.pdf
import subprocess
subprocess.call(['pdfunite one.pdf two.pdf three.pdf'],shell=True)
You can use PdfFileMerger from the PyPDF2 module.
For example, to merge multiple PDF files from a list of paths you can use the following function:
from PyPDF2 import PdfFileMerger
# pass the path of the output final file.pdf and the list of paths
def merge_pdf(out_path: str, extracted_files: list [str]):
merger = PdfFileMerger()
for pdf in extracted_files:
merger.append(pdf)
merger.write(out_path)
merger.close()
merge_pdf('./final.pdf', extracted_files)
And this function to get all the files recursively from a parent folder:
import os
# pass the path of the parent_folder
def fetch_all_files(parent_folder: str):
target_files = []
for path, subdirs, files in os.walk(parent_folder):
for name in files:
target_files.append(os.path.join(path, name))
return target_files
# get a list of all the paths of the pdf
extracted_files = fetch_all_files('./parent_folder')
Finally, you use the two functions declaring.a parent_folder_path that can contain multiple documents, and an output_pdf_path for the destination of the merged PDF:
# get a list of all the paths of the pdf
parent_folder_path = './parent_folder'
outup_pdf_path = './final.pdf'
extracted_files = fetch_all_files(parent_folder_path)
merge_pdf(outup_pdf_path, extracted_files)
You can get the full code from here (Source): How to merge PDF documents using Python
The answer from Giovanni G. PY in an easily usable way (at least for me):
import os
from PyPDF2 import PdfFileMerger
def merge_pdfs(export_dir, input_dir, folder):
current_dir = os.path.join(input_dir, folder)
pdfs = os.listdir(current_dir)
merger = PdfFileMerger()
for pdf in pdfs:
merger.append(open(os.path.join(current_dir, pdf), 'rb'))
with open(os.path.join(export_dir, folder + ".pdf"), "wb") as fout:
merger.write(fout)
export_dir = r"E:\Output"
input_dir = r"E:\Input"
folders = os.listdir(input_dir)
[merge_pdfs(export_dir, input_dir, folder) for folder in folders];
Use right python interpreter:
conda activate py_envs
pip install PyPDF2
Python code:
from PyPDF2 import PdfMerger
#set path files
import os
os.chdir('/ur/path/to/folder/')
cwd = os.path.abspath('')
files = os.listdir(cwd)
def merge_pdf_files():
merger = PdfMerger()
pdf_files = [x for x in files if x.endswith(".pdf")]
[merger.append(pdf) for pdf in pdf_files]
with open("merged_pdf_all.pdf", "wb") as new_file:
merger.write(new_file)
if __name__ == "__main__":
merge_pdf_files()
def pdf_merger(path):
"""Merge the pdfs into one pdf"""
import logging
logging.basicConfig(filename = 'output.log', level = logging.DEBUG, format = '%(asctime)s %(levelname)s %(message)s' )
try:
import glob, os
import PyPDF2
os.chdir(path)
pdfs = []
for file in glob.glob("*.pdf"):
pdfs.append(file)
if len(pdfs) == 0:
logging.info("No pdf in the given directory")
else:
merger = PyPDF2.PdfFileMerger()
for pdf in pdfs:
merger.append(pdf)
merger.write('result.pdf')
merger.close()
except Exception as e:
logging.error('Error has happened')
logging.exception('Exception occured' + str(e))