I'm trying to combine 40+ fillable pdf into one pdf. Each pdf has one page and they are the same form with different data. I followed the script from (PyPDF2 PdfFileMerger loosing PDF module in merged file) to merge pdf, but the merged output duplicates the first file 45 times (for 45 files). Does anyone know what the issue could be and what is the solution? Thank you in advance!
first i define a function to fill forms:
def set_need_appearances_writer(writer):
catalog = writer._root_object
# get the AcroForm tree and add "/NeedAppearances attribute
if "/AcroForm" not in catalog:
NameObject("/AcroForm"): IndirectObject(len(writer._objects), 0, writer)})
need_appearances = NameObject("/NeedAppearances")
writer._root_object["/AcroForm"][need_appearances] = BooleanObject(True)
except Exception as e:
print('set_need_appearances_writer() catch : ', repr(e))
return writer
def AYX_PDF_form_fill(i, template, outfile):
field_dictionary = df.to_dict('records')[i]
inputStream = open(template, "rb")
pdf_reader = PdfFileReader(inputStream, strict=False)
pdf_writer = PdfFileWriter()
pdf_writer.updatePageFormFieldValues(pdf_writer.getPage(0), field_dictionary)
#set_annotation_flag_writer(pdf_writer, field_dictionary)
outputStream = open(outfile, "wb")
#pdf_writer.encrypt(userPWD, ownerPWD, use_128bit=True)
except Exception as e:
print('AYX_PDF_form_fill() catch : ', repr(e))
Then I call the function to fill forms:
template = 'input/template.pdf'
for i in range(0,len(df)):
outfile = os.path.join('output/pdf',"%i.pdf" % i)
AYX_PDF_form_fill(i, template, outfile)
next I define a function to merge pdfs:
from pdfrw import PdfReader, PdfWriter, PdfName
def merge_pdf_files_pdfrw(pdf_files, output_filename):
output = PdfWriter()
num = 0
output_acroform = None
for pdf in pdf_files:
input = PdfReader(pdf,verbose=False)
if PdfName('AcroForm') in input[PdfName('Root')].keys(): # Not all PDFs have an AcroForm node
source_acroform = input[PdfName('Root')][PdfName('AcroForm')]
if PdfName('Fields') in source_acroform:
output_formfields = source_acroform[PdfName('Fields')]
output_formfields = []
num2 = 0
for form_field in output_formfields:
key = PdfName('T')
old_name = form_field[key].replace('(','').replace(')','') # Field names are in the "(name)" format
form_field[key] = 'FILE_{n}_FIELD_{m}_{on}'.format(n=num, m=num2, on=old_name)
num2 += 1
if output_acroform == None:
# copy the first AcroForm node
output_acroform = source_acroform
for key in source_acroform.keys():
# Add new AcroForms keys if output_acroform already existing
if key not in output_acroform:
output_acroform[key] = source_acroform[key]
# Add missing font entries in /DR node of source file
if (PdfName('DR') in source_acroform.keys()) and (PdfName('Font') in source_acroform[PdfName('DR')].keys()):
if PdfName('Font') not in output_acroform[PdfName('DR')].keys():
# if output_acroform is missing entirely the /Font node under an existing /DR, simply add it
output_acroform[PdfName('DR')][PdfName('Font')] = source_acroform[PdfName('DR')][PdfName('Font')]
# else add new fonts only
for font_key in source_acroform[PdfName('DR')][PdfName('Font')].keys():
if font_key not in output_acroform[PdfName('DR')][PdfName('Font')]:
output_acroform[PdfName('DR')][PdfName('Font')][font_key] = source_acroform[PdfName('DR')][PdfName('Font')][font_key]
if PdfName('Fields') not in output_acroform:
output_acroform[PdfName('Fields')] = output_formfields
# Add new fields
output_acroform[PdfName('Fields')] += output_formfields
num +=1
output.trailer[PdfName('Root')][PdfName('AcroForm')] = output_acroform
then I call the function.
# the list contains 45 files
the merged output file has 45 pages duplicating the information from the first file.
I have written the code below that automatically fill in one of my pdf templates, getting data form an excel sheet and saves a copy of it in one of my folders. The full code is attached below. I have removed the folders and replaced the dictionary with example texts.
And yes, I am a beginner:)
import os
import pandas as pd
from PyPDF2 import PdfFileWriter, PdfFileReader
from PyPDF2.generic import (
def set_need_appearances_writer(writer: PdfFileWriter):
catalog = writer._root_object
if "/AcroForm" not in catalog:
NameObject("/AcroForm"): IndirectObject(
len(writer._objects), 0, writer
need_appearances = NameObject("/NeedAppearances")
writer._root_object["/AcroForm"][need_appearances] = BooleanObject(
return writer
except Exception as e:
print("set_need_appearances_writer() catch : ", repr(e))
return writer
def ReadOnlyPDFOutput(page, fields):
for j in range(0, len(page["/Annots"])):
writer_annot = page["/Annots"][j].getObject()
for field in fields:
if writer_annot.get("/T") == field:
NameObject("/Ff"): NumberObject(1),
NameObject("/Ff"): NameObject(1),
if __name__ == "__main__":
xl_filename = "Example_excel_list"
pdf_filename = "Example_template"
xlin = os.path.normpath(
os.path.join(os.getcwd(), r"C:\.....", xl_filename)
pdfin = os.path.normpath(
os.path.join(os.getcwd(), r"C:\.....", pdf_filename)
pdfout = os.path.normpath(os.path.join(os.getcwd(), r"C:\....."))
data = pd.read_excel(xlin)
pdf = PdfFileReader(open(pdfin, "rb"), strict=False)
if "/AcroForm" in pdf.trailer["/Root"]:
{NameObject("/NeedAppearances"): BooleanObject(True)}
pdf_fields = [
str(x) for x in pdf.getFields().keys()
] # List of all pdf field names
excel_fields = data.columns.tolist()
i = 0 # Filename numerical prefix
for j, rows in data.iterrows():
i += 1
pdf2 = PdfFileWriter()
if "/AcroForm" in pdf2._root_object:
{NameObject("/NeedAppearances"): BooleanObject(True)}
# Below you must define the field names as keys in this dictionary
# Field names found by running and printing line 15
# Key = pdf_field_name : Value = csv_field_value
field_dictionary_1 = {
"Example1": str(rows["Example1"]),
"Example2": rows["Example2"],
"Example3": rows["Example3"],
"Example4": rows["Example4"],
"Example5": rows["Example5"],
"Example6": rows["Example6"],
"Checkbox1": rows["Checkbox1"],
"Checkbox2": rows["Checkbox2"],
temp_out_dir = os.path.normpath(
os.path.join(pdfout, str(i) + "out.pdf")
pdf2.addPage(pdf.getPage(0)) # Makes a copy of pdf template page
pdf2.getPage(0), field_dictionary_1
) # Updates fields
# Makes the pdf output file READ-ONLY
ReadOnlyPDFOutput(pdf2.getPage(0), field_dictionary_1)
outputStream = open(temp_out_dir, "wb")
pdf2.write(outputStream) # Saves copy of enhanced template
print(f"Process Complete: {i} PDFs Processed!")
What I want to improve is the possibility to "check" the checkboxes instead of doing it with textboxes with the letter "x" which I have done in this code. For example, if I write "Yes" in a specific Excel cell, the checkbox "checks". If nothing is displayed the checkboxes stays empty.
I probably have to make a dictionary for the checkboxes itself, but I am not sure how I should proceed with the rest.
I want to use textract (via aws cli) to extract tables from a pdf file (located in an s3 location) and export it into a csv file. I have tried writing a .py script but am struggling to read from the file.
Any suggestions for writing the .py script is welcome.
This is my current script. I run into the error:
File "extract-table.py", line 63, in get_table_csv_results
bash: File: command not found
KeyError: 'Blocks'
import webbrowser, os
import json
import boto3
import io
from io import BytesIO
import sys
from pprint import pprint
def get_rows_columns_map(table_result, blocks_map):
rows = {}
for relationship in table_result['Relationships']:
if relationship['Type'] == 'CHILD':
for child_id in relationship['Ids']:
cell = blocks_map[child_id]
if cell['BlockType'] == 'CELL':
row_index = cell['RowIndex']
col_index = cell['ColumnIndex']
if row_index not in rows:
# create new row
rows[row_index] = {}
# get the text value
rows[row_index][col_index] = get_text(cell, blocks_map)
return rows
def get_text(result, blocks_map):
text = ''
if 'Relationships' in result:
for relationship in result['Relationships']:
if relationship['Type'] == 'CHILD':
for child_id in relationship['Ids']:
word = blocks_map[child_id]
if word['BlockType'] == 'WORD':
text += word['Text'] + ' '
if word['BlockType'] == 'SELECTION_ELEMENT':
if word['SelectionStatus'] =='SELECTED':
text += 'X '
def get_table_csv_results(file_name):
with open(file_name, 'rb') as file:
img_test = file.read()
bytes_test = bytearray(img_test)
print('Image loaded', file_name)
# process using image bytes
# get the results
client = boto3.client('textract')
response = client.start_document_text_detection(
'S3Object': {
'Bucket': s3BucketName,
'Name': documentName
# Get the text blocks
blocks_map = {}
table_blocks = []
for block in blocks:
blocks_map[block['Id']] = block
if block['BlockType'] == "TABLE":
if len(table_blocks) <= 0:
return "<b> NO Table FOUND </b>"
csv = ''
for index, table in enumerate(table_blocks):
csv += generate_table_csv(table, blocks_map, index +1)
csv += '\n\n'
return csv
def generate_table_csv(table_result, blocks_map, table_index):
rows = get_rows_columns_map(table_result, blocks_map)
table_id = 'Table_' + str(table_index)
# get cells.
csv = 'Table: {0}\n\n'.format(table_id)
for row_index, cols in rows.items():
for col_index, text in cols.items():
csv += '{}'.format(text) + ","
csv += '\n'
csv += '\n\n\n'
return csv
def main(file_name):
table_csv = get_table_csv_results(file_name)
output_file = 'output.csv'
# replace content
with open(output_file, "wt") as fout:
# show the results
print('CSV OUTPUT FILE: ', output_file)
# Document
s3BucketName = "chrisyou.sagemi.com"
documentName = "DETAIL.pdf"
if __name__ == "__main__":
file_name = sys.argv[1]
There is a much simpler way using the Amazon Textractor Textractor library. pip install amazon-textract-textractor
This will create a csv per table in your pdf document. e.g output_p0_t0.csv
from textractor import Textractor
def extract_tables(s3_file_path, output_directory, s3_output_path):
extractor = Textractor(profile_name="default")
document = extractor.start_document_analysis(s3_file_path, textractor.data.constants.TextractFeatures.TABLES, s3_output_path)
for j, page in enumerate(document.pages):
for i, table in enumerate(document.tables):
with open(output_directory+f'/output_p{j}_t{i}.csv', 'w') as csv_file:
return document
document = extract_tables('s3://<INPUT_FILE.PDF>', './<LOCAL_DIRECTORY_FOR_CSV>', 's3://<TEXTRACT_OUTPUT_DIRECTORY>')
I had to make slight changes to #Thomas answer by importing profile `
extractor = Textractor(profile_name="default") right after importing Textractor as shown below to avoid getting this error -> NameError: name 'textractor' is not defined.
from textractor import Textractor
extractor = Textractor(profile_name="default")
def extract_tables(s3_file_path, output_directory, s3_output_path):
document = extractor.start_document_analysis(s3_file_path, textractor.data.constants.TextractFeatures.TABLES, s3_output_path)
for j, page in enumerate(document.pages):
for i, table in enumerate(document.tables):
with open(output_directory+f'/output_p{j}_t{i}.csv', 'w') as csv_file:
return document
document = extract_tables('s3://<INPUT_FILE.PDF>', './<LOCAL_DIRECTORY_FOR_CSV>', 's3://<TEXTRACT_OUTPUT_DIRECTORY>')
Hope it helps someone out there.
I wrote a code that works fine for single file, but I have to change the names for each file. It reads a pickle file, write it into a txt file, then does some process on the context of txt file and produce a list of numbers, at the end stores the list in a dataframe and write that dataframe in csv file.
def get_value_of_list(bit_list):
p_number = 0
for i in bit_list:
if i == 1:
p_number = p_number + 1
return p_number
def cross_entropy(p, q):
return -sum([p[i] * log2(q[i]) for i in range(len(p))])
if __name__ == "__main__":
file_name = 'pickleData_AIMchat2.txt'
pickle_file = 'AIMchat2.pickle'
pk = PickleToFile(file_name, pickle_file)
h = HexToBinary(file_name)
hex_list = h.read_file()
num_of_bits = 8
scale = 16
bin_data = []
for i in hex_list:
bin_data.append(bin(int(i, scale))[2:].zfill(num_of_bits))
my_bit_list = []
for byte in bin_data:
bit_list = []
for bit in byte:
num_of_one_divided_by_eight = get_value_of_list(bit_list) / 8
cross_entropy_list = []
i = 0
while i < len(my_bit_list):
cross = cross_entropy([my_bit_list[i]], [my_bit_list[i + 1]])
i = i + 2
df = pd.DataFrame(cross_entropy_list)
df.to_csv(r'AIMchat2.csv', index=False, index_label=False, chunksize=1000000, header=False)
I have changed create_pickle_file() to the code below to read files in the directory:
class PickleToFile:
def __init__(self, name, pickle_file):
self.name = name
self.pickle_file = pickle_file
def create_pickle_file(self):
basepath = Path()
files_in_basepath = basepath.iterdir('pickle/')
for item in files_in_basepath:
if item.is_file():
checkThePickle = open(self.pickle_file, "rb")
with open(self.name, 'w') as filehandler:
for listItem in checkThePickle:
filehandler.write('%s\n' % listItem)
But since after reading file it writes it to a text file and then a csv file, I don't know how to do that. Appreciate any suggestions.
If you are looking to get a list of files in directory and process them, this should get you what you want:
How do I list all files of a directory?
Once you have this list of files, do a loop:
for each in list_of_files:
Then, you are on your way, where 'process_function' is the function, and the argument is the filename.
I'm trying to automate merging several PDF files and have two requirements: a) existing bookmarks AND b) pagelabels (custom page numbering) need to be retained.
Retaining bookmarks when merging happens by default with PyPDF2 and pdftk, but not with pdfrw.
Pagelabels are consistently not retained in PyPDF2, pdftk or pdfrw.
I am guessing, after having searched a lot, that there is no straightforward approach to doing what I want. If I'm wrong then I hope someone can point to this easy solution. But, if there is no easy solution, any tips on how to get this going in python will be much appreciated!
Some example code:
1) With PyPDF2
from PyPDF2 import PdfFileWriter, PdfFileMerger, PdfFileReader
tmp1 = PdfFileReader('file1.pdf', 'rb')
tmp2 = PdfFileReader('file2.pdf', 'rb')
#extracting pagelabels is easy
pl1 = tmp1.trailer['/Root']['/PageLabels']
pl2 = tmp2.trailer['/Root']['/PageLabels']
#but PdfFileWriter or PdfFileMerger does not support writing from what I understand
So I dont know how to proceed from here
2) With pdfrw (has more promise)
from pdfrw import PdfReader, PdfWriter
writer = PdfWriter()
#read 1st file
tmp1 = PdfReader('file1')
#add the pages
#copy bookmarks to writer
writer.trailer.Root.Outlines = tmp1.Root.Outlines
#copy pagelabels to writer
writer.trailer.Root.PageLabels = tmp1.Root.PageLabels
#read second file
tmp2 = PdfReader('file2')
#append pages
# so far so good
Page numbers of bookmarks from 2nd file need to be offset before adding them, but when reading outlines I almost always get (IndirectObject, XXX) instead of page numbers. Its unclear how to get page numbers for each label and bookmark using pdfrw. So, I'm stuck again
As mentioned in my comment, I'm posting a generic solution to merge several pdfs that works in PyPDF2. Dont know what is different to make this work in PyPDF2 other than initializing pls as ArrayObject()
from PyPDF2 import PdfFileWriter, PdfFileMerger, PdfFileReader
import PyPDF2.pdf as PDF
# pls holds all the pagelabels as we iterate through multiple pdfs
pls = PDF.ArrayObject()
# used to offset bookmarks
pageCount = 0
cpdf = PdfFileMerger()
# pdffiles is a list of all files to be merged
for i in range(len(pdffiles)):
tmppdf = PdfFileReader(pdffiles[i], 'rb')
# copy all the pagelabels which I assume is present in all files
# you could use 'try' in case no pagelabels are present
plstmp = tmppdf.trailer['/Root']['/PageLabels']['/Nums']
# sometimes keys are indirect objects
# so, iterate through each pagelabel and...
for j in range(len(plstmp)):
# ... get the actual values
plstmp[j] = plstmp[j].getObject()
# offset pagenumbers by current count of pages
if isinstance(plstmp[j], int):
plstmp[j] = PDF.NumberObject(plstmp[j] + pageCount)
# once all the pagelabels are processed I append to pls
pls += plstmp
#increment pageCount
pageCount += tmppdf.getNumPages()
# rest follows KevinM's answer
pagenums = PDF.DictionaryObject()
pagenums.update({PDF.NameObject('/Nums') : pls})
pagelabels = PDF.DictionaryObject()
pagelabels.update({PDF.NameObject('/PageLabels') : pagenums})
You need to iterate through the existing PageLabels and add them to the merged output, taking care to add an offset to the page index entry, based on the number of pages already added.
This solution also requires PyPDF4, since PyPDF2 produces a weird error (see bottom).
from PyPDF4 import PdfFileWriter, PdfFileMerger, PdfFileReader
# To manipulate the PDF dictionary
import PyPDF4.pdf as PDF
import logging
def add_nums(num_entry, page_offset, nums_array):
for num in num_entry['/Nums']:
if isinstance(num, (int)):
logging.debug("Found page number %s, offset %s: ", num, page_offset)
# Add the physical page information
# {'/S': '/r'}, or {'/S': '/D', '/St': 489}
keys = num.keys()
logging.debug("Found page label, keys: %s", keys)
number_type = PDF.DictionaryObject()
# Always copy the /S entry
s_entry = num['/S']
number_type.update({PDF.NameObject("/S"): PDF.NameObject(s_entry)})
logging.debug("Adding /S entry: %s", s_entry)
if '/St' in keys:
# If there is an /St entry, fetch it
pdf_label_offset = num['/St']
# and add the new offset to it
logging.debug("Found /St %s", pdf_label_offset)
number_type.update({PDF.NameObject("/St"): PDF.NumberObject(pdf_label_offset)})
# Add the label information
return nums_array
def write_merged(pdf_readers):
# Output
merger = PdfFileMerger()
# For PageLabels information
page_labels = []
page_offset = 0
nums_array = PDF.ArrayObject()
# Iterate through all the inputs
for pdf_reader in pdf_readers:
# Merge the content
# Handle the PageLabels
# Fetch page information
old_page_labels = pdf_reader.trailer['/Root']['/PageLabels']
page_count = pdf_reader.getNumPages()
# Add PageLabel information
add_nums(old_page_labels, page_offset, nums_array)
page_offset = page_offset + page_count
except Exception as err:
print("ERROR: %s" % err)
# Add PageLabels
page_numbers = PDF.DictionaryObject()
page_numbers.update({PDF.NameObject("/Nums"): nums_array})
page_labels = PDF.DictionaryObject()
page_labels.update({PDF.NameObject("/PageLabels"): page_numbers})
root_obj = merger.output._root_object
# Write output
pdf_readers = []
tmp1 = PdfFileReader('file1.pdf', 'rb')
tmp2 = PdfFileReader('file2.pdf', 'rb')
Note: PyPDF2 produces this weird error:
File "/usr/lib/python3/dist-packages/PyPDF2/pdf.py", line 552, in _sweepIndirectReferences
data[key] = value
File "/usr/lib/python3/dist-packages/PyPDF2/generic.py", line 507, in __setitem__
raise ValueError("key must be PdfObject")
ValueError: key must be PdfObject
I am parsing through many xml files and putting certain information into a csv file. Because my xml files are named: "1.xml", "2.xml", etc... I am using a for loop to cycle through my different Xml file titles. However, based on the range that I use on my for loop, my csv file contains different data. For example, when my for loop range is 1:200 my csv file includes info from my xml files 1 to 199. However, when I change my range to 1:300, my csv file only contains info for my xml files 217 to 249. The info actually stored on my csv file changes based on what I put in as my range for my for loop. Has anyone else had this error and do you have any solutions?
My code is below:
import xml.etree.ElementTree as ET
import csv
from pathlib import Path
# open a file for writing
data_labels = open('DataLabels.csv', 'w', newline='')
missing_files = open('MissingFiles.csv', 'w', newline = '')
# create the csv writer object
csvwriter = csv.writer(data_labels)
csvwriter2 = csv.writer(missing_files)
data_head = []
data = []
missingfiles = 0
missfiles = []
MediaId = "Media Id"
#data_head.append (MediaId)
Family = "Family"
#data_head.append (Family)
Species = "Species"
#data_head.append (Species)
Genus = "Genus"
Content = "Content"
ClassId = "ClassId"
#data_head.append (Genus)
# Family = member.find('Family').tag
# Species = member.find('Species').tag
# Genus = member.find('Genus').tag
for i in range (1, 190):
#print (i)
data = []
inputfilename = str(i)+ ".xml"
my_file = Path(inputfilename)
if my_file.is_file():
data_labels = open('DataLabels.csv', 'w', newline='')
tree = ET.parse(inputfilename)
root = tree.getroot()
MediaId = root [2].text
Content = root[4].text
ClassId = root[5].text
Family = root[6].text
Species = root[7].text
Genus = root[8].text
#print (vote)
#count = 0
#for Image in root.find('MediaId'):
#print (child.tag, child.attrib)
#name = child.find('MediaId').text
# print (Image.find ('MediaId').text)
##csvwriter.writerow (data_head)
#data = []
#if count == 0:
# print ("count is zero i'm in loop")
# MediaId = member.find('MediaId').tag
# count = count + 1
#MediaId = root.findall('MediaId').text
data.append (Content)
data.append (ClassId)
#Family = member.find('Family').text
#Species = member.find('Species').text
#Genus = member.find('Genus').text
#print (data)
missingfiles = missingfiles +1
missfiles = []
print ("missing", missingfiles, "files")
print ("done")
Open the csv in append mode ,else you are just overwriting the same file.
I think you need to divide your script in small readable functions.
First, you can create a function to parse a XML file:
import xml.etree.ElementTree as ET
def parse_xml_file(xml_path):
""" Parse an XML file and return the data. """
# type: (str) -> list
tree = ET.parse(xml_path)
root = tree.getroot()
return [
This function parse a XML file and return one record containing a list of values.
Then, you can create a function to iterate a list of XML files (existing files) dans populate the CSV file:
import csv
import io
import os
def populate_data_labels(xml_path_list, work_dir="."):
header = ["Media Id", "Family", "Species", "Genus", "Content", "ClassId"]
with io.open(os.path.join(work_dir, 'DataLabels.csv'), 'w') as fd:
writer = csv.writer(fd)
for xml_path in xml_path_list:
This function use parse_xml_file() to extract each record.
You can create a function to log the missing files. You can use CSV format (or a simple text file):
def populate_missing_files(missing_files, work_dir="."):
header = ["Filename"]
with io.open(os.path.join(work_dir, 'MissingFiles.csv'), 'w') as fd:
writer = csv.writer(fd)
for xml_path in missing_files:
Finally, you can write a function which search the XML files and call the previous functions:
def parse_work_dir(work_dir="."):
all_files = [os.path.join(work_dir, "{0}.xml".format(idx))
for idx in range(1, 190)]
existing_files = (path for path in all_files if os.path.exists(path))
populate_data_labels(existing_files, work_dir)
missing_files = (path for path in all_files if not os.path.exists(path))
populate_missing_files(missing_files, work_dir)