Writing a CSV file into a PDF document

Writing a CSV file into a PDF document - python

I have a python script that parses data from an SQLite DB file and writes it to a .csv file. I would like to write this data to a forensic style report in PDF format. I have already been able to create a template pdf with a heading, date, case number, and short paragraph on details. I was wondering how should I write the .csv file data into a table in the PDF. As shown i have tried iterating through the .csv file after reading it with csv.reader. I can write the initial headings into the file but it will not pull the data from the .csv file and write it. Can anyone point me in the right direction.
# Script to generate a PDF report after data has been parsed into .csv file
# import statements
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import portrait
import csv
# PDF document layout
canvas = canvas.Canvas("H:\College Fourth Year\Development Project\Final Year Project 2018\Forensic Reports\SMS Report.pdf", pagesize=letter)
canvas.setLineWidth(.3)
canvas.setFont('Helvetica', 12)
canvas.drawString(30,750,'LYIT MOBILE FORENSICS DIVISION')
canvas.drawString(500,750,"Date: 12/02/2018")
canvas.line(500,747,595,747)
canvas.drawString(500,725,'Case Number:')
canvas.drawString(580,725,"10")
canvas.line(500,723,595,723)
# Introduction text
line1 = 'This forensic report on SMS data has been compiled by the forensic'
line2 = 'examiner in conclusion to the investigation into the RTA'
line3 = 'case which occured on 23/01/2018'
textObject = canvas.beginText(30, 700)
lines = [line1, line2, line3]
for line in lines:
textObject.textLine(line)
canvas.drawText(textObject)
# File that must be written to report
data_file = 'H:\College Fourth Year\Development Project\Final Year Project 2018\ExtractedEvidence\smsInfo.csv'
c = canvas
# Function for importing data
def import_Data(data_file):
smsInfo = csv.reader(open(data_file, "r"))
for row in smsInfo:
ID = row[0]
Incoming_Number = row[1]
Date_And_Time = row[2]
Read = row[3]
Sent_Replied = row[4]
Body = row[5]
Seen = [6]
pdf_filename = 'SMS Data Report.pdf'
generate_report(ID, Incoming_Number, Date_And_Time, Read, Sent_Replied, Body, Seen)
def generate_report(ID, Date_And_Time, Read, Sent_Replied, Body, Seen, pdf_filename):
#c = canvas.Canvas(pdf_filename, pagesize=portrait(letter))
import_Data(data_file)
canvas.save()
print("Forensic Report Generated!")

Maybe you could try something different. Recently I had to make a similar PDF files using data from csv file. I hope this help you:
# import statements
import requests
from reportlab.lib import colors
from reportlab.lib.pagesizes import A4, inch, landscape, legal, letter
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Image, Spacer, PageBreak, Table, TableStyle
from reportlab.lib.styles import getSampleStyleSheet
import csv
import os
# Get de work directory
cwd = os.getcwd()
# Introduction text
line1 = 'This forensic report on SMS data has been compiled by the forensic'
line2 = 'examiner in conclusion to the investigation into the RTA'
line3 = 'case which occured on 23/01/2018'
#PDF document layout
table_style = TableStyle([('ALIGN',(1,1),(-2,-2),'RIGHT'),
('TEXTCOLOR',(1,1),(-2,-2),colors.red),
('VALIGN',(0,0),(0,-1),'TOP'),
('TEXTCOLOR',(0,0),(0,-1),colors.blue),
('ALIGN',(0,-1),(-1,-1),'CENTER'),
('VALIGN',(0,-1),(-1,-1),'MIDDLE'),
('TEXTCOLOR',(0,-1),(-1,-1),colors.green),
('INNERGRID', (0,0), (-1,-1), 0.25, colors.black),
('BOX', (0,0), (-1,-1), 0.25, colors.black),
])
styles = getSampleStyleSheet()
styleNormal = styles['Normal']
styleHeading = styles['Heading1']
styleHeading2 = styles['Heading2']
styleHeading.alignment = 1 # centre text (TA_CENTRE)
#Configure style and word wrap
s = getSampleStyleSheet()
s = s["BodyText"]
s.wordWrap = 'CJK'
# File that must be written to report
with open ('smsInfo.csv', 'rb') as csvfile:
reader = csv.reader(csvfile)
lista = list(reader)
headers = lista[0]
conteo = 1
for numRecord in range(1,len(lista)):
record1 = lista[numRecord]
data = list()
emptyRecords = list()
records = list()
header = list()
countRecords = 0
for line in record1:
if line == '':
emptyRecords.append(line)
else:
records.append(line)
header.append(headers[countRecords])
data.append([str(headers[countRecords]), str(line)])
countRecords = countRecords + 1
data2 = [[Paragraph(cell, s) for cell in row] for row in data]
t = Table(data2)
t.setStyle(table_style)
elements = []
# Name of file
fileName = cwd + '\\' + 'SMS Data Report' + '-' + str(conteo) + '.pdf'
conteo = conteo + 1
archivo_pdf = SimpleDocTemplate(fileName, pagesize = letter, rightMargin = 40, leftMargin = 40, topMargin = 40, bottomMargin = 28)
#Send the data and build the file
elements.append(Paragraph(line1, styleNormal))
elements.append(Paragraph(line2, styleNormal))
elements.append(Paragraph(line3, styleNormal))
elements.append(Spacer(inch, .25*inch))
elements.append(t)
archivo_pdf.build(elements)
print 'Forensic Report Generated:', fileName
This code generate files like this one:
PDF report

Related

Extract Text into Column using Regex

I want to extract data (S no, Item Code, Price and Size) from the attached PDF Document in to columns.
The re.compile works for the S no, Item Code and Price, but as soon as I put the Size - it gives a limited output. I am unable to figure out why? Can you please help
(Attached picture of the PDF page)
Import pandas as pd
Import re
Import PyPDF2
file = open("Petchem.pdf", "rb")
pdfReader = PyPDF2.PdfFileReader(file)
my_dict = {"S no":[], "Item Code":[], "Price":[], "Size":[]}
for page in range (1,25):
pageObj = pdfReader.getPage(page)
data = pageObj.extractText()
size = re.compile(r'((\d{2,4}?)(\d{10})EA\s(\d?\d?,?\d?\d?\d.\d\d)[\s\w\d,:/.()-])')
for number in size.findall(data):
S_No = my_dict["S No"].append(number[1])
Item_Code = my_dict["Item Code"].append(number[2])
Price = my_dict["Price"].append(number[3])
Size = my_dict["Size"].append(number[4])
print(number[1])
a_file = open("Column_Breakup.csv", "w")
datadf = pd.DataFrame(my_dict)
datadf.to_csv("Column_Breakup.csv")
a_file.close()

Using Textract, how do you extract tables from a pdf file and output it into a csv file via .py script?

I want to use textract (via aws cli) to extract tables from a pdf file (located in an s3 location) and export it into a csv file. I have tried writing a .py script but am struggling to read from the file.
Any suggestions for writing the .py script is welcome.
This is my current script. I run into the error:
File "extract-table.py", line 63, in get_table_csv_results
bash: File: command not found
blocks=response['Blocks']
KeyError: 'Blocks'
import webbrowser, os
import json
import boto3
import io
from io import BytesIO
import sys
from pprint import pprint
def get_rows_columns_map(table_result, blocks_map):
rows = {}
for relationship in table_result['Relationships']:
if relationship['Type'] == 'CHILD':
for child_id in relationship['Ids']:
cell = blocks_map[child_id]
if cell['BlockType'] == 'CELL':
row_index = cell['RowIndex']
col_index = cell['ColumnIndex']
if row_index not in rows:
# create new row
rows[row_index] = {}
# get the text value
rows[row_index][col_index] = get_text(cell, blocks_map)
return rows
def get_text(result, blocks_map):
text = ''
if 'Relationships' in result:
for relationship in result['Relationships']:
if relationship['Type'] == 'CHILD':
for child_id in relationship['Ids']:
word = blocks_map[child_id]
if word['BlockType'] == 'WORD':
text += word['Text'] + ' '
if word['BlockType'] == 'SELECTION_ELEMENT':
if word['SelectionStatus'] =='SELECTED':
text += 'X '
def get_table_csv_results(file_name):
with open(file_name, 'rb') as file:
img_test = file.read()
bytes_test = bytearray(img_test)
print('Image loaded', file_name)
# process using image bytes
# get the results
client = boto3.client('textract')
#Response
response = client.start_document_text_detection(
DocumentLocation={
'S3Object': {
'Bucket': s3BucketName,
'Name': documentName
}
})
# Get the text blocks
blocks=response['Blocks']
pprint(blocks)
blocks_map = {}
table_blocks = []
for block in blocks:
blocks_map[block['Id']] = block
if block['BlockType'] == "TABLE":
table_blocks.append(block)
if len(table_blocks) <= 0:
return "<b> NO Table FOUND </b>"
csv = ''
for index, table in enumerate(table_blocks):
csv += generate_table_csv(table, blocks_map, index +1)
csv += '\n\n'
return csv
def generate_table_csv(table_result, blocks_map, table_index):
rows = get_rows_columns_map(table_result, blocks_map)
table_id = 'Table_' + str(table_index)
# get cells.
csv = 'Table: {0}\n\n'.format(table_id)
for row_index, cols in rows.items():
for col_index, text in cols.items():
csv += '{}'.format(text) + ","
csv += '\n'
csv += '\n\n\n'
return csv
def main(file_name):
table_csv = get_table_csv_results(file_name)
output_file = 'output.csv'
# replace content
with open(output_file, "wt") as fout:
fout.write(table_csv)
# show the results
print('CSV OUTPUT FILE: ', output_file)
# Document
s3BucketName = "chrisyou.sagemi.com"
documentName = "DETAIL.pdf"
if __name__ == "__main__":
file_name = sys.argv[1]
main(file_name)

There is a much simpler way using the Amazon Textractor Textractor library. pip install amazon-textract-textractor
This will create a csv per table in your pdf document. e.g output_p0_t0.csv
from textractor import Textractor
def extract_tables(s3_file_path, output_directory, s3_output_path):
extractor = Textractor(profile_name="default")
document = extractor.start_document_analysis(s3_file_path, textractor.data.constants.TextractFeatures.TABLES, s3_output_path)
for j, page in enumerate(document.pages):
for i, table in enumerate(document.tables):
with open(output_directory+f'/output_p{j}_t{i}.csv', 'w') as csv_file:
csv_file.write(table.to_csv())
return document
document = extract_tables('s3://<INPUT_FILE.PDF>', './<LOCAL_DIRECTORY_FOR_CSV>', 's3://<TEXTRACT_OUTPUT_DIRECTORY>')

I had to make slight changes to #Thomas answer by importing profile `
extractor = Textractor(profile_name="default") right after importing Textractor as shown below to avoid getting this error -> NameError: name 'textractor' is not defined.
from textractor import Textractor
extractor = Textractor(profile_name="default")
def extract_tables(s3_file_path, output_directory, s3_output_path):
document = extractor.start_document_analysis(s3_file_path, textractor.data.constants.TextractFeatures.TABLES, s3_output_path)
for j, page in enumerate(document.pages):
for i, table in enumerate(document.tables):
with open(output_directory+f'/output_p{j}_t{i}.csv', 'w') as csv_file:
csv_file.write(table.to_csv())
return document
document = extract_tables('s3://<INPUT_FILE.PDF>', './<LOCAL_DIRECTORY_FOR_CSV>', 's3://<TEXTRACT_OUTPUT_DIRECTORY>')
Hope it helps someone out there.

How to save pdfs

How could I save dfa as a pdf?
So far I was able to have each of the elements saved separately in each file. How now can I have the whole file saved as a single pdf?
dfa = pd.DataFrame({'STREAM':['EAGLE','HAWK','HAWK','HAWK','EAGLE','HAWK','EAGLE'],'MAT':['A','D','F','D','C','C','E'],'KIS':['B','D','E','D','A','C','D'],'GEO':['B','C','E','E','F','A','B']})
dfa.to_csv('results.csv',index=False)
students_data = csv.reader(open("results.csv", 'r'))
for row in students_data:
STREAM = row[0]
MAT = row[1]
GEO = row[2]
KIS = row[3]
c = canvas.Canvas(MAT +".pdf")
c.drawString(60, 700, "STREAM: " + STREAM)
c.drawString(60, 600, "MAT: " + MAT)
c.drawString(60, 500, "KIS: " + KIS)
c.drawString(60, 400, "GEO: " + GEO)
c.save()

My suggestion would be to create a blank PDF as your template page and then
merge the new PDF to it
packet = io.BytesIO()
# Create the initial canvas.
c = canvas.Canvas(packet)
# your code for adding to the canvas
packet.seek(0)
new_pdf = PdfFileReader(packet)
# Import The Template
template = PdfFileReader(open('path_to_template'), "rb")
output = PdfFileWriter()
# add the created PDF as a watermark to the template
page = template.getPage(0)
page.mergePage(new_pdf.getPage(0))
output.addPage(page)
# finally, write "output" to a real file
outputStream = open(output_path, "wb")
output.write(outputStream)
outputStream.close()
you will need these imports
from PyPDF2 import PdfFileWriter, PdfFileReader
import io

Only portions of data are being written to a cvs file, rest is missing

I am parsing through many xml files and putting certain information into a csv file. Because my xml files are named: "1.xml", "2.xml", etc... I am using a for loop to cycle through my different Xml file titles. However, based on the range that I use on my for loop, my csv file contains different data. For example, when my for loop range is 1:200 my csv file includes info from my xml files 1 to 199. However, when I change my range to 1:300, my csv file only contains info for my xml files 217 to 249. The info actually stored on my csv file changes based on what I put in as my range for my for loop. Has anyone else had this error and do you have any solutions?
My code is below:
import xml.etree.ElementTree as ET
import csv
from pathlib import Path
# open a file for writing
data_labels = open('DataLabels.csv', 'w', newline='')
missing_files = open('MissingFiles.csv', 'w', newline = '')
# create the csv writer object
csvwriter = csv.writer(data_labels)
csvwriter2 = csv.writer(missing_files)
data_head = []
data = []
missingfiles = 0
missfiles = []
MediaId = "Media Id"
#data_head.append (MediaId)
Family = "Family"
#data_head.append (Family)
Species = "Species"
#data_head.append (Species)
Genus = "Genus"
Content = "Content"
ClassId = "ClassId"
#data_head.append (Genus)
data_head.append(MediaId)
# Family = member.find('Family').tag
data_head.append(Content)
data_head.append(ClassId)
data_head.append(Family)
# Species = member.find('Species').tag
data_head.append(Species)
# Genus = member.find('Genus').tag
data_head.append(Genus)
csvwriter.writerow(data_head)
for i in range (1, 190):
#print (i)
data = []
inputfilename = str(i)+ ".xml"
my_file = Path(inputfilename)
if my_file.is_file():
data_labels = open('DataLabels.csv', 'w', newline='')
tree = ET.parse(inputfilename)
root = tree.getroot()
MediaId = root [2].text
Content = root[4].text
ClassId = root[5].text
Family = root[6].text
Species = root[7].text
Genus = root[8].text
#print (vote)
#count = 0
#for Image in root.find('MediaId'):
#print (child.tag, child.attrib)
#name = child.find('MediaId').text
# print (Image.find ('MediaId').text)
##csvwriter.writerow (data_head)
#data = []
#if count == 0:
# print ("count is zero i'm in loop")
# MediaId = member.find('MediaId').tag
# count = count + 1
#else:
#MediaId = root.findall('MediaId').text
data.append(MediaId)
data.append (Content)
data.append (ClassId)
#Family = member.find('Family').text
data.append(Family)
#Species = member.find('Species').text
data.append(Species)
#Genus = member.find('Genus').text
data.append(Genus)
csvwriter.writerow(data)
data_labels.close()
#print (data)
else:
missingfiles = missingfiles +1
missfiles = []
missfiles.append(inputfilename)
csvwriter2.writerow(missfiles)
print ("missing", missingfiles, "files")
data_labels.close()
missing_files.close()
print ("done")

Open the csv in append mode ,else you are just overwriting the same file.

I think you need to divide your script in small readable functions.
First, you can create a function to parse a XML file:
import xml.etree.ElementTree as ET
def parse_xml_file(xml_path):
""" Parse an XML file and return the data. """
# type: (str) -> list
tree = ET.parse(xml_path)
root = tree.getroot()
return [
root[2].text,
root[4].text,
root[5].text,
root[6].text,
root[7].text,
root[8].text]
This function parse a XML file and return one record containing a list of values.
Then, you can create a function to iterate a list of XML files (existing files) dans populate the CSV file:
import csv
import io
import os
def populate_data_labels(xml_path_list, work_dir="."):
header = ["Media Id", "Family", "Species", "Genus", "Content", "ClassId"]
with io.open(os.path.join(work_dir, 'DataLabels.csv'), 'w') as fd:
writer = csv.writer(fd)
writer.writerow(header)
for xml_path in xml_path_list:
writer.writerow(parse_xml_file(xml_path))
This function use parse_xml_file() to extract each record.
You can create a function to log the missing files. You can use CSV format (or a simple text file):
def populate_missing_files(missing_files, work_dir="."):
header = ["Filename"]
with io.open(os.path.join(work_dir, 'MissingFiles.csv'), 'w') as fd:
writer = csv.writer(fd)
writer.writerow(header)
for xml_path in missing_files:
writer.writerow([os.path.basename(xml_path)])
Finally, you can write a function which search the XML files and call the previous functions:
def parse_work_dir(work_dir="."):
all_files = [os.path.join(work_dir, "{0}.xml".format(idx))
for idx in range(1, 190)]
existing_files = (path for path in all_files if os.path.exists(path))
populate_data_labels(existing_files, work_dir)
missing_files = (path for path in all_files if not os.path.exists(path))
populate_missing_files(missing_files, work_dir)
Usage:
parse_work_dir("/path/to/your/working/dir")

Search and replace for text within a pdf, in Python

This question already has answers here:
How can I replace text in a PDF using Python?
(4 answers)
Closed 14 hours ago.
I am writing mailmerge software as part of a Python web app.
I have a template called letter.pdf which was generated from a MS Word file and includes the text {name} where the resident's name will go. I also have a list of c. 100 residents' names.
What I want to do is to read in letter.pdf do a search for "{name}" and replace it with the resident's name (for each resident) then write the result to another pdf. I then want to gather all these pdfs together into a big pdf (one page per letter) which my web app's users will print out to create their letters.
Are there any Python libraries that will do this? I've looked at pdfrw and pdfminer but I couldn't see where they would be able to do it.
(NB: I also have the MS Word file, so if there was another way of using that, and not going through a pdf, that would also do the job.)

This can be done with PyPDF2 package. The implementation may depend on the original PDF template structure. But if the template is stable enough and isn't changed very often the replacement code shouldn't be generic but rather simple.
I did a small sketch on how you could replace the text inside a PDF file. It replaces all occurrences of PDF tokens to DOC.
import os
import argparse
from PyPDF2 import PdfFileReader, PdfFileWriter
from PyPDF2.generic import DecodedStreamObject, EncodedStreamObject
def replace_text(content, replacements = dict()):
lines = content.splitlines()
result = ""
in_text = False
for line in lines:
if line == "BT":
in_text = True
elif line == "ET":
in_text = False
elif in_text:
cmd = line[-2:]
if cmd.lower() == 'tj':
replaced_line = line
for k, v in replacements.items():
replaced_line = replaced_line.replace(k, v)
result += replaced_line + "\n"
else:
result += line + "\n"
continue
result += line + "\n"
return result
def process_data(object, replacements):
data = object.getData()
decoded_data = data.decode('utf-8')
replaced_data = replace_text(decoded_data, replacements)
encoded_data = replaced_data.encode('utf-8')
if object.decodedSelf is not None:
object.decodedSelf.setData(encoded_data)
else:
object.setData(encoded_data)
if __name__ == "__main__":
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--input", required=True, help="path to PDF document")
args = vars(ap.parse_args())
in_file = args["input"]
filename_base = in_file.replace(os.path.splitext(in_file)[1], "")
# Provide replacements list that you need here
replacements = { 'PDF': 'DOC'}
pdf = PdfFileReader(in_file)
writer = PdfFileWriter()
for page_number in range(0, pdf.getNumPages()):
page = pdf.getPage(page_number)
contents = page.getContents()
if isinstance(contents, DecodedStreamObject) or isinstance(contents, EncodedStreamObject):
process_data(contents, replacements)
elif len(contents) > 0:
for obj in contents:
if isinstance(obj, DecodedStreamObject) or isinstance(obj, EncodedStreamObject):
streamObj = obj.getObject()
process_data(streamObj, replacements)
writer.addPage(page)
with open(filename_base + ".result.pdf", 'wb') as out_file:
writer.write(out_file)
The results are
UPDATE 2021-03-21:
Updated the code example to handle DecodedStreamObject and EncodedStreamObject which actually contian data stream with text to update.

If #Dmytrio solution do not alter final PDF
Dymitrio's updated code example to handle DecodedStreamObject and EncodedStreamObject which actually contain data stream with text to update could run fine, but with a file different from example, was not able to alter pdf text content.
According to EDIT 3, from How to replace text in a PDF using Python?:
By inserting page[NameObject("/Contents")] = contents.decodedSelf before writer.addPage(page), we force pyPDF2 to update content of the page object.
This way I was able to overcome this problem and replace text from pdf file.
Final code should look like this:
import os
import argparse
from PyPDF2 import PdfFileReader, PdfFileWriter
from PyPDF2.generic import DecodedStreamObject, EncodedStreamObject, NameObject
def replace_text(content, replacements = dict()):
lines = content.splitlines()
result = ""
in_text = False
for line in lines:
if line == "BT":
in_text = True
elif line == "ET":
in_text = False
elif in_text:
cmd = line[-2:]
if cmd.lower() == 'tj':
replaced_line = line
for k, v in replacements.items():
replaced_line = replaced_line.replace(k, v)
result += replaced_line + "\n"
else:
result += line + "\n"
continue
result += line + "\n"
return result
def process_data(object, replacements):
data = object.getData()
decoded_data = data.decode('utf-8')
replaced_data = replace_text(decoded_data, replacements)
encoded_data = replaced_data.encode('utf-8')
if object.decodedSelf is not None:
object.decodedSelf.setData(encoded_data)
else:
object.setData(encoded_data)
if __name__ == "__main__":
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--input", required=True, help="path to PDF document")
args = vars(ap.parse_args())
in_file = args["input"]
filename_base = in_file.replace(os.path.splitext(in_file)[1], "")
# Provide replacements list that you need here
replacements = { 'PDF': 'DOC'}
pdf = PdfFileReader(in_file)
writer = PdfFileWriter()
for page_number in range(0, pdf.getNumPages()):
page = pdf.getPage(page_number)
contents = page.getContents()
if isinstance(contents, DecodedStreamObject) or isinstance(contents, EncodedStreamObject):
process_data(contents, replacements)
elif len(contents) > 0:
for obj in contents:
if isinstance(obj, DecodedStreamObject) or isinstance(obj, EncodedStreamObject):
streamObj = obj.getObject()
process_data(streamObj, replacements)
# Force content replacement
page[NameObject("/Contents")] = contents.decodedSelf
writer.addPage(page)
with open(filename_base + ".result.pdf", 'wb') as out_file:
writer.write(out_file)
Important: from PyPDF2.generic import NameObject

Decompress the pdf to make parsing easier (solves many of the issues in the previous answer). I use pdftk. (If this step fails, one hack to pre-process the pdf is to open the pdf in OSX Preview, print it, and then choose save as pdf from the print menu. Then retry the command below.)
pdftk original.pdf output uncompressed.pdf uncompress
Parse and replace using PyPDF2.
from PyPDF2 import PdfFileReader, PdfFileWriter
replacements = [
("old string", "new string")
]
pdf = PdfFileReader(open("uncompressed.pdf", "rb"))
writer = PdfFileWriter()
for page in pdf.pages:
contents = page.getContents().getData()
for (a,b) in replacements:
contents = contents.replace(a.encode('utf-8'), b.encode('utf-8'))
page.getContents().setData(contents)
writer.addPage(page)
with open("modified.pdf", "wb") as f:
writer.write(f)
[Optional] Re-compress the pdf.
pdftk modified.pdf output recompressed.pdf compress

Here is a solution using the MS Word source file.
As trying to edit the pdf itself turned out to be too complicated for me because of the encoding errors, I went with the MS Word >> Pdf option.
Prepare MS Word template with {{input_fields}}
Fill in the template with data
Convert the filled in MS Word file to PDF
The DocxTemplate module uses jinja like syntax: {{variable_name}}
In my solution I use an intermediate temp file. I tried to get rid of this step using BytesIO/StringIO to virtualize this step only in memory, but haven't make that work yet.
Here is an easy and working solution to perform the required task:
import os
import comtypes.client
from pathlib import Path
from docxtpl import DocxTemplate
import random
# CFG
in_file_path = "files/template.docx"
temp_file_path = "files/"+str(random.randint(0,50))+".docx"
out_file_path = "files/output.pdf"
# Fill in text
data_to_fill = {'Field_name' : "John Tester",
'Field_ocupation' : "Test tester",
'Field_address' : "Test Address 123",
}
template = DocxTemplate(Path(in_file_path))
template.render(data_to_fill)
template.save(Path(temp_file_path))
# Convert to PDF
wdFormatPDF = 17
in_file = os.path.abspath(Path(temp_file_path))
out_file = os.path.abspath(Path(out_file_path))
word = comtypes.client.CreateObject('Word.Application')
doc = word.Documents.Open(in_file)
doc.SaveAs(out_file, FileFormat=wdFormatPDF)
doc.Close()
word.Quit()
# Get rid of the temp file
os.remove(Path(temp_file_path))

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Writing a CSV file into a PDF document - python

Related

Extract Text into Column using Regex

Using Textract, how do you extract tables from a pdf file and output it into a csv file via .py script?

How to save pdfs

Only portions of data are being written to a cvs file, rest is missing

Search and replace for text within a pdf, in Python

Categories

Resources