import win32com.client as win32
import os
#creating a word application object
wordApp = win32.gencache.EnsureDispatch('Word.Application') #create a word application object
wordApp.Visible = True # hide the word application
doc = wordApp.Documents.Add() # create a new application
#Formating the document
doc.PageSetup.RightMargin = 10
doc.PageSetup.LeftMargin = 10
doc.PageSetup.Orientation = win32.constants.wdOrientLandscape
# a4 paper size: 595x842
doc.PageSetup.PageWidth = 595
doc.PageSetup.PageHeight = 842
# Inserting Tables
my_dir="C:/Users/David/Documents/EGi/EGi Plots/FW_plots/Boxplots"
filenames = os.listdir(my_dir)
piccount=0
file_count = 0
for i in filenames:
if i[len(i)-3: len(i)].upper() == 'JPG': # check whether the current object is a JPG file
piccount = piccount + 1
print piccount, " images will be inserted"
total_column = 1
total_row = int(piccount/total_column)+2
rng = doc.Range(0,0)
rng.ParagraphFormat.Alignment = win32.constants.wdAlignParagraphCenter
table = doc.Tables.Add(rng,total_row, total_column)
table.Borders.Enable = False
if total_column > 1:
table.Columns.DistributeWidth()
#Collecting images in the same directory and inserting them into the document
piccount = 1
for index, filename in enumerate(filenames): # loop through all the files and folders for adding pictures
if os.path.isfile(os.path.join(os.path.abspath(my_dir), filename)): # check whether the current object is a file or not
if filename[len(filename)-3: len(filename)].upper() == 'JPG': # check whether the current object is a JPG file
piccount = piccount + 1
print filename, len(filename), filename[len(filename)-3: len(filename)].upper()
cell_column = (piccount % total_column + 1) #calculating the position of each image to be put into the correct table cell
cell_row = (piccount/total_column + 1)
#print 'cell_column=%s,cell_row=%s' % (cell_column,cell_row)
#we are formatting the style of each cell
cell_range= table.Cell(cell_row, cell_column).Range
cell_range.ParagraphFormat.LineSpacingRule = win32.constants.wdLineSpaceSingle
cell_range.ParagraphFormat.SpaceBefore = 0
cell_range.ParagraphFormat.SpaceAfter = 3
#this is where we are going to insert the images
current_pic=cell_range.InlineShapes.AddPicture(os.path.join(os.path.abspath(my_dir), filename))
#Currently this puts a lable in a cell after the pic, I want to put a proper ms word figure caption below the image instead.
table.Cell(cell_row, cell_column).Range.InsertAfter("\n"+"Appendix II Figure "+ str(piccount-1)+": "+filename[:len(filename)-4]+"\n"+"\n"+"\n")
else: continue
This code gets all the images in a chosen directory and puts them in a table in a word doc, and then puts the file name (stripped of the file extn) in the cell below. I would like a proper figure caption (so that these will update if I insert additional pictures) but everything I've tried has failed.
I just can't get the VB commands right, this:
table.Cell(cell_row, cell_column).Range.InsertAfter(InsertCaption(Label="Figure", Title=": "+filename[:len(filename)-4]))
gives me a list of figure captions at the end of the document, which isn't really what I want. I feel like I am close but I just cant quite get it. Thanks!
In order to use Word's built-in captioning instead of current_pic.InsertCaption use current_Pic.Range.InsertCaption. The InsertCaption method is a member of the Range not the InlineShape object. For me, this automatically inserts the caption below the picture, in its own paragraph. But if you want to specificy "below" use the Position argument, as well:
current_pic.Range.InsertCaption(Label="Figure", Title=": "+filename[:len(filename)-4]), Position=win32.constants.wdCaptionPositionBelow
Note: FWIW when I test the line of code (in VBA) that you say gives you a list of captions at the end of the document I do see the text in the same cell as the inserted picture.
Related
This is my first program so I imagine there are a lot of inefficiencies. First I created a GUI that works on a combined PDF. In attempting to convert the working code to a code that iterates through a directory of multiple single page PDF's, I get an error. On the "PageObj.scaleTo(1172, 1772)" line I get the error in the question title. A GUI takes the user inputs for the variables "x" (directory), "a" (paper size), and "s" (state). It is to resize the page to the selected size, merge with a template (not append but a single page "PDF sandwich" I have heard it described), then overwrite the existing file. This is to happen to every PDF in the specified directory. I have tried several version of defining my PageObj variable, but can't seem to get it right.
# Variables for User input values
x = values["-pdf_dir-"]
a = values["-paper_size-"]
s = values["-state-"]
# Location to find seal templates
state = f"G:/Drafting/Kain Mincey/Allen's seals/Correctly Sized/{a}/{s}.pdf"
Seal_pdf = PdfFileReader(open(state, "rb"), strict=False)
input_pdf = glob.glob(os.path.join(x, '*.pdf'))
output_pdf = PdfFileWriter()
page_count = len(fnmatch.filter(os.listdir(x), '*.pdf'))
i = 0
if a == "11x17":
for file in input_pdf:
sg.OneLineProgressMeter('My Meter', i, page_count, 'And now we Wait.....')
PageObj = PyPDF2.PdfFileReader(open(file, "rb"))
PageObj.scaleTo(11*72, 17*72)
PageObj.mergePage(Seal_pdf.getPage(0))
output_pdf.addPage(PageObj)
output_filename = f"{x[:-4]}"
i = i + 1
PdfFileReader returns the whole file. scaleTo applies to a page. You have to fetch the page you want with getPage. –Tim Roberts Mar 28 at 21:02
I'm trying to make a program that scans PDFs downloaded from a website with selectable text and highlights specific discrepancies. I can make it work for specific "bad words" and "good words" but I am stuck on how to make it find missing check boxes. They are no longer interactive fields in PDF form:
Here is my code for everything else so far:
import os
import fitz
source_folder = r"C:\Users\Sserb\Desktop\Test Files"
list_files = os.listdir(source_folder)
good_terms = ["trend", "decrease", "increase"]
bad_terms = ["school", "academic", "homework"] # words that should be in every pdf file (not every page)
pdf_files = [x for x in list_files if x.endswith(".pdf")]
highlight_summary = []
good_term_summary = []
for file_name in pdf_files:
# READ IN PDF
full_filename = os.path.join(source_folder, file_name)
doc = fitz.open(full_filename)
good_terms_not_found = good_terms.copy()
list_hl_pages = []
for page_num, page in enumerate(doc, 1):
# SEARCH
for text in bad_terms:
text_instances = page.search_for(text)
# HIGHLIGHT
for inst in text_instances:
highlight = page.addHighlightAnnot(inst)
highlight.update()
if page_num not in list_hl_pages:
list_hl_pages.append(page_num)
# Search for good terms- all must be found
words_found = []
for good_word in good_terms_not_found:
text_instances = page.search_for(good_word)
if text_instances:
words_found.append(good_word)
for word in words_found:
good_terms_not_found.remove(word)
highlight_summary.append([file_name, list_hl_pages.copy()])
if good_terms_not_found:
good_term_summary.append([file_name, good_terms_not_found.copy()])
# OUTPUT
if list_hl_pages:
out_file = file_name.replace(".pdf", "-errors.pdf")
doc.save(os.path.join(source_folder, "output", out_file), garbage=4, deflate=True, clean=True)
else:
doc.close()
#print(highlight_summary)
print(good_term_summary)
output_folder=r"C:\Users\Sserb\Desktop\Test Files\output"
new = os.path.join(output_folder,'outputfile.txt')
file = open(new, 'w')
value = str(good_term_summary) + '\n'
file.write(value)
file.close()
Both "value" and "export value" are always treated as text, but there are at least 8 different kinds of check-boxes in word. see how these are altered by the font used here Check boxes are shown as ☐ when unchecked, or ☑ or ☒ when checked, so search for ☑Client rather than ☐Client etc
I write a script for the Avidemux app to read a folder and get all video files in that folder then split them into a number of the part base on the file size.
the problem is that the input_ext variable which used in the get_folder_content function for checking the file format in the folder. it only gets one format and I can't set a list of formats to check it from the folder.
the input_ext variable only gets one format but I want to set a bunch of formats to it.
my code is this
adm = Avidemux()
gui = Gui()
# file set list of of input files
# input_ext = 'mp4'
input_ext = 'mkv'
# file extension for output files
output_ext = 'MKV'
def convert_file(input_file, output_folder):
file_size = get_file_size(input_file)
number_of_part = math.ceil(file_size/2037760000)#equal to 1990 MB 2037760000
if number_of_part >1:
for part in range(0,number_of_part):
file_name = " ".join(basename(input_file).split('.')[0:-1])
output_file = output_folder + '/' +file_name+'_part'+str(part+1)+'.mkv'
adm.loadVideo(input_file)
len = adm.markerB
adm.clearSegments()
adm.addSegment(0, 0, adm.markerB)
adm.markerA = (part/number_of_part)*len
adm.markerB = ((part+1)/number_of_part)*(len)
adm.videoCodec("Copy")
adm.audioClearTracks()
adm.setSourceTrackLanguage(0,"und")
if adm.audioTotalTracksCount() <= 0:
raise("Cannot add audio track 0, total tracks: " + str(adm.audioTotalTracksCount()))
adm.audioAddTrack(0)
adm.audioCodec(0, "copy")
adm.audioSetDrc(0, 0)
adm.audioSetShift(0, 0, 0)
adm.setContainer("MKV", "forceAspectRatio=False", "displayWidth=1280", "displayAspectRatio=2", "addColourInfo=False", "colMatrixCoeff=2", "colRange=0", "colTransfer=2", "colPrimaries=2")
adm.save(output_file)
def main():
input_folder = gui.dirSelect("Select the source folder")
# input_folder =
files = get_folder_content(input_folder, input_ext)
if files is None:
gui.displayError("Error", "Folder doesn't containt any ." + input_ext+ " file")
return 0
output_folder = gui.dirSelect("Select the output folder")
# output_folder =
for one_file in files:
convert_file(one_file, output_folder)
print("Done")
main()
I want to use input_ext like this :
input_ext = ['mkv','mp4']
Also, I can't find the lib document for tinypy to read function and find the solution
First time posting a question here, hopefully, someone who experienced/tried this please share your insights... I've been working to get this far in the last few days and nights... now I am getting nowhere to loop this script on every file in a directory.
Bascially, these two scripts work perfectly fine it brings a pdf file and changes it to an excel workbook. Now what I need to do is going through all files from a selected directory and do the same job.
I am keep getting stuck at the opening the file stage - is this saying that the data (the pdf page - data[0]) cant be called in? or should i add more stages in to bring the dataset in...?
Do I have to create a list for the dataset so I can call in the data as you would have more than a data to call in.. is this why python can read the data[0] ???
Revised Script
# import
import os
import glob
import pdftotext
import openpyxl
from pathlib import Path
from string import ascii_uppercase
# open a pdf file
def to_excel(pdf_file):
with open(pdf_file,'rb') as f:
data = pdftotext.PDF(f)
# operate data to get titles, values
datas = data[0].split('\r\n')
finalData = list()
for item in datas:
if item != '':
finalData.append(item)
finalDataRefined = list()
for item in finalData:
if item != ' BCA Scheduled Maintenance Questions' and item != ' Do you suspect there is Asbestos at the property?' and item != ' Yes' and item != ' No' and item != '\x0c':
finalDataRefined.append(item.strip())
titles = list()
values = list()
for num, item in enumerate(finalDataRefined):
if num % 2 == 0:
titles.append(item)
else:
values.append(item)
# get an output file name
OPRAST = values[1]
filename = work_dir / f"{OPRAST}.xlxs"
# create an excel workbook
excel_file = openpyxl.Workbook()
excel_sheet = excel_file.active
excel_sheet.append([])
alphaList = list(ascii_uppercase)
for alphabet in alphaList:
excel_sheet.column_dimensions[alphabet].width = 20
excel_sheet.append(titles)
excel_sheet.append(values)
# save the excel workbook
excel_file.save(filename)
excel_file.close
# run a python script every file in a directory
alphaList = list(ascii_uppercase)
work_dir = Path(r"C:\Users\Sunny Kim\Downloads\Do Forms")
for pdf_file in work_dir.glob("*.pdf"):
to_excel(pdf_file)
I basically know what you want to do, but your code's indent is not so readable... especially it's python.
Your goal is to create a excel for each pdf file in you prefix dir? or aggregate all the pdf files together to a single excel file?
The follow coding is for the first goal.
Code logic.
get all the pdf file
loop over all the pdf file, for each:
open pdf file
some operation
export to excel file
You full code maybe like this(just guess):
# ----------------import part-------------------
import os
import glob
import pdftotext
import openpyxl
from string import ascii_uppercase
from pathlib import Path
def to_excel(pdf_file):
with open(pdf_file, 'rb') as f: # this open the pdf file
data = pdftotext.PDF(f)
# ---------------operate the data, get title and value-----------
datas = data[0].split('\r\n')
finalData = list()
for item in datas:
if item != '':
finalData.append(item)
finalDataRefined = list()
for item in finalData:
if item != ' BCA Scheduled Maintenance Questions' and item != ' Do you suspect there is Asbestos at the property?' and item != ' Yes' and item != ' No' and item != '\x0c':
finalDataRefined.append(item.strip())
titles = list()
values = list()
for num, item in enumerate(finalDataRefined):
if num % 2 == 0:
titles.append(item)
else:
values.append(item)
# ------------------get output file name---------------------
OPRAST = values[1]
filename = work_dir / f"{OPRAST}.xlxs"
# ------------------create excel file sheet------------------
excel_file = openpyxl.Workbook()
excel_sheet = excel_file.active
excel_sheet.append([])
alphaList = list(ascii_uppercase)
for alphabet in alphaList:
excel_sheet.column_dimensions[alphabet].width = 20
excel_sheet.append(titles)
excel_sheet.append(values)
# --------------------save----------------
excel_file.save(filename)
excel_file.close
# -------------------main program---------------
alphaList = list(ascii_uppercase)
work_dir = Path(r"C:\Users\Sunny Kim\Downloads\Do Forms")
for pdf_file in work_dir.glob("*.pdf"):
to_excel(pdf_file)
Yesterday, I asked a question that was perhaps too broad.
Today, I've acted on my ideas in an effort to implement a solution.
Using ReportLab, pdfquery and PyPDF2, I'm trying to automate the process of generating barcodes on hundreds of pages in a PDF document.
Each page needs to have one barcode. However, if a page has a letter in the top right ('A' through 'E') then it needs to use the same barcode as the previous page. The files with letters on the top right are duplicate forms with similar information.
If there is no letter present, then a unique barcode number (incremented by one is fine) should be used on that page.
My code seems to work, but I'm having two issues:
The barcode moves around ever so slightly (minor issue).
The barcode value will not change (major issue). Only the first barcode number is set on all pages.
I can't seem to tell why the value isn't changing. Does anyone have an a clue?
Code is here:
import pdfquery
import os
from io import BytesIO
from PyPDF2 import PdfFileWriter, PdfFileReader
from reportlab.graphics.barcode import eanbc
from reportlab.graphics.shapes import Drawing
from reportlab.lib.pagesizes import letter
from reportlab.lib.units import mm
from reportlab.pdfgen import canvas
from reportlab.graphics import renderPDF
pdf = pdfquery.PDFQuery("letters-test.pdf")
total_pages = pdf.doc.catalog['Pages'].resolve()['Count']
print("Total pages", total_pages)
barcode_value = 12345670
output = PdfFileWriter()
for i in range(0, total_pages):
pdf.load(i) # Load page i into memory
duplicate_letter = pdf.pq('LTTextLineHorizontal:in_bbox("432,720,612,820")').text()
if duplicate_letter != '':
print("Page " + str(i+1) + " letter " + str(duplicate_letter))
print(barcode_value)
packet = BytesIO()
c = canvas.Canvas(packet, pagesize=letter)
# draw the eanbc8 code
barcode_eanbc8 = eanbc.Ean8BarcodeWidget(str(barcode_value))
bounds = barcode_eanbc8.getBounds()
width = bounds[2] - bounds[0]
height = bounds[3] - bounds[1]
d = Drawing(50, 10)
d.add(barcode_eanbc8)
renderPDF.draw(d, c, 400, 700)
c.save()
packet.seek(0)
new_pdf = PdfFileReader(packet)
# read existing PDF
existing_pdf = PdfFileReader(open("letters-test.pdf", "rb"))
# add the "watermark" (which is the new pdf) on the existing page
page = existing_pdf.getPage(i)
page.mergePage(new_pdf.getPage(0))
output.addPage(page)
else:
# increment barcode value
barcode_value += 1
print("Page " + str(i+1) + " isn't a duplicate.")
print(barcode_value)
packet = BytesIO()
c = canvas.Canvas(packet, pagesize=letter)
# draw the eanbc8 code
barcode_eanbc8 = eanbc.Ean8BarcodeWidget(str(barcode_value))
bounds = barcode_eanbc8.getBounds()
width = bounds[2] - bounds[0]
height = bounds[3] - bounds[1]
d = Drawing(50, 10)
d.add(barcode_eanbc8)
renderPDF.draw(d, c, 420, 710)
c.save()
packet.seek(0)
new_pdf = PdfFileReader(packet)
# read existing PDF
existing_pdf = PdfFileReader(open("letters-test.pdf", "rb"))
# add the "watermark" (which is the new pdf) on the existing page
page = existing_pdf.getPage(i)
page.mergePage(new_pdf.getPage(0))
output.addPage(page)
# Clear page i from memory and re load.
# pdf = pdfquery.PDFQuery("letters-test.pdf")
outputStream = open("newpdf.pdf", "wb")
output.write(outputStream)
outputStream.close()
And here is letters-test.pdf
as Kamil Nicki's answer pointed out, Ean8BarcodeWidget limiting effective digits to 7:
class Ean8BarcodeWidget(Ean13BarcodeWidget):
_digits=7
...
self.value=max(self._digits-len(value),0)*'0'+value[:self._digits]
you may change your encoding scheme or use EAN 13 barcode with Ean13BarcodeWidget, which has 12 digits usable.
The reason why your barcode is not changing is that you provided too long integer into eanbc.Ean8BarcodeWidget.
According to EAN standard EAN-8 barcodes are 8 digits long (7 digits + checkdigit)
Solution:
If you change barcode_value from 12345670 to 1234560 and run your script you will see that barcode value is increased as you want and checkdigit is appended as eighth number.
With that information in hand you should use only 7 digits to encode information in barcode.