How to run python on multiple folder to create pdf? - python

The following code is to combine multiple images into one pdf. I am trying to run this code on multiple folder where each folder has several images as result, each folder will has one pdf.
import os
from PIL import Image
from fpdf import FPDF
pdf = FPDF()
sdir = "imageFolder/"
w,h = 0,0
for i in range(1, 100):
fname = sdir + "IMG%.3d.png" % i
if os.path.exists(fname):
if i == 1:
cover = Image.open(fname)
w,h = cover.size
pdf = FPDF(unit = "pt", format = [w,h])
image = fname
pdf.add_page()
pdf.image(image,0,0,w,h)
else:
print("File not found:", fname)
print("processed %d" % i)
pdf.output("output.pdf", "F")
print("done")
I was thinking to create another loop to bring the folder path which will come before the first loop:
For j in range(1 to 70):
folderP=sdir+folder%1
And loop in each folder
Sorry I am still learning python. Any suggestion would be great!

You can use glob to get the paths of all pdfs and add them to a list, then you just iterate through the list and you wouldn't even need to check if they exist:
from glob import glob
sDir = 'imageFolder/'
pdfPaths = []
pdfPaths.extend(glob(f'{sDir}**/*.pdf', recursive=True))
for pdf in pdfPaths:
# do stuff

Related

Optimize execution time for retrieving data from xml and copy images

the purpose of the following code is to copy image files from one directory to another directory and reorganize the images in a hierarchical structure based on information extracted from XML files.
from bs4 import BeautifulSoup as bs
import shutil
import os
import glob
campaign_name = "CAMPAIGN_2020"
xml_directory = r'XML_DIRECTORY'
picture_directory = r'PICTURE_DIRECTORY'
output_directory = r'OUTPUT_DIRECTORY'
def copy_files(content, picture_files):
bs_content = bs(content, "lxml")
images = bs_content.find_all("images")
for picture in picture_files:
for i, image_group in enumerate(images):
for image in image_group.find_all('img'):
if os.path.basename(image['src']) == os.path.basename(picture):
src = image['src']
station = image['station']
first_field = image.parent.parent.data['first_field']
second_field = image.parent.parent.data['second_field']
start = int(image.parent.parent.data['start'])
end = int(image.parent.parent.data['end'])
length = start - end
class_name = image.parent.parent.assignment['class_name']
number = image.parent.parent.assignment['number']
img_nr = int(image['img_nr'])
location = image.parent.parent.assignment['location']
date = image.parent.parent['date']
# set the complete picture path
picture_path = f'{class_name}{number}\{first_field}_{second_field}_{length}_{start}_{end}\{adjust_date(date)}\{campaign_name}\{adjust_location(location)}\{adjust_img_nr(img_nr)}\{station.zfill(5)}.jpg'
# create new subdirectories if they do not already exist
os.makedirs(os.path.join(output_directory, os.path.dirname(picture_path)), exist_ok=True)
src_file = picture # original picture path
dst_file = os.path.join(output_directory, picture_path) # assembled target path
shutil.copy(src_file, dst_file)
picture_list = []
for pic in glob.glob(picture_directory + '\**\*.jpg', recursive=True): # consider files in all subdirectories that end with .jpg, adjust if necessary
picture_list.append(pic)
for path in os.listdir(xml_directory):
if path.endswith(".xml"): # only consider files that end with .xml
with open(os.path.join(xml_directory, path), "r") as file:
xml_content = file.readlines()
xml_content = "".join(xml_content)
copy_files(xml_content, picture_list)
I tested the code and it works for the most part. To copy 20 pictures the tool needs around 2 hours, so i have to drasticly improve the execution time. How can I do that?
To give you an idea: I have around 8k xml files and around 400k pictures :D

How do I select all images from a folder? -MoviePy, Python

I'm pretty new in python and trying to make a python script to put images together into a video(.mp4) using MoviePy.
However, I have multiple files and would like to be more efficient by sort of.... naming the folder and selecting all images within that folder than having to select all images individually.
Here's my Code:
from moviepy.editor import *
import os
clips = []
clip1 = ImageClip('imagesfolder\images0.jpg').set_duration(4)
clip2 = ImageClip('imagesfolder\images1.jpg').set_duration(4)
clip3 = ImageClip('imagesfolder\images2.jpg').set_duration(4)
clip4 = ImageClip('imagesfolder\images3.jpg').set_duration(4)
clip5 = ImageClip('imagesfolder\images4.jpg').set_duration(4)
clip6 = ImageClip('imagesfolder\images5.jpg').set_duration(4)
clip7 = ImageClip('imagesfolder\images6.jpg').set_duration(4)
clip8 = ImageClip('imagesfolder\images7.jpg').set_duration(4)
clip9 = ImageClip('imagesfolder\images8.jpg').set_duration(4)
clip10 = ImageClip('imagesfolder\images9.jpg').set_duration(4)
clips.append(clip1)
clips.append(clip2)
clips.append(clip3)
clips.append(clip4)
clips.append(clip5)
clips.append(clip6)
clips.append(clip7)
clips.append(clip8)
clips.append(clip9)
clips.append(clip10)
video_clip = concatenate_videoclips(clips, method='compose')
video_clip.write_videofile("memes.mp4", fps=24, remove_temp=True, codec="libx264",
audio_codec="aac")
You can use a function called glob to find all files in a directly which match a pattern.
Eg
from glob import glob
clips = [ImageClip(clip).set_duration(4) for clip in glob("imagesfolder\*.gif")]
video_clip = concatenate_videoclips(clips, method="compose")

Open all images in a folder and apply and merge it with a given folder?

I am adjusting a script.
I have 4427 images in a specified folder, named
(1).png
(2).png
(3).png
etc.
Along with those, I have another 14 images, named:
1.png
2.png
3.png
etc.
Basically the script should:
Take a specific image I tell it to open of the 4427
Then, open one of the 14 images at random
Merge the two and save it to a specified directory.
Code
import os
import random
from PIL import Image
path = r"C:\Users\17379\Desktop\images\Low effort glasses"
random_filename = random.choice([
x for x in os.listdir(path)
if os.path.isfile(os.path.join(path, x))
])
print(random_filename)
x = Image.open(r"(1).png").convert("RGBA")
y = Image.open(random_filename)
z = Image.alpha_composite(x, y)
z.save(r"C:\Users\17379\Desktop\images\Resized punks\Resized punks\punk1.png")
How to do this to all 4427 images and then save each file to the specified directory?
The pseudo-code for your task is:
for file_name in source_image_list:
# 1. Take a specific image I tell it to open of the 4427
source_image = open_source(file_name)
# 2. Then, open one of the 14 images at random
random_image = open_random_of(random_image_list)
# 3. Merge the two and save it to a specified directory.
target_image = merge(source_image, random_image)
save(target_image, file_name, directory)
Translate this to Python:
import os
import glob
import random
from PIL import Image
def open_random(random_image_list):
random_filename = random.choice(in random_image_list)
print(f"Random: {random_filename}")
return Image.open(random_filename)
def open_source(file_name):
return Image.open(file_name).convert("RGBA")
def merge(source, random):
return Image.alpha_composite(source, random)
def save(image, original_name, directory):
target = os.path.join(directory, os.path.basename(original_name))
print(f"Saving: {target}")
image.save(target)
if __name__ == '__main__':
source_path = r"C:\Users\17379\Desktop\images"
random_path = r"C:\Users\17379\Desktop\images\Low effort glasses"
directory = r"C:\Users\17379\Desktop\images\Resized punks\Resized punks"
random_image_list = os.listdir(random_path) # can also use glob here to filter for (specific) images only
source_image_list = glob.glob(f"{source_path}/\([0-9]+\).png")
for file_name in source_image_list:
print(f"Source: {file_name}")
# 1. Take a specific image I tell it to open of the 4427
source_image = open_source(file_name)
# 2. Then, open one of the 14 images at random
random_image = open_random_of(random_image_list)
# 3. Merge the two and save it to a specified directory.
target_image = merge(source_image, random_image)
save(target_image, file_name, directory)
Note: I had to replace os.listdir(source_path) for glob.glob because it accepts a regular-expression to filter for specific files only. See also
Python3 create list of image in a folder

How to convert latex to image in python

In a part of my project for creating a dataset, I have a text file containing a list of a bunch of latex equations . Now I want to convert them into images through python in diffrent font sizes. But i dont know how to do it. Please help.
This is the list of latex symbols I am using:- https://docs.mathpix.com/#vocabulary
#
import shutil
import os
from pdflatex import PDFLaTeX
from pdf2image import convert_from_path
from PIL import Image
def crop(file):
img = Image.open(file)
area = (300,300, 800, 800)
cropped_img = img.crop(area)
cropped_img.save(file)
def save_images(images_names,pdf_path,images_path=""):
# Store Pdf with convert_from_path function
images = convert_from_path(pdf_path)
if len(images_names)==0:
print("names is empty")
return
i=0
for img in images:
img.save(images_path+"/"+images_names[i]+".jpg", 'JPEG')
crop(images_path+"/"+images_names[i]+".jpg")
i+=1
print("Successfully converted")
def create_image_from_latex(image_name,latex):
if "rough" not in os.listdir():
os.mkdir("rough")
if "images_from_latex" not in os.listdir():
os.mkdir("images_from_latex")
f=open("rough/a.tex","w+")
f.write("\\documentclass{article}\n\\usepackage{chemfig}\n\\begin{document}\n")
f.write(latex+"\n")
f.write(r"\end{document}")
f.close()
#print(os.getcwd()+"/a.tex")
#tex="/a.tex"
pdfl = PDFLaTeX.from_texfile('rough/a.tex')
pdf, log, completed_process = pdfl.create_pdf(keep_pdf_file=True, keep_log_file=False)
f=open("rough/a.pdf","wb")
f.write(pdf)
f.close()
save_images([image_name],"a.pdf","images_from_latex")
os.remove("rough/a.pdf")
shutil.rmtree("rough")
#create_image_from_latex("new_image",lat)
def create_images_from_text_file_with_latexes(text_file):
with open(text_file) as f:
latexes=f.readlines()
ind=1
for lat in latexes:
create_image_from_latex("%0.3d_"%ind,lat)
ind+=1
#

Detect the content type of multiple PDF in a Folder

so far I am using PyPDF2 in anaconda platform to place a watermark in 20000+ pdfs. The code is working for the majority of PDF files but there are a few of them where the content is a poorly scanned image from reports.
I want to know if there is a tool within python or any other way where I can analyse the content of the PDF and determine if the PDF is an image or is a pdf file with text characters. This will allow me to know which files have this defect and place them in other folder.
Thanks
I added my code.
import PyPDF2 #this library requires to be installed
import os
if __name__ == "__main__":
ROOT_PATH = "."
#STAMP_PATH = "." + "/stamped/"
TEMPLATE_PATH = "."
STAMP_PATH = "."
count = 0
for dirName, subdirList, fileList in os.walk(ROOT_PATH):
files=[]
print('Found directory: %s' % dirName)
for fileName in fileList:
if fileName.find('.pdf') > 0:
count += 1
print('\tHandling %s - %s %s' % (count, dirName, fileName))
files.append(fileName)
#=======================main code part ==========================================
file= open(fileName,'rb')
reader = PyPDF2.PdfFileReader(file)
page= reader.getPage(0)
water = open(TEMPLATE_PATH + 'StampTemplate1109.pdf','rb')
reader2 = PyPDF2.PdfFileReader(water)
waterpage = reader2.getPage(0)
#command to merge parent PDF first page with PDF watermark page
page.mergeTranslatedPage(waterpage, 0, -20, expand=True)
writer =PyPDF2.PdfFileWriter()
writer.addPage(page)
#add rest of PDF pages
for pageNum in range(1, reader.numPages): # this will give length of book
pageObj = reader.getPage(pageNum)
writer.addPage(pageObj)
#return the parent PDF file with the watermark
# here we are writing so 'wb' is for write binary
resultFile = open(STAMP_PATH + 'Reviewed ' + fileName,'wb')
writer.write(resultFile)
file.close()
resultFile.close()
#==============================================================================
print "TOTAL OF %s PROCESSED" % count
Since you're already using PyPDF2 you may want to use the PageObject.extractText function to see if you get any text on each page of the PDF. If you get an empty string from a page then it's probably an image.

Categories

Resources