Merging PDF's with PyPDF2 with inputs based on file iterator - python

I have two folders with PDF's of identical file names. I want to iterate through the first folder, get the first 3 characters of the filename, make that the 'current' page name, then use that value to grab the 2 corresponding PDF's from both folders, merge them, and write them to a third folder.
The script below works as expected for the first iteration, but after that, the subsequent merged PDF's include all the previous ones (ballooning quickly to 72 pages within 8 iterations).
Some of this could be due to poor code, but I can't figure out where that is, or how to clear the inputs/outputs that could be causing the failure to write only 2 pages per iteration:
import os
from PyPDF2 import PdfFileMerger
merger = PdfFileMerger()
rootdir = 'D:/Python/Scatterplots/BoundaryEnrollmentPatternMap'
for subdir, dirs, files in os.walk(rootdir):
for currentPDF in files:
#print os.path.join(file[0:3])
pagename = os.path.join(currentPDF[0:3])
print "pagename is: " + pagename
print "File is: " + pagename + ".pdf"
input1temp = 'D:/Python/Scatterplots/BoundaryEnrollmentPatternMap/' + pagename + '.pdf'
input2temp = 'D:/Python/Scatterplots/TraditionalScatter/' + pagename + '.pdf'
input1 = open(input1temp, "rb")
input2 = open(input2temp, "rb")
merger.append(fileobj=input1, pages=(0,1))
merger.append(fileobj=input2, pages=(0,1))
outputfile = 'D:/Python/Scatterplots/CombinedMaps/Sch_' + pagename + '.pdf'
print merger.inputs
output = open(outputfile, "wb")
merger.write(output)
output.close()
#clear all inputs - necessary?
outputfile = []
output = []
merger.inputs = []
input1temp = []
input2temp = []
input1 = []
input2 = []
print "done"
My code / work is based on this sample:
https://github.com/mstamy2/PyPDF2/blob/master/Sample_Code/basic_merging.py

I think that the error is that merger is initialized before the loop and it accumulates all the documents. Try to move line merger = PdfFileMerger() into the loop body. merger.inputs = [] doesn't seem to help in this case.
There are a few notes about your code:
input1 = [] doesn't close file. It will result in many files, which are opened by the program. You should call input1.close() instead.
[] means an empty array. It is better to use None if a variable should not contain any meaningful value.
To remove a variable (e.g. output), use del output.
After all, clearing all variables is not necessary. They will be freed with garbage collector.
Use os.path.join to create input1temp and input2temp.

Related

Direct my output, a set of files, to a directory that was created within the script

I have this script which takes in input a file with a matrix like format, specifically a sequence profile (.pssm) and extracts one side of the matrix, and puts it into a new file normalising the values. Currently the script creates a new file called id.profile in the directory which I am working from, and creates an EMPTY folder named "norm_seqprof_output".
I would like it to put that output into that directory which the script is creating. I tried many different things but it does not seem to connect the new directory being made and the output.
import sys
import os.path
from pathlib import Path
def pssm_list(infile): # call list of file names and for dsspfile
''' Reads relevant lines from a pssm file and saves them to a list.
Returns values of the 2 matrices (no header).'''
with open(infile) as ofile:
flist = ofile.readlines()[3:-6] # list of each line of the file excluding first 3 & last 6 lines
return flist
def lines_to_list(infile1):
''' Reads all lines from a file and saves them to a list containing the '\n' char. '''
all_lines_list = []
with open(infile1, 'r') as rfile:
all_lines_list = rfile.readlines()
return all_lines_list # need to rstrip in a loop for using filenames.
def relevant_lines(infile2):
'''Takes list (extracted from a .pssm file) and extracts the Sequence Profile Portion only.
Returns a list of list where each element is one line of the sequence profile matrix. '''
pssm_profile_list = pssm_list(infile2) # contains all lines from the pssm file.
profile_final_list = [] # for holding relevant fileds of the line
for line in pssm_profile_list:
pssm_profile_list = line.split()[22:42] # profile ranges from pos 22-42
profile_final_list.append(pssm_profile_list) # appending to final list of lists
return profile_final_list # list of lists
# # divide all values by 100
def write_normalized_profile(profile_final_list, ofile):
'''Takes profile list of lists and outfile name as input. Writes each number that is in
one of the sublists and devides it by 100. The number is converted to a string and added
a tab and written to a file. After each sublist a newline character is written to the file.'''
with open(ofile, "a") as wfile:
for sublist in profile_final_list:
# print(sublist)
for el in sublist:
num = int(el) / 100
numstring = str(num)
wfile.write(numstring + '\t') # adding tab after each number
wfile.write("\n") # adding newline at the end of each sublist.
#print(sublist)
#print(numstring)
if __name__ == '__main__':
infile = ('/Users/Desktop/PDB/d3ps.pssm') #sys.argv[1] # the idlist to loop on
#print(infile)
# Call the function by looping through an id list+'.pssm' extension
# name the outfile the same --> id list+'.profile'
idlist = lines_to_list('/Users/Desktop/PDB/idlist') # containing the id of the file but NOT the extension ".pssm"
#print(idlist)
for ids in idlist:
#print(ids)
# path may need to be added before "<the path>"+ids.rsprip()
part2 = ids.rstrip() + '.pssm' # removing newlinecharacter, adding necessary extension
#print(part2)
if os.path.isfile(infile) == True: # does this file exist?
ofile = ids.rstrip() + '.profile' # outfile for each id with correct extension
#print(ofile)
profile_list = relevant_lines(infile)
#print(profile_list)
write_normalized_profile(profile_list, ofile)
#print(write_normalized_profile)
#print(profile_list)
else:
print("Error file: " + infile + " not found.")
if not os.path.isdir("/Users/Desktop/PDB/norm_seqprof_output"):
os.mkdir("/Users/Desktop/PDB/norm_seqprof_output")
with open(ofile, 'r') as G:
print(G.read())
sys.stdout = open("file", "w")
print(G)
sys.stdout.close
I would like it to put that output into that directory which the script is creating.
I assume you want to write a file containing the output to the directory?
Generally you take the directory that you want to save to ("/Users/Desktop/PDB/norm_seqprof_output"), add your separator "/" and then your file name ("id.profile").
You can use os.path.join to do this
output_folder = "/Users/Desktop/PDB/norm_seqprof_output"
file_name = "id.profile"
output_file = os.path.join(output_folder, file_name)
or just
output_file = output_folder + "/" + file_name
and then write your result to output_file

Python script iterates over whole folder but skips files in the folder

I tried running the following code. The code should read hdf5 files from a directory and create for every hdf5 file a png and a txt file with the same name (btw. I need it as input for the CNN YOLO).
The code does what I described but only for 20 images! I added print(i) to see if the for-loop is working proper... and it is. It prints every file in the directory (over 200 files). But it just creates 20 .png and 20 .txt files.
def process_fpath(path1, path2):
sensor_dim = (101, 101)
onlyfiles = [f for f in os.listdir(path1) if isfile(join(path1, f))]
for i in onlyfiles:
if i.endswith(".hdf"):
print(i)
#cut ".hdf"
name = str(i[0:-5])
# create png
im = h5py.File(path1 + str(i), 'r')
labels_im = im['labels']
image = im['image']
plt.imsave(path2 + name + '.png', image)
# create txt
exp = np.column_stack((np.zeros(np.size(labels_im,0)) , labels_im[:,0]/sensor_dim[0], labels_im[:,1]/sensor_dim[1], labels_im[:,3]/sensor_dim[0], labels_im[:,3]/sensor_dim[0]))
np.savetxt(path2 + name + '.txt', exp, delimiter = ' ', fmt=['%d', '%8f', '%8f', '%8f', '%8f'])
continue
else:
continue
This is my first post so if something isn't proper please let me know.
Maybe it's because of the name variable? You remove 5 characters but you want to remove only 4: name = str(i[0:-4])
Not related to your question, the last 3 lines are useless. you can remove them.
continue
else:
continue
Try to run on a given file that is not working to understand what the problem is instead of looping on each of them.

compare two list of files between different directories in python

I am trying to compare two list of files from different directories. If there is a match found, the file should be written in a different directory. Below is my code.
filelist= ['sample2\\output_1.txt','sample2\\output_2.txt','sample3\\asn_todlx_mf_output_3.txt']
filelist2 = ['sample\\output_1.txt','sample\\output_3.txt','sample\\output_7.txt','sample\\output_2.txt','sample1\\output_3.txt']
a = 1
for name in filelist:
a = a + 1
for x in filelist2 :
file1 = open(x, 'r')
file2 = open(name,'r')
FO = open('right\\right_file'+str(a)+'.txt', 'w')
for line1 in file1:
for line2 in file2:
if line1 == line2:
FO.write("%s\n" %(line1))
FO.close()
file1.close()
file2.close()
For instance, output1 from 'sample folder(filelist)' is compared with every files in 'sample2(filelist)', if there is match, it should be written 'right' folder like 'right_file1.txt'.But the script is generating 15 files starting from 'right_file1.txt' to 'right_file15.txt'. It works good when I tried to compare one file with the list of files. Please help me, getting this.
That's how i would do it.
filelist1 = ['sample2\\output_1.txt','sample2\\output_2.txt','sample3\\asn_todlx_mf_output_3.txt']
filelist2 = ['sample\\output_1.txt','sample\\output_3.txt','sample\\output_7.txt','sample\\output_2.txt','sample1\\output_3.txt']
dir1 = filelist1[0].split('\\')[:-1]
filelist1 = [x.split('\\')[-1] for x in filelist1]
dir2 = filelist2[0].split('\\')[:-1]
filelist2 = [x.split('\\')[-1] for x in filelist2]
common = [x for x in filelist1 if x in filelist2]
print(common)
# ['output_1.txt', 'output_2.txt']
a = 1
for file in common:
a += 1
with open(dir1 + '\\' + file) as f_in:
contents = f_in.readlines()
with open('right\\right_file' + str(a) + '.txt', 'w') as f_out:
f_out.write(contents)
Initially i look for the files that are common between the two lists and i store their names in common. Then for all files in the common list i create their copy in this other directory as you mentioned. Notice the use of with which handles the closing and flushing of the files. Use that instead of manually managing the file unless you have a reason not to.
Finally, i did not get the logic behind your iterator a but i just copied it from you. It starts with value 2! If you want to grab the number from the file copied you have to go about doing it differently. Your way makes the origin of the created file untraceable..
Let me know if that worked for you.

Python - reading a single file, extracting data from it, and then writing it to many files

I am trying to read 1 file, parse data from it then write each parsed column to another file in a different directory. Here is my code:
os.makedirs("new_directory")
my_file = open("original_file.txt", "r+")
output = "new_file.txt"
outputfile = open("new_directory/"+output, "w")
num = 3
for line in my_file:
line = line.split("\t")
sample = line[num]
lst = line[1] + "\t" + line[2] + "\t" + sample + "\n"
outputfile.write(lst)
This is as far as I have got. I want to write multipe output files with different information from the original file, and save them all in the new directory. How can I write a loop to change the output file name each time - I was trying to increment 'num' and then maybe add it to the output variable:
output = "new_file" + num + ".txt"
outputfile = open("new_directory/"+output, "w")
Or something like this. Is there a better way?
You can create a list of files
outputfiles = []
for num in range(number_of_output_files):
output = "new_file" + str(num) + ".txt" # or "new_file%d.txt"%(num)
outputfiles.append( open("new_directory/"+output, "w") )
for num in range(number_of_output_files):
outputfiles[num].write( 'something\n' )
for num in range(number_of_output_files):
outputfiles[num].close()

Generating a list of files

I have this this code here... it generates a list of files and folders in a directory
import os
for x in os.listdir("C:\Users"):
text = text + "\n" + x
f = open("List.txt", "w")
f.write(text)
f.close()
But how can I get it to do two things...
Firstly well read whats in the folders and keep going until there is only a child.
Secondly when it goes down a level it adds a tab. Like this
Level 1 (parent)
Level 2 (child)
How can I get it to add that tab in? For infinite amount of levels?
Use os.walk() instead:
start = r'C:\Users'
entries = []
for path, filenames, dirnames in os.walk(start):
relative = os.path.relpath(path, start)
depth = len(os.path.split(os.pathsep))
entries.extend([('\t' * depth) + f for f in filenames])
with open("List.txt", "w") as output:
output.write('\n'.join(entries))
In each loop, path is the full path to the directory the filenames and dirnames are directly contained in. By using os.path.relpath we extract the 'local' path, count the depth and use that for the number of tabs to prepend each filename with.

Categories

Resources