read multiple files automatically no manual file naming - python

I have a directory contains 50 files I want to read them one by one and compare wit the other files - that is fixed. I am using glob.blob. But it didn't work.
Here how I am reading all files. Instead, path = '*.rbd' if I give the file name like path = run-01.rbd it works.
path = '*.rbd'
path = folder + path
files=sorted(glob.glob(path))
complete code
import glob
from itertools import islice
import linecache
num_lines_nonbram = 1891427
bits_perline = 32
total_bit_flips = 0
num_bit_diff_flip_zero = 0
num_bit_diff_flip_ones = 0
folder = "files/"
path = '*.rbd'
path = folder + path
files=sorted(glob.glob(path))
original=open('files/mull-original-readback.rbd','r')
#source1 = open(file1, "r")
for filename in files:
del_lines = 101
with open(filename,'r') as f:
i=1
while i <= del_lines:
line1 = f.readline()
lineoriginal=original.readline()
i+=1
i=0
num_bit_diff_flip_zero = 0
num_bit_diff_flip_ones = 0
num_lines_diff =0
i=0
j=0
k=0
a_write2 = ""
while i < (num_lines_nonbram-del_lines):
line1 = f.readline()
lineoriginal = original.readline()
while k < bits_perline:
if ((lineoriginal[k] == line1[k])):
a_write2 += " "
else:
if (lineoriginal[k]=="0"):
#if ((line1[k]=="0" and line1[k]=="1")):
num_bit_diff_flip_zero += 1
if (lineoriginal[k]=="1"):
#if ((line1[k]=="0" and line1[k]=="1")):
num_bit_diff_flip_ones += 1
#if ((line1[k]==1 and line1[k]==0)):
#a_write_file2 = str(i+1) + " " + str(31-k) + "\n" + a_write_file2
#a_write2 += "^"
#num_bit_diff_flip_one += 1
# else:
# a_write2 += " "
k+=1
total_bit_flips=num_bit_diff_flip_zero+num_bit_diff_flip_ones
i+=1
k=0
i = 0
print files
print "Number of bits flip zero= %d" %num_bit_diff_flip_zero +"\n" +"Number of bits flip one= %d" %num_bit_diff_flip_ones +"\n" "Total bit flips = %d " %total_bit_flips
f.close()
original.close()

You could use the os module to first list everything in a directory (both files and modules) then use a python generator to filter out only the files. You could then use a second python generator to filter out files with a specific extension. There is probably a more efficient way of doing it but this works:
import os
def main():
path = './' # The path to current directory
# Go through all items in the directory and filter out files
files = [file for file in os.listdir(path) if
os.path.isfile(os.path.join(path, file))]
# Go through all files and filter out files with .txt (for example)
specificExtensionFiles = [file for file in files if ".txt" in file]
# Now specificExtensionFiles is a generator for .txt files in current
# directory which you can use in a for loop
print (specificExtensionFiles)
if __name__ == '__main__':
main()
For further reference:
How do I list all files of a directory?

The problem is that you're not going back to the beginning of originalfile whenever you start comparing with the next file in the for filename in files: loop. The simplest solution is to put:
original.seek(0)
at the beginning of that loop.
You could also read the whole file into a list just once before the loop, and use that instead of reading the file repeatedly.
And if you only want to process part of the files, you can read the file into a list, and then use a list slice to get the lines you want.
You also shouldn't be setting num_bit_diff_flip_zero and num_bit_diff_flip_one to 0 each time through the loop, since these are supposed to be the total across all files.
with open('files/mull-original-readback.rbd','r') as original:
original_lines = list(original)[del_lines:num_lines_nonbram]
for filename in files:
with open(file, 'r') as f:
lines = list(f)[del_lines:num_lines_nonbram]
for lineoriginal, line1 in zip(original_lines, lines):
for k in range(bits_perline):
if lineoriginal[k] == line1[k]:
a_write2 += " "
elif lineoriginal[k] == "0"
num_bit_diff_flip_zero += 1
else:
num_bit_diff_flip_ones += 1
total_bit_flips = num_bit_diff_flip_zero + num_bit_diff_flip_ones

Related

Unexpected end of data when zipping zip files in Python

Good day.
I wrote a little Python program to help me easily create .cbc files for Calibre, which is just a renamed .zip file with a text file called comics.txt for TOC purposes. Each chapter is another zip file.
The issue is that the last zip file zipped always has the error "Unexpected end of data". The file itself is not corrupt, if I unzip it and rezip it it works perfectly. Playing around it seems that the problem is that Python doesn't close the last zip file after zipping it, since I can't delete the last zip while the program is still running since it's still open in Python. Needless to say, Calibre doesn't like the file and fails to convert it unless I manually rezip the affected chapters.
The code is as follows, checking the folders for not-image files, zipping the folders, zipping the zips while creating the text file, and "changing" extension.
import re, glob, os, zipfile, shutil, pathlib, gzip, itertools
Folders = glob.glob("*/")
items = len(Folders)
cn_list = []
cn_list_filtered = []
dirs_filtered = []
ch_id = ["c", "Ch. "]
subdir_im = []
total = 0
Dirs = next(os.walk('.'))[1]
for i in range(0, len(Dirs)):
for items in os.listdir("./" + Dirs[i]):
if items.__contains__('.png') or items.__contains__('.jpg'):
total+=1
else:
print(items + " not an accepted format.")
subdir_im.append(total)
total = 0
for fname in Folders:
if re.search(ch_id[0] + r'\d+' + r'[\S]' + r'\d+', fname):
cn = re.findall(ch_id[0] + "(\d+[\S]\d+)", fname)[0]
cn_list.append(cn)
elif re.search(ch_id[0] + r'\d+', fname):
cn = re.findall(ch_id[0] + "(\d+)", fname)[0]
cn_list.append(cn)
elif re.search(ch_id[1] + r'\d+' + '[\S]' + r'\d+', fname):
cn = re.findall(ch_id[1] + "(\d+[\S]\d+)", fname)[0]
cn_list.append(cn)
elif re.search(ch_id[1] + r'\d+', fname):
cn = re.findall(ch_id[1] + "(\d+)", fname)[0]
cn_list.append(cn)
else:
print('Warning: File found without proper filename format.')
cn_list_filtered = set(cn_list)
cn_list_filtered = sorted(cn_list_filtered)
cwd = os.getcwd()
Dirs = Folders
subdir_zi = []
total = 0
for i in range(0, len(cn_list_filtered)):
for folders in Dirs:
if folders.__contains__(ch_id[0] + cn_list_filtered[i] + " ")\
or folders.__contains__(ch_id[1] + cn_list_filtered[i] + " "):
print('Zipping folder ', folders)
namezip = "Chapter " + cn_list_filtered[i] + ".zip"
current_zip = zipfile.ZipFile(namezip, "a")
for items in os.listdir(folders):
if items.__contains__('.png') or items.__contains__('.jpg'):
current_zip.write(folders + "/" + items, items)
total+=1
subdir_zi.append(total)
total = 0
print('Folder contents in order:', subdir_im, ' Total:', sum(subdir_im))
print("Number of items per zip: ", subdir_zi, ' Total:', sum(subdir_zi))
if subdir_im == subdir_zi:
print("All items in folders have been successfully zipped")
else:
print("Warning: File count in folders and zips do not match. Please check the affected chapters")
zips = glob.glob("*.zip")
namezip2 = os.path.basename(os.getcwd()) + ".zip"
zipfinal = zipfile.ZipFile(namezip2, "a")
for i in range(0, len(zips), 1):
zipfinal.write(zips[i],zips[i])
Data = []
for i in range (0,len(cn_list_filtered),1):
Datai = ("Chapter " + cn_list_filtered[i] + ".zip" + ":Chapter " + cn_list_filtered[i] + "\r\n")
Data.append(Datai)
Dataok = ''.join(Data)
with zipfile.ZipFile(namezip2, 'a') as myzip:
myzip.writestr("comics.txt", Dataok)
zipfinal.close()
os.rename(namezip2, namezip2 + ".cbc")
os.system("pause")
I am by no means a programmer, that is just a Frankenstein monster code I eventually managed to put together by checking threads, but this last issue has me stumped.
Some solutions I tried are:
for i in range(0, len(zips), 1):
zipfinal.write(zips[i],zips[i])
zips[i].close()
Fails with:
zips[i].close()
AttributeError: 'str' object has no attribute 'close'
and:
for i in range(0, len(zips), 1):
zipfinal.write(zips[i],zips[i])
zips[len(zips)].close()
Fails with:
zips[len(zips)].close()
IndexError: list index out of range
Thanks for the help.
This solved my issue:
def generate_zip(file_list, file_name=None):
zip_buffer = io.BytesIO()
zf = zipfile.ZipFile(zip_buffer, mode="w", compression=zipfile.ZIP_DEFLATED)
for file in file_list:
print(f"Filename: {file[0]}\nData: {file[1]}")
zf.writestr(file[0], file[1])
**zf.close()**
with open(file_name, 'wb') as f:
f.write(zip_buffer.getvalue())
f.close()

Issue with Python Not Writing Lines

So I'm writing a script to take large csv files and divide them into chunks. These files each have lines formatted accordingly:
01/07/2003,1545,12.47,12.48,12.43,12.44,137423
Where the first field is the date. The next field to the right is a time value. These data points are at minute granularity. My goal is to fill files with 8 days worth of data, so I want to write all the lines from a file for 8 days worth into a new file.
Right now, I'm only seeing the program write one line per "chunk," rather than all the lines. Code shown below and screenshots included showing how the chunk directories are made and the file as well as its contents.
For reference, day 8 shown and 1559 means it stored the last line right before the mod operator became true. So I'm thinking that everything is getting overwritten somehow since only the last values are being stored.
import os
import time
CWD = os.getcwd()
WRITEDIR = CWD+"/Divided Data/"
if not os.path.exists(WRITEDIR):
os.makedirs(WRITEDIR)
FILEDIR = CWD+"/SP500"
os.chdir(FILEDIR)
valid_files = []
filelist = open("filelist.txt", 'r')
for file in filelist:
cur_file = open(file.rstrip()+".csv", 'r')
cur_file.readline() #skip first line
prev_day = ""
count = 0
chunk_count = 1
for line in cur_file:
day = line[3:5]
WDIR = WRITEDIR + "Chunk"
cur_dir = os.getcwd()
path = WDIR + " "+ str(chunk_count)
if not os.path.exists(path):
os.makedirs(path)
if(day != prev_day):
# print(day)
prev_day = day
count += 1
#Create new directory
if(count % 8 == 0):
chunk_count += 1
PATH = WDIR + " " + str(chunk_count)
if not os.path.exists(PATH):
os.makedirs(PATH)
print("Chunk count: " + str(chunk_count))
print("Global count: " + str(count))
temp_path = WDIR +" "+str(chunk_count)
os.chdir(temp_path)
fname = file.rstrip()+str(chunk_count)+".csv"
with open(fname, 'w') as f:
try:
f.write(line + '\n')
except:
print("Could not write to file. \n")
os.chdir(cur_dir)
if(chunk_count >= 406):
continue
cur_file.close()
# count += 1
The answer is in the comment but let me give it here so that your question is answered.
You're opening your file in 'w' mode which overwrites all the previously written content. You need to open it in the 'a' (append) mode:
fname = file.rstrip()+str(chunk_count)+".csv"
with open(fname, 'a') as f:
See more on open function and modes in Python documentation. It specifically mentions about 'w' mode:
note that 'w+' truncates the file

How to open multiple directories and read the files inside

I need to compare text files with two other files, and then get the result as an output. So I taught myself enough to write the following script which works fine and compares all of the files in a specific directory, however I have multiple directories with text files inside. What I need is to compare all of the text files in all of the directories and have an output file for each directory. Is there a way to improve the code below to do that:
import glob
import os
import sys
sys.stdout = open("citation.txt", "w")
for filename in glob.glob('journal*.txt'):
f1 = open(filename,'r')
f1data = f1.readlines()
f2 = open('chem.txt')
f2data = f2.readlines()
f3 = open('bio.txt')
f3data = f3.readlines()
chem = 0
bio = 0
total = 0
for line1 in f1data:
i = 0
for line2 in f2data:
if line1 in line2:
i+=1
total+=1
chem+=1
if i > 0:
print 'chem ' + line1 + "\n"
for line3 in f3data:
if line1 in line3:
i+=1
total+=1
bio+=1
if i > 0:
print 'bio ' + line1 + "\n"
print filename
print total
print 'bio ' + str(bio)
print 'chem ' + str(kimya)
Thanks in advance!
Just use a list of directories and a for loop
directories = ['folder1','folder2',...]
for i,folder in enumerate(directories):
sys.stdout = open("citation{}.txt".format(i), "w")
...
[put the rest of your code here]
This will name different output files as citation0.txt but you can do other formats if you want, just by changing how that name is declared.
And if you want each citation.txt to go into the actual directory, just change your code to this:
for folder in directories:
citation = os.path.join(folder, "citation.txt")
sys.stdout = open(citation, "w")
This will create a path for a new citation.txt file with each directory as the loop runs. Make sure to import os at the start of your file, if you haven't already.

python multiple inputs and multiple outputs

I have written a script in python, which works on a single file. I couldn't find an answer to make it run on multiple files and to give output for each file separately.
out = open('/home/directory/a.out','w')
infile = open('/home/directory/a.sam','r')
for line in infile:
if not line.startswith('#'):
samlist = line.strip().split()
if 'I' or 'D' in samlist[5]:
match = re.findall(r'(\d+)I', samlist[5]) # remember to chang I and D here aswell
intlist = [int(x) for x in match]
## if len(intlist) < 10:
for indel in intlist:
if indel >= 10:
## print indel
###intlist contains lengths of insertions in for each read
#print intlist
read_aln_start = int(samlist[3])
indel_positions = []
for num1, i_or_d, num2, m in re.findall('(\d+)([ID])(\d+)?([A-Za-z])?', samlist[5]):
if num1:
read_aln_start += int(num1)
if num2:
read_aln_start += int(num2)
indel_positions.append(read_aln_start)
#print indel_positions
out.write(str(read_aln_start)+'\t'+str(i_or_d) + '\t'+str(samlist[2])+ '\t' + str(indel) +'\n')
out.close()
I would like my script to take multiple files with names like a.sam, b.sam, c.sam and for each file give me the output : aout.sam, bout.sam, cout.sam
Can you please pass me either a solution or a hint.
Regards,
Irek
Loop over filenames.
input_filenames = ['a.sam', 'b.sam', 'c.sam']
output_filenames = ['aout.sam', 'bout.sam', 'cout.sam']
for infn, outfn in zip(input_filenames, output_filenames):
out = open('/home/directory/{}'.format(outfn), 'w')
infile = open('/home/directory/{}'.format(infn), 'r')
...
UPDATE
Following code generate output_filenames from given input_filenames.
import os
def get_output_filename(fn):
filename, ext = os.path.splitext(fn)
return filename + 'out' + ext
input_filenames = ['a.sam', 'b.sam', 'c.sam'] # or glob.glob('*.sam')
output_filenames = map(get_output_filename, input_filenames)
I'd recommend wrapping that script in a function, using the def keyword, and passing the names of the input and output files as parameters to that function.
def do_stuff_with_files(infile, outfile):
out = open(infile,'w')
infile = open(outfile,'r')
# the rest of your script
Now you can call this function for any combination of input and output file names.
do_stuff_with_files('/home/directory/a.sam', '/home/directory/a.out')
If you want to do this for all files in a certain directory, use the glob library. To generate the output filenames, just replace the last three characters ("sam") with "out".
import glob
indir, outdir = '/home/directory/', '/home/directory/out/'
files = glob.glob1(indir, '*.sam')
infiles = [indir + f for f in files]
outfiles = [outdir + f[:-3] + "out" for f in files]
for infile, outfile in zip(infiles, outfiles):
do_stuff_with_files(infile, outfile)
The following script allows working with an input and output file. It will loop over all files in the given directory with the ".sam" extension, perform the specified operation on them, and output the results to a separate file.
Import os
# Define the directory containing the files you are working with
path = '/home/directory'
# Get all the files in that directory with the desired
# extension (in this case ".sam")
files = [f for f in os.listdir(path) if f.endswith('.sam')]
# Loop over the files with that extension
for file in files:
# Open the input file
with open(path + '/' + file, 'r') as infile:
# Open the output file
with open(path + '/' + file.split('.')[0] + 'out.' +
file.split('.')[1], 'a') as outfile:
# Loop over the lines in the input file
for line in infile:
# If a line in the input file can be characterized in a
# certain way, write a different line to the output file.
# Otherwise write the original line (from the input file)
# to the output file
if line.startswith('Something'):
outfile.write('A different kind of something')
else:
outfile.write(line)
# Note the absence of either a infile.close() or an outfile.close()
# statement. The with-statement handles that for you

Python: Conditional logic if entry is file or directory not working

I have the following directory structure on my file system:
/home/myUser/
stuff_home/
fizz/
a.txt
b.txt
buzz/
1.pdf
widgets/
c.txt
2.pdf
3.pdf
4.pdf
I want to traverse stuff_home/ recursively and count the number of subdirectories, .txt files and .pdf documents it contains. I have written a small Python script:
import os
dirCnt = 0
txtCnt = 0
pdfCnt = 0
def main():
get_counts("/home/myUser/stuff_home")
t = str(txtCnt)
p = str(pdfCnt)
d = str(dirCnt)
print "\nRESULTS\Text Files:\t" + t + "\nPDF Files:\t" + p + "\nDirectories:\t" + d + "\n\n"
def get_counts(root):
contents = os.listdir(root)
for file in contents:
if os.path.isdir(file):
dirCnt = dirCnt + 1
elif os.path.splitext(file)[1] == "txt":
txtCnt = txtCnt + 1
elif os.path.splitext(file)[1] == "pdf":
pdfCnt = pdfCnt + 1
else:
print "Encountered unknown file: " + file
When I run this, I get no errors, but the script is clearly coded wrong. Here is the output I get:
Encountered unkown file: fizz
Encountered unkown file: buzz
Encountered unkown file: widgets
RESULTS
Text Files: 0
PDF Files: 0
Directories: 0
Anything jump out to you Pythonians out there? It looks like none of my logic (for detecting file vs. directory, as well as using splitext to grabs the file extension) is working here...thanks in advance!
This seems like a job for os.walk (if I understand correctly):
def count_pdf_txt(top):
npdf = 0
ntxt = 0
ndir = 0
for root,dirs,files in os.walk(top):
ndir += len(dirs)
for f in files:
if f.endswith('txt'): #use `splitext` if you like.
ntxt += 1
elif f.endswith('pdf'):
npdf += 1
else:
print "unknown"
return npdf,ntxt,ndirs
Note that your version gives a wrong result because of the lines like:
pdfCount = pdfCount + 1
inside your get_counts function. This creates a new local variable which doesn't influence the global variable in any way. In order to have your local variables change the global variables, you need to declare them as global. e.g. global pdfCount. However, the presence of a global keyword in your function should always make you think "there's got to be a better way to do this".

Categories

Resources