Converting multiple gz file within subdirectories into csv - python

I have many subdirectories in my main directory and would like to write a script to unzip and convert all the files within it. If possible, I would also like to combine all the CSV within a single directory into a single CSV. But more importantly, I need help with my nested loop.
import gzip
import csv
import os
subdirlist = os.listdir('/home/user/Desktop/testloop')
subtotal = len(subdirlist)
subcounter = 0
for dirlist in subdirlist:
print "Working On " + dirlist
total = len(dirlist)
counter = 0
for dir in dirlist:
print "Working On " + dir
f = gzip.open('/' + str(subdirlist) + '/' + dir, 'rb')
file_content = f.read()
f.close()
print "25% Complete"
filename = '/' + str(subdirlist) + '/temp.txt'
target = open(filename, 'w')
target.write(file_content)
target.close()
print "50% Complete!"
csv_file = '/' + str(subdirlist) + '/' + str(dir) + '.csv'
in_txt = csv.reader(open(filename, "rb"), delimiter = '\t')
out_csv = csv.writer(open(csv_file, 'wb'))
out_csv.writerows(in_txt)
os.remove(filename)
os.remove('/' + str(subdirlist) + '/' + dir)
counter+=1
print str(counter) + "/" + str(total) + " " + str(dir) + " Complete!"
print "SubDirectory Converted!"
print str(subcounter) + "/" + str(subtotal) + " " + str(subdirlist) + " Complete!"
subcounter+=1
print "All Files Converted!"
Thanks in advance

To get lists of files and subdirectories, you can use os.walk. Below is an implementation I wrote to get all files (optionally, of certain type(s)) in arbitrarily nested subdirectories:
from os import walk, sep
from functools import reduce # in Python 3.x only
def get_filelist(root, extensions=None):
"""Return a list of files (path and name) within a supplied root directory.
To filter by extension(s), provide a list of strings, e.g.
get_filelist(root, ["zip", "csv"])
"""
return reduce(lambda x, y: x+y,
[[sep.join([item[0], name]) for name in item[2]
if (extensions is None or
name.split(".")[-1] in extensions)]
for item in walk(root)])

Related

Unexpected end of data when zipping zip files in Python

Good day.
I wrote a little Python program to help me easily create .cbc files for Calibre, which is just a renamed .zip file with a text file called comics.txt for TOC purposes. Each chapter is another zip file.
The issue is that the last zip file zipped always has the error "Unexpected end of data". The file itself is not corrupt, if I unzip it and rezip it it works perfectly. Playing around it seems that the problem is that Python doesn't close the last zip file after zipping it, since I can't delete the last zip while the program is still running since it's still open in Python. Needless to say, Calibre doesn't like the file and fails to convert it unless I manually rezip the affected chapters.
The code is as follows, checking the folders for not-image files, zipping the folders, zipping the zips while creating the text file, and "changing" extension.
import re, glob, os, zipfile, shutil, pathlib, gzip, itertools
Folders = glob.glob("*/")
items = len(Folders)
cn_list = []
cn_list_filtered = []
dirs_filtered = []
ch_id = ["c", "Ch. "]
subdir_im = []
total = 0
Dirs = next(os.walk('.'))[1]
for i in range(0, len(Dirs)):
for items in os.listdir("./" + Dirs[i]):
if items.__contains__('.png') or items.__contains__('.jpg'):
total+=1
else:
print(items + " not an accepted format.")
subdir_im.append(total)
total = 0
for fname in Folders:
if re.search(ch_id[0] + r'\d+' + r'[\S]' + r'\d+', fname):
cn = re.findall(ch_id[0] + "(\d+[\S]\d+)", fname)[0]
cn_list.append(cn)
elif re.search(ch_id[0] + r'\d+', fname):
cn = re.findall(ch_id[0] + "(\d+)", fname)[0]
cn_list.append(cn)
elif re.search(ch_id[1] + r'\d+' + '[\S]' + r'\d+', fname):
cn = re.findall(ch_id[1] + "(\d+[\S]\d+)", fname)[0]
cn_list.append(cn)
elif re.search(ch_id[1] + r'\d+', fname):
cn = re.findall(ch_id[1] + "(\d+)", fname)[0]
cn_list.append(cn)
else:
print('Warning: File found without proper filename format.')
cn_list_filtered = set(cn_list)
cn_list_filtered = sorted(cn_list_filtered)
cwd = os.getcwd()
Dirs = Folders
subdir_zi = []
total = 0
for i in range(0, len(cn_list_filtered)):
for folders in Dirs:
if folders.__contains__(ch_id[0] + cn_list_filtered[i] + " ")\
or folders.__contains__(ch_id[1] + cn_list_filtered[i] + " "):
print('Zipping folder ', folders)
namezip = "Chapter " + cn_list_filtered[i] + ".zip"
current_zip = zipfile.ZipFile(namezip, "a")
for items in os.listdir(folders):
if items.__contains__('.png') or items.__contains__('.jpg'):
current_zip.write(folders + "/" + items, items)
total+=1
subdir_zi.append(total)
total = 0
print('Folder contents in order:', subdir_im, ' Total:', sum(subdir_im))
print("Number of items per zip: ", subdir_zi, ' Total:', sum(subdir_zi))
if subdir_im == subdir_zi:
print("All items in folders have been successfully zipped")
else:
print("Warning: File count in folders and zips do not match. Please check the affected chapters")
zips = glob.glob("*.zip")
namezip2 = os.path.basename(os.getcwd()) + ".zip"
zipfinal = zipfile.ZipFile(namezip2, "a")
for i in range(0, len(zips), 1):
zipfinal.write(zips[i],zips[i])
Data = []
for i in range (0,len(cn_list_filtered),1):
Datai = ("Chapter " + cn_list_filtered[i] + ".zip" + ":Chapter " + cn_list_filtered[i] + "\r\n")
Data.append(Datai)
Dataok = ''.join(Data)
with zipfile.ZipFile(namezip2, 'a') as myzip:
myzip.writestr("comics.txt", Dataok)
zipfinal.close()
os.rename(namezip2, namezip2 + ".cbc")
os.system("pause")
I am by no means a programmer, that is just a Frankenstein monster code I eventually managed to put together by checking threads, but this last issue has me stumped.
Some solutions I tried are:
for i in range(0, len(zips), 1):
zipfinal.write(zips[i],zips[i])
zips[i].close()
Fails with:
zips[i].close()
AttributeError: 'str' object has no attribute 'close'
and:
for i in range(0, len(zips), 1):
zipfinal.write(zips[i],zips[i])
zips[len(zips)].close()
Fails with:
zips[len(zips)].close()
IndexError: list index out of range
Thanks for the help.
This solved my issue:
def generate_zip(file_list, file_name=None):
zip_buffer = io.BytesIO()
zf = zipfile.ZipFile(zip_buffer, mode="w", compression=zipfile.ZIP_DEFLATED)
for file in file_list:
print(f"Filename: {file[0]}\nData: {file[1]}")
zf.writestr(file[0], file[1])
**zf.close()**
with open(file_name, 'wb') as f:
f.write(zip_buffer.getvalue())
f.close()

Finding the latest file in each subdirectory

I have the a similar folder structure for which I need to pull the latest .jpg from each subdirectory:
+ C:\\myfiles
+ parentdir1
+ subdir1
+ somename1.jpg
+ somename2.jpg
+ ...
+ subdir2
+ somename3.jpg
+ somename4.jpg
+ ...
+ ...
+ parentdir2
+ subdir1
+ somename5.jpg
+ somename6.jpg
+ ...
+ subdir2
+ somename7.jpg
+ somename8.jpg
+ ...
+ ...
+ parentdir3
+ subdir1
+ somename9.jpg
+ somename10.jpg
+ ...
+ subdir2
+ somename11.jpg
+ somename12.jpg
+ ...
+ ...
+ ...
I don't know any of the names of the folders or files but I need to access the last 2 .jpg files in each subdir.
For the sake of making this simple, let's just assume I need to print the last 2 files created in the subdir.
I wrote a script that will search all subdir's in a given parentdir, but I actually need to go iterate through all parentdir's as well
import os
path = 'C:\\myfiles'
filelist = []
for i in range(len(os.listdir(path))):
subpath = path + '\\' + os.listdir(path)[i]
for root, dirs, files in os.walk(subpath):
for file in os.listdir(subpath):
filelist.append(os.path.join(root, file))
sorted_filelist = sorted(filelist, key=os.path.getctime)
print('the latest jpg file in ' + root + ' is: ' + sorted_filelist[-1])
print('the 2nd last jpg file in ' + root + ' is: ' + sorted_filelist[-2])
filelist.clear()
I think this will do what you want. Note that I sort the files by their last modification times, rather than their creation time because I think that's the way to determine which are "most recent".
import glob
import os
N_MOST_RECENT = 2
path = 'C:\\myfiles'
for entry in os.listdir(path):
subpath = os.path.join(path, entry)
if os.path.isdir(subpath):
for subentry in os.listdir(subpath):
subentrypath = os.path.abspath(os.path.join(subpath, subentry))
if os.path.isdir(subentrypath):
jpg_files = glob.iglob(os.path.join(subentrypath, '*.jpg'))
sorted_filenames = sorted(jpg_files, key=os.path.getmtime)
# Create list of filenames of the N most recent files.
most_recent = [os.path.split(name)[-1] # Extract filename from path.
for name in sorted_filenames[-N_MOST_RECENT:]]
print(f'{N_MOST_RECENT} most recent .jpg files in "{subentrypath}":\n'
f' {most_recent}')
Try iterating through the parent directory, and then through all the sub-directories, using os.listdir().
import os
parent_dir = 'path/to/parent/dir'
for subdir in os.listdir(parent_dir):
if not os.path.isdir(subdir):
continue
sorted_filelist = sorted(
[os.path.join(parent_dir, subdir, f) for f in os.listdir(subdir)
if os.path.splitext(f)[1] == '.jpg'],
key=os.path.getctime, reverse=True)
print(sorted_filelist[:2])

How to check if folder has suboflders and then list the directories with listdir()?

I want to write a file searching code where I don't know if the directory I'm searching in has subdirectories and I want to check that so I don't get an error like this:
[Error 267]The directory name is invalid: 'C:/Path/To/Directory'.
I wrote a code like this where if it finds the file it breaks and stopps the program but if not it goes down a layer and so on.
filename = raw_input('> ')
path = 'C:/Path/Of/Directory/You/Want/To/Search/In'
fldr = os.listdir(path)
for f in fldr:
p = path + '/' + f
sfldr = os.listdir(p)
if os.path.exists(p + '/' + filename):
print 'Found file!!', p + '/' + filename
break
else:
for sf in sfldr:
pp = p + '/' + sf
ssfldr = os.listdir(pp)
if os.path.exists(pp + '/' + filename):
print 'Found file!!', pp + '/' + filename
break
else:
for ssf in ssfldr:
ppp = pp + '/' + ssf
sssfldr = os.listdir(ppp)
if os.path.exists(ppp + '/' + filename):
print 'Found file!!', ppp + '/' + filename
break
The easy to notice error is that when the directory doesn't have 3 layers of subfolders the program just breaks and gives an error message.So if I could check if the folder has subfolders before entering it,that would be neat.
Use os.scandir(). Provides better speed over os.walk()
Link to docs here!
Alternatively use, glob
>>> from glob import glob
>>> paths = glob('*/')
>>> paths
['bin/', 'content/', 'include/', 'lib/', 'output/']
>>>

Python File Integrity Monitoring

I have written the following code and can't get it working and I can't see why. The code:
Reads list of target files
Loops through the directories
Runs MD5 hash on the file
Checks the activefile for previous md5 hashes for said file
If the file is new it will log it as new
If the log is existing but changed it will write the change and log the change
If not new and no change then do nothing
Here is the code:
import hashlib
import logging as log
import optparse
import os
import re
import sys
import glob
import shutil
def md5(fileName):
"""Compute md5 hash of the specified file"""
try:
fileHandle = open(fileName, "rb")
except IOError:
return
m5Hash = hashlib.md5()
while True:
data = fileHandle.read(8192)
if not data:
break
m5Hash.update(data)
fileHandle.close()
return m5Hash.hexdigest()
req = open("requested.txt")
for reqline in req:
reqName = reqline[reqline.rfind('/') + 1:len(reqline) - 1]
reqDir = reqline[0:reqline.rfind('/') + 1]
tempFile = open("activetemp.txt", 'w')
for name in glob.glob(reqDir + reqName):
fileHash = md5(name)
actInt = 0
if fileHash != None:
actFile = open("activefile.txt")
for actLine in actFile:
actNameDir = actLine[0:actLine.rfind(' : ')]
actHash = actLine[actLine.rfind(' : ') + 3:len(actLine) -1]
if actNameDir == name and actHash == fileHash:
tempFile.write(name + " : " + fileHash + "\n")
actInt = 1
print fileHash
print actHash
print name
print actNameDir
if actNameDir == name and actHash != fileHash:
fimlog = open("fimlog.txt", 'a')
tempFile.write(name + " : " + actHash + "\n")
actInt = 1
fimlog.write("FIM Log: The file " + name + " was modified: " + actHash + "\n")
if actInt == 0:
fimlog = open("fimlog.txt", 'a')
fimlog.write("FIM Log: The file " + name + " was created: " + fileHash + "\n")
tempFile.write(name + " : " + fileHash + "\n")
shutil.copyfile("activetemp.txt", "activefile.txt")
You never really described the problem, but one possible culprit is that you never close the tempFile (or the other files for that matter), so the file copy at the end may fail.

Avoid duplicate file names in a folder in Python

I want to download some files and save them in a folder and there may be some duplication in file names, so I want to avoid this to happen.
I think it needs an auto-naming system but now i don't know how to make it.
I used shutil and urllib2 to write my function.
This is a part of my code :
path = 'C:/DL/Others/'+filename+file_ext
with open(path, 'wb') as fp:
shutil.copyfileobj(req, fp)
As you know we can check that if a file exists or not by os.path.exists('path').
I wanna to rename my files and save them to avoid duplicated names using a pattern, for example by adding a number to file name.So if there was 4 files with same name, "fname", I want 4 files in this pattern :
fname - fname(1) - fname(2) - fname(3)
Something like this is probably reasonable:
path = 'c:/DL/Others/%s%s' % (filename, file_ext)
uniq = 1
while os.path.exists(path):
path = 'c:/DL/Others/%s_%d%s' % (filename, uniq, file_ext)
uniq += 1
If the original path doesn't exist you get no _1, but if it does exist it'll count up until it finds one that's free.
Track each filename's count as you create it:
fname_counts = {}
# ... whatever generates filename and file_ext goes here...
if filename + file_ext in fname_counts:
fname_counts[filename + file_ext] += 1
else:
fname_counts[filename + file_ext] = 0
# now check if it's a dupe when you create the path
if fname_counts[filename + file_ext]:
path = 'C:/DL/Others/%s_%s.%s' % (filename, fname_counts[filename + file_ext], file_ext)
else:
path = 'C:/DL/Others/' + filename + file_ext
Example at work with two duplicates ("test.txt"):
>>> filenames_and_exts = [('test', '.txt'), ('test', '.txt'), ('test2', '.txt'), ('test', '.cfg'), ('different_name', '.txt')]
>>> fname_counts = {}
>>> for filename, file_ext in filenames_and_exts:
if filename + file_ext in fname_counts:
fname_counts[filename + file_ext] += 1
else:
fname_counts[filename + file_ext] = 0
if fname_counts[filename + file_ext]:
path = 'C:/DL/Others/%s_%s%s' % (filename, fname_counts[filename + file_ext], file_ext)
else:
path = 'C:/DL/Others/' + filename + file_ext
print path
C:/DL/Others/test.txt
C:/DL/Others/test_1.txt
C:/DL/Others/test2.txt
C:/DL/Others/test.cfg
C:/DL/Others/different_name.txt

Categories

Resources