How to get file metadata from external drive in Python? - python

I wrote a Python script that collects file metadata (filename, creation date, creation time, last modified data, last modified time) from a file directory. However, when the directory is a path that is located in an external hard drive the script doesn't work. I can't figure out why.
Here is the code:
import os
from os.path import basename
import datetime
import time
def getSize(filename):
st = os.stat(filename)
print st
return st.st_size
#get last modified date
def getMTime(filename):
fileModTime = os.path.getmtime(filename)
return fileModTime
#get creation date
def getCTime(filename):
fileModTime = os.path.getctime(filename)
return fileModTime
#get data from directory
MyDirectory = "H:\0_tempfiles\150115_Portfolio\Work\Work\BarBackUp"
MyExtension = ".jpg"
#write to file
WorkingDirectory = "C:\\Users\Admin\Downloads\demo\\"
MyTxtFile = WorkingDirectory + "fileData6.txt"
delim = ";"
with open(MyTxtFile, 'wb') as f:
f.write(delim.join(["FILENAME", "FILESIZE", "mDATE","mTIME",
"cDATE","cTIME"]) + "\n")
for root, dirs, files in os.walk(MyDirectory):
for file in files:
if file.endswith(MyExtension):
#get File Name
a = (os.path.join(root, file))
#print a
filename = a
MyFileName = basename(a)
#get File Size
MyFileSize = getSize(filename) / 1000
print MyFileName + " >>> file size: " + str(MyFileSize) + "Kb"
#get modification time V2
modTimeV2 = getMTime(filename)
modTimeV2 = time.strftime("%Y/%d/%m;%I:%M:%S %p", \
time.localtime(modTimeV2))
print "time modified: " + str(modTimeV2)
#get creation time
creTime = getCTime(filename)
creTime = time.strftime("%Y/%d/%m;%I:%M:%S %p", \
time.localtime(creTime))
print "time created: " + str(creTime)
#--------
#write data to file
entry = delim.join([str(MyFileName), str(MyFileSize), \
str(modTimeV2), str(creTime)]) + "\n"
f.write(entry)
print "<<<<<<everything went fine>>>>>>"

Your code works fine for me. Your "MyDirectory" variable has escape characters in it. Try adding an r in front of the quotations:
MyDirectory = r"H:\0_tempfiles\150115_Portfolio\Work\Work\BarBackUp"
or
MyDirectory = "H:/0_tempfiles/150115_Portfolio/Work/Work/BarBackUp"
or
MyDirectory = "H:\\0_tempfiles\\150115_Portfolio\\Work\\Work\\BarBackUp"

Related

Why does this script move all files instead of just the newest files?

import os
import datetime
import shutil
source = 'C:/Users/user/Desktop/Files to move/'
destination = 'C:/Users/user/Desktop/Delete Logs/'
today = datetime.datetime.today() # Get current time
# Create log file with datestamp
file=open(logging_path+datetime.datetime.today().strftime('%d-%m-%Y')+'.txt', 'a')
# Move files
allfiles = os.listdir(source)
for f in allfiles:
# Check last modified time
t = os.stat(os.path.join(f, source))[8]
filetime = datetime.datetime.fromtimestamp(t) - today
# Is file less than a day old? If yes, move.
if filetime.days <= 1:
print(os.path.join(f, source), filetime.days)
file.write(os.path.join(f, source) + ' created ' + str(-1 * filetime.days)+' day(s) ago has moved\n')
shutil.move(source + f, destination + f)
Like the title says, I wrote this trying to move files less than a day old to a new location on a different disk but it moves all files in the location instead of the newest files. What am I doing wrong?
Your code works fine, you just need to add '-' sign before the last condition, because the output of filetime.days is -1, your condition will be :
if -filetime.days <= 1:
And you have inversed between source and filename just after the for loop
import os
import datetime
import shutil
source = 'f2/'
destination = 'f1/'
today = datetime.datetime.today() # Get current time
# Create log file with datestamp
file=open(logging_path+datetime.datetime.today().strftime('%d-%m-%Y')+'.txt', 'a')
# Move files
allfiles = os.listdir(source)
for f in allfiles:
# Check last modified time
t = os.stat(os.path.join(source, f))[8]
filetime = datetime.datetime.fromtimestamp(t) - today
print (filetime.days, type(filetime.days), t, f)
# Is file less than a day old? If yes, move.
if -filetime.days <= 1: # ==============> Here
print(os.path.join(f, source), filetime.days)
file.write(os.path.join(f, source) + ' created ' + str(-1 * filetime.days)+' day(s) ago has moved\n')
shutil.move(source + f, destination + f)

Folder Compare using Python

I have to compare 100s of files in two folders(directories). There is a way in which we can derive the second file based on the first file and vice versa. I was asked to develop a script so that we can do this task quickly. Following were the requirements
a) HTML Report showing the differences
b) Txt file showing the basic information i.e., count,header,trailer info.
I have written the following script using python but after processing 14 files, there is no movement.
{#Take two folders as input and compare the same files in them using pandas and sqlite
#!/usr/bin/env python3
# Path: folder_compare.py
import os
import pandas as pd
import sqlite3
import logging
import difflib
import sys
#function to write the message sent to the txt file passed as an argument
def write_to_txt(file_name, message):
#path to the file
d_path = 'C:/Upgrade/File-Compare/Differences/' + os.path.basename(file_name)
os.makedirs(d_path, exist_ok=True)
file_path = d_path + '/' + file_name + '.txt'
#Create the file if it does not exist
if not os.path.exists(file_path):
open(file_path, 'w').close()
f = open(file_path, 'a')
f.write(message)
f.close()
def convert_windows_path_to_python(path):
path = path.replace("\\","/")
return path
#get the folders as input from the user
fol1 = input("Enter the first folder path: ")
fol2 = input("Enter the second folder path: ")
folder1 = convert_windows_path_to_python(fol1)
folder2 = convert_windows_path_to_python(fol2)
#function to derive the second file name from the first file name
def get_file_name(file_name):
#file_name = file_name.split('.')
#file_name = file_name[0].replace('BZ1CV','BZ1DV') + '.' + file_name[1]
file_name = file_name.replace('BZ1CV','BZ1DV')
return file_name
#function to compare the two files and write the difference to a html using html.table
def compare_files(file1, file2):
#read the two files
f1 = pd.read_table(file1, encoding='unicode_escape', header=None)
f2 = pd.read_table(file2, encoding='unicode_escape', header=None)
#Get the filesize of the two files
f1_size = os.path.getsize(file1)
f2_size = os.path.getsize(file2)
d_path = 'C:/Upgrade/File-Compare/Differences/' + os.path.basename(file1)
os.makedirs(d_path, exist_ok=True)
#if file size of any of the two files is greater than 10MB, then compare the files using pandas concat and drop_duplicates
if f1_size > 10485760 or f2_size > 10485760:
#compare the two files using pandas concat and drop_duplicates, where both the files can be viewed side by side
difference = pd.concat([f1, f2]).drop_duplicates(keep=False)
difference.to_html(d_path + '_diff.html')
#if the file size of any of the two files is less than 10MB, then compare the files using difflib.html_diff
else:
#compare the two files using difflib.html_diff
first_file_lines = open(file1).readlines()
second_file_lines = open(file2).readlines()
diff = difflib.HtmlDiff().make_file(first_file_lines, second_file_lines, file1, file2, context=True, numlines=0)
diff_report = open(d_path + '_diff.html', 'w')
diff_report.writelines(diff)
diff_report.close()
logging.info('The files are compared successfully')
#Now start logging findings of the files
#Count the number of rows in the two data frames and log the rowcount of both the data frames in a log file with the name as the first file name and extension as .txt
#Loop through the files in the folder1 and compare them with the files in the folder2
for file in os.listdir(folder1):
file1 = folder1 + '/' + file
file2 = folder2 + '/' + get_file_name(file)
#if the second file does not exist in folder 2, then log the error and continue
if not os.path.isfile(file2):
logging.error('File not found: ' + os.path.basename(file2))
continue
f1 = pd.read_table(file1, encoding='unicode_escape', header=None)
f2 = pd.read_table(file2, encoding='unicode_escape', header=None)
#Get the first row(header) of the first data frame and the first row(header) of the second data frame and write both the headers to a text file using the first file name and extension as .txt
f1_header = f1.iloc[0]
f2_header = f2.iloc[0]
#write the headers to a text file using the first file name and extension as .txt and writing a sentence to the text file
write_to_txt(os.path.basename(file1) , 'The headers of the first file are: ' + str(f1_header) + '\n')
write_to_txt(os.path.basename(file1) , 'The headers of the second file are: ' + str(f2_header) + '\n')
#Get the rowcount of the first data frame and the rowcount of the second data frame and write both the rowcounts to a text file using the first file name and extension as .txt
f1_rowcount = f1.shape[0]
f2_rowcount = f2.shape[0]
write_to_txt(os.path.basename(file1) , 'The rowcount of the first file(including header and trailer rows) is: ' + str(f1_rowcount) + '\n')
write_to_txt(os.path.basename(file1) , 'The rowcount of the second file(including header and trailer rows) is: ' + str(f2_rowcount) + '\n')
#Get the last row (footer) of the first data frame and the last row (footer) of the second data frame and write both the footers to a text file using the first file name and extension as .txt
f1_footer = f1.iloc[-1]
f2_footer = f2.iloc[-1]
write_to_txt(os.path.basename(file1) , 'The trailer of the first file are: ' + str(f1_footer) + '\n')
write_to_txt(os.path.basename(file1) , 'The trailer of the second file are: ' + str(f2_footer) + '\n')
compare_files(file1, file2)
}

Unexpected end of data when zipping zip files in Python

Good day.
I wrote a little Python program to help me easily create .cbc files for Calibre, which is just a renamed .zip file with a text file called comics.txt for TOC purposes. Each chapter is another zip file.
The issue is that the last zip file zipped always has the error "Unexpected end of data". The file itself is not corrupt, if I unzip it and rezip it it works perfectly. Playing around it seems that the problem is that Python doesn't close the last zip file after zipping it, since I can't delete the last zip while the program is still running since it's still open in Python. Needless to say, Calibre doesn't like the file and fails to convert it unless I manually rezip the affected chapters.
The code is as follows, checking the folders for not-image files, zipping the folders, zipping the zips while creating the text file, and "changing" extension.
import re, glob, os, zipfile, shutil, pathlib, gzip, itertools
Folders = glob.glob("*/")
items = len(Folders)
cn_list = []
cn_list_filtered = []
dirs_filtered = []
ch_id = ["c", "Ch. "]
subdir_im = []
total = 0
Dirs = next(os.walk('.'))[1]
for i in range(0, len(Dirs)):
for items in os.listdir("./" + Dirs[i]):
if items.__contains__('.png') or items.__contains__('.jpg'):
total+=1
else:
print(items + " not an accepted format.")
subdir_im.append(total)
total = 0
for fname in Folders:
if re.search(ch_id[0] + r'\d+' + r'[\S]' + r'\d+', fname):
cn = re.findall(ch_id[0] + "(\d+[\S]\d+)", fname)[0]
cn_list.append(cn)
elif re.search(ch_id[0] + r'\d+', fname):
cn = re.findall(ch_id[0] + "(\d+)", fname)[0]
cn_list.append(cn)
elif re.search(ch_id[1] + r'\d+' + '[\S]' + r'\d+', fname):
cn = re.findall(ch_id[1] + "(\d+[\S]\d+)", fname)[0]
cn_list.append(cn)
elif re.search(ch_id[1] + r'\d+', fname):
cn = re.findall(ch_id[1] + "(\d+)", fname)[0]
cn_list.append(cn)
else:
print('Warning: File found without proper filename format.')
cn_list_filtered = set(cn_list)
cn_list_filtered = sorted(cn_list_filtered)
cwd = os.getcwd()
Dirs = Folders
subdir_zi = []
total = 0
for i in range(0, len(cn_list_filtered)):
for folders in Dirs:
if folders.__contains__(ch_id[0] + cn_list_filtered[i] + " ")\
or folders.__contains__(ch_id[1] + cn_list_filtered[i] + " "):
print('Zipping folder ', folders)
namezip = "Chapter " + cn_list_filtered[i] + ".zip"
current_zip = zipfile.ZipFile(namezip, "a")
for items in os.listdir(folders):
if items.__contains__('.png') or items.__contains__('.jpg'):
current_zip.write(folders + "/" + items, items)
total+=1
subdir_zi.append(total)
total = 0
print('Folder contents in order:', subdir_im, ' Total:', sum(subdir_im))
print("Number of items per zip: ", subdir_zi, ' Total:', sum(subdir_zi))
if subdir_im == subdir_zi:
print("All items in folders have been successfully zipped")
else:
print("Warning: File count in folders and zips do not match. Please check the affected chapters")
zips = glob.glob("*.zip")
namezip2 = os.path.basename(os.getcwd()) + ".zip"
zipfinal = zipfile.ZipFile(namezip2, "a")
for i in range(0, len(zips), 1):
zipfinal.write(zips[i],zips[i])
Data = []
for i in range (0,len(cn_list_filtered),1):
Datai = ("Chapter " + cn_list_filtered[i] + ".zip" + ":Chapter " + cn_list_filtered[i] + "\r\n")
Data.append(Datai)
Dataok = ''.join(Data)
with zipfile.ZipFile(namezip2, 'a') as myzip:
myzip.writestr("comics.txt", Dataok)
zipfinal.close()
os.rename(namezip2, namezip2 + ".cbc")
os.system("pause")
I am by no means a programmer, that is just a Frankenstein monster code I eventually managed to put together by checking threads, but this last issue has me stumped.
Some solutions I tried are:
for i in range(0, len(zips), 1):
zipfinal.write(zips[i],zips[i])
zips[i].close()
Fails with:
zips[i].close()
AttributeError: 'str' object has no attribute 'close'
and:
for i in range(0, len(zips), 1):
zipfinal.write(zips[i],zips[i])
zips[len(zips)].close()
Fails with:
zips[len(zips)].close()
IndexError: list index out of range
Thanks for the help.
This solved my issue:
def generate_zip(file_list, file_name=None):
zip_buffer = io.BytesIO()
zf = zipfile.ZipFile(zip_buffer, mode="w", compression=zipfile.ZIP_DEFLATED)
for file in file_list:
print(f"Filename: {file[0]}\nData: {file[1]}")
zf.writestr(file[0], file[1])
**zf.close()**
with open(file_name, 'wb') as f:
f.write(zip_buffer.getvalue())
f.close()

How to use a CFS_Config textfile to create path directories to auto generate text files in python?

Below is the data in CFS_Config.txt. What this textfile does is to know where the documents have stored and to avoid hardcodes in the program. For instance, if the program is moved to other environment, we only need to change the directory paths in the CFS_Config.txt file.
Folder Path = ../dataprep/source_documents
ED Notes name = ED Notes
ED Notes output = ../dataprep/ED_Notes
This below codes in a python file what it actually does is to read configuration from the CFS_Config.txt mentioned earlier and also to do an auto generated textfile.
The problem encountered is that they tell me the ../dataprep/ED_Notes path was not found. Please do take a look at the codes if need more codes I will try my best to provide, thanks!!! :((
from preprocessing import ednotes_extractor
import os
def read_config():
# open existing file to read configuration
cfs_config_txt = open("..\CFS_Config.txt", "r")
file_list = []
root_dir = ""
ednotes_name = ""
ednotes_output = ""
for line in cfs_config_txt:
file_list.append(line)
if "Folder Path = " in file_list[0]:
root_dir = str(file_list[0])
root_dir = root_dir.replace("Folder Path = ", "")
root_dir = root_dir.replace("\n", "")
if "ED Notes name = " in file_list[1]:
ednotes_name = str(file_list[1])
ednotes_name = ednotes_name.replace("ED Notes name = ", "")
ednotes_name = ednotes_name.replace("\n", "")
if "ED Notes output = " in file_list[2]:
ednotes_output = str(file_list[2])
ednotes_output = ednotes_output.replace("ED Notes output = ", "")
ednotes_output = ednotes_output + ".txt"
ednotes_output = ednotes_output.replace("\n", "")
return root_dir, ednotes_name, ednotes_output
def convert_txt(choices):
root_dir, ednotes_name, ednotes_output = read_config()
if(choices == 1):
# open new file to write string data textfile
text_file = open(ednotes_output, 'w', encoding='utf-8')
text_file.write("cat_id|content\n")
for filename in os.listdir(root_dir):
source_directory = root_dir + '/' + filename
arr = ednotes_extractor.get_ednotes(source_directory)
# open existing file to append the items in the array to the previously written textfile
text_file = open(ednotes_output, 'a', encoding='utf-8')
for item in arr:
text_file.write("%s\n" % item)
elif(choices==2):
print("Hi")

Python: downloading s3 file based name and date

I'm trying to Pull the file from s3 based on id and date of the filename:
Naming Convention:
The naming convention are as follows:
**
ID_NAME_DATE.csv : filename follow that same pattern
example : 9919USEN_File_20180216.csv
example : 9919GBEN_File_20180211.csv
**
Code:
import boto3
import re
def downloadFiletest():
#connect to s3
client = boto3.resource(u's3', aws_access_key_id=u'KEY',
aws_secret_access_key=u'TOKEN')
#used for downloading
s3 = boto3.client(u's3', aws_access_key_id=u'KEY',
aws_secret_access_key=u'TOKEN')
dateIdReg = '[0-9]{8}'
dateSuffix = re.compile(date)
print (u"= S3 Client Connected =")
# configure s3 bucket
bucket = client.Bucket(u'us-eu-Bucket')
b_folder = "/folder/example/"
c_folder = b_folder.lower() + '/'
files_not_found = True
for cList in bucket.objects.filter(Prefix=b_folder):
cFiles= cList.key
print ('file : ', cFiles)
for fileId in cFiles.lower():
files_not_found = False
f = fileId.rstrip()
print(f)
fileidreg= '[0-9]{4}[a-zA-Z]{4}'
FileID = re.compile(fileidreg)
if FileID.match(f) and dateSuffix.match(f):
print(u'cList.key.lower(): ', cList.key.lower())
old_file = cList.key
dot_index = old_file.find(u'.')
print (u'old dot file name: ', dot_index)
file_ext = old_file[dot_index:]
cfile = fileId + '_file_' + dateSuffix + file_ext
tmp_path = "/tmp/folder/" + cfile
b_path = cVal + cfile
print (u'b path : ', b_path)
s3.download_file("us-eu-Bucket", b_path, tmp_path)
print ("TEMP PATH: ", tmp_path)
if files_not_found:
print("ALERT", "No file in {0}/{1}".format(bucket, b_folder))
downloadFiletest()
Error:
It Skips over for fileId in cFiles.lower(): and closes the script.
Goal:
Pull file from S3 and Download it to tmp_path to be used as desired.
When pulling file i'd like the script to pick file based on ID and Date. For instance:
Rule: Pseudo:
If S3 has file 9919USEN_File_20180216.csv and 9919USEN_File_20180217.csv then pick 9919USEN_File_20180217.csv to download. Also IF 991USEN_File_2018.csv in S3 then don't pick file as it doesn't match rule, fileidreg = '[0-9]{4}[a-zA-Z]{4}' and dateIdReg = '[0-9]{8}'.
Rule: Visual:
9919USEN_File_20180217.csv > 9919USEN_File_20180216.csv [due to date]
9919USEN_File_20180217.csv > 991USEN_File_2018.csv [Due to Incorrect ID and Date]
Solution
The issue was the way it was structured. I've reorganized and put it in side a try, exception conditional loop. I've also used FileIDPrefix.search instead of FileIDPrefix.match since it was only looking specifically looking at the index and wasn't proper for the question in hand.
final solution.
import boto3
import re
#connect to s3
client = boto3.resource(u's3', aws_access_key_id=u'KEY',
aws_secret_access_key=u'TOKEN')
#used for downloading
s3 = boto3.client(u's3', aws_access_key_id=u'KEY',
aws_secret_access_key=u'TOKEN')
def downloadFiletest():
date = '[0-9]{8}' # fileDate regex
dateSuffix = re.compile(dates) # regex used to check the date of the file
reg = '[0-9]{4}[a-zA-Z]{4}' # filename regex
fileIDPrefix = re.compile(reg) # check fileID of the Filename.
folder = u"/folder/example/" # directory
bucket = client.Bucket(bucketname) # bucket
try:
for cuList in bucket.objects.filter(Prefix=folder): # filter to the folder
filenames= cList.key # directory of the files that we would like to use
print(cu)
# specific locations of site fileID of the file and date of the file
fileID = filenames[33:41]
fileDate = filenames[51:59]
# check the length of each values to be verified later.
lenf = len(fileID)
lenG = len(fileDate)
old_file = cList.key
dot_index = old_file.find(u'.')
file_ext = old_file[dot_index:]
# this check that the files in directory match our specified rules. if does it proceeds.
if fileIDPrefix.search(cu) and fileDateSuffix.search(cu):
filename = fileID + u'_file_' + fileDate + file_ext
tmp_path = "/tmp/mpcmt/" + filename
file_path = folder + filename
s3.download_file(bucketname, file_path, tmp_path)
return filename, tmp_path, fileID, fileDate
# this check the number of values/char in a directory to see it matches up to what is expected.
if dot_index > 59 or dot_index < 59:
print('File has wrong fileID or Wrong Date')
if lenG > 8 or lenG < 8:
print('File has wrong fileDate Format')
if lenf > 8 or lenf < 8:
print('File has wrong fileID')
except Exception as e: # this closes and displays an error if the file doesn't exist.
print("ALERT", "No file in {0}/{1}".format(bucket, folder))
# There was some issue / error / problem and that is why the program is exiting.
print >> sys.stderr, "No file in {0}/{1}".format(bucket, folder)
print >> sys.stderr, "Exception: %s" % str(e)
sys.exit(1)
downloadFiletest()

Categories

Resources