I have to compare 100s of files in two folders(directories). There is a way in which we can derive the second file based on the first file and vice versa. I was asked to develop a script so that we can do this task quickly. Following were the requirements
a) HTML Report showing the differences
b) Txt file showing the basic information i.e., count,header,trailer info.
I have written the following script using python but after processing 14 files, there is no movement.
{#Take two folders as input and compare the same files in them using pandas and sqlite
#!/usr/bin/env python3
# Path: folder_compare.py
import os
import pandas as pd
import sqlite3
import logging
import difflib
import sys
#function to write the message sent to the txt file passed as an argument
def write_to_txt(file_name, message):
#path to the file
d_path = 'C:/Upgrade/File-Compare/Differences/' + os.path.basename(file_name)
os.makedirs(d_path, exist_ok=True)
file_path = d_path + '/' + file_name + '.txt'
#Create the file if it does not exist
if not os.path.exists(file_path):
open(file_path, 'w').close()
f = open(file_path, 'a')
f.write(message)
f.close()
def convert_windows_path_to_python(path):
path = path.replace("\\","/")
return path
#get the folders as input from the user
fol1 = input("Enter the first folder path: ")
fol2 = input("Enter the second folder path: ")
folder1 = convert_windows_path_to_python(fol1)
folder2 = convert_windows_path_to_python(fol2)
#function to derive the second file name from the first file name
def get_file_name(file_name):
#file_name = file_name.split('.')
#file_name = file_name[0].replace('BZ1CV','BZ1DV') + '.' + file_name[1]
file_name = file_name.replace('BZ1CV','BZ1DV')
return file_name
#function to compare the two files and write the difference to a html using html.table
def compare_files(file1, file2):
#read the two files
f1 = pd.read_table(file1, encoding='unicode_escape', header=None)
f2 = pd.read_table(file2, encoding='unicode_escape', header=None)
#Get the filesize of the two files
f1_size = os.path.getsize(file1)
f2_size = os.path.getsize(file2)
d_path = 'C:/Upgrade/File-Compare/Differences/' + os.path.basename(file1)
os.makedirs(d_path, exist_ok=True)
#if file size of any of the two files is greater than 10MB, then compare the files using pandas concat and drop_duplicates
if f1_size > 10485760 or f2_size > 10485760:
#compare the two files using pandas concat and drop_duplicates, where both the files can be viewed side by side
difference = pd.concat([f1, f2]).drop_duplicates(keep=False)
difference.to_html(d_path + '_diff.html')
#if the file size of any of the two files is less than 10MB, then compare the files using difflib.html_diff
else:
#compare the two files using difflib.html_diff
first_file_lines = open(file1).readlines()
second_file_lines = open(file2).readlines()
diff = difflib.HtmlDiff().make_file(first_file_lines, second_file_lines, file1, file2, context=True, numlines=0)
diff_report = open(d_path + '_diff.html', 'w')
diff_report.writelines(diff)
diff_report.close()
logging.info('The files are compared successfully')
#Now start logging findings of the files
#Count the number of rows in the two data frames and log the rowcount of both the data frames in a log file with the name as the first file name and extension as .txt
#Loop through the files in the folder1 and compare them with the files in the folder2
for file in os.listdir(folder1):
file1 = folder1 + '/' + file
file2 = folder2 + '/' + get_file_name(file)
#if the second file does not exist in folder 2, then log the error and continue
if not os.path.isfile(file2):
logging.error('File not found: ' + os.path.basename(file2))
continue
f1 = pd.read_table(file1, encoding='unicode_escape', header=None)
f2 = pd.read_table(file2, encoding='unicode_escape', header=None)
#Get the first row(header) of the first data frame and the first row(header) of the second data frame and write both the headers to a text file using the first file name and extension as .txt
f1_header = f1.iloc[0]
f2_header = f2.iloc[0]
#write the headers to a text file using the first file name and extension as .txt and writing a sentence to the text file
write_to_txt(os.path.basename(file1) , 'The headers of the first file are: ' + str(f1_header) + '\n')
write_to_txt(os.path.basename(file1) , 'The headers of the second file are: ' + str(f2_header) + '\n')
#Get the rowcount of the first data frame and the rowcount of the second data frame and write both the rowcounts to a text file using the first file name and extension as .txt
f1_rowcount = f1.shape[0]
f2_rowcount = f2.shape[0]
write_to_txt(os.path.basename(file1) , 'The rowcount of the first file(including header and trailer rows) is: ' + str(f1_rowcount) + '\n')
write_to_txt(os.path.basename(file1) , 'The rowcount of the second file(including header and trailer rows) is: ' + str(f2_rowcount) + '\n')
#Get the last row (footer) of the first data frame and the last row (footer) of the second data frame and write both the footers to a text file using the first file name and extension as .txt
f1_footer = f1.iloc[-1]
f2_footer = f2.iloc[-1]
write_to_txt(os.path.basename(file1) , 'The trailer of the first file are: ' + str(f1_footer) + '\n')
write_to_txt(os.path.basename(file1) , 'The trailer of the second file are: ' + str(f2_footer) + '\n')
compare_files(file1, file2)
}
Related
I´m new to python and I´ve been trying to simplify a manual task that I do on my daily bases, I have a text file with a list of file names separated in groups by a blank line, like this:
fileName1
fileName2
fileName3
fileName4
fileName5
fileName6
fileName7
fileName8
fileName9
fileName10
fileName11
fileName12
All of this files are in one folder and I want to find each group of files and move them into separate folders the name of the new folders should be the name of the first file of each group.
I´m doing my research and I found how to do each step separately using os and shutil modules but I can´t find a way to join them together and make a beautiful script, any help that I can get from you guys will be awesome, thanks!!
Here's a little script that can do that.
I've made two assumptions:
The file with the list of files is stored in the same directory as source files
There is a blank line after the last file so the script can grab the last group
import os
from shutil import move
from itertools import groupby
#Where the files are originally stored
src_path = "C:\\temp\\src\\"
#Where the group folders will go
dest_path = "C:\\temp\\dest\\"
#Open up the file containing the list of files
with open(src_path + "list_of_files.txt") as txt:
lines = txt.readlines() #Read the file
#Split the contents of the file based on the newline character "\n"
i = (list(g) for _, g in groupby(lines, key='\n'.__ne__))
list_of_groups = [a + b for a, b in zip(i, i)]
#Iterate through each group
for group in list_of_groups:
folder_name = dest_path + group[0].replace("\n","") + "\\"
if not os.path.exists(folder_name):
#Create a folder for each group if it doesn't already exist
os.mkdir(folder_name)
#Move over each file in the group. The last element in the group is a newline character
for file in group:
if file != "\n":
move(src_path + file.replace("\n",""),folder_name + file.replace("\n",""))
When reading a file you can look up characters. Blank spaces have a newline character as represented by \n.
import os
filepath1 = os.getcwd() # current working directory
filepath2 = "path\\to\\where\\you\\want\\dir"
filename = os.path.join(filepath1,"yourfilename.txt")
dirName = []
groupName = "filegroup"
idx = 1
newPath = ""
init = True
file = open(filename, "r")
for line in file:
if init == True: # initial folder
newPath = os.path.join(filepath2,groupName + str(idx))
os.mkdir(newPath)
dirName.append(groupName)
init = False
if line == "\n": # line in file is empty
idx += 1
newName = groupName + str(idx)
dirName.append(newName)
newPath = filepath2 + dirName[idx-1]
os.mkdir(newPath)
else:
os.mkdir(os.path.join(newPath,line.rstrip()))
file.close()
I am trying to loop over folders and subfolder to access and read CSV files before transforming them into JSON. Here is the code I am working on:
cursor = conn.cursor()
try:
# Specify the folder containing needed files
folderPath = 'C:\\Users\\myUser\\Desktop\\toUpload' # Or using input()
fwdPath = 'C:/Users/myUser/Desktop/toUpload'
for countries in os.listdir(folderPath):
for sectors in os.listdir(folderPath+'\\'+countries):
for file in os.listdir(folderPath+'\\'+countries+'\\'+sectors):
data = pd.DataFrame()
filename, _ext = os.path.splitext(os.path.basename(folderPath+'\\'+countries+'\\'+file))
print(file + ' ' + filename+ ' ' + sectors + ' ' + countries)
data = pd.read_csv(file)
# cursor.execute('SELECT * FROM SECTORS')
# print(list(cursor))
finally:
cursor.close()
conn.close()
The following print line is returning the file with its filename without the extension, and then sectors and countries folder names:
print(file + ' ' + filename+ ' ' + sectors + ' ' + countries)
myfile.csv myfile WASHSector CTRYIrq
Now when it comes to reading the CSV, it will take lots and lots of time and at the end O get the following error:
[Errno 2] File myfile.csv does not exist
you need to give pd.read_csv the full path of the file, so change it to:
data = pd.read_csv(folderPath+'\\'+countries+'\\'+sectors + '\\' +file)
Before reading the csv file, you should compose the whole path to the file, otherwise, pandas won't be able to read that file.
import os
# ...
path = os.path.join(folderPath, countries, sectors, file)
data = pd.read_csv(path)
Also instead of using three nested for loops I recommend you using the os.walk method. It will automatically recurse through directories
>>> folderPath = 'C:\\Users\\myUser\\Desktop\\toUpload'
>>> for root, _, files in os.walk(folderPath):
>>> ... for f in files:
>>> ... pd.read_csv(os.path.join(root, f))
I am using the Scala code below to rename a CSV file into TXT file and move TXT file. I need to translate this code to Python/Pyspark but I am having problems (not well versed in Python). I would highly appreciate your help. Thanks in advance!
//Prepare to rename file
import org.apache.hadoop.fs._
import org.apache.hadoop.fs.{FileSystem, Path}
val fs = FileSystem.get(sc.hadoopConfiguration)
//Create variables
val table_name = dbutils.widgets.get("table_name") // getting table name
val filePath = "dbfs:/mnt/datalake/" + table_name + "/" // path where original csv file name is located
val fileName = fs.globStatus(new Path(filePath+"part*"))(0).getPath.getName // getting original csv file name
val newfilename = table_name + ".txt" // renaming and transforming csv into txt
val curatedfilePath = "dbfs:/mnt/datalake/" + newfilename // curated path + new file name
//Move to curated folder
dbutils.fs.mv(filePath + fileName, curatedfilePath)
Here is the Python Code
%python
#Create variables
table_name = dbutils.widgets.get("table_name") # getting table name
filePath = "dbfs:/mnt/datalake/" + table_name + "/" # path where original csv file name is located
newfilename = table_name + ".txt" # transforming csv into txt
curatedfilePath = "dbfs:/mnt/datalake/" + newfilename # curated path + new file name
#Save CSV file
df_curated.coalesce(1).replace("", None).write.mode("overwrite").save(filePath,format='csv', delimiter='|', header=True, nullValue=None)
# getting original csv file name
for f in filePath:
if f[1].startswith("part-00000"):
original_file_name = f[1]
#move to curated folder
dbutils.fs.mv(filePath + fileName, curatedfilePath)
I am having problem with the "getting original file name" part. It throws the following error:
IndexError: string index out of range
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<command-3442953727364942> in <module>()
11 # getting original csv file name
12 for f in filePath:
---> 13 if f[1].startswith("part-00000"):
14 original_file_name = f[1]
15
IndexError: string index out of range
In the Scala code, you're using hadoop.fs.golobStatus to list the part files from the folder where you save the DataFrame.
In Python you can do the same by accessing hadoop.fs via the JVM like this:
conf = sc._jsc.hadoopConfiguration()
Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
part_files = Path(filePath).getFileSystem(conf).globStatus(Path(filePath + "/part*"))
file_name = part_files[0].getPath().getName()
My code currently unzips one zip folder and finds the file called file.txt and extracts it. Now I need to unzip multiple folders that have the extension .zip. I have tried to use code similar to what I need it to do but the problem is that now I have to find a file called file.txt in each of those .zip folders and extract that file only . Also to store file.txt into a separate folder that has the same name where it came from. Thank you in advance for your time.
import re
import os
from zipfile import ZipFile
def pain():
print("\t\t\tinput_files.zip has been unzipped")
with ZipFile('input_files.zip', 'r') as zipObj:
zipObj.extractall()
listOfFileNames = zipObj.namelist()
for fileName in listOfFileNames:
if fileName.endswith('.txt'):
zipObj.extract(fileName, 'storage')
outfile = "output2.txt" #this will be the filename that the code will write to
baconFile = open(outfile,"wt")
file_name1 = "file.txt"
print('Filename\tLine\tnumber of numbers\tstring separated by a comma\twhite space found\ttab found\tcarriage return found\n') #This prints the master column in the python shell and this is the way the code should collect the data
baconFile.write('Filename\tLine\tnumber of numbers\tstring separated by a comma\twhite space found\ttab found\tcarriage return found\n') #This prints the master column in the output file and this is the way the code should collect the data
#for filename in os.listdir(os.getcwd() + "/input_files"):
for filename in os.listdir('C:\Users\M29858\Desktop\TestPy\Version10\input_files'):
with open("input_files/" + filename, 'r') as f:
if file_name1 in filename:
output_contents(filename, f, baconFile)
baconFile.close() #closes the for loop that the code is writing to
def output_contents(filename, f, baconFile): #using open() function to open the file inside the directory
index = 0
for line in f:
#create a list of all of the numerical values in our line
content = line.split(',') #this will be used to count the amount numbers before and after comma
whitespace_found = False
tab_found = False
false_string = "False (end of file)"
carriage_found = false_string
sigfigs = ""
index += 1 #adds 1 for every line if it finds what the command wants
if " " in line: #checking for whitespace
whitespace_found = True
if "\t" in line: #checking for tabs return
tab_found = True
if '\n' in line: #checking if there is a newline after the end of each line
carriage_found = True
sigfigs = (','.join(str(len(g)) for g in re.findall(r'\d+\.?(\d+)?', line ))) #counts the sigsfigs after decimal point
print(filename + "\t{0:<4}\t{1:<17}\t{2:<27}\t{3:17}\t{4:9}\t{5:21}"
.format(index, len(content), sigfigs, str(whitespace_found), str(tab_found), str(carriage_found))) #whatever is inside the .format() is the way it the data is stored into
baconFile.write('\n')
baconFile.write( filename + "\t{0:<4}\t{1:<17}\t{2:<27}\t{3:17}\t{4:9}\t{5:21}"
.format(index, len(content), sigfigs, str(whitespace_found), str(tab_found), str(carriage_found)))
if __name__ == '__main__':
pain()
#THIS WORKS
import glob
import os
from zipfile import ZipFile
def main():
for fname in glob.glob("*.zip"): # get all the zip files
with ZipFile(fname) as archive:
# if there's no file.txt, ignore and go on to the next zip file
if 'file.txt' not in archive.namelist(): continue
# make a new directory named after the zip file
dirname = fname.rsplit('.',1)[0]
os.mkdir(dirname)
extract file.txt into the directory you just created
archive.extract('file.txt', path=dirname)
import os, unicodecsv as csv
# open and store the csv file
IDs = {}
with open('labels.csv','rb') as csvfile:
timeReader = csv.reader(csvfile, delimiter = ',')
# build dictionary with associated IDs
for row in timeReader:
IDs[row[0]] = row[1]
# move files
path = 'train/'
tmpPath = 'train2/'
for oldname in os.listdir(path):
# ignore files in path which aren't in the csv file
if oldname in IDs:
try:
os.rename(os.path.join(path, oldname), os.path.join(tmpPath, IDs[oldname]))
except:
print 'File ' + oldname + ' could not be renamed to ' + IDs[oldname] + '!'
I am trying to sort my files according to this csv file. But the file contains many ids with same name. Is there a way to move files with same name to 1 folder or adding a number in front of a file if the file with same name already exist in directory?
Example-
id name
001232131hja1.jpg golden_retreiver
0121221122ld.jpg black_hound
0232113222kl.jpg golden_retreiver
0213113jjdsh.jpg alsetian
05hkhdsk1233a.jpg black_hound
I actually want to move all the files having id corresponding to golden_retreiver to one folder and so on.
Based on what you describe, here is my approach:
import csv
import os
SOURCE_ROOT = 'train'
DEST_ROOT = 'train2'
with open('labels.csv') as infile:
next(infile) # Skip the header row
reader = csv.reader(infile)
seen = set()
for dogid, breed in reader:
# Create a new directory if needed
if breed not in seen:
os.mkdir(os.path.join(DEST_ROOT, breed))
seen.add(breed)
src = os.path.join(SOURCE_ROOT, dogid + '.jpg')
dest = os.path.join(DEST_ROOT, breed, dogid + '.jpg')
try:
os.rename(src, dest)
except WindowsError as e:
print e
Notes
For every line in the data file, I create the breed directory at the destination. I use the set seen to make sure that I only create each directory once.
After that, it is a trivia matter of moving files into place
One possible move error: file does not exist in the source dir. In which case, the code just prints out the error and ignore it.