I would like to save a file by adding an extension to the existing filename, by extension I do not mean change .csv to .HTML.
What I mean is if I have an existing file file1.csv
I would like to save the other file as file1_processed.csv.
I tried doing this
data = pd.read_csv("file1.csv")
df = x
df.to_csv(os.path.basename(data) + '_' + 'processed' + '.csv')
however, it gives an error
TypeError: expected str, bytes or os.PathLike object, not DataFrame
filename = "file1"
data = pd.read_csv(filename + ".csv")
# ...
df.to_csv(filename + '_processed.csv')
data has contents of file file1.csv, not the file name. You need to assign the file name to a variable if the name of the file is not same always.
file_name = "file1.csv"
data = pd.read_csv(file_name )
df = x
name, extension = os.path.splitext(os.path.basename(file_name))
df.to_csv(name + '_' + 'processed' + extension)
Related
I have to compare 100s of files in two folders(directories). There is a way in which we can derive the second file based on the first file and vice versa. I was asked to develop a script so that we can do this task quickly. Following were the requirements
a) HTML Report showing the differences
b) Txt file showing the basic information i.e., count,header,trailer info.
I have written the following script using python but after processing 14 files, there is no movement.
{#Take two folders as input and compare the same files in them using pandas and sqlite
#!/usr/bin/env python3
# Path: folder_compare.py
import os
import pandas as pd
import sqlite3
import logging
import difflib
import sys
#function to write the message sent to the txt file passed as an argument
def write_to_txt(file_name, message):
#path to the file
d_path = 'C:/Upgrade/File-Compare/Differences/' + os.path.basename(file_name)
os.makedirs(d_path, exist_ok=True)
file_path = d_path + '/' + file_name + '.txt'
#Create the file if it does not exist
if not os.path.exists(file_path):
open(file_path, 'w').close()
f = open(file_path, 'a')
f.write(message)
f.close()
def convert_windows_path_to_python(path):
path = path.replace("\\","/")
return path
#get the folders as input from the user
fol1 = input("Enter the first folder path: ")
fol2 = input("Enter the second folder path: ")
folder1 = convert_windows_path_to_python(fol1)
folder2 = convert_windows_path_to_python(fol2)
#function to derive the second file name from the first file name
def get_file_name(file_name):
#file_name = file_name.split('.')
#file_name = file_name[0].replace('BZ1CV','BZ1DV') + '.' + file_name[1]
file_name = file_name.replace('BZ1CV','BZ1DV')
return file_name
#function to compare the two files and write the difference to a html using html.table
def compare_files(file1, file2):
#read the two files
f1 = pd.read_table(file1, encoding='unicode_escape', header=None)
f2 = pd.read_table(file2, encoding='unicode_escape', header=None)
#Get the filesize of the two files
f1_size = os.path.getsize(file1)
f2_size = os.path.getsize(file2)
d_path = 'C:/Upgrade/File-Compare/Differences/' + os.path.basename(file1)
os.makedirs(d_path, exist_ok=True)
#if file size of any of the two files is greater than 10MB, then compare the files using pandas concat and drop_duplicates
if f1_size > 10485760 or f2_size > 10485760:
#compare the two files using pandas concat and drop_duplicates, where both the files can be viewed side by side
difference = pd.concat([f1, f2]).drop_duplicates(keep=False)
difference.to_html(d_path + '_diff.html')
#if the file size of any of the two files is less than 10MB, then compare the files using difflib.html_diff
else:
#compare the two files using difflib.html_diff
first_file_lines = open(file1).readlines()
second_file_lines = open(file2).readlines()
diff = difflib.HtmlDiff().make_file(first_file_lines, second_file_lines, file1, file2, context=True, numlines=0)
diff_report = open(d_path + '_diff.html', 'w')
diff_report.writelines(diff)
diff_report.close()
logging.info('The files are compared successfully')
#Now start logging findings of the files
#Count the number of rows in the two data frames and log the rowcount of both the data frames in a log file with the name as the first file name and extension as .txt
#Loop through the files in the folder1 and compare them with the files in the folder2
for file in os.listdir(folder1):
file1 = folder1 + '/' + file
file2 = folder2 + '/' + get_file_name(file)
#if the second file does not exist in folder 2, then log the error and continue
if not os.path.isfile(file2):
logging.error('File not found: ' + os.path.basename(file2))
continue
f1 = pd.read_table(file1, encoding='unicode_escape', header=None)
f2 = pd.read_table(file2, encoding='unicode_escape', header=None)
#Get the first row(header) of the first data frame and the first row(header) of the second data frame and write both the headers to a text file using the first file name and extension as .txt
f1_header = f1.iloc[0]
f2_header = f2.iloc[0]
#write the headers to a text file using the first file name and extension as .txt and writing a sentence to the text file
write_to_txt(os.path.basename(file1) , 'The headers of the first file are: ' + str(f1_header) + '\n')
write_to_txt(os.path.basename(file1) , 'The headers of the second file are: ' + str(f2_header) + '\n')
#Get the rowcount of the first data frame and the rowcount of the second data frame and write both the rowcounts to a text file using the first file name and extension as .txt
f1_rowcount = f1.shape[0]
f2_rowcount = f2.shape[0]
write_to_txt(os.path.basename(file1) , 'The rowcount of the first file(including header and trailer rows) is: ' + str(f1_rowcount) + '\n')
write_to_txt(os.path.basename(file1) , 'The rowcount of the second file(including header and trailer rows) is: ' + str(f2_rowcount) + '\n')
#Get the last row (footer) of the first data frame and the last row (footer) of the second data frame and write both the footers to a text file using the first file name and extension as .txt
f1_footer = f1.iloc[-1]
f2_footer = f2.iloc[-1]
write_to_txt(os.path.basename(file1) , 'The trailer of the first file are: ' + str(f1_footer) + '\n')
write_to_txt(os.path.basename(file1) , 'The trailer of the second file are: ' + str(f2_footer) + '\n')
compare_files(file1, file2)
}
I have been creating a little script that queries a database and returns the result. I have then been using Pandas.to_csv() to write it out to a CSV tempfile before I upload that CSV result to a cloud location. The trouble I am running into is ensuring that the pandas.to_csv() function has completed writing the CSV tempfile before I upload it to the cloud location. The only way I have consistently ensured that that date makes it to the temp file before the upload is by keeping the
print(temp.tell())
line of code in the example below. If I comment it out, no data gets uploaded.
Example code below:
def write_to_temporary_csv_file(df, file_name, token, folder_id):
with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as temp:
print("DataFrame: ", df)
df.to_csv(temp, index=False, encoding='utf-8')
print("temp.tell() size: ", temp.tell())
print("File size: ", str(round((os.stat(temp.name).st_size/1024), 2)), "kb")
new_file_path = tempfile.gettempdir() + '/' + customer_name + '_' + file_name + '_' + current_date + '.csv'
## Check if newly created renamed temp file already exist, if it does remove it to create it
remove_temporary_file(new_file_path)
os.link(temp.name, new_file_path)
upload_response = upload_file(token, folder_id, new_file_path)
## Remove both the temp file and the newly created renamed temp file
remove_temporary_file(temp.name)
remove_temporary_file(new_file_path)
Image 1 (with temp.tell() included:
Image 2 (with temp.tell() commented out:
I think it might be caused by the fact that you keep your file opened (as long as you are inside the with block). That might cause the content not being flushed to disk.
def write_to_temporary_csv_file(df, file_name, token, folder_id):
with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as temp:
print("DataFrame: ", df)
df.to_csv(temp, index=False, encoding='utf-8')
# at this point we can close the file by exiting the with block
print("temp.tell() size: ", temp.tell())
print("File size: ", str(round((os.stat(temp.name).st_size/1024), 2)), "kb")
new_file_path = tempfile.gettempdir() + '/' + customer_name + '_' + file_name + '_' + current_date + '.csv'
## Check if newly created renamed temp file already exist, if it does remove it to create it
remove_temporary_file(new_file_path)
os.link(temp.name, new_file_path)
upload_response = upload_file(token, folder_id, new_file_path)
## Remove both the temp file and the newly created renamed temp file
remove_temporary_file(temp.name)
remove_temporary_file(new_file_path)
I am trying to loop over folders and subfolder to access and read CSV files before transforming them into JSON. Here is the code I am working on:
cursor = conn.cursor()
try:
# Specify the folder containing needed files
folderPath = 'C:\\Users\\myUser\\Desktop\\toUpload' # Or using input()
fwdPath = 'C:/Users/myUser/Desktop/toUpload'
for countries in os.listdir(folderPath):
for sectors in os.listdir(folderPath+'\\'+countries):
for file in os.listdir(folderPath+'\\'+countries+'\\'+sectors):
data = pd.DataFrame()
filename, _ext = os.path.splitext(os.path.basename(folderPath+'\\'+countries+'\\'+file))
print(file + ' ' + filename+ ' ' + sectors + ' ' + countries)
data = pd.read_csv(file)
# cursor.execute('SELECT * FROM SECTORS')
# print(list(cursor))
finally:
cursor.close()
conn.close()
The following print line is returning the file with its filename without the extension, and then sectors and countries folder names:
print(file + ' ' + filename+ ' ' + sectors + ' ' + countries)
myfile.csv myfile WASHSector CTRYIrq
Now when it comes to reading the CSV, it will take lots and lots of time and at the end O get the following error:
[Errno 2] File myfile.csv does not exist
you need to give pd.read_csv the full path of the file, so change it to:
data = pd.read_csv(folderPath+'\\'+countries+'\\'+sectors + '\\' +file)
Before reading the csv file, you should compose the whole path to the file, otherwise, pandas won't be able to read that file.
import os
# ...
path = os.path.join(folderPath, countries, sectors, file)
data = pd.read_csv(path)
Also instead of using three nested for loops I recommend you using the os.walk method. It will automatically recurse through directories
>>> folderPath = 'C:\\Users\\myUser\\Desktop\\toUpload'
>>> for root, _, files in os.walk(folderPath):
>>> ... for f in files:
>>> ... pd.read_csv(os.path.join(root, f))
I am using the Scala code below to rename a CSV file into TXT file and move TXT file. I need to translate this code to Python/Pyspark but I am having problems (not well versed in Python). I would highly appreciate your help. Thanks in advance!
//Prepare to rename file
import org.apache.hadoop.fs._
import org.apache.hadoop.fs.{FileSystem, Path}
val fs = FileSystem.get(sc.hadoopConfiguration)
//Create variables
val table_name = dbutils.widgets.get("table_name") // getting table name
val filePath = "dbfs:/mnt/datalake/" + table_name + "/" // path where original csv file name is located
val fileName = fs.globStatus(new Path(filePath+"part*"))(0).getPath.getName // getting original csv file name
val newfilename = table_name + ".txt" // renaming and transforming csv into txt
val curatedfilePath = "dbfs:/mnt/datalake/" + newfilename // curated path + new file name
//Move to curated folder
dbutils.fs.mv(filePath + fileName, curatedfilePath)
Here is the Python Code
%python
#Create variables
table_name = dbutils.widgets.get("table_name") # getting table name
filePath = "dbfs:/mnt/datalake/" + table_name + "/" # path where original csv file name is located
newfilename = table_name + ".txt" # transforming csv into txt
curatedfilePath = "dbfs:/mnt/datalake/" + newfilename # curated path + new file name
#Save CSV file
df_curated.coalesce(1).replace("", None).write.mode("overwrite").save(filePath,format='csv', delimiter='|', header=True, nullValue=None)
# getting original csv file name
for f in filePath:
if f[1].startswith("part-00000"):
original_file_name = f[1]
#move to curated folder
dbutils.fs.mv(filePath + fileName, curatedfilePath)
I am having problem with the "getting original file name" part. It throws the following error:
IndexError: string index out of range
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<command-3442953727364942> in <module>()
11 # getting original csv file name
12 for f in filePath:
---> 13 if f[1].startswith("part-00000"):
14 original_file_name = f[1]
15
IndexError: string index out of range
In the Scala code, you're using hadoop.fs.golobStatus to list the part files from the folder where you save the DataFrame.
In Python you can do the same by accessing hadoop.fs via the JVM like this:
conf = sc._jsc.hadoopConfiguration()
Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
part_files = Path(filePath).getFileSystem(conf).globStatus(Path(filePath + "/part*"))
file_name = part_files[0].getPath().getName()
I have just created a csv file with the codes below. But how do I set the location of the new file? For example I want it in created in the desktop.
csv = pd.DataFrame(all_feature_array, columns=feature_name)
csv['start_row_of_file'] = start_row_col
csv['timestamp'] = timestamp_col
csv['class'] = obj_class
csv.to_csv(folder[-14:] + '-' + obj_class + '.csv')
csv.head()
You just need to append the location of the file infront of the file name to save , you would do it as such
location = "C:/User/Desktop/"
filename = folder[-14:] + '-' + obj_class + '.csv'
csv.to_csv(location + filename)