How could I find files with the same filename in multiple folders, and then perform the same operations?
def findCommonDeep(path1, path2):
return set.intersection(*(set(os.path.relpath(os.path.join(root, file), path) for root, _, files in os.walk(path) for file in files) for path in (path1, path2)))
lista = []
for x in [2010, 2017, 2020]:
if x > 2015:
filepath = rf'My Documents\Analysis\{x}\LateAnalysis\*.csv'
else:
filepath = rf'My Documents\Analysis\{x}\Early_Analysis\*.csv'
fname = os.path.basename(filepath, filepath)
findCommonDeep(fname)
for file in glob.glob(filename_common):
df = pd.read_csv(file)
df = df.set_index('date')
lista.append(df)
As you've tagged pandas, let's use pandas and pathlib to return a dictionary of files with similar names:
from pathlib import Path
import pandas as pd
def return_similair_files(start_dir : str) -> dict:
all_files = Path(start_dir).rglob('*.csv')
df = pd.DataFrame({'files' : all_files})
df['name'] = df['files'].apply(lambda x : x.name) #pathlib method.
return df.groupby('name')['files'].agg(list).to_dict()
This will return a dictionary of files, like so:
{'file_1.csv' : [list_of_paths]}
You can then operate on them as you need.
Related
My current process involves looping through my source directory and adding the name of each file to my dataframe in python. I want to get the datemodified for each of these files as well
import datetime
import os
import pandas as pd
#set src directory
os.chdir('C:/Users/jj/Desktop/do/Claims/globmove')
def read_files(filenames):
result = []
for filename in filenames:
file = read_sheets(filename)
file['Filename'] = filename
result.append(file)
return pd.concat(result, ignore_index=True)
def modification_date(filename):
t = os.path.getmtime(filename)
return datetime.datetime.fromtimestamp(t)
folder_path = os.path.abspath('C:/Users/jj/Desktop/do/Claims/globmove')
files = [file for file in os.listdir(folder_path) if file.endswith(".xlsx")]
dfooc = read_files(files)
I am able to run this without errors, but the datemodified timestamp currently does not append to the final dataframe- dfooc. How can I get the datemodified to append?
Edit: Getting an indent error after changing order of my original code above
def read_files(filenames):
result = []
for filename in filenames:
file = read_sheets(filename)
file['Filename'] = filename
def modification_date(filename):
t = os.path.getmtime(filename)
return datetime.datetime.fromtimestamp(t)
file['ModificationDate'] = filename
result.append(file)
return pd.concat(result, ignore_index=True)
return pd.concat(result, ignore_index=True)
^
IndentationError: unexpected indent
Here's how I do it.
import os
from pathlib import Path
import pandas as pd
import pendulum
class FileDates:
def __init__(self, **kwargs):
self.file_type = kwargs.get("file_type")
self.file_path = kwargs.get("file_path")
self.path = kwargs.get("path")
self.tz = pendulum.now().timezone.name
def main(self) -> pd.DataFrame:
files = self.get_files()
dates = self.get_dates(files)
return pd.DataFrame(list(zip([str(Path(x)).split("/")[-1] for x in files], dates)), columns=["file", "date"])
def get_files(self) -> list:
files = [str(x) for x in self.file_path.rglob("*") if x.is_file()]
return [x for x in files if self.file_type in x]
def get_dates(self, files: list) -> list:
return [pendulum.from_timestamp(os.path.getmtime(Path(x))).in_tz(self.tz).to_date_string() for x in files]
file_type = ".xlsx"
file_path = Path(f"{Path.home()}/Desktop/do/Claims/globmove/")
data = FileDates(file_type=file_type, file_path=file_path).main()
How i can select last raw in text files with for?
this my first idea code :
import glob
import pandas as pd
path = input("Insert location:")
file_list = glob.glob(path + "/*.txt")
txt_list = []
for file in file_list:
txt_list.append(pd.read_csv(file))
for file in file_list:
txt_list[-7::3]
excl_merged = pd.concat(txt_list, ignore_index=True)
excl_merged.to_excel('Total.xlsx', index=False) ]
Your code is incorrect. Here is a version that should work:
import glob
import pandas as pd
path = input("Insert location:")
file_list = glob.glob(path + "/*.txt")
df_list = []
for file in file_list:
df = pd.read_csv(file)
df_list.append(df.tail(3)) # last 3 rows from each file dataframe
excl_merged = pd.concat(df_list, ignore_index=True)
excl_merged.to_excel('Total.xlsx', index=False)
Explaination: tail() method takes the last several rows (provided as an argument) from a dataframe.
In below code all the output files are getting written into T1 folder. How to separate those output files into sub folders, with the same name as original sub folders (where the original csv files were) ? Thanks
import pandas as pd
import numpy as np
import glob
import os
path = '/root/Desktop/TT1/'
mystep = 0.4
#define the function
def data_splitter(df, name):
max_time = df['Time'].max() # get max value of Time for the current csv file (df)
myrange= np.arange(0, max_time, mystep) # build the threshold range
for k in range(len(myrange)):
# build the upper values
temp = df[(df['Time'] >= myrange[k]) & (df['Time'] < myrange[k] + mystep)]
temp.to_csv("/root/Desktop/T1/{}_{}.csv".format(name, k))
# use os.walk(path) on the main path to get ALL subfolders inside path
for root,dirs,_ in os.walk(path):
for d in dirs:
path_sub = os.path.join(root,d) # this is the current subfolder
for filename in glob.glob(os.path.join(path_sub, '*.csv')):
df = pd.read_csv(filename)
name = os.path.split(filename)[1] # get the name of the current csv file
data_splitter(df, name)
This should help
Demo:
import pandas as pd
import numpy as np
import glob
import os
path = '/root/Desktop/TT1/'
mystep = 0.4
#define the function
def data_splitter(df, name, dest_folder):
max_time = df['Time'].max() # get max value of Time for the current csv file (df)
myrange= np.arange(0, max_time, mystep) # build the threshold range
basepath = "/root/Desktop/"
for k in range(len(myrange)):
# build the upper values
temp = df[(df['Time'] >= myrange[k]) & (df['Time'] < myrange[k] + mystep)]
dest_f = os.path.join(basepath, dest_folder)
if not os.path.isdir(dest_f):
os.mkdir(dest_f)
temp.to_csv(os.path.join(dest_f, "{}_{}.csv".format(name, k)))
# use os.walk(path) on the main path to get ALL subfolders inside path
for root,dirs, files in os.walk(path):
for f in files:
if f.endswith(".csv"):
filename = os.path.join(root, f)
df = pd.read_csv(filename)
name = os.path.split(os.path.basename(filename))[1]
dest_folder = os.path.basename(os.path.dirname(filename))
data_splitter(df, name, dest_folder)
A similar approach should work here:
import pandas as pd
import numpy as np
import glob
import os
input_root = '/root/Desktop/TT1'
output_root = '/root/Desktop/T1'
mystep = 0.4
#define the function
def data_splitter(input_file, output_path, output_basename):
df = pd.read_csv(input_file)
max_time = df['Time'].max() # get max value of Time for the current csv file (df)
myrange = np.arange(0, max_time, mystep) # build the threshold range
for k in range(len(myrange)):
# build the upper values
temp = df[(df['Time'] >= myrange[k]) & (df['Time'] < myrange[k] + mystep)]
temp.to_csv(os.path.join(output_path, f"{output_basename}_{k}.csv"))
# use os.walk(path) on the main path to get ALL subfolders inside path
for dirpath, dirnames, filenames in os.walk(input_root):
for filename in filenames:
if filename.lower().endswith('.csv'):
input_file = os.path.join(dirpath, filename)
sub_folders = dirpath[len(input_root)+1:]
output_path = os.path.join(output_root, sub_folders)
os.makedirs(output_path, exist_ok=True) # Ensure the output folder exists
output_basename = os.path.join(output_path, os.path.splitext(filename)[0] + '.csv')
data_splitter(input_file, output_path, output_basename)
This should result with the folder structure recreated at your output root folder.
could anyone advise me how to apply this code to several csv in one folder? Then, save the modified csv to another folder and each separately? In short, I need to automate it.
I need to automatically load the csv file, execute the code, save the newly modified csv file, and then repeat it to the next csv file in the folder.
import pandas as pd
import datetime as dt
import numpy as np
from numpy import nan as Nan
path = "C://Users//Zemi4//Desktop//csv//A-001.csv"
df = pd.read_csv(path,delimiter=";")
df['ta'] = pd.to_numeric(df['ta'])
df['tw'] = pd.to_numeric(df['tw'])
df["time_str"] = [dt.datetime.strptime(d, "%d.%m.%Y %H:%M:%S") for d in df["time"]]
df["time_str"] = [d.date() for d in df["time_str"]]
df["time_str"] = pd.to_datetime(df["time_str"])
df["time_zaokrouhleny"]=df["time_str"]
def analyza(pozadovane_data):
new_list = []
new_df = pd.DataFrame(new_list)
new_df=df.loc[df["time_str"] == pozadovane_data,["ta","tw", "zone", "time_zaokrouhleny"]]
counter = new_df.ta.count()
if counter < 24:
for i in range(counter,24):
new_df.loc[i] = [Nan for n in range(4)]
new_df["ta"]= new_df.ta.fillna(0)
new_df["tw"] = new_df.tw.fillna(0)
new_df["zone"] = new_df.zone.fillna(0)
new_df["time_zaokrouhleny"]=new_df.time_zaokrouhleny.fillna(new_df.time_zaokrouhleny.min())
elif counter > 24:
counter_list = list(range(24,counter))
new_df = new_df.drop(new_df.index[counter_list])
new_df["time_oprava"] = [dt.datetime.combine(d.date(),dt.time(1,0)) for d in new_df["time_zaokrouhleny"]]
s = 0
cas_list = []
for d in new_df["time_oprava"]:
d =d + dt.timedelta(hours=s)
#print(d)
#print(s)
cas_list.append(d)
s = s + 1
se = pd.Series(cas_list)
new_df['time_oprava'] = se.values
new_df['Validace'] = (new_df['ta'] != 0) & (new_df['tw'] != 0)
new_df['Rozdil'] = new_df['ta'] - new_df['tw']
new_df.rename(columns={"ta": "Skutecna teplota", "tw": "Pozadovana teplota", "time_oprava": "Cas", "zone": "Mistnost"}, inplace = True)
new_df.index = new_df['Cas']
return new_df
start = dt.datetime(2010,10,6)
end = dt.datetime(2010,12,27)
date_range = []
date_range = [start + dt.timedelta(days=x) for x in range(0,(end-start).days)]
new_list = []
vysledek_df =pd.DataFrame(new_list)
for d in date_range:
pom = analyza(d)
vysledek_df = vysledek_df.append(pom,ignore_index=True)
vysledek_df.pop('time_zaokrouhleny')
vysledek_df.to_csv('C://Users//Zemi4//Desktop//zpr//A-001.csv', encoding='utf-8', index=False)
The code itself works correctly. Thank you for your advice.
Simplest way is to use glob. Just give the folder_path and output_path as per your requirements and use the sample code below. I commented the code to help you understand the code.
import os
import glob
folder_path = 'path/to/folder/' # path to folder containing .csv files
output_path = 'path/to/output/folder/' # path to output folder
for file in glob.glob(folder_path + '*.csv'): # only loads .csv files from the folder
df = pd.read_csv(file, delimiter=";") # read .csv file
# Do something
df.to_csv(output_path + 'modified_' + str(os.path.basename(file)), encoding='utf-8', index=False) # saves modified .csv file to output_path
You want to use os.listdir() to find the contents of the directory, then parameterize the file path in a new function. You can then loop over a list of directories retrieved via os.walk() and run the function for each one.
import os
def run(file_directory):
filelist = os.listdir(file_directory)
for path in filelist:
df = pd.read_csv(path,delimiter=";")
# etc.
df.to_csv(os.path.join(file_directory, 'output.csv'))
If you need to create a new directory, you can use os.mkdir(newpath)
Can you still advise on how to parameterize the function?
I am new with Python but trying to write a code which add a column on multiple .xlsx files and saves this files with the origin name to a new folder.
I have started with some coding beneath, but missing some code in open all files and saving to my DestPath. Would be pleased if any has a solution for this:
from os import listdir, path
import pandas as pd
import xlrd
SourcePath = 'C:\' #Source Path
DestPath = 'C:\' #Destination Path
# Listing up all .xlsx files from Source
def find_xlsx_filenames( path_to_dir, suffix=".xlsx" ):
filenames = listdir(path_to_dir)
return [ filename for filename in filenames if filename.endswith( suffix ) ]
filenames = find_xlsx_filenames(SourcePath)
fname = path.join(SourcePath, filenames[0]) # Tar første fil i mappa.
outname = path.join(outputdata, filenames[0])
for i in range(len(filenames)):
fname = path.join(SourcePath, filenames[i])
df = pd.read_excel(fname) #Read Excel file as a DataFrame
df['new_col'] = 'Sort Data' #Adding a new column named <Sort Data>
#To save it back as Excel
df.to_excel(DestPath, outname) #Write DateFrame back as Excel file
Thanks in Advance
check if this works
import os
import pandas as pd
path = 'C:/'
for roots, dirs, files in os.walk(path):
xlsfile = [ _ for _ in files if _.endswith('.xlsx')]
for xlsf in xlsfile:
df = pd.read_excel(os.path.join(roots, xlsf))
df['Sort Data'] = ' '
df.to_excel(os.path.join(roots, xlsf), index = False)