Combine multiple .csv files with python from different directory paths

Combine multiple .csv files with python from different directory paths - python

I am trying to combine multiple .csv files into one .csv file using the dataframe in pandas. the tricky part about this is, i need to grab multiple files from multiple days. Please let me know if this does not make sense. As it currently stands i cannot figure out how to loop through the directory. Could you offer some assistance?
import csv
import pandas as pd
import datetime as dt
import glob, os
startDate = 20160613
endDate = 20160614
dateRange = endDate - startDate
dateRange = dateRange + 1
todaysDateFilePath = startDate
for x in xrange(dateRange):
print startDate
startDate = startDate + 1
filePath = os.path.join(r"\\export\path", startDate, "preprocessed")
os.chdir(filePath)
interesting_files = glob.glob("trade" + "*.csv")
print interesting_files
df_list = []
for filename in sorted(interesting_files):
df_list.append(pd.read_csv(filename))
full_df = pd.concat(df_list)
saveFilepath = r"U:\Chris\Test_Daily_Fails"
fileList = []
full_df.to_csv(saveFilepath + '\\Files_For_IN' + "_0613_" + ".csv", index = False)

IIUC you can create list all_files and in loop append output from glob to all_files:
all_files = []
for x in xrange(dateRange):
print startDate
startDate = startDate + 1
filePath = os.path.join(r"\\export\path", startDate, "preprocessed")
os.chdir(filePath)
all_files = all_files + glob.glob("trade" + "*.csv")
print interesting_files
Also you need first append all values to df_list and then only once concat (I indented code for concat):
df_list = []
for filename in sorted(interesting_files):
df_list.append(pd.read_csv(filename))
full_df = pd.concat(df_list)

Related

how do i select last raw in CSV files and import in to Excel?

How i can select last raw in text files with for?
this my first idea code :
import glob
import pandas as pd
path = input("Insert location:")
file_list = glob.glob(path + "/*.txt")
txt_list = []
for file in file_list:
txt_list.append(pd.read_csv(file))
for file in file_list:
txt_list[-7::3]
excl_merged = pd.concat(txt_list, ignore_index=True)
excl_merged.to_excel('Total.xlsx', index=False) ]

Your code is incorrect. Here is a version that should work:
import glob
import pandas as pd
path = input("Insert location:")
file_list = glob.glob(path + "/*.txt")
df_list = []
for file in file_list:
df = pd.read_csv(file)
df_list.append(df.tail(3)) # last 3 rows from each file dataframe
excl_merged = pd.concat(df_list, ignore_index=True)
excl_merged.to_excel('Total.xlsx', index=False)
Explaination: tail() method takes the last several rows (provided as an argument) from a dataframe.

How to call multiple columns from multiple csv file in python?

I have 100 csv file. I want to print particular columns from all the csv file with the file name. Here in this code I can print all of the csv file.
path = r'F:\11 semister\TPC_MEMBER'
all_files = glob.glob(path + "/*.csv")
dataStorage = {}
for filename in all_files:
name = os.path.basename(filename).split(".csv")[0]
dataStorage[name] = pd.read_csv(filename)
print(name)
dataStorage

May be you want this.
import pandas as pd
import numpy as np
import glob
path = r'folderpath' #provide your folder path where your csv files are stored.
all_csv= glob.glob(path + "/*.csv")
li = []
for filename in all_csv:
df = pd.read_csv(filename, index_col=None, header=0)
li.append(df)
data_frame = pd.concat(li, axis=0, ignore_index=True)
data_frame['columnname'] # enter the name of your dataframe's column.
print(data_frame)

Group values and remove duplicates of groups based on a column in Pandas

I have a datafile which is the result of combining several sources that contain name information. Each name have a unique ID (Column ID).
Sorting the ID by column, I would like to remove the second/third source finding in the column Source.
My output today:
(all the red rows are "duplicates" since we already got them from the first source (blue rows))
What I would like to achieve:
How can I achieve this result?
Is there a way to iterate row by row, where I remove duplicate of ID already when I iterate in the function "for file in files:" part of the code?
Or is it easier to do it in the "df_merged" before I output the dataframe to an an excel file?.
Code:
import pandas as pd
import os
from datetime import datetime
from shutil import copyfile
from functools import reduce
import numpy as np
#Path
base_path = "G:/Till/"
# Def
def get_files(folder, filetype):
list_files = []
directory = os.fsencode(folder)
for file in os.listdir(directory):
filename = os.fsdecode(file)
if filename.endswith("." + filetype.strip().lower()):
list_files.append(filename)
return list_files
# export files
df_result_e = pd.DataFrame()
files = get_files(base_path + "datasource/" + "export","xlsx")
df_append_e = pd.DataFrame()
for file in files:
df_temp = pd.read_excel(base_path + "datasource/" + "export/" + file, "Results", dtype=str, index=False)
df_temp["Source"] = file
df_append_e = pd.concat([df_append_e, df_temp])
df_result_e = pd.concat([df_result_e, df_append_e])
print(df_result_e)
# match files
df_result_m = pd.DataFrame()
files = get_files(base_path + "datasource/" + "match","xlsx")
df_append_m = pd.DataFrame()
for file in files:
df_temp = pd.read_excel(base_path + "datasource/" + "match/" + file, "Page 1", dtype=str, index=False)
df_append_m = pd.concat([df_append_m, df_temp])
df_result_m = pd.concat([df_result_m, df_append_m])
df_result_m = df_result_m[['ID_Our','Name_Our','Ext ID']]
df_result_m.rename(columns={'ID_Our' : 'ID', 'Name_Our' : 'Name' , 'Ext ID' : 'Match ID'}, inplace=True)
df_result_m.dropna(subset=["Match ID"], inplace=True) # Drop all NA
data_frames = [df_result_e, df_result_m]
# Join files
df_merged = reduce(lambda left,right: pd.merge(left, right, on=["Match ID"], how='outer'), data_frames)
#Output of files
df_merged.to_excel(base_path + "Total datasource Export/" + datetime.now().strftime("%Y-%m-%d_%H%M") + ".xlsx", index=False)

For remove them you can try transform with factorize
newdf=df[df.groupby('ID')['Source'].transform(lambda x : x.factorize()[0])==0]

Automating through multiple path locations and exporting file names

I have written a script which works but is not very elegant. It merges csv files, outputs a new file, filters that file to the required conditions, then outputs the filtered file, which is the file I want. I then repeat the process for every month.
Rather than altering this code to process every month (I have 5 more years worth of data to go), I would like to automate the path directory part and export csv file names that change from one month (and year) to the next.
See snippet of Jan and Feb below:
import os
import glob
import pandas as pd
import shutil
path = r"C:\Users\jonathan.capanda\Documents\Fishing_DataBase\gfw_data\100_deg_data\daily_csvs\20xx01"
os.chdir(path)
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames])
combined_csv.to_csv("201401.csv", index=False, encoding='utf-8-sig')
grab1 = r'C:\Users\jonathan.capanda\Documents\Fishing_DataBase\gfw_data\100_deg_data\daily_csvs\20xx01\201401.csv'
move1 = r'C:\Users\jonathan.capanda\Documents\Fishing_DataBase\gfw_data\100_deg_data\daily_csvs\2014\2014-01.csv'
shutil.move(grab1,move1)
fd = pd.read_csv(r'C:\Users\jonathan.capanda\Documents\Fishing_DataBase\gfw_data\100_deg_data\daily_csvs\2014\2014-01.csv')
df = pd.DataFrame(fd)
irishsea = df[(df.lat_bin >= 5300) & (df.lat_bin <= 5500) & (df.lon_bin >= -650) & (df.lon_bin <= -250)]
irishsea.to_csv("2014-01_irishsea.csv", index=False, encoding='utf-8-sig')
grab2 = r'C:\Users\jonathan.capanda\Documents\Fishing_DataBase\gfw_data\100_deg_data\daily_csvs\20xx01\2014-01_irishsea.csv'
move2 = r'C:\Users\jonathan.capanda\Documents\Fishing_DataBase\gfw_data\100_deg_data\daily_csvs\2014\2014-01-IrishSea.csv'
shutil.move(grab2,move2)
I then repeat it for Feb data but have to update the path locations.
#process feb data
path = r"C:\Users\jonathan.capanda\Documents\Fishing_DataBase\gfw_data\100_deg_data\daily_csvs\20xx02"
os.chdir(path)
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames])
combined_csv.to_csv("201402.csv", index=False, encoding='utf-8-sig')
grab1 = r'C:\Users\jonathan.capanda\Documents\Fishing_DataBase\gfw_data\100_deg_data\daily_csvs\20xx02\201402.csv'
move1 = r'C:\Users\jonathan.capanda\Documents\Fishing_DataBase\gfw_data\100_deg_data\daily_csvs\2014\2014-02.csv'
shutil.move(grab1,move1)
fd = pd.read_csv(r'C:\Users\jonathan.capanda\Documents\Fishing_DataBase\gfw_data\100_deg_data\daily_csvs\2014\2014-02.csv')
df = pd.DataFrame(fd)
irishsea = df[(df.lat_bin >= 5300) & (df.lat_bin <= 5500) & (df.lon_bin >= -650) & (df.lon_bin <= -250)]
irishsea.to_csv("2014-02_irishsea.csv", index=False, encoding='utf-8-sig')
grab2 = r'C:\Users\jonathan.capanda\Documents\Fishing_DataBase\gfw_data\100_deg_data\daily_csvs\20xx02\2014-02_irishsea.csv'
move2 = r'C:\Users\jonathan.capanda\Documents\Fishing_DataBase\gfw_data\100_deg_data\daily_csvs\2014\2014-02-IrishSea.csv'
shutil.move(grab2,move2)

You can do something like the following. Keep in mind that the second number of range (the stop value) needs to be one value higher than you intend.
for year in range(2014, 2020):
for month in range(1, 13):
if month < 10:
month_as_string = "0" + str(month)
else:
month_as_string = str(month)
date = "%s\%s-%s" % (year, year, month_as_string)
pathname = 'YOUR\FILEPATH\HERE' + date + 'irishsea.csv'
You can learn more about string formatting here https://www.learnpython.org/en/String_Formatting

How do I apply this code to multiple csv?

could anyone advise me how to apply this code to several csv in one folder? Then, save the modified csv to another folder and each separately? In short, I need to automate it.
I need to automatically load the csv file, execute the code, save the newly modified csv file, and then repeat it to the next csv file in the folder.
import pandas as pd
import datetime as dt
import numpy as np
from numpy import nan as Nan
path = "C://Users//Zemi4//Desktop//csv//A-001.csv"
df = pd.read_csv(path,delimiter=";")
df['ta'] = pd.to_numeric(df['ta'])
df['tw'] = pd.to_numeric(df['tw'])
df["time_str"] = [dt.datetime.strptime(d, "%d.%m.%Y %H:%M:%S") for d in df["time"]]
df["time_str"] = [d.date() for d in df["time_str"]]
df["time_str"] = pd.to_datetime(df["time_str"])
df["time_zaokrouhleny"]=df["time_str"]
def analyza(pozadovane_data):
new_list = []
new_df = pd.DataFrame(new_list)
new_df=df.loc[df["time_str"] == pozadovane_data,["ta","tw", "zone", "time_zaokrouhleny"]]
counter = new_df.ta.count()
if counter < 24:
for i in range(counter,24):
new_df.loc[i] = [Nan for n in range(4)]
new_df["ta"]= new_df.ta.fillna(0)
new_df["tw"] = new_df.tw.fillna(0)
new_df["zone"] = new_df.zone.fillna(0)
new_df["time_zaokrouhleny"]=new_df.time_zaokrouhleny.fillna(new_df.time_zaokrouhleny.min())
elif counter > 24:
counter_list = list(range(24,counter))
new_df = new_df.drop(new_df.index[counter_list])
new_df["time_oprava"] = [dt.datetime.combine(d.date(),dt.time(1,0)) for d in new_df["time_zaokrouhleny"]]
s = 0
cas_list = []
for d in new_df["time_oprava"]:
d =d + dt.timedelta(hours=s)
#print(d)
#print(s)
cas_list.append(d)
s = s + 1
se = pd.Series(cas_list)
new_df['time_oprava'] = se.values
new_df['Validace'] = (new_df['ta'] != 0) & (new_df['tw'] != 0)
new_df['Rozdil'] = new_df['ta'] - new_df['tw']
new_df.rename(columns={"ta": "Skutecna teplota", "tw": "Pozadovana teplota", "time_oprava": "Cas", "zone": "Mistnost"}, inplace = True)
new_df.index = new_df['Cas']
return new_df
start = dt.datetime(2010,10,6)
end = dt.datetime(2010,12,27)
date_range = []
date_range = [start + dt.timedelta(days=x) for x in range(0,(end-start).days)]
new_list = []
vysledek_df =pd.DataFrame(new_list)
for d in date_range:
pom = analyza(d)
vysledek_df = vysledek_df.append(pom,ignore_index=True)
vysledek_df.pop('time_zaokrouhleny')
vysledek_df.to_csv('C://Users//Zemi4//Desktop//zpr//A-001.csv', encoding='utf-8', index=False)
The code itself works correctly. Thank you for your advice.

Simplest way is to use glob. Just give the folder_path and output_path as per your requirements and use the sample code below. I commented the code to help you understand the code.
import os
import glob
folder_path = 'path/to/folder/' # path to folder containing .csv files
output_path = 'path/to/output/folder/' # path to output folder
for file in glob.glob(folder_path + '*.csv'): # only loads .csv files from the folder
df = pd.read_csv(file, delimiter=";") # read .csv file
# Do something
df.to_csv(output_path + 'modified_' + str(os.path.basename(file)), encoding='utf-8', index=False) # saves modified .csv file to output_path

You want to use os.listdir() to find the contents of the directory, then parameterize the file path in a new function. You can then loop over a list of directories retrieved via os.walk() and run the function for each one.
import os
def run(file_directory):
filelist = os.listdir(file_directory)
for path in filelist:
df = pd.read_csv(path,delimiter=";")
# etc.
df.to_csv(os.path.join(file_directory, 'output.csv'))
If you need to create a new directory, you can use os.mkdir(newpath)

Can you still advise on how to parameterize the function?

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Combine multiple .csv files with python from different directory paths - python

Related

how do i select last raw in CSV files and import in to Excel?

How to call multiple columns from multiple csv file in python?

Group values and remove duplicates of groups based on a column in Pandas

Automating through multiple path locations and exporting file names

How do I apply this code to multiple csv?

Categories

Resources