Combine Dataframes resulting from a for loop

Combine Dataframes resulting from a for loop - python

I need a little help in appending the data thats getting generated out of the for loop below. Currenlty, im writing it to a dataframe in line "df = pd.DataFrame(li_row, columns=col_names)"
But when I have multiple files which starts from PAJ, I need the resulted Dataframe to be appended to one Dataframe.
Also, the below is a bits and pieces we gathered and amended to suit our need. please excuse me in case you feel its a mess. :)
import xmlschema
import os
import xml.etree.ElementTree as ET
import pandas as pd
dirpath = "C:\\Users\\xxxxx\\PycharmProjects\\pythonProject\\xmls"
filenames = os.listdir("C:\\Users\\xxxxx\\PycharmProjects\\pythonProject\\xmls")
# print(filenames)
for eachfile in filenames:
fname = eachfile[0:3]
print(dirpath+'\\'+eachfile)
if fname == 'PAJ':
xmlschema.validate(dirpath+'\\'+eachfile, 'PAJ.xsd')
tree = ET.parse(eachfile)
root = tree.getroot()
# Get AlertID from header
cols = {}
for header in root.findall(".//header/alertId"):
cols[header.tag] = header.text
# print(cols)
# get detailhr to be used for column header names
col_names = []
for DtHeader in root.findall(".//detailHdr/c"):
col_names.append(DtHeader.text)
# print(col_names)
# Get row and c
li_row = []
size = 0
for Data in root.findall(".//report/data"):
for child in Data:
# print(child.tag,child.text,len(Data))
li_row.append([])
for grandchild in child:
# print(grandchild.tag, grandchild.text,len(child))
li_row[size].append(grandchild.text)
size += 1
# print(li_row)
# create a dataframe with the col_names and row with c and alertid added at the end
df = pd.DataFrame(li_row, columns=col_names)
df['alertId'] = cols['alertId']
print(df)
elif fname == 'PIE':
fileContent = ''
with open(dirpath + '\\' + eachfile) as filehandle:
fileContent = filehandle.read()
modFileContent = fileContent.replace("UTF-16", "UTF-8")
xmlschema.validate(modFileContent, 'PIE.xsd')

So if i were to change your current solution as little as possible I create a list of paj_data_frames and concatenate them once the script was done. Look at pd.concat documentation https://pandas.pydata.org/docs/user_guide/merging.html
paj_data_frames = []
for eachfile in filenames:
....
if fname == 'PAJ':
df = pd.DataFrame(li_row, columns=col_names)
df['alertId'] = cols['alertId']
paj_data_frames.append(df)
....
final_df = pd.concat(paj_data_frames)

Related

Efficient way to combine multiple csv

I have over 100K CSV (total file size north of 150 GB) which I need to join. All have standard column names although the sequence of columns may not match and some csv have a few columns missing.
Now I just created a dataframe and kept concating the datframe from each csv in each iteration to have a standard dataframe containing all columns which I eventually intended to save as csv
I tried making a dataframe with 1000 sample csv and noticed as the dataframe size increased, the number of iteration dropped down from 10 to 1.5 per second which probably means that it would follow a similar trend if I got all-in with 100k csv thus taking days if not months to combine them.
Is there a better way of combining huge number of csv files?
Here is my code
df_t1 = pd.DataFrame()
for i in tqdm(range(len(excelNames))):
thisCSV = str(excelNames[i]).lower().strip()
df = pd.read_csv(pathxl + "\\" + thisCSV, error_bad_lines=False, warn_bad_lines=False,low_memory=False)
df["File Name"] = pd.Series([thisCSV for x in range(len(df.index))])
if thisCSV.endswith('type1.csv'):
df_t1 = pd.concat([df_t1,df], axis=0, ignore_index=True)
df_t1.to_csv(outpath + "df_t1.csv", index = None, header=True, encoding='utf-8')
print("df_t1.csv generated")

Possible improvement
Method 1: Using Pandas
#df_t1 = pd.DataFrame()
df_t1_lst = []
for i in tqdm(range(len(excelNames))):
thisCSV = str(excelNames[i]).lower().strip()
if thisCSV.endswith('type1.csv'):
df = pd.read_csv(pathxl + "\\" + thisCSV, error_bad_lines=False, warn_bad_lines=False,low_memory=False)
#df["File Name"] = pd.Series([thisCSV for x in range(len(df.index))]) --unnecessary to loop use next line instead
df["File Name"] = thisCSV # places thisCSV in every row
#df_t1 = pd.concat([df_t1,df], axis=0, ignore_index=True) # concat slow, append to list instead
df_t1_lst.append(df)
df_t1 = pd.concat(df_t1_lst, ignore_index=True) # Form dataframe from list (faster than pd.concat in loop)
df_t1.to_csv(outpath + "df_t1.csv", index = None, header=True, encoding='utf-8')
print("df_t1.csv generated")
Method 1a
Using Pandas to continuously append to CSV output file
import os
import pandas as pd
def str_to_bytes(s):
' String to byte array '
result = bytearray()
result.extend(map(ord, s))
return result
def good_file(file_path):
""" Check if file exists and is not empty"""
# Check if file exist and it is empty
return os.path.exists(file_path) and os.stat(file_path).st_size > 0
SEPARATOR = ',' # Separator used by CSV file
write_header = True
pathxl = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
outpath = 'xxxxxxxxxxxxxxxxxxxxxxxxxx'
excelNames = ["xxx.csv", "xxxxx.csv"]
pathxl = r"C:\\Users\\darryl\\OneDrive\\Python"
outpath = pathxl + r"\\"
excelNames = ["test1_type1.csv", "test2_type1.csv"]
output_file = outpath + "df_t1.csv"
with open(output_file, "w") as ofile:
pass # create empty output file
for i in tqdm(range(len(excelNames))):
thisCSV = str(excelNames[i]).lower().strip()
input_file = pathxl + "\\" + thisCSV
if thisCSV.endswith('type1.csv') and good_file(input_file):
df = pd.read_csv(input_file)
if df.shape[0] > 0:
df['File Name'] = thisCSV # Add filename
df = df.sort_index(axis = 1) # sort based upon colunn in ascending order
# Append to output file
df.to_csv(output_file, mode='a',
index = False,
header= write_header)
write_header = False # Only write header once
del df
Method 2: Binary Files
Reading/Writing binary and using memory-map should be faster.
from tqdm import tqdm
import os
import mmap
def str_to_bytes(s):
' String to byte array '
result = bytearray()
result.extend(map(ord, s))
return result
def good_file(file_path):
""" Check if file exists and is not empty"""
# Check if file exist and it is empty
return os.path.exists(file_path) and os.stat(file_path).st_size > 0
SEPARATOR = ',' # Separator used by CSV file
header = None
pathxl = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
outpath = 'xxxxxxxxxxxxxxxxxxxxxxxxxx'
excelNames = ["xxx.csv", "xxxxx.csv"]
with open(outpath + "df_t1.csv", "wb") as ofile:
for i in tqdm(range(len(excelNames))):
thisCSV = str(excelNames[i]).lower().strip()
input_file = pathxl + "\\" + thisCSV
if thisCSV.endswith('type1.csv') and good_file(input_file):
with open(input_file, "rb") as ifile:
print('file ', thisCSV)
# memory-map the file, size 0 means whole file
with mmap.mmap(ifile.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_obj:
text_iter = iter(mmap_obj.read().split(b'\n'))
if header is None:
header = next(text_iter)
header = header.rstrip() + str_to_bytes(SEPARATOR + "File Name\n")
ofile.write(header) # write header
else:
next(text_iter) # ignore header row
# write data to output file
file_value = str_to_bytes(SEPARATOR + f"{thisCSV}\n")
for line in text_iter:
if line.strip(): # skip blank lines
ofile.write(line.rstrip() + file_value)

Write output text data into a CSV or excel file as a table

I have a programme that outputs results into a .csv file.
However, the results are structured as text one below the other.
1:
However, I would need this to be in this format-
Snippet of my code-
PathDicom = "./Images/cases/TCGA-G3-A3CK/01-03-2005-CT CHEST ABDOMEN PELVIS ENHANCED-BODY-48980/"
ListFold = []; # Create an empty list for folder names
# Make a list of series names (i.e. Folder names) - ListFold
for dirName, subdirList, fileList in os.walk(PathDicom):
for filename in subdirList:
ListFold.append(os.path.join(dirName,filename))
lstFilesDCM = [] # create an empty list
with open ('results_NOISE_duke_new.csv','a+') as f:
for dirName, subdirList, fileList in os.walk(ListFold[0]):
CTSeriesPath = ListFold[0]
print("Exam_name:", PathDicom, file=f)
IQ = CTPatientImageNoise(CTSeriesPath)
ct_series_noise = {}
ct_series_noise['Noise'] = IQ.forDatabase['AverageGlobalNoiseIndex']
print("Series_name:",CTSeriesPath, file=f) # This is the series name
print("Series_Noise_value:", ct_series_noise['Noise'], file=f)
for filename in fileList:
lstFilesDCM.append(os.path.join(dirName,filename))
RefDs = pydicom.dcmread(lstFilesDCM[0])
#print("Exam_name:", PathDicom, file=f)
print("Manufacturer:", RefDs.Manufacturer, file=f)
print("iMAGE tYPE:", RefDs.ImageType, file=f)
print("Slice Thickness:", RefDs.SliceThickness, file=f)
print("Filter Type:", RefDs.FilterType, file=f)
#print("Convolution Kernel:", RefDs.ConvolutionKernel, file=f)
print("AccessionNumber:", RefDs.AccessionNumber, file=f)
print("StudyDescription:", RefDs.StudyDescription, file=f)
f.write('\n')
break #
UPDATE -
I have edited as Alfonso suggested in his answer - use pandas -got the answer!
import matplotlib.pyplot as plt
import numpy as np
import sys
import pydicom # Importing DICOM package
import csv
import os,string
import pandas as pd
from pyctpatientimagenoise import CTPatientImageNoise
PathDicom = "F:/PythonExample/Images/testduke/TCGA-DD-A11C/05-27-1999-Abdomen120LiverBiPhase Adult-61415"
Examname = []
ImageType=[]
Manufacturer = []
Series_name = []
Series_Noise =[]
Slice_thickness = []
Filter_type = []
Accessnum = []
StudyDesc = []
CTSeriesPath = []
# This is a list of all the series (organised as Folders) in the exam
ListFold = []; # Create an empty list for folder names
# Make a list of series names (i.e. Folder names) - ListFold
for dirName, subdirList, fileList in os.walk(PathDicom):
for filename in subdirList:
ListFold.append(os.path.join(dirName,filename))
lstFilesDCM = [] # create an empty list
for dirName, subdirList, fileList in os.walk(ListFold[0]):
CTSeriesPath = ListFold[0]
Examname.append(PathDicom)
IQ = CTPatientImageNoise(CTSeriesPath)
ct_series_noise = {}
ct_series_noise['Noise'] = IQ.forDatabase['AverageGlobalNoiseIndex']
Series_name.append(CTSeriesPath) # This is the series name
Series_Noise.append(ct_series_noise['Noise'])
for filename in fileList:
lstFilesDCM.append(os.path.join(dirName,filename))
# Get 1ST SLICE INFO
RefDs = pydicom.dcmread(lstFilesDCM[0])
#print("Exam_name:", PathDicom, file=f)
Manufacturer.append(RefDs.Manufacturer)
ImageType.append(RefDs.ImageType)
Slice_thickness.append(RefDs.SliceThickness)
Filter_type.append(RefDs.FilterType)
#print("Convolution Kernel:", RefDs.ConvolutionKernel, file=f)
Accessnum.append(RefDs.AccessionNumber)
StudyDesc.append(RefDs.StudyDescription)
#print("Irradiation Event UID:", RefDs.IrradiationEventUID, file=f)
break
for dirName, subdirList, fileList in os.walk(ListFold[1]):
CTSeriesPath = ListFold[1]
Examname.append(PathDicom)
IQ = CTPatientImageNoise(CTSeriesPath)
ct_series_noise = {}
ct_series_noise['Noise'] = IQ.forDatabase['AverageGlobalNoiseIndex']
Series_name.append(CTSeriesPath) # This is the series name
Series_Noise.append(ct_series_noise['Noise'])
for filename in fileList:
lstFilesDCM.append(os.path.join(dirName,filename))
# Get 1ST SLICE INFO
RefDs = pydicom.dcmread(lstFilesDCM[1])
#print("Exam_name:", PathDicom, file=f)
Manufacturer.append(RefDs.Manufacturer)
ImageType.append(RefDs.ImageType)
Slice_thickness.append(RefDs.SliceThickness)
Filter_type.append(RefDs.FilterType)
#print("Convolution Kernel:", RefDs.ConvolutionKernel, file=f)
Accessnum.append(RefDs.AccessionNumber)
StudyDesc.append(RefDs.StudyDescription)
#print("Irradiation Event UID:", RefDs.IrradiationEventUID, file=f)
break
df = pd.DataFrame(list(zip(Examname, Manufacturer, ImageType, Series_name,Series_Noise,Slice_thickness,Filter_type,Accessnum,StudyDesc )), columns =["Exam","Manufacturer", "iMAGE tYPE", "Series", "Noise", "Slice Thickness", "Filter Type", "Accession Num", "Study Desc"])
df.to_csv("F:/PythonExample/testdel.csv")
I have edited my original code to include the solution that worked best with least amount of re-coding.

How about using pandas to make a dataframe and then saving it to csv?
Without a minimal reproducible example is difficult to give a full answer but I will try.
First you create a list for each desired column
Manufacturer_list = []
image_list = []
...
and then, in the for loop, replace the print statements by append
Manufacturer_list.append(RefDs.Manufacturer)
image_list.append(RefDs.ImageType)
...
Finally you create a dataframe with pandas. More info here
# import pandas as pd
import pandas as pd
df = pd.DataFrame(list(zip(Manufacturer_list, image_list, ...)),
columns =["Manufacturer", "iMAGE tYPE", ...])
and save it to csv. Info here
df.to_csv("path/to/file.csv")

We can't reproduce your results. Reduce your code to a minimal example or use a debugger to trace where your logic is wrong. Here's an example based on your update that works:
import pandas as pd
Examname = 'Exam1 Exam2 Exam3'.split()
ImageType= 'ImageType1 ImageType2 Imagetype3'.split()
Manufacturer = 'Mfg1 Mfg2 Mfg3'.split()
Series_name = 'Series1 Series2 Series3'.split()
Series_Noise = 'SN1 SN2 SN3'.split()
Slice_thickness = 'thick1 thick2 thick3'.split()
Filter_type = 'ft1 ft2 ft3'.split()
Accessnum = 'anum1 anum2 anum3'.split()
StudyDesc = 'desc1 desc2 desc3'.split()
CTSeriesPath = 'path1 path2 path3'.split()
df = pd.DataFrame(list(zip(Examname, Manufacturer, ImageType, Series_name,Series_Noise,Slice_thickness,Filter_type,Accessnum,StudyDesc )), columns =["Exam","Manufacturer", "iMAGE tYPE", "Series", "Noise", "Slice Thickness", "Filter Type", "Accession Num", "Study Desc"])
df.to_csv('example.csv',index=False)
Also see: https://ericlippert.com/2014/03/05/how-to-debug-small-programs/

How do I apply this code to multiple csv?

could anyone advise me how to apply this code to several csv in one folder? Then, save the modified csv to another folder and each separately? In short, I need to automate it.
I need to automatically load the csv file, execute the code, save the newly modified csv file, and then repeat it to the next csv file in the folder.
import pandas as pd
import datetime as dt
import numpy as np
from numpy import nan as Nan
path = "C://Users//Zemi4//Desktop//csv//A-001.csv"
df = pd.read_csv(path,delimiter=";")
df['ta'] = pd.to_numeric(df['ta'])
df['tw'] = pd.to_numeric(df['tw'])
df["time_str"] = [dt.datetime.strptime(d, "%d.%m.%Y %H:%M:%S") for d in df["time"]]
df["time_str"] = [d.date() for d in df["time_str"]]
df["time_str"] = pd.to_datetime(df["time_str"])
df["time_zaokrouhleny"]=df["time_str"]
def analyza(pozadovane_data):
new_list = []
new_df = pd.DataFrame(new_list)
new_df=df.loc[df["time_str"] == pozadovane_data,["ta","tw", "zone", "time_zaokrouhleny"]]
counter = new_df.ta.count()
if counter < 24:
for i in range(counter,24):
new_df.loc[i] = [Nan for n in range(4)]
new_df["ta"]= new_df.ta.fillna(0)
new_df["tw"] = new_df.tw.fillna(0)
new_df["zone"] = new_df.zone.fillna(0)
new_df["time_zaokrouhleny"]=new_df.time_zaokrouhleny.fillna(new_df.time_zaokrouhleny.min())
elif counter > 24:
counter_list = list(range(24,counter))
new_df = new_df.drop(new_df.index[counter_list])
new_df["time_oprava"] = [dt.datetime.combine(d.date(),dt.time(1,0)) for d in new_df["time_zaokrouhleny"]]
s = 0
cas_list = []
for d in new_df["time_oprava"]:
d =d + dt.timedelta(hours=s)
#print(d)
#print(s)
cas_list.append(d)
s = s + 1
se = pd.Series(cas_list)
new_df['time_oprava'] = se.values
new_df['Validace'] = (new_df['ta'] != 0) & (new_df['tw'] != 0)
new_df['Rozdil'] = new_df['ta'] - new_df['tw']
new_df.rename(columns={"ta": "Skutecna teplota", "tw": "Pozadovana teplota", "time_oprava": "Cas", "zone": "Mistnost"}, inplace = True)
new_df.index = new_df['Cas']
return new_df
start = dt.datetime(2010,10,6)
end = dt.datetime(2010,12,27)
date_range = []
date_range = [start + dt.timedelta(days=x) for x in range(0,(end-start).days)]
new_list = []
vysledek_df =pd.DataFrame(new_list)
for d in date_range:
pom = analyza(d)
vysledek_df = vysledek_df.append(pom,ignore_index=True)
vysledek_df.pop('time_zaokrouhleny')
vysledek_df.to_csv('C://Users//Zemi4//Desktop//zpr//A-001.csv', encoding='utf-8', index=False)
The code itself works correctly. Thank you for your advice.

Simplest way is to use glob. Just give the folder_path and output_path as per your requirements and use the sample code below. I commented the code to help you understand the code.
import os
import glob
folder_path = 'path/to/folder/' # path to folder containing .csv files
output_path = 'path/to/output/folder/' # path to output folder
for file in glob.glob(folder_path + '*.csv'): # only loads .csv files from the folder
df = pd.read_csv(file, delimiter=";") # read .csv file
# Do something
df.to_csv(output_path + 'modified_' + str(os.path.basename(file)), encoding='utf-8', index=False) # saves modified .csv file to output_path

You want to use os.listdir() to find the contents of the directory, then parameterize the file path in a new function. You can then loop over a list of directories retrieved via os.walk() and run the function for each one.
import os
def run(file_directory):
filelist = os.listdir(file_directory)
for path in filelist:
df = pd.read_csv(path,delimiter=";")
# etc.
df.to_csv(os.path.join(file_directory, 'output.csv'))
If you need to create a new directory, you can use os.mkdir(newpath)

Can you still advise on how to parameterize the function?

Only portions of data are being written to a cvs file, rest is missing

I am parsing through many xml files and putting certain information into a csv file. Because my xml files are named: "1.xml", "2.xml", etc... I am using a for loop to cycle through my different Xml file titles. However, based on the range that I use on my for loop, my csv file contains different data. For example, when my for loop range is 1:200 my csv file includes info from my xml files 1 to 199. However, when I change my range to 1:300, my csv file only contains info for my xml files 217 to 249. The info actually stored on my csv file changes based on what I put in as my range for my for loop. Has anyone else had this error and do you have any solutions?
My code is below:
import xml.etree.ElementTree as ET
import csv
from pathlib import Path
# open a file for writing
data_labels = open('DataLabels.csv', 'w', newline='')
missing_files = open('MissingFiles.csv', 'w', newline = '')
# create the csv writer object
csvwriter = csv.writer(data_labels)
csvwriter2 = csv.writer(missing_files)
data_head = []
data = []
missingfiles = 0
missfiles = []
MediaId = "Media Id"
#data_head.append (MediaId)
Family = "Family"
#data_head.append (Family)
Species = "Species"
#data_head.append (Species)
Genus = "Genus"
Content = "Content"
ClassId = "ClassId"
#data_head.append (Genus)
data_head.append(MediaId)
# Family = member.find('Family').tag
data_head.append(Content)
data_head.append(ClassId)
data_head.append(Family)
# Species = member.find('Species').tag
data_head.append(Species)
# Genus = member.find('Genus').tag
data_head.append(Genus)
csvwriter.writerow(data_head)
for i in range (1, 190):
#print (i)
data = []
inputfilename = str(i)+ ".xml"
my_file = Path(inputfilename)
if my_file.is_file():
data_labels = open('DataLabels.csv', 'w', newline='')
tree = ET.parse(inputfilename)
root = tree.getroot()
MediaId = root [2].text
Content = root[4].text
ClassId = root[5].text
Family = root[6].text
Species = root[7].text
Genus = root[8].text
#print (vote)
#count = 0
#for Image in root.find('MediaId'):
#print (child.tag, child.attrib)
#name = child.find('MediaId').text
# print (Image.find ('MediaId').text)
##csvwriter.writerow (data_head)
#data = []
#if count == 0:
# print ("count is zero i'm in loop")
# MediaId = member.find('MediaId').tag
# count = count + 1
#else:
#MediaId = root.findall('MediaId').text
data.append(MediaId)
data.append (Content)
data.append (ClassId)
#Family = member.find('Family').text
data.append(Family)
#Species = member.find('Species').text
data.append(Species)
#Genus = member.find('Genus').text
data.append(Genus)
csvwriter.writerow(data)
data_labels.close()
#print (data)
else:
missingfiles = missingfiles +1
missfiles = []
missfiles.append(inputfilename)
csvwriter2.writerow(missfiles)
print ("missing", missingfiles, "files")
data_labels.close()
missing_files.close()
print ("done")

Open the csv in append mode ,else you are just overwriting the same file.

I think you need to divide your script in small readable functions.
First, you can create a function to parse a XML file:
import xml.etree.ElementTree as ET
def parse_xml_file(xml_path):
""" Parse an XML file and return the data. """
# type: (str) -> list
tree = ET.parse(xml_path)
root = tree.getroot()
return [
root[2].text,
root[4].text,
root[5].text,
root[6].text,
root[7].text,
root[8].text]
This function parse a XML file and return one record containing a list of values.
Then, you can create a function to iterate a list of XML files (existing files) dans populate the CSV file:
import csv
import io
import os
def populate_data_labels(xml_path_list, work_dir="."):
header = ["Media Id", "Family", "Species", "Genus", "Content", "ClassId"]
with io.open(os.path.join(work_dir, 'DataLabels.csv'), 'w') as fd:
writer = csv.writer(fd)
writer.writerow(header)
for xml_path in xml_path_list:
writer.writerow(parse_xml_file(xml_path))
This function use parse_xml_file() to extract each record.
You can create a function to log the missing files. You can use CSV format (or a simple text file):
def populate_missing_files(missing_files, work_dir="."):
header = ["Filename"]
with io.open(os.path.join(work_dir, 'MissingFiles.csv'), 'w') as fd:
writer = csv.writer(fd)
writer.writerow(header)
for xml_path in missing_files:
writer.writerow([os.path.basename(xml_path)])
Finally, you can write a function which search the XML files and call the previous functions:
def parse_work_dir(work_dir="."):
all_files = [os.path.join(work_dir, "{0}.xml".format(idx))
for idx in range(1, 190)]
existing_files = (path for path in all_files if os.path.exists(path))
populate_data_labels(existing_files, work_dir)
missing_files = (path for path in all_files if not os.path.exists(path))
populate_missing_files(missing_files, work_dir)
Usage:
parse_work_dir("/path/to/your/working/dir")

Python - Batch combine Multiple large CSV, filter data, skip header, appending vertically into a single CSV

** Note i have modified the code below original to show a code that works for what i need
Good afternoon all
So many questions around csv data combining but as yet i haven't found anything to help me with my code requirements.
I have large fixed header CSV's that:
1) are produced over a 12hr period. i need to look up a weeks worth of csv's to merge
2) filter the individual CSV's on 2 columns information (to many rows otherwise)
3) append vertically into a single csv 'master sheet' with the naming convention 'date of the last shift'
** Files are coming out as individual CSV's. I need them to append into one
** FYI - Data set after code (there are 16 columns of data i just cut out for this purpose)
Below is what i have so far. apologies for the mess!
import os, csv
import pandas as pd
import io
import glob
from datetime import date
import time
import collections
# Process data and filter #
def ProcessData( data ):
processedData = []
for row in data:
if row[ 15 ] == ( 'OPERATING' ):
outputRow = row[ 0:3 ] + row[ 15:17 ]
processedData.append( outputRow )
return processedData
# Process and write #
def ProcessAndWrite( data, filename ):
processedData = ProcessData( data )
name, ext = os.path.splitext( filename )
outputfilename = name + '_week_combined.csv'
print "writing data to " + str( outputfilename )
with open(outputfilename, 'wb') as csvfile:
writer = csv.writer(csvfile)
for row in processedData:
writer.writerow(row)
# select the correct weeks worth of files #
def filedate( data, datetime ):
root = 'E:\Rs\\'
date_outputfilename_list = []
for file in date_outputfilename_list:
folder, file_name = os.path.split(file[1])
file_date = time.strftime("%y-%m-%d", file[0])
date_name_list.append((file_date, file_name))
date_count_dict = {}
date_name_dict = {}
for date, name in date_name_list:
date_count_dict=collections.defaultdict( int )
date_name_dict.setdefault(date, []).append(name)
import pprint
print("Files with the same date:")
pprint.pprint(date_name_dict)
print('-'*60)
print("Same dates count:")
pprint.pprint(date_count_dict)
# Function #
if __name__ == "__main__":
import sys
path = r'E:\Rs'
filenames = glob.glob(os.path.join(path, '*.csv'))
filenames.sort()
data = []
for filename in filenames:
with open(filename, 'r') as csvfile:
reader = csv.reader(csvfile, delimiter = ',')
header = []
for headerCount in range( 2 ):
header.append(next(reader))
data.extend( [ row for row in reader ] )
if( filedate ):
ProcessAndWrite( data, filename )
data = [ProcessData]
if ( len( data ) > 0 ):
ProcessAndWrite( data, filename )
Data set:
position_x, position_y, position_z, start_time, opreason, stage,
header 2, header 2, header 2, header 2, header 2, header 2
649794, 4764274, 1147, 2/11/2016 00:00, OPERATING, sound,
Amended Script that works for my purpose
import os, csv # Import csv library
import io
import glob
import datetime
import time
import collections
def ProcessData( data ): # Function definition: filter data
processedData = [] # empty process data list
for row in data:
if (row[ 15 ] == 'OPERATING' and row[ 6 ] == 'truck'): # Filter explination
n1=datetime.datetime.strptime(row[3], '%Y-%m-%d %H:%M:%S') # Strip date from timedate for duration calc
n2=datetime.datetime.strptime(row[4], '%Y-%m-%d %H:%M:%S') # Strip date from timedate for duration calc
diff = n2 - n1 # duration calc
outputRow = row[ 0:3 ] + row[ 3:5 ] + [diff.total_seconds()]
processedData.append( outputRow ) # process the last of the list information from the csv and append new file
return processedData # Final Processed data
def ProcessAndWrite( data, filename ): # Function Definition: Write data
processedData = ProcessData( data )
name, ext = os.path.splitext( filename ) # Split the file name from the original to define the output as weeks mastersheet
outputfilename = name + '_week_combined.csv'
print "writing data to " + str( outputfilename ) # Screen output describing file to look for
with open(outputfilename, 'wb') as csvfile: # 'wb' is write binary file
writer = csv.writer(csvfile) # Next line is a hack to put headers in the csv
writer.writerow(['position_x','position_y','position_z','start_time','end_time','model','number','speed','o','stage','duration', 'cummulative_duration'])
for row in processedData:
writer.writerow(row)
if __name__ == "__main__": # Run script directly through python (not imported)
import sys
path = r'E:\\' # Set correct folder location for file merge
filenames = glob.glob(os.path.join(path, '*.csv')) # Select correct files for merge
filenames.sort() # Sort the folder so that the files are in date order to make sure you dont crash the script later
data = [] # Blank data list
def dateFromFilename( name ): # Function to select the correct files from truck speed folder
path,filename = os.path.split(name)
splitName = filename.split('_')
dateStr = splitName[0]
date = datetime.datetime.strptime(dateStr,'%Y-%m-%d') # Split file name date and words
return date # Need to put this is so it returns an actual value!
firstFileDate = None
lastFilename = None
for filename in filenames: # Select file
currentFileDate = dateFromFilename( filename )
if firstFileDate:
diff = currentFileDate - firstFileDate
# somehow convert this to days
if ( diff.days >= 1 ): # Selct the previous 24hrs worth of data
ProcessAndWrite( data, lastFilename ) # Call function to write data
data = []
firstFileDate = currentFileDate
lastFilename = filename
with open(filename, 'r') as csvfile: # For new CSV files
reader = csv.reader(csvfile, delimiter = ',') # read the csv
header = [] # Blank header list (do this to skip the header rows for merge)
for headerCount in range( 3 ): # Start reading from line 3
header.append(next(reader))
data.extend( [ row for row in reader ] ) # extend is to continue the data stacking with the next csv data
if ( len( data ) > 0 ): # If the list of data has data then continue to process and write
ProcessAndWrite( data, filename )

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Combine Dataframes resulting from a for loop - python

Related

Efficient way to combine multiple csv

Write output text data into a CSV or excel file as a table

How do I apply this code to multiple csv?

Only portions of data are being written to a cvs file, rest is missing

Python - Batch combine Multiple large CSV, filter data, skip header, appending vertically into a single CSV

Categories

Resources