Separate csv files based on column values

Separate csv files based on column values - python

There are few csv files in different folders and sub folders. I need to separate each csv file to incoming and outgoing traffic.
if source == ac:37:43:9b:92:24 && Receiver address ==
8c:15:c7:3a:d0:1a then those rows need to get written to .out.csv
files.
if Transmitter address == 8c:15:c7:3a:d0:1a && Destination==
ac:37:43:9b:92:24 then those rows need to get written into .in.csv
files.
The output files (files that got separated as incoming and outgoing) have to get the same name as input files (eg: if input file is aaa.csv then output files will be aaa.in.csv and aaa.out.csv).
And output files needs to get written into folders and sub folders as input files were.
I tried the below code, but not working.
I am new to programming, so not sure is this code correct or wrong. Any help is greatly appreciated. Thanks
import csv
import os
import subprocess
startdir = '.'
outdir = '.'
suffix = '.csv'
def decode_to_file(cmd, in_file, new_suffix):
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
fileName = outdir + '/' + in_file[len(startdir):-len(suffix)] + new_suffix
os.makedirs(os.path.dirname(fileName), exist_ok=True)
csv_writer = csv.writer(open(fileName, 'w'))
for line_bytes in proc.stdout:
line_str = line_bytes.decode('utf-8')
csv_writer.writerow(line_str.strip().split(','))
for root, dirs, files in os.walk(startdir):
for name in files:
if not name.endswith(suffix):
continue
in_file = os.path.join(root, name)
decode_to_file(
cmd= [if source== ac:37:43:9b:92:24 && Receiver address== 8c:15:c7:3a:d0:1a],
in_file=in_file,
new_suffix='.out.csv'
)
decode_to_file(
cmd= [if Transmitter address == 8c:15:c7:3a:d0:1a && Destination== ac:37:43:9b:92:24],
in_file=in_file,
new_suffix='.in.csv'
)

You could make use of Python's CSV library to process the rows and glob.glob could be used to walk over the files. os.path.splitext() can be used to help with changing the file extension. For example:
import csv
import glob
import os
for filename in glob.glob('**/*.csv', recursive=True):
basename, extension = os.path.splitext(filename)
print(f"Processing - {filename}")
with open(filename, encoding='utf-8') as f_input, \
open(basename + '.in.csv', 'w', newline='', encoding='utf-8') as f_in, \
open(basename + '.out.csv', 'w', newline='', encoding='utf-8') as f_out:
csv_input = csv.reader(f_input)
csv_in = csv.writer(f_in)
csv_out = csv.writer(f_out)
for row in csv_input:
if row[3] == 'ac:37:43:9b:92:24' and row[4] == '8c:15:c7:3a:d0:1a':
csv_out.writerow(row)
if row[5] == '8c:15:c7:3a:d0:1a' and row[6] == 'ac:37:43:9b:92:24':
csv_in.writerow(row)
This assumes that your CSV file are in a standard format e.g. aaa,bbb,ccc,ddd. The csv.reader() will read each line of the file and convert it into a list of values automatically split on the commas. So the first value in each row is row[0].

Related

Merge csv files, add original file name to each row in output file

I have multiple csv files in a folder with the same data structure,
0.00;1.05;10.5
0.01;2.05;15.5
0.02;3.05;20.5
...
I want ot merge all the csv files to 1 summary file and add a column with file name to each line depanding on the original data source.
0.00;1.05;10.5;csv1.csv
0.01;2.05;15.5;csv1.csv
0.02;3.05;20.5;csv1.csv
0.00;5.05;0.05;csv2.csv
0.01;6.05;1.05;csv2.csv
0.02;7.05;2.05;csv2.csv
...
I managed to merge the files, but cant find a way to add the file names.
files = []
for file in os.listdir(folder):
if file.endswith('.csv'):
files.append(file)
with open('results.csv', 'w', newline='') as fw:
cw = csv.writer(fw)
for file in files:
with open(file, newline='') as f:
cr = csv.reader(islice(f,13,None)
cw.writerows(cr)
I dont want to use pandas concat due to ram limitations.
Thanks you.

You don't need to parse the input csv files, just append a delimiter and then the current file name to each line. You can use the fileinput module:
import fileinput
from pathlib import Path
folder = '.' # set accordingly, assume current directory
path = Path(folder)
with fileinput.input(files=path.glob('*.csv')) as f, open('results.csv', 'w') as outfile:
for line in f:
print(';'.join([line.rstrip('\n'), fileinput.filename().name()]), file=outfile)
Regarding your code, you can fix it like this:
import os
import csv
folder = '.'
files = []
for file in os.listdir(folder):
if file.endswith('.csv'):
files.append(file)
with open('results.csv', 'w', newline='') as fw:
cw = csv.writer(fw, delimiter=';')
for file in files:
with open(file, newline='') as f:
for row in csv.reader(f, delimiter=';'):
row.append(file)
cw.writerow(row)
Here the delimiter argument is set to semi-colon because the default delimiter is comma and your files are using ;. That will fix the proper parsing of the input csv files, and use ; for the output file. Then each input file is processed by reading each line and appending the filename to the row list. Finally the new row is written to the output CSV file.

You can use os and pandas:
import os
import pandas as pd
basedir = <path of your base folder>
all_dfs = []
for filename in filter(lambda f: os.path.splitext(f)[1] == '.csv', next(os.walk(basedir))[2]):
curr_df = pd.read_csv(os.path.join(basedir, filename), sep=';', header=None)
curr_df['filename'] = filename
all_dfs.append(curr_df)
pd.concat(all_dfs, axis=0).to_csv('merged_cvs.csv', sep=';', header=False, index=False)
Or if you prefer in only one line:
pd.concat([pd.concat((df, pd.DataFrame([f for _ in range(len(df))])), axis=1) for f, df in
((filename, pd.read_csv(os.path.join(basedir, filename), sep=';', header=None))
for filename in filter(lambda f: os.path.splitext(f)[1] == '.csv', next(os.walk(basedir))[2]))
]).to_csv('merged_cvs.csv', sep=';', header=False, index=False)

files = []
for file in os.listdir(folder):
if file.endswith('.csv'):
files.append(file)
with open('results.csv', 'w', newline='') as fw:
cw = csv.writer(fw)
for file in files:
with open(file, newline='') as f:
fw.write(f"{file}\n") # just write the filename before the content :)
cr = csv.reader(islice(f,13,None)
cw.writerows(cr)

Create output file based on changing input value

I have the following txt files (10000s) in multiple directories eg.
BaseDirectory\04_April\2019-04-14\UniqeDirectoryName1 (username)\345308457384745637.txt
BaseDirectory\04_April\2019-04-14\UniqeDirectoryName2 (username)\657453456456546543.txt
BaseDirectory\04_April\2019-04-14\UniqeDirectoryName3 (username)\234545743564356774.txt
BaseDirectory\05_May\2019-05-14\UniqeDirectoryName1 (username)\266434564564563565.txt
BaseDirectory\05_May\2019-05-14\UniqeDirectoryName2 (username)\934573845739632048.txt
BaseDirectory\05_May\2019-05-14\UniqeDirectoryName3 (username)\634534534535654501.txt
so in other words in each date folder there are multiple directories that again contains text files.
import os
import re
import csv
for path, subdirs, files in os.walk("E:\\BaseDir\\"):
for name in files:
file_fullinfo = os.path.join(path, name)
path, filename = os.path.split(file_fullinfo)
NoExtension = os.path.splitext(file_fullinfo)[0]
file_noext = str(NoExtension)
file_splitinfo = re.split('\\\\', file_noext, 0)
file_month = file_splitinfo[2]
file_date = file_splitinfo[3]
file_folder = re.sub(r'\([^)]*\)', '', file_splitinfo[4])
file_name = file_splitinfo[5]
file_category = file_folder
My script generates the following..
['E:', 'BaseDirectory', '04_April', '2019-04-09', 'UniqeDirectoryName', '345308457384745637.txt', 'UniqeDirectoryName']
So far so good, writing this to a generic CSV file is also straight forward, but I want to create a new CSV file based on the changing date like this.
E:\BaseDir\2019-04-09.csv
file_folder, file_name, file_category
'UniqeDirectoryName', '543968732948754398','UniqeDirectoryName'
'UniqeDirectoryName', '345308457384745637','UniqeDirectoryName'
'UniqeDirectoryName', '324089734983987439','UniqeDirectoryName'
E:\BaseDir\2019-05-14.csv
file_folder, file_name, file_category
'UniqeDirectoryName', '543968732948754398','UniqeDirectoryName'
'UniqeDirectoryName', '345308457384745637','UniqeDirectoryName'
'UniqeDirectoryName', '324089734983987439','UniqeDirectoryName'
How can I accomplise this can't quite wrap my head a around it, the struggle of being a Python noob is real.. :)

If you can live without the first line as a header row it can be achieved quite simply.
output_file_path = 'D:/output_files/' + file_date + '.csv'
with open(file=output_file_path, mode='a') as csv_file: #open a csv file to write to in append mode
csv_file.write("my data\n")
if you absolutely must have the header then you can test if the file exists first, if it doesn't exist write the header row
import os.path
output_file_path = 'D:/output_files/' + file_date + '.csv'
if not os.path.exists(output_file_path): #open a csv file to write header row if doesn't exist
with open(file=output_file_path, mode='a') as csv_file:
csv_file.write("my header row\n")
with open(file=output_file_path, mode='a') as csv_file: #open a csv file to write to in append mode
csv_file.write("my data\n")

Move png's to appropriate folder in python

I have a csv with two columns Directory and Filename. Each row in the csv shows what directory each file belongs like so
Directory, File Name
DIR18, IMG_42.png
DIR12, IMG_16.png
DIR4, IMG_65.png
So far I have written code that grabs each directory and filename from the csv and then all files at their destination like so:
movePng.py
import shutil
import os
import csv
from collections import defaultdict
columns = defaultdict(list) # each value in each column is appended to a list
with open('/User/Results.csv') as f:
reader = csv.DictReader(f)
for row in reader:
for (k,v) in row.items():
columns[k].append(v)
source = '/User/PNGItems'
files = os.listdir(source)
for f in files:
pngName = f[:-4]
for filename in columns['File Name']:
fileName = filename[:-4]
if pngName == fileName
# GET THIS POSITION IN columns['File Name'] for columns['Directory']
shutil.move(f, source + '/' + DIRECTORY)
How do I get the index of the columns['File Name'] and grab the corresponding directory out of columns['Directory'] ?

You should read the assignments into a dictionary and then query that:
folder_assignment_file = "folders.csv"
file_folder = dict()
with open(folder_assignment_file, "r") as fh:
reader = csv.reader(fh)
for folder, filename in reader:
file_folder[filename] = folder
And then get the target folder like so: DIRECTORY = file_folder[fileName].
Some other hints:
filename, fileName are not good variable names, this will only lead to hard to find bugs because Python is case sensitive
use os.path.splitext to split the extension off the filename
if not all your files are in one folder the glob module and os.walk might come in handy
Edit:
Creating the dict can be made even nicer like so:
with open(folder_assignment_file, "r") as fh:
reader = csv.reader(fh)
file_folders = {filename: folder for folder, filename in reader}

To solve this I used #Peter Wood suggestion and it worked beautifully. Also I had to modify shutil.
Here is the code below
for f in files:
pngName = f[:-4]
for filename, directory in zip(columns['File Name'], columns['Directory']):
fileName = filename[:-4]
if pngName == fileName:
directoryName = directory[1:]
shutil.move(os.path.join(source, f), source + '/' + directoryName)

Text file to csv with glob. Need to change delimiter depending on section of file being read

I have a text file that doesn't have a standard delimiter. I need to be able to check if the current line is equal to a certain phrase and if it is, the code should use a certain delimiter until another phrase is found. delimiters used are ',' '-',':' and '='.
Please help me out :)
This is what my code is at the moment
import csv
import glob
import os
directory = raw_input("INPUT Folder for Log Dump Files:")
output = raw_input("OUTPUT Folder for .csv files:")
txt_files = os.path.join(directory, '*.txt')
for txt_file in glob.glob(txt_files):
with open(txt_file, "rb") as input_file:
in_txt = csv.reader(input_file, delimiter=':')
filename = os.path.splitext(os.path.basename(txt_file))[0] + '.csv'
with open(os.path.join(output, filename), 'wb') as output_file:
out_csv = csv.writer(output_file)
out_csv.writerows(in_txt)

I cannot speak to the time efficiency of this method, but it might just get what you want done. The basic idea is to create a list to contain the lines of each text file, and then output the list to your new csv file. You save a 'delimiter' variable and then change it by checking each line as you go through the text files.
For example:
I created two text files on my Desktop. They read as follows:
delimiter_test_1.txt
test=delimiter=here
does-it-work
I'm:Not:Sure
delimiter_test_2.txt
This:File:Uses:Colons
Pretty:Much:The:Whole:Time
does-it-work
If-Written-Correctly-yes
I then ran this script on them:
import csv
import glob
import os
directory = raw_input("INPUT Folder for Log Dump Files:")
output = raw_input("OUTPUT Folder for .csv files:")
txt_files = os.path.join(directory, '*.txt')
delimiter = ':'
for txt_file in glob.glob(txt_files):
SavingList = []
with open(txt_file, 'r') as text:
for line in text:
if line == 'test=delimiter=here\n':
delimiter = '='
elif line == 'does-it-work\n':
delimiter = '-'
elif line == "I'm:Not:Sure":
delimiter = ':'
SavingList.append(line.split(delimiter))
with open('%s.csv' %os.path.join(output, txt_file.split('.')[0]), 'wb') as output_file:
writer = csv.writer(output_file)
for m in xrange(len(SavingList)):
writer.writerow(SavingList[m])
And got two csv files with the text split based on the desired delimiter. Depending on how many different lines you have for changing the delimiter you could set up a dictionary of said lines. Then your check becomes:
if line in my_dictionary.keys():
delimiter = my_dictionary[line]
for example.

Transposing all csv files within a folder

I got help the last time I asked a question on this site regarding batch processing csv files within a folder using glob.glob() with Python. I am trying to use it this time to transpose all csv files within a folder. The script below only processes the last file and stops. What am I doing wrong?
import csv
import os
import glob
directory = raw_input ("INPUT Folder")
output = raw_input("OUTPUT Folder:")
in_files = os.path.join(directory, '*.csv')
for in_file in glob.glob(in_files):
with open(in_file) as input_file:
reader = csv.reader(input_file)
cols = []
for row in reader:
cols.append(row)
filename = os.path.splitext(os.path.basename(in_file))[0] + '.csv'
with open (os.path.join(output, filename), 'wb') as output_file:
writer = csv.writer(output_file)
for i in range(len(max(cols, key=len))):
writer.writerow ([(c[i] if i<len(c) else '') for c in cols])

You need to indent the "output" portion of the code so that it runs once for each iteration of the for in_file loop:
import csv
import os
import glob
directory = raw_input ("INPUT Folder")
output = raw_input("OUTPUT Folder:")
in_files = os.path.join(directory, '*.csv')
for in_file in glob.glob(in_files):
with open(in_file) as input_file:
reader = csv.reader(input_file)
cols = []
for row in reader:
cols.append(row)
# "outdent" this code so it only needs to run once for each in_file
filename = os.path.splitext(os.path.basename(in_file))[0] + '.csv'
# Indent this to the same level as the rest of the "for in_file" loop!
with open (os.path.join(output, filename), 'wb') as output_file:
writer = csv.writer(output_file)
for i in range(len(max(cols, key=len))):
writer.writerow ([(c[i] if i<len(c) else '') for c in cols])
In your version that code only runs once, after the for in_file loop has completed, and therefore only outputs cols data left over from the final iteration of that loop.
I have also "outdented" the filename = ... statement to the for in_file level, as this only needs to be done once for each in_file, not once for each row of each in_file.

You can get a lot of mileage with data manipulation using pandas:
import os
import pandas as pd
for filename in os.listdir('.'):
# We save an augmented filename later,
# so using splitext is useful for more
# than just checking the extension.
prefix, ext = os.path.splitext(filename)
if ext.lower() != '.csv':
continue
# Load the data into a dataframe
df = pd.DataFrame.from_csv(filename,
header=None,
index_col=None,
parse_dates=False)
# Transpose is easy, but you could do TONS
# of data processing here. pandas is awesome.
df_transposed = df.T
# Save to a new file with an augmented name
df_transposed.to_csv(prefix+'_T'+ext, header=True, index=False)
The os.walk version is not much different, if you need to dig into subfolders as well.

Here is a working one:
had to google for an hour, but works and tested on python33
import csv
import os
import glob
directory = 'C:\Python33\csv'
output = 'C:\Python33\csv2'
in_files = os.path.join(directory, '*.csv')
for in_file in glob.glob(in_files):
with open(in_file) as input_file:
reader = csv.reader(input_file)
cols = []
for row in reader:
cols.append(row)
# "outdent" this code so it only needs to run once for each in_file
filename = os.path.splitext(os.path.basename(in_file))[0] + '.csv'
# Indent this to the same level as the rest of the "for in_file" loop!
with open (os.path.join(output, filename), 'w') as output_file:
writer = csv.writer(output_file)
for i in range(len(max(cols, key=len))):
writer.writerow ([(c[i] if i<len(c) else '') for c in cols])

in_files will only return a single result in that format. Try returning a list:
in_files = [f for f in os.listdir(directory) if f.endswith('.csv')]

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Separate csv files based on column values - python

Related

Merge csv files, add original file name to each row in output file

Create output file based on changing input value

Move png's to appropriate folder in python

Text file to csv with glob. Need to change delimiter depending on section of file being read

Transposing all csv files within a folder

Categories

Resources