Process multiple files in Python with one code - python

I'm having some trouble with my code. I want to do the following: I have about 30 csv files which I need to process and I want the result in one single csv file. So far I have the following code (sample):
import pandas as pd
import csv as csv
df = pd.read_csv ("file.csv",
delimiter=",",skiprows=1)
datamean10=df[61:240].mean()
datamean15=df[241:420].mean()
list10=[]
list15=[]
list10.append(datamean10.clip(0))
list15.append(datamean15.clip(0))
csvfile = "C:/Users/bla/bla/list10.csv"
with open(csvfile, 'w') as output:
writer = csv.writer(output, lineterminator='\n')
writer.writerows(list10)
csvfile = "C:/Users/bla/bla/list15.csv"
with open(csvfile, 'w') as output:
writer = csv.writer(output, lineterminator='\n')
writer.writerows(list15)
So this code does what I want for a single file. I wrote another script to go through all the files:
import sys, os
for root, dir, files in os.walk('C:/Users/bla/bla/bla'):
for file in files:
if file.endswith('.csv'):
os.system ('averagelist.py {}'.format(root + '\\' + file))
Needless to say this script deletes the lists again and starts off with a blank list while I want to append the results to the same list. So that the resulting files have a row of average values for each file that is processed. Can somebody tell me how to do this? Thank you very much in advance.

This should be a working combination.
import pandas as pd
import csv as csv
df = pd.read_csv ("file.csv",
delimiter=",",skiprows=1)
datamean10=df[61:240].mean()
datamean15=df[241:420].mean()
list10=[]
list15=[]
list10.append(datamean10.clip(0))
list15.append(datamean15.clip(0))
csvfile = "C:/Users/bla/bla/list10.csv"
with open(csvfile, 'w') as output:
writer = csv.writer(output, lineterminator='\n')
writer.writerows(list10)
import sys, os
for root, dir, files in os.walk('C:/Users/bla/bla/bla'):
for file in files:
if file.endswith('.csv'):
#csvfile = "C:/Users/bla/bla/list15.csv"
with open(file, 'w') as output:
writer = csv.writer(output, lineterminator='\n')
writer.writerows(list15)

Just open your output files in append mode inside the inner script:
with open(csvfile, 'a') as output:
and truncate then only in master script:
with open("C:/Users/bla/bla/list10.csv", 'w') as output:
pass
with open("C:/Users/bla/bla/list15.csv", 'w') as output:
pass
for root, dir, files in os.walk('C:/Users/bla/bla/bla'):
...
But using system to call a Python script from another Python script is bad. I would be better to write the working part of the inner script in a function, and then use it from the outer script after importing it:
Inner script (averagelist.py)
import pandas as pd
import csv as csv
def average(file):
df = pd.read_csv (file,
delimiter=",",skiprows=1)
...
with open(csvfile, 'w') as output:
writer = csv.writer(output, lineterminator='\n')
writer.writerows(list15)
outer script:
import os
import averagelist
with open("C:/Users/bla/bla/list10.csv", 'w') as output:
pass
with open("C:/Users/bla/bla/list15.csv", 'w') as output:
pass
for root, dir, files in os.walk('C:/Users/bla/bla/bla'):
for file in files:
if file.endswith('.csv'):
averagelist.average(root + '\\' + file))

Related

Python - Read files from folder and Write CSV file in format

import glob
import os
import csv
from collections import OrderedDict
#Remove output file if already exists. Resolve the append Issue
file_path = 'C:\\Users\\Desktop\\Cobol\\Outputs\\LOC3X.csv'
if os.path.isfile(file_path):
os.remove(file_path)
#
list_of_files = glob.glob('C:\\Users\\Desktop\\Cobol\\*.CBL') # Input files in Folder
Fields = ['Program Name', 'LinesofCode'] # to be displayed in output CSV file
# opening output csv file to write (Fields)
file_path = 'C:\\Users\\Desktop\\Cobol\\Outputs\\LOC3X.csv'
with open(file_path, 'a') as csvfile1:
csvwriter = csv.writer(csvfile1)
csvwriter.writerow(Fields)
csvfile1.close()
def process_files_loc(list_of_files):
for fileName in list_of_files:
with open(fileName) as i:
count = sum(1 for line in i)
my_dict = {i : count} #input filename and its lines of code
ordered_dict = OrderedDict() #using OrderedDict
print(ordered_dict)
# creating ordered dict from dict
ordered_dict = OrderedDict(my_dict)
print(ordered_dict)
# writing records of Program name and LinesofCode to output csv file
file_path = 'C:\\Users\\Desktop\\Cobol\\Outputs\\LOC3X.csv'
with open(file_path, 'a') as csvfile2:
csvwriter = csv.writer(csvfile2)
csvwriter.writerows(ordered_dict)
csvfile2.close()
process_files_loc(list_of_files)
Output in Teminal (Error):
PS C:\Users\Python-1> & C:/Users/AppData/Local/Programs/Python/Python310/python.exe c:/Users/Python-1/one.py
OrderedDict()
OrderedDict([(<_io.TextIOWrapper name='C:\\Users\\Desktop\\Cobol\\ABCDEFGH.CBL' mode='r' encoding='cp1252'>, 191)])
OrderedDict()
OrderedDict([(<_io.TextIOWrapper name='C:\\Users\\Desktop\\Cobol\\IJKLMNOP.CBL' mode='r' encoding='cp1252'>, 195)])
Actual output of file in Folder:
C:\Users\Desktop\Cobol\Outputs
Name Date Modified Type Size
LOC3X.csv 9/15/2022 time Comma Seperated 1KB
Problem: Script executed and Read 2 CBL files in the Folder, and created 1 CSV file in output folder. The output CSV file to have,
Program Name LinesofCode
ABCDEFGH.CBL 191
IJKLMNOP.CBL 195
However, the actual output lines in CSV file is,
Program Name LinesofCode
Try something like this:
import glob
import csv
import os
def process_files_loc(files):
res = []
for file in files:
with open(file) as f:
line_count = len([line.strip("\n") for line in f if line != "\n"])
res.append([os.path.basename(f.name), line_count])
return res
if __name__ == '__main__':
with open('C:\\Users\\Main\\Desktop\\test\\test.csv', 'w', newline='') as f:
csvwriter = csv.writer(f)
csvwriter.writerow(['Program Name', 'LinesofCode'])
csvwriter.writerows(process_files_loc(glob.glob('C:\\Users\\Main\\Desktop\\test\\*.PY')))
Result:
result
Regards,

Python iterate folder of csv and convert do json

I am amateur at python but I have a task of converting folder of csv to json files. I have this script working with specified CSV file but I have no idea how to make the script iterate thrue folder of csv and convert all of those csv to json. The original script:
import csv
import json
import pandas as pd
file = '/users/krzysztofpaszta/CSVtoGD/build-a-bridge.csv'
json_file = '/users/krzysztofpaszta/CSVtoGD/build-a-bridge.json'
#Odczyt pliku CSV
def read_CSV(file, json_file):
csv_rows = []
with open(file) as csvfile:
reader = csv.DictReader(csvfile)
field = reader.fieldnames
for row in reader:
csv_rows.extend([{field[i]:row[field[i]] for i in range(len(field))}])
convert_write_json(csv_rows, json_file)
#Zamiana CSV na JSON
def convert_write_json(data, json_file):
with open(json_file, "w") as f:
f.write(json.dumps(data, sort_keys=False, indent=4, separators=(',', ': ')))
f.write(json.dumps(data))
read_CSV(file,json_file)
someone will give me a hint?
You can use os functions, particularly os.listdir() to iterate over files in the directory, and safely generate new names with os.path.splitext():
import os
DIRECTORY = "/path/to/dir"
for f in os.listdir(os.fsencode(DIRECTORY)):
fn = os.fsdecode(f)
pre, ext = os.path.splitext(fn)
if ext == ".csv":
read_CSV(fn, pre + '.json')
The similar approach with pathlib would be:
from pathlib import Path
DIRECTORY = "/path/to/dir"
files = Path(DIRECTORY).glob('*.csv') # to process files only in this dir
files = Path(DIRECTORY).rglob('*.csv') # to process files in sub-directories recursively
for f in files:
read_CSV(f, str(f.with_suffix('.json'))) # use .with_suffix() for safe name generation
You can list the csv files in a folder using pathlib:
from pathlib import Path
csv_files = Path().glob('*.csv')
Then loop over the files:
for csv_file in csv_files:
csv_path = str(csv_file.absolute())
json_path = csv_path.replace('.csv', '.json')
read_CSV(csv_path, json_path)

Merge csv files, add original file name to each row in output file

I have multiple csv files in a folder with the same data structure,
0.00;1.05;10.5
0.01;2.05;15.5
0.02;3.05;20.5
...
I want ot merge all the csv files to 1 summary file and add a column with file name to each line depanding on the original data source.
0.00;1.05;10.5;csv1.csv
0.01;2.05;15.5;csv1.csv
0.02;3.05;20.5;csv1.csv
0.00;5.05;0.05;csv2.csv
0.01;6.05;1.05;csv2.csv
0.02;7.05;2.05;csv2.csv
...
I managed to merge the files, but cant find a way to add the file names.
files = []
for file in os.listdir(folder):
if file.endswith('.csv'):
files.append(file)
with open('results.csv', 'w', newline='') as fw:
cw = csv.writer(fw)
for file in files:
with open(file, newline='') as f:
cr = csv.reader(islice(f,13,None)
cw.writerows(cr)
I dont want to use pandas concat due to ram limitations.
Thanks you.
You don't need to parse the input csv files, just append a delimiter and then the current file name to each line. You can use the fileinput module:
import fileinput
from pathlib import Path
folder = '.' # set accordingly, assume current directory
path = Path(folder)
with fileinput.input(files=path.glob('*.csv')) as f, open('results.csv', 'w') as outfile:
for line in f:
print(';'.join([line.rstrip('\n'), fileinput.filename().name()]), file=outfile)
Regarding your code, you can fix it like this:
import os
import csv
folder = '.'
files = []
for file in os.listdir(folder):
if file.endswith('.csv'):
files.append(file)
with open('results.csv', 'w', newline='') as fw:
cw = csv.writer(fw, delimiter=';')
for file in files:
with open(file, newline='') as f:
for row in csv.reader(f, delimiter=';'):
row.append(file)
cw.writerow(row)
Here the delimiter argument is set to semi-colon because the default delimiter is comma and your files are using ;. That will fix the proper parsing of the input csv files, and use ; for the output file. Then each input file is processed by reading each line and appending the filename to the row list. Finally the new row is written to the output CSV file.
You can use os and pandas:
import os
import pandas as pd
basedir = <path of your base folder>
all_dfs = []
for filename in filter(lambda f: os.path.splitext(f)[1] == '.csv', next(os.walk(basedir))[2]):
curr_df = pd.read_csv(os.path.join(basedir, filename), sep=';', header=None)
curr_df['filename'] = filename
all_dfs.append(curr_df)
pd.concat(all_dfs, axis=0).to_csv('merged_cvs.csv', sep=';', header=False, index=False)
Or if you prefer in only one line:
pd.concat([pd.concat((df, pd.DataFrame([f for _ in range(len(df))])), axis=1) for f, df in
((filename, pd.read_csv(os.path.join(basedir, filename), sep=';', header=None))
for filename in filter(lambda f: os.path.splitext(f)[1] == '.csv', next(os.walk(basedir))[2]))
]).to_csv('merged_cvs.csv', sep=';', header=False, index=False)
files = []
for file in os.listdir(folder):
if file.endswith('.csv'):
files.append(file)
with open('results.csv', 'w', newline='') as fw:
cw = csv.writer(fw)
for file in files:
with open(file, newline='') as f:
fw.write(f"{file}\n") # just write the filename before the content :)
cr = csv.reader(islice(f,13,None)
cw.writerows(cr)

Transposing all csv files within a folder

I got help the last time I asked a question on this site regarding batch processing csv files within a folder using glob.glob() with Python. I am trying to use it this time to transpose all csv files within a folder. The script below only processes the last file and stops. What am I doing wrong?
import csv
import os
import glob
directory = raw_input ("INPUT Folder")
output = raw_input("OUTPUT Folder:")
in_files = os.path.join(directory, '*.csv')
for in_file in glob.glob(in_files):
with open(in_file) as input_file:
reader = csv.reader(input_file)
cols = []
for row in reader:
cols.append(row)
filename = os.path.splitext(os.path.basename(in_file))[0] + '.csv'
with open (os.path.join(output, filename), 'wb') as output_file:
writer = csv.writer(output_file)
for i in range(len(max(cols, key=len))):
writer.writerow ([(c[i] if i<len(c) else '') for c in cols])
You need to indent the "output" portion of the code so that it runs once for each iteration of the for in_file loop:
import csv
import os
import glob
directory = raw_input ("INPUT Folder")
output = raw_input("OUTPUT Folder:")
in_files = os.path.join(directory, '*.csv')
for in_file in glob.glob(in_files):
with open(in_file) as input_file:
reader = csv.reader(input_file)
cols = []
for row in reader:
cols.append(row)
# "outdent" this code so it only needs to run once for each in_file
filename = os.path.splitext(os.path.basename(in_file))[0] + '.csv'
# Indent this to the same level as the rest of the "for in_file" loop!
with open (os.path.join(output, filename), 'wb') as output_file:
writer = csv.writer(output_file)
for i in range(len(max(cols, key=len))):
writer.writerow ([(c[i] if i<len(c) else '') for c in cols])
In your version that code only runs once, after the for in_file loop has completed, and therefore only outputs cols data left over from the final iteration of that loop.
I have also "outdented" the filename = ... statement to the for in_file level, as this only needs to be done once for each in_file, not once for each row of each in_file.
You can get a lot of mileage with data manipulation using pandas:
import os
import pandas as pd
for filename in os.listdir('.'):
# We save an augmented filename later,
# so using splitext is useful for more
# than just checking the extension.
prefix, ext = os.path.splitext(filename)
if ext.lower() != '.csv':
continue
# Load the data into a dataframe
df = pd.DataFrame.from_csv(filename,
header=None,
index_col=None,
parse_dates=False)
# Transpose is easy, but you could do TONS
# of data processing here. pandas is awesome.
df_transposed = df.T
# Save to a new file with an augmented name
df_transposed.to_csv(prefix+'_T'+ext, header=True, index=False)
The os.walk version is not much different, if you need to dig into subfolders as well.
Here is a working one:
had to google for an hour, but works and tested on python33
import csv
import os
import glob
directory = 'C:\Python33\csv'
output = 'C:\Python33\csv2'
in_files = os.path.join(directory, '*.csv')
for in_file in glob.glob(in_files):
with open(in_file) as input_file:
reader = csv.reader(input_file)
cols = []
for row in reader:
cols.append(row)
# "outdent" this code so it only needs to run once for each in_file
filename = os.path.splitext(os.path.basename(in_file))[0] + '.csv'
# Indent this to the same level as the rest of the "for in_file" loop!
with open (os.path.join(output, filename), 'w') as output_file:
writer = csv.writer(output_file)
for i in range(len(max(cols, key=len))):
writer.writerow ([(c[i] if i<len(c) else '') for c in cols])
in_files will only return a single result in that format. Try returning a list:
in_files = [f for f in os.listdir(directory) if f.endswith('.csv')]

Read CSV from within Zip File

I have a directory of zip files (approximately 10,000 small files), within each is a CSV file I am trying to read and split into a number of different CSV files.
I managed to write the code to split the CSV files from a directory of CSVs, shown below, that reads the first atttribute of the CSV, and depending what it is write it to the relevent CSV.
import csv
import os
import sys
import re
import glob
reader = csv.reader(open("C:/Projects/test.csv", "rb"), delimiter=',', quotechar='"')
write10 = csv.writer(open('ouput10.csv', 'w'), delimiter=',', lineterminator='\n', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
write15 = csv.writer(open('ouput15.csv', 'w'), delimiter=',', lineterminator='\n', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
headings10=["RECORD_IDENTIFIER","CUSTODIAN_NAME","LOCAL_CUSTODIAN_NAME","PROCESS_DATE","VOLUME_NUMBER","ENTRY_DATE","TIME_STAMP","VERSION","FILE_TYPE"]
write10.writerow(headings10)
headings15=["RECORD_IDENTIFIER","CHANGE_TYPE","PRO_ORDER","USRN","STREET_DESCRIPTION","LOCALITY_NAME","TOWN_NAME","ADMINSTRATIVE_AREA","LANGUAGE"]
write15.writerow(headings15)
for row in reader:
type = row[0]
if "10" in type:
write10.writerow(row)
elif "15" in type:
write15.writerow(row)
So I am now trying to read the Zip files rather than wasting time extracting them first.
This is what I have so far after following as many tutorials as I have found
import glob
import os
import csv
import zipfile
import StringIO
for name in glob.glob('C:/Projects/abase/*.zip'):
base = os.path.basename(name)
filename = os.path.splitext(base)[0]
datadirectory = 'C:/Projects/abase/'
dataFile = filename
archive = '.'.join([dataFile, 'zip'])
fullpath = ''.join([datadirectory, archive])
csv = '.'.join([dataFile, 'csv'])
filehandle = open(fullpath, 'rb')
zfile = zipfile.ZipFile(filehandle)
data = StringIO.StringIO(zfile.read(csv))
reader = csv.reader(data)
for row in reader:
print row
However and error gets thrown
AttributeError: 'str' object has no attribute 'reader'
Hopefully someone can show me how to change my CSV reading code that works to read the Zip file.
Much appreciated
Tim
Simple fix. You're overriding the csv module with your local csv variable. Just change the name of that variable:
import glob
import os
import csv
import zipfile
import StringIO
for name in glob.glob('C:/Projects/abase/*.zip'):
base = os.path.basename(name)
filename = os.path.splitext(base)[0]
datadirectory = 'C:/Projects/abase/'
dataFile = filename
archive = '.'.join([dataFile, 'zip'])
fullpath = ''.join([datadirectory, archive])
csv_file = '.'.join([dataFile, 'csv']) #all fixed
filehandle = open(fullpath, 'rb')
zfile = zipfile.ZipFile(filehandle)
data = StringIO.StringIO(zfile.read(csv_file)) #don't forget this line!
reader = csv.reader(data)
for row in reader:
print row

Categories

Resources