I have 118 CSVs, I need to go into each CSV and change F1, F2, F3 and so on to 0.
For example, in csv1, F1 = 0, in csv2, F2 = 0, in csv3, F3 = 0 and so on.
The CSV has headers:
I am assuming all of your CSV files have the same format, and that you are trying to set column F to be 0 for all of them.
You can use Python CSV library to help you as follows:
import csv
import glob
for filename in glob.glob('*.csv'):
print(f'Processing: {filename}')
with open(filename) as f_input:
csv_input = csv.reader(f_input)
header = next(csv_input)
rows = [[*row[:5], '0'] for row in csv_input]
with open(filename, 'w', newline='') as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(header)
csv_output.writerows(rows)
This reads all .csv files from a given folder and changes the Multi Col 2 values to 0. It does this for all rows but leaves the header the same.
Thank you all, I made my own solution, it is a lot less classy than the ones posted here. But I automated it from the point of needing x number of files to amending the col/row.
#==============================================================================
# Import the necessary packages
import os
#import glob
import shutil
import pathlib
import pandas as pd
#import numpy as np
#==============================================================================
InputPath = 'F:\\cells\\bc_dbase\\bc_dbase1.csv'
OutputPath = 'F:\\cells\\bc_dbase'
str1 = 'Name '
str2 = 'Mult Col 2'
NoStart = 1
NoEnd = 119
#==============================================================================
# Create complete path of folders
def CreatePath(FullPath,File=False):
Parts = pathlib.Path(FullPath).parts
for [n1,Folder] in enumerate(Parts):
if File==True and n1==len(Parts)-1 and "." in Parts[n1]:
continue
elif n1==0:
FolderPath = Parts[n1]
else:
FolderPath = os.path.join(FolderPath,Folder)
if os.path.exists(FolderPath)==False:
os.mkdir(FolderPath)
#==============================================================================
# Delete folder
def DeleteFolder(FullPath):
FullPath = pathlib.Path(FullPath)
try:
shutil.rmtree(FullPath)
except:
pass
#==============================================================================
CreatePath(OutputPath,File=False)
[FolderPath,File] = os.path.split(InputPath)
[FileName,FileExt] = os.path.splitext(os.path.basename(InputPath))
ReversedFileName = FileName[::-1]
AdjFileName = FileName
for n1 in reversed(range(len(AdjFileName))):
char = FileName[n1]
if char.isdigit():
AdjFileName = AdjFileName[:n1] + AdjFileName[(n1+1):]
else: break;
Data1 = pd.read_csv(InputPath)
Data2 = pd.DataFrame.copy(Data1)
NameCols = Data1.columns
if str2 in NameCols:
Data2.loc[:,str2] = 1
for n1 in range(NoStart,NoEnd+1):
NewFile = AdjFileName + str(n1) + FileExt
NewFilePath = os.path.join(OutputPath,NewFile)
Data3 = pd.DataFrame.copy(Data2)
index = Data3[Data3[str1]==n1].index[0]
Data3.loc[index,str2] = 0
Data3.to_csv(NewFilePath, index=False)
print('[INFO] Storing file:',NewFilePath)
#==============================================================================
Mr. Evans has pretty neat code using Python CSV library, so I will expand on it a bit to answer y
our specific question.
import csv
import glob
file_count = 0
for filename in glob.glob('*.csv'):
file_count += 1
print(f'Processing: {filename}')
with open(filename) as f_input:
csv_input = csv.reader(f_input)
header = next(csv_input)
line_count = 0
rows = []
for row in csv_input:
line_count += 1
if line_count == file_count:
rows.append([*row[:5], '0'])
else:
rows.append([*row[:6]])
with open(filename, 'w', newline='') as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(header)
csv_output.writerows(rows)
Note: the code will run for all the .csv files in the working directory and will run through the files in an alphabetic order.
Related
I have a directory containing about 1700 pickle file, that every file is all Twitter post of the user, I want to convert it into a folder of CSV files, that every CSV file name is the name of the pickle file and each row contains one tweet of user...
after that, I want just the top 20 CSV with more samples than others... how can I do that?
# khabarlist = open_file_linebyline(pkl_path)
def open_dir_in_dict(input_path):
files = os.scandir(input_path)
my_dict = {}
for file in files:
# if len(file.name.split()) > 1:
# continue
# if file.split('.')[-1] != "pkl":
with open(file, 'r', encoding='utf8') as f:
items = [i.strip() for i in f.read().split(",")]
my_dict[file.replace(".pkl", "")] = items
df = pd.DataFrame(my_dict)
df.to_excel(file.replace(".pkl", "") + "xlsx")
open_dir_in_dict("Raw/")
I Wrote the sample code for it and it did not work...
def open_dir_in_dict(input_path):
files = os.scandir(input_path)
my_dict = {}
for file in files:
if len(file.name.split()) > 1:
continue
if file.split('.')[-1] != "pkl":
with open(file, 'r', encoding='utf-8', errors='replace') as f:
print(f.readlines())
items = [i.strip() for i in f.read().split(",")] # encode('utf-8').strip()
my_dict[file.replace(".pkl", "")] = items
df = pd.DataFrame(my_dict)
df.to_excel(file.replace(".pkl", "") + "xlsx")
# open_dir_in_dict("Raw/")
and a better answer...
import os
import pandas as pd
import regex as re
data_path = "/content/drive/My Drive/twint/Data/pkl/Data/"
for path in os.listdir(data_path):
my_tweets = []
df = pd.read_pickle(data_path + path)
for tweet in df.tweet:
url = re.findall(r"http\S+", tweet)
if url == []:
my_tweets.append(tweet)
new_df = pd.DataFrame({"tweets": my_tweets, "author": path.replace(".pkl", "")}) # path[:-4]
new_df.to_csv("/content/drive/My Drive/twint/final.csv", index=False, mode="a", )
I'm using Python 3 & I am having trouble appending lines from multiple csv files into multiple rows for the Master_Total.csv file. I suspect that it is due to not having a "pre-existing" blank row for each csv file. If this is true how do I add a new blank row to each TOTAL.csv file?
TOTAL.csv file:
GND, 0.1V, 1.0V, REFKelvin,
0.000000, 0.100436, 1.003407, 150318.406250,
[no empty row]
enviro.csv file:
temp [C], pressure [kPa], humidity [%RH]
23.870001, 85.903000, 33.75244
[empty row]
When I run my script I get this:
Master_Total.csv
GND, 0.1V, 1.0V, REFKelvin,
0.000000, 0.100436, 1.003407, 150318.4062500.000000, 0.100764, 1.005011, 100.3399580.000019, 0.100252, 1.002642, 100.214996...
Master_enviro.csv
temp [C], pressure [kPa], humidity [%RH]
23.870001, 85.903000, 33.752441
23.760000, 85.914001, 32.997131
24.040001, 85.879997, 33.134460
...
Code:
import shutil, glob, csv, os, sys
path = r'directory'
Enviro_Files = glob.glob(path +"**/*enviro.csv")
Total_Files = glob.glob(path +"**/*TOTAL.csv")
with open('directory_desktop/Master_enviro.csv', 'wb') as outfile1:
for i, filename1 in enumerate(Enviro_Files):
with open(filename1, 'rb') as inputfile1:
if i != 0:
inputfile1.readline()
shutil.copyfileobj(inputfile1, outfile1)
print(filename1 + " has been imported.")
with open('directory_desktop/Master_TOTAL.csv', 'wb') as outfile2:
for h, filename2 in enumerate(Total_Files):
with open(filename2, 'rb') as inputfile2:
if h != 0:
inputfile2.readline()
shutil.copyfileobj(inputfile2, outfile2)
print(fname2 + " has been imported.")
If you make use of Python's CSV library, you can easily test to ensure a given row has values in it, that way it does not matter if there are empty lines are not, they will be skipped over when writing the master files:
import csv
import glob
def merge_csvs(target_filename, csv_list):
with open(target_filename, 'w', newline='') as f_master_target:
csv_master_target = csv.writer(f_master_target)
write_header = True
for csv_filename in csv_list:
with open(csv_filename, 'r', newline='') as f_single:
csv_single = csv.reader(f_single)
header = next(csv_single)
if write_header:
csv_master_target.writerow(header)
write_header = False
for row in csv_single:
if row:
csv_master_target.writerow(row)
path = 'directory'
Enviro_Files = glob.glob(path + "**/*enviro.csv")
Total_Files = glob.glob(path + "**/*TOTAL.csv")
merge_csvs('Master_enviro.csv', Enviro_Files)
merge_csvs('Master_TOTAL.csv', Total_Files)
I'm new using MPI and I'm trying to parallelize the following code with MPI4PY. It's a code that read more or less 100 csv files and rearrange them.
The problem comes after the parallelization how to join in order the arrays 'data' and 'mapData'
the original code without parallelization:
import csv
import sys
import numpy as np
import StringIO
import re
numbers = re.compile(r'(\d+)')
def numericalSort(value):
parts = numbers.split(value)
parts[1::2] = map(int, parts[1::2])
return parts
#def sprintf(buf, fmt, *args):
# buf.write(fmt % args)
from os import listdir
from os.path import isfile, join
import os,glob
folder = os.getcwd()+'/'
files_in_dir=[]
for i in sorted(glob.glob('*.csv'), key=numericalSort):
print "Current File Being Processed is: " + i
files_in_dir.append(i)
filelist = [[]] * len(files_in_dir)
for file in xrange(len(files_in_dir)):
fileName = folder+files_in_dir[file]
with open(fileName, 'rt') as f:
has_header = csv.Sniffer().has_header(f.read(1024))
f.seek(0) # rewind
incsv = csv.reader(f)
if has_header:
next(incsv)
reader = csv.reader(f, quoting=csv.QUOTE_NONNUMERIC)
for row in reader:
#next(reader)
filelist[file].append(row) #save each row
# for row in
f.close()
filelist = [sorted(filelist[ijk],key = lambda x: x[7]) for ijk in xrange(len(filelist))] #order row by numerical index
# DO NOT!!! use P
data = [[]]*len(filelist[0]) #len(filelist[0])=lunghezza colonna
mapData = [[]]*len(filelist[0])
for i in xrange(len(data)):
data[i] = [filelist[k][i][0:4] for k in xrange(len(filelist))]
mapData[i] = filelist[0][i][4:7]
with open('mapdata.csv', 'wb') as mapdatares:
writer = csv.writer(mapdatares)
writer.writerows(mapData)
and this is what I could have done. my problem is when I call 'comm.allreduce'
import csv
import sys
import numpy as np
import StringIO
import re
from mpi4py import MPI
from mpi4py.MPI import ANY_SOURCE
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
numbers = re.compile(r'(\d+)')
def numericalSort(value):
parts = numbers.split(value)
parts[1::2] = map(int, parts[1::2])
return parts
from os import listdir
from os.path import isfile, join
import os,glob
import math
folder = os.getcwd()+'/'
files_in_dir=[]
for i in sorted(glob.glob('*.csv'), key=numericalSort):
print "Current File Being Processed is: " + 1
files_in_dir.append(i)
#count number of csv files
print "number of files is: ", len(files_in_dir)
#divide files for processors
filesrank=len(files_in_dir)/size
intero=math.trunc(filesrank)
fremain=len(files_in_dir) % size
if fremain > rank:
sizeproc=intero + 1
else:
sizeproc=intero
#extreme of intervals
a=rank * sizeproc
b=a + sizeproc
#parallelize
filelist = [[]] * len(files_in_dir[a:b])
for file in xrange(len(files_in_dir[a:b])):
fileName = folder+files_in_dir[file]
with open(fileName, 'rt') as f:
has_header = csv.Sniffer().has_header(f.read(1024))
f.seek(0) # rewind
incsv = csv.reader(f)
if has_header:
next(incsv)
reader = csv.reader(f, quoting=csv.QUOTE_NONNUMERIC) #read cvs file skipping reader
for row in reader:
#next(reader)
filelist[file].np.append(row) #save each row
# for row in
f.close()
filelist = [sorted(filelist[ijk],key = lambda x: x[7]) for ijk in xrange(len(filelist))] #order row by numerical index
data = [[]]*len(filelist[0]) #len(filelist[0])=lunghezza colonna
mapData = [[]]*len(filelist[0])
for i in xrange(len(data)):
data= [filelist[k][i][0:4] for k in xrange(len(filelist))]
mapData = filelist[0][i][4:7]
totmapData = []
totData = []
comm.allreduce(mapData, totmapData, op=MPI.SUM)
comm.allreduce(data, totData, op=MPI.SUM)
Any suggestion? thanks
As a new Python programmer, I'm having trouble figuring out how to accomplish the following.
Given this input data from a csv file:
Sku Image_Name
B001 a.jpg
B002 a.jpg
B001 b.jpg
B002 c.jpg
B003 x.jpg
Where multiple Sku's might have the same image name. When that occurs, I want to rename the image name in the Image__Name column by concatenating "_" + Sku value as shown to the image name in that same row.
So the desired output data would be:
Sku Image_Name
B001 a_B001.jpg
B002 a_B002.jpg
B001 b.jpg
B002 c.jpg
B003 x.jpg
After that it should rename the images in the image folder according to the Image_Name column.
This is all I have so far:
import csv
#open and store the csv file
with open('D:\\test.csv', 'rb') as csvfile:
spamreader = csv.reader(csvfile, delimiter=',')
OK, you've got quite a ways to go, but this should give you some hints on how to proceed (assuming a reasonable number of files):
import csv
from os.path import splitext
with open("/tmp/test.csv", 'rb') as csvfile:
itemList = []
renamedList = []
keyList = []
spamreader = csv.reader(csvfile, delimiter=",")
for row in spamreader:
keyList.append(row[0])
itemList.append(row[1])
renamedList.append(row[1])
toBeChanged = [itemNum for itemNum, item in enumerate(itemList)
if itemList.count(item) > 1]
for itemNum in toBeChanged:
name, ext = splitext(itemList[itemNum])
renamedList[itemNum] = '{}_{}{}'.format(name, keyList[itemNum], ext)
# At this point we have your desired info and can print it just like you
# have above
print("Sku\tImage_Name")
for row in zip(keyList, itemList):
print(row[0] + '\t' + row[1])
# Duplicating / renaming files is next. This isn't the only way
# to do it (or the most efficient), but it's an easy way to understand.
# The idea is to first make copies of all needed files...
from shutil import copyfile
changedNames = []
for itemNum in toBeChanged:
copyfile(itemList[itemNum], renamedList[itemNum])
changedNames.append(itemList[itemNum])
# ...and then delete the originals. The set is used to eliminate
# duplicates.
from os import remove
for item in set(changedNames):
remove(itemName)
There are lots of ways you can improve this code. The intent here was to make it more understandable. Understand it first, improve it second.
import csv
import os
from os.path import splitext # splits name & extension from a file
import shutil #making a duplicate copy of a file
from os import rename
#open and read csv
with open('test.csv') as csvfile:
#create list for sku,old_imagename and new imagename
itemList = []
renamedList = []
keyList = []
spamreader = csv.reader(csvfile, delimiter=",")
#processing every row at a time
for row in spamreader:
keyList.append(row[0]) #for sku
itemList.append(row[1]) #for old_imagename
renamedList.append(row[1]) #for new_imagename
#Processing only sku having same images
toBeChanged = [itemNum for itemNum, item in enumerate(itemList)
if itemList.count(item) > 1]
for itemNum in toBeChanged:
name, ext = splitext(itemList[itemNum]) # splitting image name & extension: eg a-> "a" & "jpg"
oldFileName = name + ext
print("oldFileName = " + oldFileName) # oldFileName = a.jpg
newFileName = '{}_{}{}'.format(name, keyList[itemNum], ext)
print("newFileName = " + newFileName) # newFileName = a_B001.jpg & a_B002.jpg
# check if the Image file exists,
if(os.path.isfile(oldFileName)):
shutil.copy2(oldFileName, newFileName); # creating a duplicate image file
renamedList[itemNum] = '{}_{}{}'.format(name, keyList[itemNum], ext) #a_B001.jpg
# os.remove(oldFileName)
#write the final output in new csv
with open('newCsv.csv','w') as mycsv:
csvWriter = csv.writer(mycsv,delimiter=",")
for row in zip(keyList, renamedList):
print(row[0] + '\t' + '\t' + row[1])
csvWriter.writerow([row[0],row[1]])
I got help the last time I asked a question on this site regarding batch processing csv files within a folder using glob.glob() with Python. I am trying to use it this time to transpose all csv files within a folder. The script below only processes the last file and stops. What am I doing wrong?
import csv
import os
import glob
directory = raw_input ("INPUT Folder")
output = raw_input("OUTPUT Folder:")
in_files = os.path.join(directory, '*.csv')
for in_file in glob.glob(in_files):
with open(in_file) as input_file:
reader = csv.reader(input_file)
cols = []
for row in reader:
cols.append(row)
filename = os.path.splitext(os.path.basename(in_file))[0] + '.csv'
with open (os.path.join(output, filename), 'wb') as output_file:
writer = csv.writer(output_file)
for i in range(len(max(cols, key=len))):
writer.writerow ([(c[i] if i<len(c) else '') for c in cols])
You need to indent the "output" portion of the code so that it runs once for each iteration of the for in_file loop:
import csv
import os
import glob
directory = raw_input ("INPUT Folder")
output = raw_input("OUTPUT Folder:")
in_files = os.path.join(directory, '*.csv')
for in_file in glob.glob(in_files):
with open(in_file) as input_file:
reader = csv.reader(input_file)
cols = []
for row in reader:
cols.append(row)
# "outdent" this code so it only needs to run once for each in_file
filename = os.path.splitext(os.path.basename(in_file))[0] + '.csv'
# Indent this to the same level as the rest of the "for in_file" loop!
with open (os.path.join(output, filename), 'wb') as output_file:
writer = csv.writer(output_file)
for i in range(len(max(cols, key=len))):
writer.writerow ([(c[i] if i<len(c) else '') for c in cols])
In your version that code only runs once, after the for in_file loop has completed, and therefore only outputs cols data left over from the final iteration of that loop.
I have also "outdented" the filename = ... statement to the for in_file level, as this only needs to be done once for each in_file, not once for each row of each in_file.
You can get a lot of mileage with data manipulation using pandas:
import os
import pandas as pd
for filename in os.listdir('.'):
# We save an augmented filename later,
# so using splitext is useful for more
# than just checking the extension.
prefix, ext = os.path.splitext(filename)
if ext.lower() != '.csv':
continue
# Load the data into a dataframe
df = pd.DataFrame.from_csv(filename,
header=None,
index_col=None,
parse_dates=False)
# Transpose is easy, but you could do TONS
# of data processing here. pandas is awesome.
df_transposed = df.T
# Save to a new file with an augmented name
df_transposed.to_csv(prefix+'_T'+ext, header=True, index=False)
The os.walk version is not much different, if you need to dig into subfolders as well.
Here is a working one:
had to google for an hour, but works and tested on python33
import csv
import os
import glob
directory = 'C:\Python33\csv'
output = 'C:\Python33\csv2'
in_files = os.path.join(directory, '*.csv')
for in_file in glob.glob(in_files):
with open(in_file) as input_file:
reader = csv.reader(input_file)
cols = []
for row in reader:
cols.append(row)
# "outdent" this code so it only needs to run once for each in_file
filename = os.path.splitext(os.path.basename(in_file))[0] + '.csv'
# Indent this to the same level as the rest of the "for in_file" loop!
with open (os.path.join(output, filename), 'w') as output_file:
writer = csv.writer(output_file)
for i in range(len(max(cols, key=len))):
writer.writerow ([(c[i] if i<len(c) else '') for c in cols])
in_files will only return a single result in that format. Try returning a list:
in_files = [f for f in os.listdir(directory) if f.endswith('.csv')]