I have a csv file of about 5000 rows in python i want to split it into five files.
I wrote a code for it but it is not working
import codecs
import csv
NO_OF_LINES_PER_FILE = 1000
def again(count_file_header,count):
f3 = open('write_'+count_file_header+'.csv', 'at')
with open('import_1458922827.csv', 'rb') as csvfile:
candidate_info_reader = csv.reader(csvfile, delimiter=',', quoting=csv.QUOTE_ALL)
co = 0
for row in candidate_info_reader:
co = co + 1
count = count + 1
if count <= count:
pass
elif count >= NO_OF_LINES_PER_FILE:
count_file_header = count + NO_OF_LINES_PER_FILE
again(count_file_header,count)
else:
writer = csv.writer(f3,delimiter = ',', lineterminator='\n',quoting=csv.QUOTE_ALL)
writer.writerow(row)
def read_write():
f3 = open('write_'+NO_OF_LINES_PER_FILE+'.csv', 'at')
with open('import_1458922827.csv', 'rb') as csvfile:
candidate_info_reader = csv.reader(csvfile, delimiter=',', quoting=csv.QUOTE_ALL)
count = 0
for row in candidate_info_reader:
count = count + 1
if count >= NO_OF_LINES_PER_FILE:
count_file_header = count + NO_OF_LINES_PER_FILE
again(count_file_header,count)
else:
writer = csv.writer(f3,delimiter = ',', lineterminator='\n',quoting=csv.QUOTE_ALL)
writer.writerow(row)
read_write()
The above code creates many fileswith empty content.
How to split one files into five csv files?
In Python
Use readlines() and writelines() to do that, here is an example:
>>> csvfile = open('import_1458922827.csv', 'r').readlines()
>>> filename = 1
>>> for i in range(len(csvfile)):
... if i % 1000 == 0:
... open(str(filename) + '.csv', 'w+').writelines(csvfile[i:i+1000])
... filename += 1
the output file names will be numbered 1.csv, 2.csv, ... etc.
From terminal
FYI, you can do this from the command line using split as follows:
$ split -l 1000 import_1458922827.csv
I suggest you not inventing a wheel. There is existing solution. Source here
import os
def split(filehandler, delimiter=',', row_limit=1000,
output_name_template='output_%s.csv', output_path='.', keep_headers=True):
import csv
reader = csv.reader(filehandler, delimiter=delimiter)
current_piece = 1
current_out_path = os.path.join(
output_path,
output_name_template % current_piece
)
current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
current_limit = row_limit
if keep_headers:
headers = reader.next()
current_out_writer.writerow(headers)
for i, row in enumerate(reader):
if i + 1 > current_limit:
current_piece += 1
current_limit = row_limit * current_piece
current_out_path = os.path.join(
output_path,
output_name_template % current_piece
)
current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
if keep_headers:
current_out_writer.writerow(headers)
current_out_writer.writerow(row)
Use it like:
split(open('/your/pat/input.csv', 'r'));
A python3-friendly solution:
def split_csv(source_filepath, dest_folder, split_file_prefix,
records_per_file):
"""
Split a source csv into multiple csvs of equal numbers of records,
except the last file.
Includes the initial header row in each split file.
Split files follow a zero-index sequential naming convention like so:
`{split_file_prefix}_0.csv`
"""
if records_per_file <= 0:
raise Exception('records_per_file must be > 0')
with open(source_filepath, 'r') as source:
reader = csv.reader(source)
headers = next(reader)
file_idx = 0
records_exist = True
while records_exist:
i = 0
target_filename = f'{split_file_prefix}_{file_idx}.csv'
target_filepath = os.path.join(dest_folder, target_filename)
with open(target_filepath, 'w') as target:
writer = csv.writer(target)
while i < records_per_file:
if i == 0:
writer.writerow(headers)
try:
writer.writerow(next(reader))
i += 1
except StopIteration:
records_exist = False
break
if i == 0:
# we only wrote the header, so delete that file
os.remove(target_filepath)
file_idx += 1
I have modified the accepted answer a little bit to make it simpler
Edited: Added the import statement, modified the print statement for printing the exception. #Alex F code snippet was written for python2, for python3 you also need to use header_row = rows.__next__() instead header_row = rows.next(). Thanks for pointing out.
import os
import csv
def split_csv_into_chunks(file_location, out_dir, file_size=2):
count = 0
current_piece = 1
# file_to_split_name.csv
file_name = file_location.split("/")[-1].split(".")[0]
split_file_name_template = file_name + "__%s.csv"
splited_files_path = []
if not os.path.exists(out_dir):
os.makedirs(out_dir)
try:
with open(file_location, "rb") as csv_file:
rows = csv.reader(csv_file, delimiter=",")
headers_row = rows.next()
for row in rows:
if count % file_size == 0:
current_out_path = os.path.join(out_dir,
split_file_name_template%str(current_piece))
current_out_writer = None
current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=",")
current_out_writer.writerow(headers_row)
splited_files_path.append(current_out_path)
current_piece += 1
current_out_writer.writerow(row)
count += 1
return True, splited_files_path
except Exception as e:
print("Exception occurred as {}".format(e))
return False, splited_files_path
#Ryan, Python3 code worked for me. I used newline='' as below to avoid the blank line issue:
with open(target_filepath, 'w', newline='') as target:
Another pandas solution (each 1000 rows), similar to Aziz Alto solution:
suffix = 1
for i in range(len(df)):
if i % 1000 == 0:
df[i:i+1000].to_csv(f"processed/{filename}_{suffix}.csv", sep ='|', index=False, index_label=False)
suffix += 1
where df is the csv loaded as pandas.DataFrame; filename is the original filename, the pipe is a separator; index and index_label false is to skip the autoincremented index columns
A simple Python 3 solution with Pandas that doesn't cut off the last batch
def to_csv_batch(src_csv, dst_dir, size=30000, index=False):
import pandas as pd
import math
# Read source csv
df = pd.read_csv(src_csv)
# Initial values
low = 0
high = size
# Loop through batches
for i in range(math.ceil(len(df) / size)):
fname = dst_dir+'/Batch_' + str(i+1) + '.csv'
df[low:high].to_csv(fname, index=index)
# Update selection
low = high
if (high + size < len(df)):
high = high + size
else:
high = len(df)
Usage example
to_csv_batch('Batch_All.csv', 'Batches')
if count <= count:
pass
This condition is always true so you pass everytime
Otherwise you can look at this post: Splitting a CSV file into equal parts?
I suggest you leverage the possibilities offered by pandas. Here are functions you could use to do that :
def csv_count_rows(file):
"""
Counts the number of rows in a file.
:param file: path to the file.
:return: number of lines in the designated file.
"""
with open(file) as f:
nb_lines = sum(1 for line in f)
return nb_lines
def split_csv(file, sep=",", output_path=".", nrows=None, chunksize=None, low_memory=True, usecols=None):
"""
Split a csv into several files.
:param file: path to the original csv.
:param sep: View pandas.read_csv doc.
:param output_path: path in which to output the resulting parts of the splitting.
:param nrows: Number of rows to split the original csv by, also view pandas.read_csv doc.
:param chunksize: View pandas.read_csv doc.
:param low_memory: View pandas.read_csv doc.
:param usecols: View pandas.read_csv doc.
"""
nb_of_rows = csv_count_rows(file)
# Parsing file elements : Path, name, extension, etc...
# file_path = "/".join(file.split("/")[0:-1])
file_name = file.split("/")[-1]
# file_ext = file_name.split(".")[-1]
file_name_trunk = file_name.split(".")[0]
split_files_name_trunk = file_name_trunk + "_part_"
# Number of chunks to partition the original file into
nb_of_chunks = math.ceil(nb_of_rows / nrows)
if nrows:
log_debug_process_start = f"The file '{file_name}' contains {nb_of_rows} ROWS. " \
f"\nIt will be split into {nb_of_chunks} chunks of a max number of rows : {nrows}." \
f"\nThe resulting files will be output in '{output_path}' as '{split_files_name_trunk}0 to {nb_of_chunks - 1}'"
logging.debug(log_debug_process_start)
for i in range(nb_of_chunks):
# Number of rows to skip is determined by (the number of the chunk being processed) multiplied by (the nrows parameter).
rows_to_skip = range(1, i * nrows) if i else None
output_file = f"{output_path}/{split_files_name_trunk}{i}.csv"
log_debug_chunk_processing = f"Processing chunk {i} of the file '{file_name}'"
logging.debug(log_debug_chunk_processing)
# Fetching the original csv file and handling it with skiprows and nrows to process its data
df_chunk = pd.read_csv(filepath_or_buffer=file, sep=sep, nrows=nrows, skiprows=rows_to_skip,
chunksize=chunksize, low_memory=low_memory, usecols=usecols)
df_chunk.to_csv(path_or_buf=output_file, sep=sep)
log_info_file_output = f"Chunk {i} of file '{file_name}' created in '{output_file}'"
logging.info(log_info_file_output)
And then in your main or jupyter notebook you put :
# This is how you initiate logging in the most basic way.
logging.basicConfig(level=logging.DEBUG)
file = {#Path to your file}
split_csv(file,sep=";" ,output_path={#Path where you'd like to output it},nrows = 4000000, low_memory = False)
P.S.1 : I put nrows = 4000000 because when it's a personal preference. You can change that number if you wish.
P.S.2 : I used the logging library to display messages. When would apply such a function on big files that exist on a remote server, you really want to avoid 'simple printing' and incorporate logging capabilities. You can replace logging.info or logging.debug with print
P.S.3 : Of course, you need to replace the {# Blablabla} parts of the code with your own parameters.
A simpler script works for me.
import pandas as pd
path = "path to file" # path to file
df = pd.read_csv(path) # reading file
low = 0 # Initial Lower Limit
high = 1000 # Initial Higher Limit
while(high < len(df)):
df_new = df[low:high] # subsetting DataFrame based on index
low = high #changing lower limit
high = high + 1000 # givig uper limit with increment of 1000
df_new.to_csv("Path to output file") # output file
import pandas as pd
df = pd.read_csv('input.csv')
file_len = len(df)
filename = 'output'
n = 1
for i in range(file_len):
if i % 10 == 0:
sf = (df[i:i+10])
sf.to_csv(f'{filename}_{n}.csv', index=False)
n += 1
Building upon the top voted answer, here is a python solution that also includes the headers in each file.
file = open('file.csv', 'r')
header = file.readline()
csvfile = file.readlines()
filename = 1
batch_size = 1000
for i in range(len(csvfile)):
if i % batch_size == 0:
open(str(filename) + '.csv', 'w+').writelines(header)
open(str(filename) + '.csv', 'a+').writelines(csvfile[i:i+batch_size])
filename += 1
This will output the same file names as 1.csv, 2.csv, ... etc.
The following is a very simple solution, that does not loop over all rows, but only on the chunks - imagine if you have millions of rows.
chunk_size = 100_000
for i in range(len(df) // chunk_size + 1):
df[i*chunk_size:(i+1)*chunk_size].to_csv(f"output_{i:02d}.csv",
sep=";", index=False)
You define the chunk size and if the total number of rows is not an integer multiple of the chunk size, the last chunk will contain the rest.
Using f"output_{i:02d}.csv" the suffix will be formatted with two digits and a leading zero
If you want to have a header only for the first chunk (and no header for the other chunks), then you can use a boolean over the suffix index at i == 0, that is:
for i in range(len(df) // chunk_size + 1):
df[i*chunk_size:(i+1)*chunk_size].to_csv(f"output_{i:02d}.csv",
sep=";", index=False, header=(i == 0))
Related
I have a following text-file products.txt:
Product;Amount;Price
Apple;3;10.00
Banana;1;5.00
Lemon;2;3.00
Orange;4;20.00
Apple;4;8.00
I want read this file and make a new text-file newfile.txt, which contains value of each row (Amount X Price):
30.00
5.00
6.00
80.00
32.00
Finally, I want to find the total sum of newfile.txt (which is 30+5+6+80+32 = 153)
Note, the price of same product can vary and we are not interested total sum of each product.
I started with creating class.
class DATA:
product= ""
amount= 0
price= 0
def read (name):
list = []
file= open(name, 'r', encoding="UTF-8")
file.readline()
while (True):
row= file.readline()
if(rivi == ''):
break
columns= row[:-1].split(';')
info= DATA()
info.amount= int(columns[1])
info.price= int(columns[2])
info.total = info.amount * info.price
file.append(info)
tiedosto.close()
return list
This should work:
def read(name):
total = 0
ori = open(name, 'r', encoding="UTF-8")
row = ori.readline()
dest = open("newfile.txt", 'w', encoding="UTF-8")
row = ori.readline()
while (row != ""):
row = row[:-1].split(';')
res = int(row[1]) * float(row[2])
total += res
dest.write(str(res) + "\n")
row = ori.readline()
ori.close()
dest.close()
print(total)
read("products.txt")
A possibility would be to use csv from the standard library.
import csv
# fix files' paths
path1 = # file to read
path2 = # file to write
# read data and perform computations
rows_tot = []
with open(path1, 'r', newline='', encoding="utf-8") as fd:
reader = csv.DictReader(fd, delimiter=";")
for row in reader:
rows_tot.append(float(row['Amount']) * float(row['Price']))
# total sum
print("Total sum:", int(sum(rows_tot)))
# save to file the new data
with open(path2, 'w', newline='') as fd:
fieldnames = ("AmountXPrice",)
writer = csv.DictWriter(fd, fieldnames=fieldnames)
writer.writeheader()
for value in rows_tot:
writer.writerow({fieldnames[0]: f"{value:.2f}"})
Remark: it is not clear from the question the type of the various data, in case just change int with float or the other way around.
import csv
divisor = 1000000
outfileno = 1
outfile = None
with open('o77069882.txt', 'r') as infile:
infile_iter = csv.reader(infile)
header = next(infile_iter)
for index, row in enumerate(infile_iter):
if index % divisor == 0:
if outfile is not None:
outfile.close()
outfilename = 'big-{}.csv'.format(outfileno)
outfile = open(outfilename, 'w')
outfileno += 1
writer = csv.writer(outfile)
writer.writerow(header)
writer.writerow(row)
# Don't forget to close the last file
if outfile is not None:
outfile.close()
I am using this above code to divide my file into chunks of size 100000, it does the job but each row in the first file is getting enclosed by quotes( " ") as for example "abc, dfg, ghj, kil" . The second and third file created are not having this problem, can anyone help me modify my code to get rid of the above error.
I am attempting to read data from a CSV file and load it into a DynamoDB table. The issue is that description is written in sentences and have commas. How do I read the columns with a comma delimiter, but ignore the commas within the cells?
Currently, I am using this code to read the CSV file and write to the DB:
def import_csv_to_dynamodb(table_name, csv_file_name, col_names, column_types):
'''
Import a CSV file to a DynamoDB table
'''
dynamodb_conn = boto.connect_dynamodb(aws_access_key_id=MY_ACCESS_KEY_ID,
aws_secret_access_key=MY_SECRET_ACCESS_KEY)
dynamodb_table = dynamodb_conn.get_table(table_name)
BATCH_COUNT = 2 # 25 is the maximum batch size for Amazon DynamoDB
items = []
count = 0
csv_file = open(csv_file_name, 'r', encoding="utf-8-sig")
for cur_line in csv_file:
count += 1
cur_line = cur_line.strip().split(',')
row = {}
for col_number, col_name in enumerate(col_names):
row[col_name] = column_types[col_number](cur_line[col_number])
item = dynamodb_table.new_item(
attrs=row
)
items.append(item)
if count % BATCH_COUNT == 0:
print
'batch write start ... ',
do_batch_write(items, table_name, dynamodb_table, dynamodb_conn)
items = []
print
'batch done! (row number: ' + str(count) + ')'
# flush remaining items, if any
if len(items) > 0:
do_batch_write(items, table_name, dynamodb_table, dynamodb_conn)
csv_file.close()
The Python built-in csv library is very good. The documentation really needs no extra explanation:
https://docs.python.org/3/library/csv.html
I have a folder with a set of text documents. I want to split each document to two or three documents, each one should be 45-70kb.
How сan I do it? I tried:
def split_file(filename, pattern, size):
with open(filename, 'rb') as f:
for index, line in enumerate(f, start=1):
with open(pattern.format(index), 'wb') as out:
n=0
for line in chain([line], f):
out.write(line)
n += len(line)
if n >= 450000 and n <=700000:
break
if __name__ == '__main__':
split_file('folderadress', 'part_{0:03d}.txt', 20000)
but it seems to me it's completely wrong.
This uses a different approach to yours. I have set the maximum size for each file to be 1000 bytes for testing purposes:
import glob
import os
dname = './gash' # directory name
unit_size = 1000 # maximum file size
for fname in glob.iglob("%s/*" % dname):
with open(fname, 'rb') as fo:
data = True
n = 1
while data:
# read returns "" (False) on EOF
data = fo.read(unit_size)
if data:
sub_fname = fname + str(n)
with open(sub_fname, 'wb') as out:
out.write(data)
n += 1
What this might do is to split a line between files, however you do not state if this could be an issue or not.
I have an application that works. But in the interest of attempting to understand functions and python better. I am trying to split it out into various functions.
I"m stuck on the file_IO function. I'm sure the reason it does not work is because the main part of the application does not understand reader or writer. To better explain. Here is a full copy of the application.
Also I'm curious about using csv.DictReader and csv.DictWriter. Do either provide any advantages/disadvantages to the current code?
I suppose another way of doing this is via classes which honestly I would like to know how to do it that way as well.
#!/usr/bin/python
""" Description This script will take a csv file and parse it looking for specific criteria.
A new file is then created based of the original file name containing only the desired parsed criteria.
"""
import csv
import re
import sys
searched = ['aircheck', 'linkrunner at', 'onetouch at']
def find_group(row):
"""Return the group index of a row
0 if the row contains searched[0]
1 if the row contains searched[1]
etc
-1 if not found
"""
for col in row:
col = col.lower()
for j, s in enumerate(searched):
if s in col:
return j
return -1
#Prompt for File Name
def file_IO():
print "Please Enter a File Name, (Without .csv extension): ",
base_Name = raw_input()
print "You entered: ",base_Name
in_Name = base_Name + ".csv"
out_Name = base_Name + ".parsed.csv"
print "Input File: ", in_Name
print "OutPut Files: ", out_Name
#Opens Input file for read and output file to write.
in_File = open(in_Name, "rU")
reader = csv.reader(in_File)
out_File = open(out_Name, "wb")
writer = csv.writer(out_File, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
return (reader, writer)
file_IO()
# Read header
header = reader.next()
stored = []
writer.writerow([header[0], header[3]])
for i, row in enumerate(reader):
g = find_group(row)
if g >= 0:
stored.append((g, i, row))
stored.sort()
for g, i, row in stored:
writer.writerow([row[0], row[3]])
# Closing Input and Output files.
in_File.close()
out_File.close()
If I were you, I'd only separate find_group.
import csv
def find_group(row):
GROUPS = ['aircheck', 'linkrunner at', 'onetouch at']
for idx, group in enumerate(GROUPS):
if group in map(str.lower, row):
return idx
return -1
def get_filenames():
# this might be the only other thing you'd want to factor
# into a function, and frankly I don't really like getting
# user input this way anyway....
basename = raw_input("Enter a base filename (no extension): ")
infilename = basename + ".csv"
outfilename = basename + ".parsed.csv"
return infilename, outfilename
# notice that I don't open the files yet -- let main handle that
infilename, outfilename = get_filenames()
with open(infilename, 'rU') as inf, open(outfilename, 'wb') as outf:
reader = csv.reader(inf)
writer = csv.writer(outf, delimiter=',',
quotechar='"', quoting=csv.QUOTE_ALL)
header = next(reader)
writer.writerow([[header[0], header[3]])
stored = sorted([(find_group(row),idx,row) for idx,row in
enumerate(reader)) if find_group(row) >= 0])
for _, _, row in stored:
writer.writerow([row[0], row[3]])