Python waiting for operation to complete before continuing - python

I'm writing data to a CSV file, then once that's done, I copy the file to another directory.
This is all in a loop, so when the second iteration starts it reads the data from the file that was copied.
The problem is that the file is still being copied while the second iteration starts and this causes obvious issues.
How would I wait for the whole function in the loop to be complete before the second iterations starts? it should be able to go on with any amount of iterations.
for rule in substring_rules:
substring(rule)
the function:
def substring(rule, remove_rows=[]):
writer = csv.writer(open("%s%s" % (DZINE_DIR, f), "wb"))
from_column = rule.from_column
to_column = rule.to_column
reader = csv.reader(open("%s%s" % (OUTPUT_DIR, f)))
headers = reader.next()
index = 0
from_column_index = None
for head in headers:
if head == from_column:
from_column_index = index
index += 1
if to_column not in headers:
headers.append(to_column)
writer.writerow(headers)
row_index = 0
for row in reader:
if rule.get_rule_type_display() == "substring":
try:
string = rule.string.split(",")
new_value = string[0] + row[from_column_index] + string[1]
if from_column == to_column:
row[from_column_index] = new_value
else:
row.append(new_value)
except Exception, e:
print e
if row_index not in remove_rows:
writer.writerow(row)
row_index += 1
shutil.copyfile("%s%s" % (DZINE_DIR,f), "%s%s" % (OUTPUT_DIR, f))

The problem is that you are not flushing the file for reader to disk before copying it.
(This is automatically done when the file object is garbage-collected)
instead of
writer = csv.writer(open("%s%s" % (DZINE_DIR, f), "wb"))
...
shutil.copyfile("%s%s" % (DZINE_DIR,f), "%s%s" % (OUTPUT_DIR, f))
you should write
wf = open("%s%s" % (DZINE_DIR, f), "wb")
writer = csv.writer(wf)
...
wf.close()
shutil.copyfile("%s%s" % (DZINE_DIR,f), "%s%s" % (OUTPUT_DIR, f))

Related

How to split a large file into chunks using python

import csv
divisor = 1000000
outfileno = 1
outfile = None
with open('o77069882.txt', 'r') as infile:
infile_iter = csv.reader(infile)
header = next(infile_iter)
for index, row in enumerate(infile_iter):
if index % divisor == 0:
if outfile is not None:
outfile.close()
outfilename = 'big-{}.csv'.format(outfileno)
outfile = open(outfilename, 'w')
outfileno += 1
writer = csv.writer(outfile)
writer.writerow(header)
writer.writerow(row)
# Don't forget to close the last file
if outfile is not None:
outfile.close()
I am using this above code to divide my file into chunks of size 100000, it does the job but each row in the first file is getting enclosed by quotes( " ") as for example "abc, dfg, ghj, kil" . The second and third file created are not having this problem, can anyone help me modify my code to get rid of the above error.

Split a large csv file through python

I have a csv file with 5 million rows.
I want to split the file into a number a number of rows specified by the user.
Have developed the following code, but its taking too much time for the execution. Can anyone help me with the optimization of the code.
import csv
print "Please delete the previous created files. If any."
filepath = raw_input("Enter the File path: ")
line_count = 0
filenum = 1
try:
in_file = raw_input("Enter Input File name: ")
if in_file[-4:] == ".csv":
split_size = int(raw_input("Enter size: "))
print "Split Size ---", split_size
print in_file, " will split into", split_size, "rows per file named as OutPut-file_*.csv (* = 1,2,3 and so on)"
with open (in_file,'r') as file1:
row_count = 0
reader = csv.reader(file1)
for line in file1:
#print line
with open(filepath + "\\OutPut-file_" +str(filenum) + ".csv", "a") as out_file:
if row_count < split_size:
out_file.write(line)
row_count = row_count +1
else:
filenum = filenum + 1
row_count = 0
line_count = line_count+1
print "Total Files Written --", filenum
else:
print "Please enter the Name of the file correctly."
except IOError as e:
print "Oops..! Please Enter correct file path values", e
except ValueError:
print "Oops..! Please Enter correct values"
I have also tried without "with open"
Oups! You are consistently re-opening the output file on each row, when it is an expensive operation... Your code could could become:
...
with open (in_file,'r') as file1:
row_count = 0
#reader = csv.reader(file1) # unused here
out_file = open(filepath + "\\OutPut-file_" +str(filenum) + ".csv", "a")
for line in file1:
#print line
if row_count >= split_size:
out_file.close()
filenum = filenum + 1
out_file = open(filepath + "\\OutPut-file_" +str(filenum) + ".csv", "a")
row_count = 0
out_file.write(line)
row_count = row_count +1
line_count = line_count+1
...
Ideally, you should even initialize out_file = None before the try block and ensure a clean close in the except blocks with if out_file is not None: out_file.close()
Remark: this code only splits in line count (as yours did). That means that is will give wrong output if the csv file can contain newlines in quoted fields...
You can definitely use the multiprocessing module of python.
This is the result I have achieved when I have a csv file which had 1,000,000 lines in it.
import time
from multiprocessing import Pool
def saving_csv_normally(start):
out_file = open('out_normally/' + str(start/batch_size) + '.csv', 'w')
for i in range(start, start+batch_size):
out_file.write(arr[i])
out_file.close()
def saving_csv_multi(start):
out_file = open('out_multi/' + str(start/batch_size) + '.csv', 'w')
for i in range(start, start+batch_size):
out_file.write(arr[i])
out_file.close()
def saving_csv_multi_async(start):
out_file = open('out_multi_async/' + str(start/batch_size) + '.csv', 'w')
for i in range(start, start+batch_size):
out_file.write(arr[i])
out_file.close()
with open('files/test.csv') as file:
arr = file.readlines()
print "length of file : ", len(arr)
batch_size = 100 #split in number of rows
start = time.time()
for i in range(0, len(arr), batch_size):
saving_csv_normally(i)
print "time taken normally : ", time.time()-start
#multiprocessing
p = Pool()
start = time.time()
p.map(saving_csv_multi, range(0, len(arr), batch_size), chunksize=len(arr)/4) #chunksize you can define as much as you want
print "time taken for multiprocessing : ", time.time()-start
# it does the same thing aynchronically
start = time.time()
for i in p.imap_unordered(saving_csv_multi_async, range(0, len(arr), batch_size), chunksize=len(arr)/4):
continue
print "time taken for multiprocessing async : ", time.time()-start
output shows time taken by each :
length of file : 1000000
time taken normally : 0.733881950378
time taken for multiprocessing : 0.508712053299
time taken for multiprocessing async : 0.471592903137
I have defined three separate functions as functions passed in p.map can only have one parameter and as I am storing csv files in three different folders that is why I have written three functions.

Splitting one csv into multiple files

I have a csv file of about 5000 rows in python i want to split it into five files.
I wrote a code for it but it is not working
import codecs
import csv
NO_OF_LINES_PER_FILE = 1000
def again(count_file_header,count):
f3 = open('write_'+count_file_header+'.csv', 'at')
with open('import_1458922827.csv', 'rb') as csvfile:
candidate_info_reader = csv.reader(csvfile, delimiter=',', quoting=csv.QUOTE_ALL)
co = 0
for row in candidate_info_reader:
co = co + 1
count = count + 1
if count <= count:
pass
elif count >= NO_OF_LINES_PER_FILE:
count_file_header = count + NO_OF_LINES_PER_FILE
again(count_file_header,count)
else:
writer = csv.writer(f3,delimiter = ',', lineterminator='\n',quoting=csv.QUOTE_ALL)
writer.writerow(row)
def read_write():
f3 = open('write_'+NO_OF_LINES_PER_FILE+'.csv', 'at')
with open('import_1458922827.csv', 'rb') as csvfile:
candidate_info_reader = csv.reader(csvfile, delimiter=',', quoting=csv.QUOTE_ALL)
count = 0
for row in candidate_info_reader:
count = count + 1
if count >= NO_OF_LINES_PER_FILE:
count_file_header = count + NO_OF_LINES_PER_FILE
again(count_file_header,count)
else:
writer = csv.writer(f3,delimiter = ',', lineterminator='\n',quoting=csv.QUOTE_ALL)
writer.writerow(row)
read_write()
The above code creates many fileswith empty content.
How to split one files into five csv files?
In Python
Use readlines() and writelines() to do that, here is an example:
>>> csvfile = open('import_1458922827.csv', 'r').readlines()
>>> filename = 1
>>> for i in range(len(csvfile)):
... if i % 1000 == 0:
... open(str(filename) + '.csv', 'w+').writelines(csvfile[i:i+1000])
... filename += 1
the output file names will be numbered 1.csv, 2.csv, ... etc.
From terminal
FYI, you can do this from the command line using split as follows:
$ split -l 1000 import_1458922827.csv
I suggest you not inventing a wheel. There is existing solution. Source here
import os
def split(filehandler, delimiter=',', row_limit=1000,
output_name_template='output_%s.csv', output_path='.', keep_headers=True):
import csv
reader = csv.reader(filehandler, delimiter=delimiter)
current_piece = 1
current_out_path = os.path.join(
output_path,
output_name_template % current_piece
)
current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
current_limit = row_limit
if keep_headers:
headers = reader.next()
current_out_writer.writerow(headers)
for i, row in enumerate(reader):
if i + 1 > current_limit:
current_piece += 1
current_limit = row_limit * current_piece
current_out_path = os.path.join(
output_path,
output_name_template % current_piece
)
current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
if keep_headers:
current_out_writer.writerow(headers)
current_out_writer.writerow(row)
Use it like:
split(open('/your/pat/input.csv', 'r'));
A python3-friendly solution:
def split_csv(source_filepath, dest_folder, split_file_prefix,
records_per_file):
"""
Split a source csv into multiple csvs of equal numbers of records,
except the last file.
Includes the initial header row in each split file.
Split files follow a zero-index sequential naming convention like so:
`{split_file_prefix}_0.csv`
"""
if records_per_file <= 0:
raise Exception('records_per_file must be > 0')
with open(source_filepath, 'r') as source:
reader = csv.reader(source)
headers = next(reader)
file_idx = 0
records_exist = True
while records_exist:
i = 0
target_filename = f'{split_file_prefix}_{file_idx}.csv'
target_filepath = os.path.join(dest_folder, target_filename)
with open(target_filepath, 'w') as target:
writer = csv.writer(target)
while i < records_per_file:
if i == 0:
writer.writerow(headers)
try:
writer.writerow(next(reader))
i += 1
except StopIteration:
records_exist = False
break
if i == 0:
# we only wrote the header, so delete that file
os.remove(target_filepath)
file_idx += 1
I have modified the accepted answer a little bit to make it simpler
Edited: Added the import statement, modified the print statement for printing the exception. #Alex F code snippet was written for python2, for python3 you also need to use header_row = rows.__next__() instead header_row = rows.next(). Thanks for pointing out.
import os
import csv
def split_csv_into_chunks(file_location, out_dir, file_size=2):
count = 0
current_piece = 1
# file_to_split_name.csv
file_name = file_location.split("/")[-1].split(".")[0]
split_file_name_template = file_name + "__%s.csv"
splited_files_path = []
if not os.path.exists(out_dir):
os.makedirs(out_dir)
try:
with open(file_location, "rb") as csv_file:
rows = csv.reader(csv_file, delimiter=",")
headers_row = rows.next()
for row in rows:
if count % file_size == 0:
current_out_path = os.path.join(out_dir,
split_file_name_template%str(current_piece))
current_out_writer = None
current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=",")
current_out_writer.writerow(headers_row)
splited_files_path.append(current_out_path)
current_piece += 1
current_out_writer.writerow(row)
count += 1
return True, splited_files_path
except Exception as e:
print("Exception occurred as {}".format(e))
return False, splited_files_path
#Ryan, Python3 code worked for me. I used newline='' as below to avoid the blank line issue:
with open(target_filepath, 'w', newline='') as target:
Another pandas solution (each 1000 rows), similar to Aziz Alto solution:
suffix = 1
for i in range(len(df)):
if i % 1000 == 0:
df[i:i+1000].to_csv(f"processed/{filename}_{suffix}.csv", sep ='|', index=False, index_label=False)
suffix += 1
where df is the csv loaded as pandas.DataFrame; filename is the original filename, the pipe is a separator; index and index_label false is to skip the autoincremented index columns
A simple Python 3 solution with Pandas that doesn't cut off the last batch
def to_csv_batch(src_csv, dst_dir, size=30000, index=False):
import pandas as pd
import math
# Read source csv
df = pd.read_csv(src_csv)
# Initial values
low = 0
high = size
# Loop through batches
for i in range(math.ceil(len(df) / size)):
fname = dst_dir+'/Batch_' + str(i+1) + '.csv'
df[low:high].to_csv(fname, index=index)
# Update selection
low = high
if (high + size < len(df)):
high = high + size
else:
high = len(df)
Usage example
to_csv_batch('Batch_All.csv', 'Batches')
if count <= count:
pass
This condition is always true so you pass everytime
Otherwise you can look at this post: Splitting a CSV file into equal parts?
I suggest you leverage the possibilities offered by pandas. Here are functions you could use to do that :
def csv_count_rows(file):
"""
Counts the number of rows in a file.
:param file: path to the file.
:return: number of lines in the designated file.
"""
with open(file) as f:
nb_lines = sum(1 for line in f)
return nb_lines
def split_csv(file, sep=",", output_path=".", nrows=None, chunksize=None, low_memory=True, usecols=None):
"""
Split a csv into several files.
:param file: path to the original csv.
:param sep: View pandas.read_csv doc.
:param output_path: path in which to output the resulting parts of the splitting.
:param nrows: Number of rows to split the original csv by, also view pandas.read_csv doc.
:param chunksize: View pandas.read_csv doc.
:param low_memory: View pandas.read_csv doc.
:param usecols: View pandas.read_csv doc.
"""
nb_of_rows = csv_count_rows(file)
# Parsing file elements : Path, name, extension, etc...
# file_path = "/".join(file.split("/")[0:-1])
file_name = file.split("/")[-1]
# file_ext = file_name.split(".")[-1]
file_name_trunk = file_name.split(".")[0]
split_files_name_trunk = file_name_trunk + "_part_"
# Number of chunks to partition the original file into
nb_of_chunks = math.ceil(nb_of_rows / nrows)
if nrows:
log_debug_process_start = f"The file '{file_name}' contains {nb_of_rows} ROWS. " \
f"\nIt will be split into {nb_of_chunks} chunks of a max number of rows : {nrows}." \
f"\nThe resulting files will be output in '{output_path}' as '{split_files_name_trunk}0 to {nb_of_chunks - 1}'"
logging.debug(log_debug_process_start)
for i in range(nb_of_chunks):
# Number of rows to skip is determined by (the number of the chunk being processed) multiplied by (the nrows parameter).
rows_to_skip = range(1, i * nrows) if i else None
output_file = f"{output_path}/{split_files_name_trunk}{i}.csv"
log_debug_chunk_processing = f"Processing chunk {i} of the file '{file_name}'"
logging.debug(log_debug_chunk_processing)
# Fetching the original csv file and handling it with skiprows and nrows to process its data
df_chunk = pd.read_csv(filepath_or_buffer=file, sep=sep, nrows=nrows, skiprows=rows_to_skip,
chunksize=chunksize, low_memory=low_memory, usecols=usecols)
df_chunk.to_csv(path_or_buf=output_file, sep=sep)
log_info_file_output = f"Chunk {i} of file '{file_name}' created in '{output_file}'"
logging.info(log_info_file_output)
And then in your main or jupyter notebook you put :
# This is how you initiate logging in the most basic way.
logging.basicConfig(level=logging.DEBUG)
file = {#Path to your file}
split_csv(file,sep=";" ,output_path={#Path where you'd like to output it},nrows = 4000000, low_memory = False)
P.S.1 : I put nrows = 4000000 because when it's a personal preference. You can change that number if you wish.
P.S.2 : I used the logging library to display messages. When would apply such a function on big files that exist on a remote server, you really want to avoid 'simple printing' and incorporate logging capabilities. You can replace logging.info or logging.debug with print
P.S.3 : Of course, you need to replace the {# Blablabla} parts of the code with your own parameters.
A simpler script works for me.
import pandas as pd
path = "path to file" # path to file
df = pd.read_csv(path) # reading file
low = 0 # Initial Lower Limit
high = 1000 # Initial Higher Limit
while(high < len(df)):
df_new = df[low:high] # subsetting DataFrame based on index
low = high #changing lower limit
high = high + 1000 # givig uper limit with increment of 1000
df_new.to_csv("Path to output file") # output file
import pandas as pd
df = pd.read_csv('input.csv')
file_len = len(df)
filename = 'output'
n = 1
for i in range(file_len):
if i % 10 == 0:
sf = (df[i:i+10])
sf.to_csv(f'{filename}_{n}.csv', index=False)
n += 1
Building upon the top voted answer, here is a python solution that also includes the headers in each file.
file = open('file.csv', 'r')
header = file.readline()
csvfile = file.readlines()
filename = 1
batch_size = 1000
for i in range(len(csvfile)):
if i % batch_size == 0:
open(str(filename) + '.csv', 'w+').writelines(header)
open(str(filename) + '.csv', 'a+').writelines(csvfile[i:i+batch_size])
filename += 1
This will output the same file names as 1.csv, 2.csv, ... etc.
The following is a very simple solution, that does not loop over all rows, but only on the chunks - imagine if you have millions of rows.
chunk_size = 100_000
for i in range(len(df) // chunk_size + 1):
df[i*chunk_size:(i+1)*chunk_size].to_csv(f"output_{i:02d}.csv",
sep=";", index=False)
You define the chunk size and if the total number of rows is not an integer multiple of the chunk size, the last chunk will contain the rest.
Using f"output_{i:02d}.csv" the suffix will be formatted with two digits and a leading zero
If you want to have a header only for the first chunk (and no header for the other chunks), then you can use a boolean over the suffix index at i == 0, that is:
for i in range(len(df) // chunk_size + 1):
df[i*chunk_size:(i+1)*chunk_size].to_csv(f"output_{i:02d}.csv",
sep=";", index=False, header=(i == 0))

How to split code into smaller functions

I have an application that works. But in the interest of attempting to understand functions and python better. I am trying to split it out into various functions.
I"m stuck on the file_IO function. I'm sure the reason it does not work is because the main part of the application does not understand reader or writer. To better explain. Here is a full copy of the application.
Also I'm curious about using csv.DictReader and csv.DictWriter. Do either provide any advantages/disadvantages to the current code?
I suppose another way of doing this is via classes which honestly I would like to know how to do it that way as well.
#!/usr/bin/python
""" Description This script will take a csv file and parse it looking for specific criteria.
A new file is then created based of the original file name containing only the desired parsed criteria.
"""
import csv
import re
import sys
searched = ['aircheck', 'linkrunner at', 'onetouch at']
def find_group(row):
"""Return the group index of a row
0 if the row contains searched[0]
1 if the row contains searched[1]
etc
-1 if not found
"""
for col in row:
col = col.lower()
for j, s in enumerate(searched):
if s in col:
return j
return -1
#Prompt for File Name
def file_IO():
print "Please Enter a File Name, (Without .csv extension): ",
base_Name = raw_input()
print "You entered: ",base_Name
in_Name = base_Name + ".csv"
out_Name = base_Name + ".parsed.csv"
print "Input File: ", in_Name
print "OutPut Files: ", out_Name
#Opens Input file for read and output file to write.
in_File = open(in_Name, "rU")
reader = csv.reader(in_File)
out_File = open(out_Name, "wb")
writer = csv.writer(out_File, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
return (reader, writer)
file_IO()
# Read header
header = reader.next()
stored = []
writer.writerow([header[0], header[3]])
for i, row in enumerate(reader):
g = find_group(row)
if g >= 0:
stored.append((g, i, row))
stored.sort()
for g, i, row in stored:
writer.writerow([row[0], row[3]])
# Closing Input and Output files.
in_File.close()
out_File.close()
If I were you, I'd only separate find_group.
import csv
def find_group(row):
GROUPS = ['aircheck', 'linkrunner at', 'onetouch at']
for idx, group in enumerate(GROUPS):
if group in map(str.lower, row):
return idx
return -1
def get_filenames():
# this might be the only other thing you'd want to factor
# into a function, and frankly I don't really like getting
# user input this way anyway....
basename = raw_input("Enter a base filename (no extension): ")
infilename = basename + ".csv"
outfilename = basename + ".parsed.csv"
return infilename, outfilename
# notice that I don't open the files yet -- let main handle that
infilename, outfilename = get_filenames()
with open(infilename, 'rU') as inf, open(outfilename, 'wb') as outf:
reader = csv.reader(inf)
writer = csv.writer(outf, delimiter=',',
quotechar='"', quoting=csv.QUOTE_ALL)
header = next(reader)
writer.writerow([[header[0], header[3]])
stored = sorted([(find_group(row),idx,row) for idx,row in
enumerate(reader)) if find_group(row) >= 0])
for _, _, row in stored:
writer.writerow([row[0], row[3]])

Unable to write list back to CSV

I am trying to write a code that takes in a csv, runs a ping on the value in the first column and then outputs the status to the second column. Everything in the code runs fine until it tries to write out to the csv at which time I get this error
line 35, in writer.writerows(columns)
TypeError: 'str' does not support the buffer interface
import os
import csv
from collections import defaultdict
i = 0
#read file
columns = defaultdict(list)
with open('hosts.csv') as f:
reader = csv.DictReader(f)
for row in reader:
for (k,v) in row.items():
columns[k].append(v)
f.close()
print('[DEBUG]', columns['host'])
print('[DEBUG] 1st host is', (columns['host'])[0])
print('[DEBUG]', columns['status'])
#ping hosts
hostname = (columns['host'])[i]
response = os.system("ping -n 1 " + hostname)
print ("[DEBUG]", response)
if response == 0:
print (hostname, 'is up')
(columns['status'])[i] = 'Up'
i = i+1
else:
print (hostname, 'is down')
(columns['status'])[i] = 'Down'
i = i+1
#write results
with open("hosts.csv", "wb") as f:
writer =csv.writer(f)
print("[DEBUG] just before write rows")
writer.writerows(columns)
print("[DEBUG] after write rows")
f.close()
The csv contains the following
host,status,name
8.8.8.8,down,google.com
and should return
host,status,name
8.8.8.8,Up,google.com
I am using Python 3.4
You are reading the CSV in one format and writing in another one, where columns is defaultdict with list of values inside a dict.
Here's a better way to solve this problem, maintaing the original file structure:
import os
import csv
with open('hosts.csv') as f:
reader = csv.DictReader(f)
rows = list(reader)
hosts = [row['host'] for row in rows]
statuses = [row['status'] for row in rows]
print('[DEBUG]', hosts)
print('[DEBUG] 1st host is', hosts[0])
print('[DEBUG]', statuses)
for row in rows:
#ping hosts
hostname = row['host']
response = os.system("ping -n 1 " + hostname)
print ("[DEBUG]", response)
if response == 0:
print (hostname, 'is up')
row['status'] = 'Up'
else:
print (hostname, 'is down')
row['status'] = 'Down'
#write results
with open("hosts.csv", "wb") as f:
writer = csv.DictWriter(f, reader.fieldnames)
# to maintain the same structure from the original file, rewrite header in original position
writer.writeheader()
print("[DEBUG] just before write rows")
writer.writerows(rows)
print("[DEBUG] after write rows")
Before instantiate csv.DictWriter, you can change the field names that you want to be in the new file:
newfieldnames = csvreader.fieldnames
lastfield = newfieldnames.pop() # remove last field
if 'field_name' in newfieldnames:
newfieldnames.remove('field_name') # remove by field name
writer = csv.DictWriter(f, newfieldnames)

Categories

Resources