Read csv file with empty lines

Read csv file with empty lines - python

Analysis software I'm using outputs many groups of results in 1 csv file and separates the groups with 2 empty lines.
I would like to break the results in groups so that I can then analyse them separately.
I'm sure there is a built-in function in python (or one of it's libraries) that does this, I tried this piece of code that I found somewhere but it doesn't seem to work.
import csv
results = open('03_12_velocity_y.csv').read().split("\n\n")
# Feed first csv.reader
first_csv = csv.reader(results[0], delimiter=',')
# Feed second csv.reader
second_csv = csv.reader(results[1], delimiter=',')
Update:
The original code actually works, but my python skills are pretty limited and I did not implement it properly.
.split(\n\n\n) method does work but the csv.reader is an object and to get the data in a list (or something similar), it needs to iterate through all the rows and write them to the list.
I then used Pandas to remove the header and convert the scientific notated values to float. Code is bellow. Thanks everyone for help.
import csv
import pandas as pd
# Open the csv file, read it and split it when it encounters 2 empty lines (\n\n\n)
results = open('03_12_velocity_y.csv').read().split('\n\n\n')
# Create csv.reader objects that are used to iterate over rows in a csv file
# Define the output - create an empty multi-dimensional list
output1 = [[],[]]
# Iterate through the rows in the csv file and append the data to the empty list
# Feed first csv.reader
csv_reader1 = csv.reader(results[0].splitlines(), delimiter=',')
for row in csv_reader1:
output1.append(row)
df = pd.DataFrame(output1)
# remove first 7 rows of data (the start position of the slice is always included)
df = df.iloc[7:]
# Convert all data from string to float
df = df.astype(float)

If your row counts are inconsistent across groups, you'll need a little state machine to check when you're between groups and do something with the last group.
#!/usr/bin/env python3
import csv
def write_group(group, i):
with open(f"group_{i}.csv", "w", newline="") as out_f:
csv.writer(out_f).writerows(group)
with open("input.csv", newline="") as f:
reader = csv.reader(f)
group_i = 1
group = []
last_row = []
for row in reader:
if row == [] and last_row == [] and group != []:
write_group(group, group_i)
group = []
group_i += 1
continue
if row == []:
last_row = row
continue
group.append(row)
last_row = row
# flush remaining group
if group != []:
write_group(group, group_i)
I mocked up this sample CSV:
g1r1c1,g1r1c2,g1r1c3
g1r2c1,g1r2c2,g1r2c3
g1r3c1,g1r3c2,g1r3c3
g2r1c1,g2r1c2,g2r1c3
g2r2c1,g2r2c2,g2r2c3
g3r1c1,g3r1c2,g3r1c3
g3r2c1,g3r2c2,g3r2c3
g3r3c1,g3r3c2,g3r3c3
g3r4c1,g3r4c2,g3r4c3
g3r5c1,g3r5c2,g3r5c3
And when I run the program above I get three CSV files:
group_1.csv
g1r1c1,g1r1c2,g1r1c3
g1r2c1,g1r2c2,g1r2c3
g1r3c1,g1r3c2,g1r3c3
group_2.csv
g2r1c1,g2r1c2,g2r1c3
g2r2c1,g2r2c2,g2r2c3
group_3.csv
g3r1c1,g3r1c2,g3r1c3
g3r2c1,g3r2c2,g3r2c3
g3r3c1,g3r3c2,g3r3c3
g3r4c1,g3r4c2,g3r4c3
g3r5c1,g3r5c2,g3r5c3

If your row counts are consistent, you can do this with fairly vanilla Python or using the Pandas library.
Vanilla Python
Define your group size and the size of the break (in "rows") between groups.
Loop over all the rows adding each row to a group accumulator.
When the group accumulator reaches the pre-defined group size, do something with it, reset the accumulator, and then skip break-size rows.
Here, I'm writing each group to its own numbered file:
import csv
group_sz = 5
break_sz = 2
def write_group(group, i):
with open(f"group_{i}.csv", "w", newline="") as f_out:
csv.writer(f_out).writerows(group)
with open("input.csv", newline="") as f_in:
reader = csv.reader(f_in)
group_i = 1
group = []
for row in reader:
group.append(row)
if len(group) == group_sz:
write_group(group, group_i)
group_i += 1
group = []
for _ in range(break_sz):
try:
next(reader)
except StopIteration: # gracefully ignore an expected StopIteration (at the end of the file)
break
group_1.csv
g1r1c1,g1r1c2,g1r1c3
g1r2c1,g1r2c2,g1r2c3
g1r3c1,g1r3c2,g1r3c3
g1r4c1,g1r4c2,g1r4c3
g1r5c1,g1r5c2,g1r5c3
With Pandas
I'm new to Pandas, and learning this as I go, but it looks like Pandas will automatically trim blank rows/records from a chunk of data^1.
With that in mind, all you need to do is specify the size of your group, and tell Pandas to read your CSV file in "iterator mode", where you can ask for a chunk (your group size) of records at a time:
import pandas as pd
group_sz = 5
with pd.read_csv("input.csv", header=None, iterator=True) as reader:
i = 1
while True:
try:
df = reader.get_chunk(group_sz)
except StopIteration:
break
df.to_csv(f"group_{i}.csv")
i += 1
Pandas add an "ID" column and default header when it writes out the CSV:
group_1.csv
,0,1,2
0,g1r1c1,g1r1c2,g1r1c3
1,g1r2c1,g1r2c2,g1r2c3
2,g1r3c1,g1r3c2,g1r3c3
3,g1r4c1,g1r4c2,g1r4c3
4,g1r5c1,g1r5c2,g1r5c3

TRY this out with your output:
import pandas as pd
# csv file name to be read in
in_csv = 'input.csv'
# get the number of lines of the csv file to be read
number_lines = sum(1 for row in (open(in_csv)))
# size of rows of data to write to the csv,
# you can change the row size according to your need
rowsize = 500
# start looping through data writing it to a new file for each set
for i in range(1,number_lines,rowsize):
df = pd.read_csv(in_csv,
header=None,
nrows = rowsize,#number of rows to read at each loop
skiprows = i)#skip rows that have been read
#csv to write data to a new file with indexed name. input_1.csv etc.
out_csv = 'input' + str(i) + '.csv'
df.to_csv(out_csv,
index=False,
header=False,
mode='a', #append data to csv file
)

I updated the question with the last details that answer my question.

Related

Read CSV file with limit and offset

I am using the following code for reading CSV file to a dictionary.
file_name = path+'/'+file.filename
with open(file_name, newline='') as csv_file:
csv_dict = [{k: v for k, v in row.items()}
for row in csv.DictReader(csv_file)]
for item in csv_dict:
call_api(item)
Now this is reading the files and calling the function for each of the row. As the number of rows increases, the number of calls also will increase. Also it is not possible to load all the contents to memory and split and call API from there as the size of the data is big. So I would like to follow an approach, so that the file will be read using limit and offset as in the case of SQL queries. But how can this be done in Python ? I am not seeing any option to specify the number of rows and skip rows in the csv documentation. Is someone can suggest a better approach also that will be fine.

You can call your api directly just with 1 line in memory:
with open(file_name, newline='') as csv_file:
for row in csv.DictReader(csv_file):
call_api(row) # call api with row-dictionary, don't persist all to memory
You can skip lines using next(row) before the for loop:
with open(file_name, newline='') as csv_file:
for _ in range(10): # skip first 10 rows
next(csv_file)
for row in csv.DictReader(csv_file):
You can skip lines in between using continue:
with open(file_name, newline='') as csv_file:
for (i,row) in enumerate(csv.DictReader(csv_file)):
if i%2 == 0: continue # skip every other row
You can simply count parsed lines and break after n lines are done:
n = 0
with open(file_name, newline='') as csv_file:
for row in csv.DictReader(csv_file):
if n == 50:
break
n += 1
and you can combine those approaches to skip 100 rows and take 200, only taking every 2th one - this mimics limit and offset and hacks using modulo on the line number.
Or you use something thats great with csv, like pandas:
Reading a part of csv file
Read random lines from huge CSV file in Python
Read a small random sample from a big CSV file into a Python data frame

A solution could be to use pandas to read the csv:
import pandas as pd
file_name = 'data.csv'
OFFSET = 10
LIMIT = 24
CHSIZE = 6
header = list('ABC')
reader = pd.read_csv(file_name, sep=',',
header=None, names=header, # Header 'A', 'B', 'C'
usecols=[0, 1, 4], # Select some columns
skiprows=lambda idx: idx < OFFSET, # Skip lines
chunksize=CHSIZE, # Chunk reading
nrows=LIMIT)
for df_chunk in reader:
# Each df_chunk is a DataFrame, so
# an adapted api may be needed to
# call_api(item)
for row in df_chunk.itertuples():
print(row._asdict())

Remove columns + keep certain rows in multiple large .csv files using python

Hello I'm really new here as well as in the world of python.
I have some (~1000) .csv files, including ~ 1800000 rows of information each. The files are in the following form:
5302730,131841,-0.29999999999999999,NULL,2013-12-31 22:00:46.773
5303072,188420,28.199999999999999,NULL,2013-12-31 22:27:46.863
5350066,131841,0.29999999999999999,NULL,2014-01-01 00:37:21.023
5385220,-268368577,4.5,NULL,2014-01-01 03:12:14.163
5305752,-268368587,5.1900000000000004,NULL,2014-01-01 03:11:55.207
So, i would like for all of the files:
(1) to remove the 4th (NULL) column
(2) to keep in every file only certain rows (depending on the value of the first column i.e.5302730, keep only the rows that containing that value)
I don't know if this is even possible, so any answer is appreciated!
Thanks in advance.

Have a look at the csv module
One can use the csv.reader function to generate an iterator of lines, with each lines cells as a list.
for line in csv.reader(open("filename.csv")):
# Remove 4th column, remember python starts counting at 0
line = line[:3] + line[4:]
if line[0] == "thevalueforthefirstcolumn":
dosomethingwith(line)

If you wish to do this sort of operation with CSV files more than once and want to use different parameters regarding column to skip, column to use as key and what to filter on, you can use something like this:
import csv
def read_csv(filename, column_to_skip=None, key_column=0, key_filter=None):
data_from_csv = []
with open(filename) as csvfile:
csv_reader = csv.reader(csvfile)
for row in csv_reader:
# Skip data in specific column
if column_to_skip is not None:
del row[column_to_skip]
# Filter out rows where the key doesn't match
if key_filter is not None:
key = row[key_column]
if key_filter != key:
continue
data_from_csv.append(row)
return data_from_csv
def write_csv(filename, data_to_write):
with open(filename, 'w') as csvfile:
csv_writer = csv.writer(csvfile)
for row in data_to_write:
csv_writer.writerow(row)
data = read_csv('data.csv', column_to_skip=3, key_filter='5302730')
write_csv('data2.csv', data)

Remove unwanted columns from CSV file

I have two lists of CSV files that my program is combining into a single file.
The first group of files has 5 columns of data that I do not want to include in the output. How do I remove those 5 columns, whether I do it row-by-row or all at one time, from the data I have read in using csv.reader?
Here's my function (I would like to keep the function def and structure mostly the same):
def get_data(filename,rowlen,delimit=','):
data = []
with open(filename, 'rb') as f:
raw = csv.reader(f, dialect='excel', delimiter=delimit)
if raw != None:
for row in raw:
if row[-1] == '':
row.pop()
for i in range(len(row),rowlen):
row.append('-999')
data.append(row)
return data
I tried doing this:
raw = csv.reader(f, dialect='excel', delimiter=delimit)
if raw != None:
for row in raw:
if rowlen == 13: # This is true only for csv files I want to shorten
row = row[0:8]
rowlen = 8
if row[-1] == '':
But the output file remained the same. Also, I tried commenting out rowlen = 8, but this just filled the columns I don't want with -999.

You need to replace the row in raw or create a new list that will contains your sliced rows, here a correction of a part of your code with enumerate to keep track of the index of the row to be replaced in raw.
for i, row in enumerate(raw):
if rowlen == 13: # This is true only for csv files I want to shorten
raw[i] = row[0:8]
rowlen = 8
Another example where you don't alter raw :
new_container = []
for row in raw:
if rowlen == 13: # This is true only for csv files I want to shorten
new_container.append(row[0:8]) # we just append your slice to the new_container each iteration
rowlen = 8

You should check out pandas. It makes working with csv much much better..
from pandas import read_csv
def get_data(filename, rowlen, delimit=','):
df = read_csv(filename, header=None, sep=delimit, usecols=range(rowlen))
df.to_csv('output.csv', index=False)
get_data('input.csv',4)

How to perform a simple calculation in a CSV and append the results to the file

I have a csv which contains 38 colums of data, all I want to find our how to do is, divide column 11 by column by column 38 and append this data tot he end of each row. Missing out the title row of the csv (row 1.)
If I am able to get a snippet of code that can do this, I will be able to manipulate the same code to perform lots of similar functions.
My attempt involved editing some code that was designed for something else.
See below:
from collections import defaultdict
class_col = 11
data_col = 38
# Read in the data
with open('test.csv', 'r') as f:
# if you have a header on the file
# header = f.readline().strip().split(',')
data = [line.strip().split(',') for line in f]
# Append the relevant sum to the end of each row
for row in xrange(len(data)):
data[row].append(int(class_col)/int(data_col))
# Write the results to a new csv file
with open('testMODIFIED2.csv', 'w') as nf:
nf.write('\n'.join(','.join(row) for row in data))
Any help will be greatly appreciated. Thanks SMNALLY

import csv
with open('test.csv', 'rb') as old_csv:
csv_reader = csv.reader(old_csv)
with open('testMODIFIED2.csv', 'wb') as new_csv:
csv_writer = csv.writer(new_csv)
for i, row in enumerate(csv_reader):
if i != 0:
row.append(float(row[10]) / float(row[37]))
csv_writer.writerow(row)

Use pandas:
import pandas
df = pandas.read_csv('test.csv') #assumes header row exists
df['FRACTION'] = 1.0*df['CLASS']/df['DATA'] #by default new columns are appended to the end
df.to_csv('out.csv')

How to perform multiple changes to a csv file without intervening writes

I want to perform multiple edits to most rows in a csv file without making multiple writes to the output csv file.
I have a csv that I need to convert and clean up into specific format for another program to use. For example, I'd like to:
remove all blank rows
remove all rows where the value of column "B" is not a number
with this new data, create a new column and explode the first part of the values in column B into the new column
Here's an example of the data:
"A","B","C","D","E"
"apple","blah","1","","0.00"
"ape","12_fun","53","25","1.00"
"aloe","15_001","51","28",2.00"
I can figure out the logic behind each process, but what I can't figure out is how to perform each process without reading and writing to a file each time. I'm using the CSV module. Is there a better way to perform these steps at once before writing a final CSV?

I would define a set of tests and a set of processes.
If all tests pass, all processes are applied, and the final result is written to output:
import csv
#
# Row tests
#
def test_notblank(row):
return any(len(i) for i in row)
def test_bnumeric(row):
return row[1].isdigit()
def do_tests(row, tests=[test_notblank, test_bnumeric]):
return all(t(row) for t in tests)
#
# Row processing
#
def process_splitb(row):
b = row[1].split('.')
row[1] = b[0]
row.append(b[1])
return row
def do_processes(row, processes=[process_splitb]):
for p in processes:
row = p(row)
return row
def main():
with open("in.csv","rb") as inf, open("out.csv","wb") as outf:
incsv = csv.reader(inf)
outcsv = csv.writer(outf)
outcsv.writerow(incsv.next()) # pass header row
outcsv.writerows(do_processes(row) for row in incsv if do_tests(row))
if __name__=="__main__":
main()

Simple for loops.
import csv
csv_file = open('in.csv', 'rb')
csv_reader = csv.reader(csv_file)
header = csv_reader.next()
header.append('F') #add new column
records = [header]
#process records
for record in csv_reader:
#skip blank records
if record == []:
continue
#make sure column "B" has 2 parts
try:
part1, part2 = record[1].split('_')
except:
continue
#make sure part1 is a digit
if not part1.isdigit():
continue
record[1] = part1 #make column B equal part1
record.append(part2) #add data for the new column F to record
records.append(record)
new_csv_file = open('out.csv', 'wb')
csv_writer = csv.writer(new_csv_file, quoting=csv.QUOTE_ALL)
for r in records:
csv_writer.writerow(r)

Why use the CSV module. A CSV is made up of text lines (strings) and you can use the Python string power (split, join, replace, len) to create your result.
line_cols = line.split(',') and back: line = ','.join(line_cols)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Read csv file with empty lines - python

I updated the question with the last details that answer my question.

Related

Read CSV file with limit and offset

Remove columns + keep certain rows in multiple large .csv files using python

Remove unwanted columns from CSV file

How to perform a simple calculation in a CSV and append the results to the file

How to perform multiple changes to a csv file without intervening writes

Categories

Resources