merge some rows in two conditions

merge some rows in two conditions - python

I want to merge rows within a condition. If the row is less than 20 characters long, combine that row with the previous row. But I have two columns and I want to apply the condition in the code in the second column, if any row contains less than 20 characters remove row for two columns.
I got help here already to merge rows but if I had one column now I have different requirements. I have two columns and want to apply the operation in the second row, any row have less than 20 char merge this row with the previous row and remove this row from two columns.
This the old code for merge and remove row but when I have one columns. Thank you for help.
I'm try this code but doesn't give me result.
import csv
import pandas as pd
df = pd.read_csv('Test.csv')
with open('Output.csv', mode='w', newline='', encoding='utf-16') as f:
writer = csv.writer(f, delimiter=' ')
rows = []
for i, data in enumerate(df['Sentence']):
if i + 1 == len(df['Sentence']):
writer.writerow([data])
elif len(df['Sentence'][i + 1]) < 20:
writer.writerow([data + df['Sentence'][i + 1]])
df.drop(df.index[[i + 1]])
elif len(df['Sentence'][i + 1]) >= 20:
writer.writerow([data])

I solved this by make the row null then remove it from CSV
df = pd.read_csv('test.csv', encoding='utf-8')
with open('output.csv', mode='w', newline='', encoding='utf-16') as f:
writer = csv.writer(f, delimiter=' ')
rows = []
for i, data in enumerate(df['Sentence']):
if i + 1 == len(df['Sentence']):
writer.writerow([data])
elif len(df['Sentence'][i + 1]) < 19:
writer.writerow([data + df['Sentence'][i + 1]])
df['Sentence'][i + 1] = ''
elif len(df['Sentence'][i + 1]) >= 19:
writer.writerow([data])

Related

How to conditionally select a character from two strings, based upon another string

I have 2 files: fileA is composed of 1 row and file B is 2 rows.
fileA (1 row):
*****s**e**********************************************q*
fileB (2 rows):
Row 1 is the subject
Row 2 is the query
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
I need to produce an output file, where if the fileA string contains an s or *, the subject character at the corresponding index position, will be written to the output file. If there is a q or e the query character will be written to the output file.
Output:
AAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABA
my code:
ff = open("filea.txt")
gg = open("fileb.txt")
file_as_list = ff.readline()
file_as_last = gg.readlines()
query = file_as_last[0]
subject = file_as_last[1]
for i in file_as_list:
z = -1
while z <= len(file_as_list):
if i == "*":
f = open('output.txt', 'a+', encoding='utf-8')
f.write(subject[z])
z += 1
elif i == "s":
f = open('output.txt', 'a+', encoding='utf-8')
f.write(subject[z])
z += 1
elif i == "e":
f = open('output.txt', 'a+', encoding='utf-8')
f.write(query[z])
z += 1
elif i == "q":
f = open('output.txt', 'a+', encoding='utf-8')
f.write(query[z])
z += 1
break
the things work more or less but not properly: I have always that the loop works only for the first statement and produce an output that is just a copy of the subject

with open is used, so all files will be automatically closed
convert each string into list, then.strip to remove \n & \r
load the lists into a pandas.DataFrame
pandas.DataFrame.apply with axis=1, for row wise operations
np.where to return the correct value
write out, to a list, and convert it into a str
write out, to the output.txt file
Code:
import pandas as pd
import numpy as np
with open('fileA.txt', 'r') as filA:
with open('fileB.txt', 'r') as filB:
with open('output.txt', 'w', newline='\n') as output:
fil_a = filA.readline()
fil_b = filB.readlines()
sub = [x for x in fil_b[0].strip()]
que = [x for x in fil_b[1].strip()]
line = [x for x in fil_a.strip()]
df = pd.DataFrame({'A': line, 'sub': sub, 'que': que})
df['out'] = df.apply(lambda x: str(np.where(x[0] in ['*', 's'], x[1], x[2])), axis=1)
out = df.out.to_list()
out = ''.join(x for x in out)
output.write(out)

Python CSV writerow to specific column in already opened file

I am struggling with csv module and writerow method.
NOTE: This is simplified the code as much as I could. I am asking for understanding.
I provided Minimal, Complete, and Verifiable example as much as I could.
WHAT I'VE GOT:
Three tables in the database:
MODEL_test - contain data on which algorithm will learn
my_prediction - contain unseen data on which algorithm will be applied
OUT_predictions - contain output from algorithm predict method
In the first step, I create a new CSV file and keep in open till alliteration for the current algorithm is finished. Before training iteration starts I append CSV file rows with first 7 values from unseen table data, so data won't be multiplied. Then after each algorithm iteration, I want to append already opened file with OUT_prediction values.
CODE:
import csv
import datetime
def export_to_csv():
ldb = sqlite3.connect('database.db')
c = ldb.cursor()
table_name = 'my_predictions'
training_size = 3
now = datetime.datetime.now()
file_name = str.format('my_predictions {}', now.strftime("%Y-%m-%d %H %M %S"))
export_columns = ['COLUMN ' + str(n) for n in range(1, 8)] + \
['OUTPUT ' + str(n) for n in range(1, training_size + 1)]
with open('archived/' + file_name + '.csv', 'w', newline='') as csv_file:
writer = csv.writer(csv_file)
writer.writerow(export_columns)
output_writer = csv.DictWriter(csv_file, fieldnames=export_columns)
for o in range(1, 500): # < write all unseen data from database to csv
c.execute(str.format('SELECT * FROM {} WHERE ID=?', table_name), [o])
fetch_one = c.fetchone()
writer.writerow(fetch_one[1:7])
for t in range(training_size): #for each iteration write output to csv
# some machine learning training code
prediction = [0, 0, 1, 1, 0, 1] # <-- sample output from predictions
combined_set = list(map(str, prediction))
ids = 1
for each in combined_set:
c.execute(str.format('INSERT INTO OUTPUT_prediction VALUES ({})',
",".join(["?" for _ in range(1, len([ids] + [int(each)]) + 1)])), [ids] + [int(each)])
ids += 1
ldb.commit()
for o in range(1, 500): # <-- write down output from last prediction iteration to specific column
c.execute(str.format('SELECT * FROM {} WHERE ID=?', table_name), [o])
fetch_output = c.fetchone()
output_writer.writeheader()
output_writer.writerow({'OUTPUT ' + str(t + 1): fetch_output[-1]}) # <-- columns remain empty
WHAT IS THE PROBLEM
When code finish and I open the file I can see that OUTPUT columns remain empty
CSV IMAGE
EDIT: I don't want to use pandas and to_csv because of thy are very slow. Sometimes my unseen data has 1 million lines and it takes half an hour for a single iteration using to_csv.

I know what I've done wrong and found solution for this situation, but I'm not satisfied with it. When I try to add new column in w mode new data is always written at the end of the file. When I set csv_file.seek(0) old data is overwritten.
Also I have tried to reopen file in r+ mode and set csv_file.seek(0), but got same outcome.
I will use xlwings for this task, because it gives me more control, but still do not know how it will affect input data speed. My goal is to prepare summary report with unseen data, output for each iteration and statistical information.
SOLUTION (with r+):
now = datetime.datetime.now()
file_name = str.format('my_predictions {}', now.strftime("%Y-%m-%d %H %M %S"))
export_columns = ['COLUMN ' + str(n) for n in range(1, 8)] + \
['OUTPUT ' + str(n) for n in range(1, training_size + 1)]
with open('archived/' + file_name + '.csv', 'w', newline='') as csv_file:
writer = csv.writer(csv_file)
writer.writerow(export_columns)
for o in range(1, 500):
c.execute(str.format('SELECT * FROM {} WHERE ID=?', table_name), [o])
fetch_one = c.fetchone()
writer.writerow(fetch_one[1:7])
for t in range(training_size):
# some machine learning training code
prediction = [0, 0, 1, 1, 0, 1] # <-- sample output from predictions
combined_set = List(Map(Str, prediction))
# ids = 1
#
# for each in combined_set:
# c.execute(str.format('INSERT INTO OUTPUT_prediction VALUES ({})',
# ",".join(["?" for _ in range(1, len([ids] + [int(each)]) + 1)])), [ids] + [int(each)])
#
# ids += 1
#
# ldb.commit()
with open('archived/' + file_name + '.csv', 'r+', newline='') as csv_file:
writer = csv.writer(csv_file)
csv_input = csv.reader(csv_file)
rows = List(csv_input)
writer.writerow(export_columns)
for row, o in zip(rows, combined_set):
row += [o]
writer.writerow(row)

File data binding with column names

I have files with hundreds and thousands rows of data but they are without any column.
I am trying to go to every file and make them row by row and store them in list after that I want to assign values by columns. But here I am confused what to do because values are around 60 in every row and some extra columns with value assigned and they should be added in every row.
Code so for:
import re
import glob
filenames = glob.glob("/home/ashfaque/Desktop/filetocsvsample/inputfiles/*.txt")
columns = []
with open("/home/ashfaque/Downloads/coulmn names.txt",encoding = "ISO-8859-1") as f:
file_data = f.read()
lines = file_data.splitlines()
for l in lines:
columns.append(l.rstrip())
total = {}
for name in filenames:
modified_data = []
with open(name,encoding = "ISO-8859-1") as f:
file_data = f.read()
lines = file_data.splitlines()
for l in lines:
if len(l) >= 1:
modified_data.append(re.split(': |,',l))
rows = []
i = len(modified_data)
x = 0
while i > 60:
r = lines[x:x+59]
x = x + 60
i = i - 60
rows.append(r)
z = len(modified_data)
while z >= 60:
z = z - 60
if z > 1:
last_columns = modified_data[-z:]
x = []
for l in last_columns:
if len(l) > 1:
del l[0]
x.append(l)
elif len(l) == 1:
x.append(l)
for row in rows:
for vl in x:
row.append(vl)
for r in rows:
for i in range(0,len(r)):
if len(r) >= 60:
total.setdefault(columns[i],[]).append(r[i])
In other script I have separated both row with 60 values and last 5 to 15 columns which should be added with row are separate but again I am confused how to bind all the data.
Data Should look like this after binding.
outputdata.xlsx
Data Input file:
inputdata.txt
What Am I missing here? any tool ?

I believe that your issue can be resolved by taking the input file and turning it into a CSV file which you can then import into whatever program you like.
I wrote a small generator that would read a file a line at a time and return a row after a certain number of lines, in this case 60. In that generator, you can make whatever modifications to the data as you need.
Then with each generated row, I write it directly to the csv. This should keep the memory requirements for this process pretty low.
I didn't understand what you were doing with the regex split, but it would be simple enough to add it to the generator.
import csv
OUTPUT_FILE = "/home/ashfaque/Desktop/File handling/outputfile.csv"
INPUT_FILE = "/home/ashfaque/Desktop/File handling/inputfile.txt"
# This is a generator that will pull only num number of items into
# memory at a time, before it yields the row.
def get_rows(path, num):
row = []
with open(path, "r", encoding="ISO-8859-1") as f:
for n, l in enumerate(f):
# apply whatever transformations that you need to here.
row.append(l.rstrip())
if (n + 1) % num == 0:
# if rows need padding then do it here.
yield row
row = []
with open(OUTPUT_FILE, "w") as output:
csv_writer = csv.writer(output)
for r in get_rows(INPUT_FILE, 60):
csv_writer.writerow(r)

Writing data into CSV file

I have a code that is basically doing this:
row1 = []
count = 0
writer = csv.writer(myFile)
row = []
for j in range(0, 2):
for i in range(0, 4):
row1.append(i+count)
count = count + 1
print(row1)
writer.writerows(row1)
row1[:] = []
I'm creating some lists and I want to map each value to a column, like this
This error showed it up iterable expected. How can I do that?

#roganjosh is right, what you need to write one row at a time is writerow:
import csv
myFile = open("aaa.csv", "w", newline="")
row1 = []
count = 0
writer = csv.writer(myFile)
row = []
for j in range(0, 2):
for i in range(0, 4):
row1.append(i+count)
count = count + 1
print(row1)
writer.writerow(row1)
row1[:] = []
myFile.close() # Don't forget to close your file

You probably need to call the method .writerow() instead of the plural .writerows(), because you write a single line to the file on each call.
The other method is to write multiple lines at once to the file.
Or you could also restructure your code like this to write all the lines at the end:
import csv
row_list = []
for j in range(2):
row = [j+i for i in range(4)]
row_list.append(row)
# row_list = [
# [j+i for i in range(4)]
# for j in range(2)]
with open('filename.csv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerows(row_list)

It's much simpler and easier to manipulate tabular data in pandas -- is there a reason you don't want to use pandas?
import pandas as pd
df = pd.DataFrame()
for i in range(4):
df[i] = range(i, i+4)
# Any other data wrangling
df.to_csv("file.csv")

Counting data within ranges in csv

I have some data which I need to break down into manageable chunks. With the following data I need to count the number of times x occurs in column 11 with column 7 being a 1 and how many times the number x occurs in column 11. I need to put them into the first line of a csv. After that I need to count the same thing but with column 11 being the following brackets:
0
">0 but <0.05"
">0.05 but <0.10"
">0.1 but <0.15... all the way up to 1.00"
All of these would ideally be appended to the same new.csv i.e. not the main data csv
Some example raw data that fits the above description (please note a lot of the brackets will contain no data. In which case they would need to return 0,0:
01/01/2002,Data,class1,4,11yo+,4,1,George Smith,0,0,x
01/01/2002,Data,class1,4,11yo+,4,2,Ted James,0,0,x
01/01/2002,Data,class1,4,11yo+,4,3,Emma Lilly,0,0,x
01/01/2002,Data,class1,4,11yo+,4,5,George Smith,0,0,x
02/01/2002,Data,class2,4,10yo+,6,4,Tom Phillips,0,0,x
02/01/2002,Data,class2,4,10yo+,6,2,Tom Phillips,0,0,x
02/01/2002,Data,class2,4,10yo+,6,5,George Smith,1,2,0.5
02/01/2002,Data,class2,4,10yo+,6,3,Tom Phillips,0,0,x
02/01/2002,Data,class2,4,10yo+,6,1,Emma Lilly,0,1,0
02/01/2002,Data,class2,4,10yo+,6,6,George Smith,1,2,0.5
03/01/2002,Data,class3,4,10yo+,6,6,Ted James,0,1,0
03/01/2002,Data,class3,4,10yo+,6,3,Tom Phillips,0,3,0
03/01/2002,Data,class3,4,10yo+,6,2,George Smith,1,4,0.25
03/01/2002,Data,class3,4,10yo+,6,4,George Smith,1,4,0.25
03/01/2002,Data,class3,4,10yo+,6,1,George Smith,1,4,0.25
03/01/2002,Data,class3,4,10yo+,6,5,Tom Phillips,0,3,0
04/01/2002,Data,class4,2,10yo+,5,3,Emma Lilly,1,2,0.5
04/01/2002,Data,class4,2,10yo+,5,1,Ted James,0,2,0
04/01/2002,Data,class4,2,10yo+,5,2,George Smith,2,7,0.285714286
04/01/2002,Data,class4,2,10yo+,5,4,Emma Lilly,1,2,0.5
04/01/2002,Data,class4,2,10yo+,5,5,Tom Phillips,0,5,0
05/01/2002,Data,class5,4,11yo+,4,1,George Smith,2,8,0.25
05/01/2002,Data,class5,4,11yo+,4,2,Ted James,1,3,0.333333333
05/01/2002,Data,class5,4,11yo+,4,3,Emma Lilly,1,4,0.25
05/01/2002,Data,class5,4,11yo+,4,5,George Smith,2,8,0.25
06/01/2002,Data,class6,4,10yo+,6,4,Tom Phillips,0,6,0
06/01/2002,Data,class6,4,10yo+,6,2,Tom Phillips,0,6,0
06/01/2002,Data,class6,4,10yo+,6,5,George Smith,3,10,0.3
06/01/2002,Data,class6,4,10yo+,6,3,Tom Phillips,0,6,0
06/01/2002,Data,class6,4,10yo+,6,1,Emma Lilly,1,5,0.2
06/01/2002,Data,class6,4,10yo+,6,6,George Smith,3,10,0.3
07/01/2002,Data,class7,4,10yo+,6,6,Ted James,1,4,0.25
07/01/2002,Data,class7,4,10yo+,6,3,Tom Phillips,0,9,0
07/01/2002,Data,class7,4,10yo+,6,2,George Smith,3,12,0.25
07/01/2002,Data,class7,4,10yo+,6,4,George Smith,3,12,0.25
07/01/2002,Data,class7,4,10yo+,6,1,George Smith,3,12,0.25
07/01/2002,Data,class7,4,10yo+,6,5,Tom Phillips,0,9,0
08/01/2002,Data,class8,2,10yo+,5,3,Emma Lilly,2,6,0.333333333
08/01/2002,Data,class8,2,10yo+,5,1,Ted James,1,5,0.2
08/01/2002,Data,class8,2,10yo+,5,2,George Smith,4,15,0.266666667
08/01/2002,Data,class8,2,10yo+,5,4,Emma Lilly,2,6,0.333333333
08/01/2002,Data,class8,2,10yo+,5,5,Tom Phillips,0,11,0
09/01/2002,Data,class9,4,11yo+,4,1,George Smith,4,16,0.25
09/01/2002,Data,class9,4,11yo+,4,2,Ted James,2,6,0.333333333
09/01/2002,Data,class9,4,11yo+,4,3,Emma Lilly,2,8,0.25
09/01/2002,Data,class9,4,11yo+,4,5,George Smith,4,16,0.25
10/01/2002,Data,class10,4,10yo+,6,4,Tom Phillips,0,12,0
10/01/2002,Data,class10,4,10yo+,6,2,Tom Phillips,0,12,0
10/01/2002,Data,class10,4,10yo+,6,5,George Smith,5,18,0.277777778
10/01/2002,Data,class10,4,10yo+,6,3,Tom Phillips,0,12,0
10/01/2002,Data,class10,4,10yo+,6,1,Emma Lilly,2,9,0.222222222
10/01/2002,Data,class10,4,10yo+,6,6,George Smith,5,18,0.277777778
11/01/2002,Data,class11,4,10yo+,6,6,Ted James,2,7,0.285714286
11/01/2002,Data,class11,4,10yo+,6,3,Tom Phillips,0,15,0
11/01/2002,Data,class11,4,10yo+,6,2,George Smith,5,20,0.25
11/01/2002,Data,class11,4,10yo+,6,4,George Smith,5,20,0.25
11/01/2002,Data,class11,4,10yo+,6,1,George Smith,5,20,0.25
11/01/2002,Data,class11,4,10yo+,6,5,Tom Phillips,0,15,0
12/01/2002,Data,class12,2,10yo+,5,3,Emma Lilly,3,10,0.3
12/01/2002,Data,class12,2,10yo+,5,1,Ted James,2,8,0.25
12/01/2002,Data,class12,2,10yo+,5,2,George Smith,6,23,0.260869565
12/01/2002,Data,class12,2,10yo+,5,4,Emma Lilly,3,10,0.3
12/01/2002,Data,class12,2,10yo+,5,5,Tom Phillips,0,17,0
13/01/2002,Data,class13,4,11yo+,4,1,George Smith,6,24,0.25
13/01/2002,Data,class13,4,11yo+,4,2,Ted James,3,9,0.333333333
13/01/2002,Data,class13,4,11yo+,4,3,Emma Lilly,3,12,0.25
13/01/2002,Data,class13,4,11yo+,4,5,George Smith,6,24,0.25
14/01/2002,Data,class14,4,10yo+,6,4,Tom Phillips,0,18,0
14/01/2002,Data,class14,4,10yo+,6,2,Tom Phillips,0,18,0
14/01/2002,Data,class14,4,10yo+,6,5,George Smith,7,26,0.269230769
14/01/2002,Data,class14,4,10yo+,6,3,Tom Phillips,0,18,0
14/01/2002,Data,class14,4,10yo+,6,1,Emma Lilly,3,13,0.230769231
14/01/2002,Data,class14,4,10yo+,6,6,George Smith,7,26,0.269230769
15/01/2002,Data,class15,4,10yo+,6,6,Ted James,3,10,0.3
If anybody can help me achieve this I will truly grateful. If this requires more detail please ask.
One last note the csv in question has main data csv in question has 800k rows.
EDIT
Currently the output file appears as follows using the code supplied by #user650654:
data1,data2
If at all possible I would like the code changed slightly to out put two more things. Hopefully therse are not too difficult to do. Proposed changes to output file (commas represent each new row):
title row labeling the row (e.g. "x" or "0:0.05",Calculated avereage of values within each bracket e.g."0.02469",data1,data2
So in reality it would probably look like this:
x,n/a,data1,data2
0:0.05,0.02469,data1,data2
0.05:0.1,0.5469,data1,data2
....
....
Column1 = Row label (The data ranges that are being counted in the original question i.e. from 0 to 0.05
Column2 = Calculated average of values that fell within a particular range. I.e. If the
Note the data1 & data2 are the two values the question innitially asked for.
Column1
Many thanks AEA

Here is a solution for adding the two new fields:
import csv
import numpy
def count(infile='data.csv', outfile='new.csv'):
bins = numpy.arange(0, 1.05, 0.05)
total_x = 0
col7one_x = 0
total_zeros = 0
col7one_zeros = 0
all_array = []
col7one_array = []
with open(infile, 'r') as fobj:
reader = csv.reader(fobj)
for line in reader:
if line[10] == 'x':
total_x += 1
if line[6] == '1':
col7one_x += 1
elif line[10] == '0':
# assumes zero is represented as "0" and not as say, "0.0"
total_zeros += 1
if line[6] == '1':
col7one_zeros += 1
else:
val = float(line[10])
all_array.append(val)
if line[6] == '1':
col7one_array.append(val)
all_array = numpy.array(all_array)
hist_all, edges = numpy.histogram(all_array, bins=bins)
hist_col7one, edges = numpy.histogram(col7one_array, bins=bins)
bin_ranges = ['%s:%s' % (x, y) for x, y in zip(bins[:-1], bins[1:])]
digitized = numpy.digitize(all_array, bins)
bin_means = [all_array[digitized == i].mean() if hist_all[i - 1] else 'n/a' for i in range(1, len(bins))]
with open(outfile, 'w') as fobj:
writer = csv.writer(fobj)
writer.writerow(['x', 'n/a', col7one_x, total_x])
writer.writerow(['0', 0 if total_zeros else 'n/a', col7one_zeros, total_zeros])
for row in zip(bin_ranges, bin_means, hist_col7one, hist_all):
writer.writerow(row)
if __name__ == '__main__':
count()

This might work:
import numpy as np
import pandas as pd
column_names = ['col1', 'col2', 'col3', 'col4', 'col5', 'col6',
'col7', 'col8', 'col9', 'col10', 'col11'] #names to be used as column labels. If no names are specified then columns can be refereed to by number eg. df[0], df[1] etc.
df = pd.read_csv('data.csv', header=None, names=column_names) #header= None means there are no column headings in the csv file
df.ix[df.col11 == 'x', 'col11']=-0.08 #trick so that 'x' rows will be grouped into a category >-0.1 and <= -0.05. This will allow all of col11 to be treated as a numbers
bins = np.arange(-0.1, 1.0, 0.05) #bins to put col11 values in. >-0.1 and <=-0.05 will be our special 'x' rows, >-0.05 and <=0 will capture all the '0' values.
labels = np.array(['%s:%s' % (x, y) for x, y in zip(bins[:-1], bins[1:])]) #create labels for the bins
labels[0] = 'x' #change first bin label to 'x'
labels[1] = '0' #change second bin label to '0'
df['col11'] = df['col11'].astype(float) #convert col11 to numbers so we can do math on them
df['bin'] = pd.cut(df['col11'], bins=bins, labels=False) # make another column 'bins' and put in an integer representing what bin the number falls into.Later we'll map the integer to the bin label
df.set_index('bin', inplace=True, drop=False, append=False) #groupby is meant to run faster with an index
def count_ones(x):
"""aggregate function to count values that equal 1"""
return np.sum(x==1)
dfg = df[['bin','col7','col11']].groupby('bin').agg({'col11': [np.mean], 'col7': [count_ones, len]}) # groupby the bin number and apply aggregate functions to specified column.
dfg.index = labels[dfg.index]# apply labels to bin numbers
dfg.ix['x',('col11', 'mean')]='N/A' #mean of 'x' rows is meaningless
print(dfg)
dfg.to_csv('new.csv')
which gave me
col7 col11
count_ones len mean
x 1 7 N/A
0 2 21 0
0.15:0.2 2 2 0.2
0.2:0.25 9 22 0.2478632
0.25:0.3 0 13 0.2840755
0.3:0.35 0 5 0.3333333
0.45:0.5 0 4 0.5

This solution uses numpy.histogram. See below.
import csv
import numpy
def count(infile='data.csv', outfile='new.csv'):
total_x = 0
col7one_x = 0
total_zeros = 0
col7one_zeros = 0
all_array = []
col7one_array = []
with open(infile, 'r') as fobj:
reader = csv.reader(fobj)
for line in reader:
if line[10] == 'x':
total_x += 1
if line[6] == '1':
col7one_x += 1
elif line[10] == '0':
# assumes zero is represented as "0" and not as say, "0.0"
total_zeros += 1
if line[6] == '1':
col7one_zeros += 1
else:
val = float(line[10])
all_array.append(val)
if line[6] == '1':
col7one_array.append(val)
bins = numpy.arange(0, 1.05, 0.05)
hist_all, edges = numpy.histogram(all_array, bins=bins)
hist_col7one, edges = numpy.histogram(col7one_array, bins=bins)
with open(outfile, 'w') as fobj:
writer = csv.writer(fobj)
writer.writerow([col7one_x, total_x])
writer.writerow([col7one_zeros, total_zeros])
for row in zip(hist_col7one, hist_all):
writer.writerow(row)
if __name__ == '__main__':
count()

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

merge some rows in two conditions - python

Related

How to conditionally select a character from two strings, based upon another string

Python CSV writerow to specific column in already opened file

File data binding with column names

Writing data into CSV file

Counting data within ranges in csv

Categories

Resources