merge some rows in two conditions - python
I want to merge rows within a condition. If the row is less than 20 characters long, combine that row with the previous row. But I have two columns and I want to apply the condition in the code in the second column, if any row contains less than 20 characters remove row for two columns.
I got help here already to merge rows but if I had one column now I have different requirements. I have two columns and want to apply the operation in the second row, any row have less than 20 char merge this row with the previous row and remove this row from two columns.
This the old code for merge and remove row but when I have one columns. Thank you for help.
I'm try this code but doesn't give me result.
import csv
import pandas as pd
df = pd.read_csv('Test.csv')
with open('Output.csv', mode='w', newline='', encoding='utf-16') as f:
writer = csv.writer(f, delimiter=' ')
rows = []
for i, data in enumerate(df['Sentence']):
if i + 1 == len(df['Sentence']):
writer.writerow([data])
elif len(df['Sentence'][i + 1]) < 20:
writer.writerow([data + df['Sentence'][i + 1]])
df.drop(df.index[[i + 1]])
elif len(df['Sentence'][i + 1]) >= 20:
writer.writerow([data])
I solved this by make the row null then remove it from CSV
df = pd.read_csv('test.csv', encoding='utf-8')
with open('output.csv', mode='w', newline='', encoding='utf-16') as f:
writer = csv.writer(f, delimiter=' ')
rows = []
for i, data in enumerate(df['Sentence']):
if i + 1 == len(df['Sentence']):
writer.writerow([data])
elif len(df['Sentence'][i + 1]) < 19:
writer.writerow([data + df['Sentence'][i + 1]])
df['Sentence'][i + 1] = ''
elif len(df['Sentence'][i + 1]) >= 19:
writer.writerow([data])
Related
How to conditionally select a character from two strings, based upon another string
I have 2 files: fileA is composed of 1 row and file B is 2 rows. fileA (1 row): *****s**e**********************************************q* fileB (2 rows): Row 1 is the subject Row 2 is the query AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB I need to produce an output file, where if the fileA string contains an s or *, the subject character at the corresponding index position, will be written to the output file. If there is a q or e the query character will be written to the output file. Output: AAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABA my code: ff = open("filea.txt") gg = open("fileb.txt") file_as_list = ff.readline() file_as_last = gg.readlines() query = file_as_last[0] subject = file_as_last[1] for i in file_as_list: z = -1 while z <= len(file_as_list): if i == "*": f = open('output.txt', 'a+', encoding='utf-8') f.write(subject[z]) z += 1 elif i == "s": f = open('output.txt', 'a+', encoding='utf-8') f.write(subject[z]) z += 1 elif i == "e": f = open('output.txt', 'a+', encoding='utf-8') f.write(query[z]) z += 1 elif i == "q": f = open('output.txt', 'a+', encoding='utf-8') f.write(query[z]) z += 1 break the things work more or less but not properly: I have always that the loop works only for the first statement and produce an output that is just a copy of the subject
with open is used, so all files will be automatically closed convert each string into list, then.strip to remove \n & \r load the lists into a pandas.DataFrame pandas.DataFrame.apply with axis=1, for row wise operations np.where to return the correct value write out, to a list, and convert it into a str write out, to the output.txt file Code: import pandas as pd import numpy as np with open('fileA.txt', 'r') as filA: with open('fileB.txt', 'r') as filB: with open('output.txt', 'w', newline='\n') as output: fil_a = filA.readline() fil_b = filB.readlines() sub = [x for x in fil_b[0].strip()] que = [x for x in fil_b[1].strip()] line = [x for x in fil_a.strip()] df = pd.DataFrame({'A': line, 'sub': sub, 'que': que}) df['out'] = df.apply(lambda x: str(np.where(x[0] in ['*', 's'], x[1], x[2])), axis=1) out = df.out.to_list() out = ''.join(x for x in out) output.write(out)
Python CSV writerow to specific column in already opened file
I am struggling with csv module and writerow method. NOTE: This is simplified the code as much as I could. I am asking for understanding. I provided Minimal, Complete, and Verifiable example as much as I could. WHAT I'VE GOT: Three tables in the database: MODEL_test - contain data on which algorithm will learn my_prediction - contain unseen data on which algorithm will be applied OUT_predictions - contain output from algorithm predict method In the first step, I create a new CSV file and keep in open till alliteration for the current algorithm is finished. Before training iteration starts I append CSV file rows with first 7 values from unseen table data, so data won't be multiplied. Then after each algorithm iteration, I want to append already opened file with OUT_prediction values. CODE: import csv import datetime def export_to_csv(): ldb = sqlite3.connect('database.db') c = ldb.cursor() table_name = 'my_predictions' training_size = 3 now = datetime.datetime.now() file_name = str.format('my_predictions {}', now.strftime("%Y-%m-%d %H %M %S")) export_columns = ['COLUMN ' + str(n) for n in range(1, 8)] + \ ['OUTPUT ' + str(n) for n in range(1, training_size + 1)] with open('archived/' + file_name + '.csv', 'w', newline='') as csv_file: writer = csv.writer(csv_file) writer.writerow(export_columns) output_writer = csv.DictWriter(csv_file, fieldnames=export_columns) for o in range(1, 500): # < write all unseen data from database to csv c.execute(str.format('SELECT * FROM {} WHERE ID=?', table_name), [o]) fetch_one = c.fetchone() writer.writerow(fetch_one[1:7]) for t in range(training_size): #for each iteration write output to csv # some machine learning training code prediction = [0, 0, 1, 1, 0, 1] # <-- sample output from predictions combined_set = list(map(str, prediction)) ids = 1 for each in combined_set: c.execute(str.format('INSERT INTO OUTPUT_prediction VALUES ({})', ",".join(["?" for _ in range(1, len([ids] + [int(each)]) + 1)])), [ids] + [int(each)]) ids += 1 ldb.commit() for o in range(1, 500): # <-- write down output from last prediction iteration to specific column c.execute(str.format('SELECT * FROM {} WHERE ID=?', table_name), [o]) fetch_output = c.fetchone() output_writer.writeheader() output_writer.writerow({'OUTPUT ' + str(t + 1): fetch_output[-1]}) # <-- columns remain empty WHAT IS THE PROBLEM When code finish and I open the file I can see that OUTPUT columns remain empty CSV IMAGE EDIT: I don't want to use pandas and to_csv because of thy are very slow. Sometimes my unseen data has 1 million lines and it takes half an hour for a single iteration using to_csv.
I know what I've done wrong and found solution for this situation, but I'm not satisfied with it. When I try to add new column in w mode new data is always written at the end of the file. When I set csv_file.seek(0) old data is overwritten. Also I have tried to reopen file in r+ mode and set csv_file.seek(0), but got same outcome. I will use xlwings for this task, because it gives me more control, but still do not know how it will affect input data speed. My goal is to prepare summary report with unseen data, output for each iteration and statistical information. SOLUTION (with r+): now = datetime.datetime.now() file_name = str.format('my_predictions {}', now.strftime("%Y-%m-%d %H %M %S")) export_columns = ['COLUMN ' + str(n) for n in range(1, 8)] + \ ['OUTPUT ' + str(n) for n in range(1, training_size + 1)] with open('archived/' + file_name + '.csv', 'w', newline='') as csv_file: writer = csv.writer(csv_file) writer.writerow(export_columns) for o in range(1, 500): c.execute(str.format('SELECT * FROM {} WHERE ID=?', table_name), [o]) fetch_one = c.fetchone() writer.writerow(fetch_one[1:7]) for t in range(training_size): # some machine learning training code prediction = [0, 0, 1, 1, 0, 1] # <-- sample output from predictions combined_set = List(Map(Str, prediction)) # ids = 1 # # for each in combined_set: # c.execute(str.format('INSERT INTO OUTPUT_prediction VALUES ({})', # ",".join(["?" for _ in range(1, len([ids] + [int(each)]) + 1)])), [ids] + [int(each)]) # # ids += 1 # # ldb.commit() with open('archived/' + file_name + '.csv', 'r+', newline='') as csv_file: writer = csv.writer(csv_file) csv_input = csv.reader(csv_file) rows = List(csv_input) writer.writerow(export_columns) for row, o in zip(rows, combined_set): row += [o] writer.writerow(row)
File data binding with column names
I have files with hundreds and thousands rows of data but they are without any column. I am trying to go to every file and make them row by row and store them in list after that I want to assign values by columns. But here I am confused what to do because values are around 60 in every row and some extra columns with value assigned and they should be added in every row. Code so for: import re import glob filenames = glob.glob("/home/ashfaque/Desktop/filetocsvsample/inputfiles/*.txt") columns = [] with open("/home/ashfaque/Downloads/coulmn names.txt",encoding = "ISO-8859-1") as f: file_data = f.read() lines = file_data.splitlines() for l in lines: columns.append(l.rstrip()) total = {} for name in filenames: modified_data = [] with open(name,encoding = "ISO-8859-1") as f: file_data = f.read() lines = file_data.splitlines() for l in lines: if len(l) >= 1: modified_data.append(re.split(': |,',l)) rows = [] i = len(modified_data) x = 0 while i > 60: r = lines[x:x+59] x = x + 60 i = i - 60 rows.append(r) z = len(modified_data) while z >= 60: z = z - 60 if z > 1: last_columns = modified_data[-z:] x = [] for l in last_columns: if len(l) > 1: del l[0] x.append(l) elif len(l) == 1: x.append(l) for row in rows: for vl in x: row.append(vl) for r in rows: for i in range(0,len(r)): if len(r) >= 60: total.setdefault(columns[i],[]).append(r[i]) In other script I have separated both row with 60 values and last 5 to 15 columns which should be added with row are separate but again I am confused how to bind all the data. Data Should look like this after binding. outputdata.xlsx Data Input file: inputdata.txt What Am I missing here? any tool ?
I believe that your issue can be resolved by taking the input file and turning it into a CSV file which you can then import into whatever program you like. I wrote a small generator that would read a file a line at a time and return a row after a certain number of lines, in this case 60. In that generator, you can make whatever modifications to the data as you need. Then with each generated row, I write it directly to the csv. This should keep the memory requirements for this process pretty low. I didn't understand what you were doing with the regex split, but it would be simple enough to add it to the generator. import csv OUTPUT_FILE = "/home/ashfaque/Desktop/File handling/outputfile.csv" INPUT_FILE = "/home/ashfaque/Desktop/File handling/inputfile.txt" # This is a generator that will pull only num number of items into # memory at a time, before it yields the row. def get_rows(path, num): row = [] with open(path, "r", encoding="ISO-8859-1") as f: for n, l in enumerate(f): # apply whatever transformations that you need to here. row.append(l.rstrip()) if (n + 1) % num == 0: # if rows need padding then do it here. yield row row = [] with open(OUTPUT_FILE, "w") as output: csv_writer = csv.writer(output) for r in get_rows(INPUT_FILE, 60): csv_writer.writerow(r)
Writing data into CSV file
I have a code that is basically doing this: row1 = [] count = 0 writer = csv.writer(myFile) row = [] for j in range(0, 2): for i in range(0, 4): row1.append(i+count) count = count + 1 print(row1) writer.writerows(row1) row1[:] = [] I'm creating some lists and I want to map each value to a column, like this This error showed it up iterable expected. How can I do that?
#roganjosh is right, what you need to write one row at a time is writerow: import csv myFile = open("aaa.csv", "w", newline="") row1 = [] count = 0 writer = csv.writer(myFile) row = [] for j in range(0, 2): for i in range(0, 4): row1.append(i+count) count = count + 1 print(row1) writer.writerow(row1) row1[:] = [] myFile.close() # Don't forget to close your file
You probably need to call the method .writerow() instead of the plural .writerows(), because you write a single line to the file on each call. The other method is to write multiple lines at once to the file. Or you could also restructure your code like this to write all the lines at the end: import csv row_list = [] for j in range(2): row = [j+i for i in range(4)] row_list.append(row) # row_list = [ # [j+i for i in range(4)] # for j in range(2)] with open('filename.csv', 'w', newline='') as f: writer = csv.writer(f) writer.writerows(row_list)
It's much simpler and easier to manipulate tabular data in pandas -- is there a reason you don't want to use pandas? import pandas as pd df = pd.DataFrame() for i in range(4): df[i] = range(i, i+4) # Any other data wrangling df.to_csv("file.csv")
Counting data within ranges in csv
I have some data which I need to break down into manageable chunks. With the following data I need to count the number of times x occurs in column 11 with column 7 being a 1 and how many times the number x occurs in column 11. I need to put them into the first line of a csv. After that I need to count the same thing but with column 11 being the following brackets: 0 ">0 but <0.05" ">0.05 but <0.10" ">0.1 but <0.15... all the way up to 1.00" All of these would ideally be appended to the same new.csv i.e. not the main data csv Some example raw data that fits the above description (please note a lot of the brackets will contain no data. In which case they would need to return 0,0: 01/01/2002,Data,class1,4,11yo+,4,1,George Smith,0,0,x 01/01/2002,Data,class1,4,11yo+,4,2,Ted James,0,0,x 01/01/2002,Data,class1,4,11yo+,4,3,Emma Lilly,0,0,x 01/01/2002,Data,class1,4,11yo+,4,5,George Smith,0,0,x 02/01/2002,Data,class2,4,10yo+,6,4,Tom Phillips,0,0,x 02/01/2002,Data,class2,4,10yo+,6,2,Tom Phillips,0,0,x 02/01/2002,Data,class2,4,10yo+,6,5,George Smith,1,2,0.5 02/01/2002,Data,class2,4,10yo+,6,3,Tom Phillips,0,0,x 02/01/2002,Data,class2,4,10yo+,6,1,Emma Lilly,0,1,0 02/01/2002,Data,class2,4,10yo+,6,6,George Smith,1,2,0.5 03/01/2002,Data,class3,4,10yo+,6,6,Ted James,0,1,0 03/01/2002,Data,class3,4,10yo+,6,3,Tom Phillips,0,3,0 03/01/2002,Data,class3,4,10yo+,6,2,George Smith,1,4,0.25 03/01/2002,Data,class3,4,10yo+,6,4,George Smith,1,4,0.25 03/01/2002,Data,class3,4,10yo+,6,1,George Smith,1,4,0.25 03/01/2002,Data,class3,4,10yo+,6,5,Tom Phillips,0,3,0 04/01/2002,Data,class4,2,10yo+,5,3,Emma Lilly,1,2,0.5 04/01/2002,Data,class4,2,10yo+,5,1,Ted James,0,2,0 04/01/2002,Data,class4,2,10yo+,5,2,George Smith,2,7,0.285714286 04/01/2002,Data,class4,2,10yo+,5,4,Emma Lilly,1,2,0.5 04/01/2002,Data,class4,2,10yo+,5,5,Tom Phillips,0,5,0 05/01/2002,Data,class5,4,11yo+,4,1,George Smith,2,8,0.25 05/01/2002,Data,class5,4,11yo+,4,2,Ted James,1,3,0.333333333 05/01/2002,Data,class5,4,11yo+,4,3,Emma Lilly,1,4,0.25 05/01/2002,Data,class5,4,11yo+,4,5,George Smith,2,8,0.25 06/01/2002,Data,class6,4,10yo+,6,4,Tom Phillips,0,6,0 06/01/2002,Data,class6,4,10yo+,6,2,Tom Phillips,0,6,0 06/01/2002,Data,class6,4,10yo+,6,5,George Smith,3,10,0.3 06/01/2002,Data,class6,4,10yo+,6,3,Tom Phillips,0,6,0 06/01/2002,Data,class6,4,10yo+,6,1,Emma Lilly,1,5,0.2 06/01/2002,Data,class6,4,10yo+,6,6,George Smith,3,10,0.3 07/01/2002,Data,class7,4,10yo+,6,6,Ted James,1,4,0.25 07/01/2002,Data,class7,4,10yo+,6,3,Tom Phillips,0,9,0 07/01/2002,Data,class7,4,10yo+,6,2,George Smith,3,12,0.25 07/01/2002,Data,class7,4,10yo+,6,4,George Smith,3,12,0.25 07/01/2002,Data,class7,4,10yo+,6,1,George Smith,3,12,0.25 07/01/2002,Data,class7,4,10yo+,6,5,Tom Phillips,0,9,0 08/01/2002,Data,class8,2,10yo+,5,3,Emma Lilly,2,6,0.333333333 08/01/2002,Data,class8,2,10yo+,5,1,Ted James,1,5,0.2 08/01/2002,Data,class8,2,10yo+,5,2,George Smith,4,15,0.266666667 08/01/2002,Data,class8,2,10yo+,5,4,Emma Lilly,2,6,0.333333333 08/01/2002,Data,class8,2,10yo+,5,5,Tom Phillips,0,11,0 09/01/2002,Data,class9,4,11yo+,4,1,George Smith,4,16,0.25 09/01/2002,Data,class9,4,11yo+,4,2,Ted James,2,6,0.333333333 09/01/2002,Data,class9,4,11yo+,4,3,Emma Lilly,2,8,0.25 09/01/2002,Data,class9,4,11yo+,4,5,George Smith,4,16,0.25 10/01/2002,Data,class10,4,10yo+,6,4,Tom Phillips,0,12,0 10/01/2002,Data,class10,4,10yo+,6,2,Tom Phillips,0,12,0 10/01/2002,Data,class10,4,10yo+,6,5,George Smith,5,18,0.277777778 10/01/2002,Data,class10,4,10yo+,6,3,Tom Phillips,0,12,0 10/01/2002,Data,class10,4,10yo+,6,1,Emma Lilly,2,9,0.222222222 10/01/2002,Data,class10,4,10yo+,6,6,George Smith,5,18,0.277777778 11/01/2002,Data,class11,4,10yo+,6,6,Ted James,2,7,0.285714286 11/01/2002,Data,class11,4,10yo+,6,3,Tom Phillips,0,15,0 11/01/2002,Data,class11,4,10yo+,6,2,George Smith,5,20,0.25 11/01/2002,Data,class11,4,10yo+,6,4,George Smith,5,20,0.25 11/01/2002,Data,class11,4,10yo+,6,1,George Smith,5,20,0.25 11/01/2002,Data,class11,4,10yo+,6,5,Tom Phillips,0,15,0 12/01/2002,Data,class12,2,10yo+,5,3,Emma Lilly,3,10,0.3 12/01/2002,Data,class12,2,10yo+,5,1,Ted James,2,8,0.25 12/01/2002,Data,class12,2,10yo+,5,2,George Smith,6,23,0.260869565 12/01/2002,Data,class12,2,10yo+,5,4,Emma Lilly,3,10,0.3 12/01/2002,Data,class12,2,10yo+,5,5,Tom Phillips,0,17,0 13/01/2002,Data,class13,4,11yo+,4,1,George Smith,6,24,0.25 13/01/2002,Data,class13,4,11yo+,4,2,Ted James,3,9,0.333333333 13/01/2002,Data,class13,4,11yo+,4,3,Emma Lilly,3,12,0.25 13/01/2002,Data,class13,4,11yo+,4,5,George Smith,6,24,0.25 14/01/2002,Data,class14,4,10yo+,6,4,Tom Phillips,0,18,0 14/01/2002,Data,class14,4,10yo+,6,2,Tom Phillips,0,18,0 14/01/2002,Data,class14,4,10yo+,6,5,George Smith,7,26,0.269230769 14/01/2002,Data,class14,4,10yo+,6,3,Tom Phillips,0,18,0 14/01/2002,Data,class14,4,10yo+,6,1,Emma Lilly,3,13,0.230769231 14/01/2002,Data,class14,4,10yo+,6,6,George Smith,7,26,0.269230769 15/01/2002,Data,class15,4,10yo+,6,6,Ted James,3,10,0.3 If anybody can help me achieve this I will truly grateful. If this requires more detail please ask. One last note the csv in question has main data csv in question has 800k rows. EDIT Currently the output file appears as follows using the code supplied by #user650654: data1,data2 If at all possible I would like the code changed slightly to out put two more things. Hopefully therse are not too difficult to do. Proposed changes to output file (commas represent each new row): title row labeling the row (e.g. "x" or "0:0.05",Calculated avereage of values within each bracket e.g."0.02469",data1,data2 So in reality it would probably look like this: x,n/a,data1,data2 0:0.05,0.02469,data1,data2 0.05:0.1,0.5469,data1,data2 .... .... Column1 = Row label (The data ranges that are being counted in the original question i.e. from 0 to 0.05 Column2 = Calculated average of values that fell within a particular range. I.e. If the Note the data1 & data2 are the two values the question innitially asked for. Column1 Many thanks AEA
Here is a solution for adding the two new fields: import csv import numpy def count(infile='data.csv', outfile='new.csv'): bins = numpy.arange(0, 1.05, 0.05) total_x = 0 col7one_x = 0 total_zeros = 0 col7one_zeros = 0 all_array = [] col7one_array = [] with open(infile, 'r') as fobj: reader = csv.reader(fobj) for line in reader: if line[10] == 'x': total_x += 1 if line[6] == '1': col7one_x += 1 elif line[10] == '0': # assumes zero is represented as "0" and not as say, "0.0" total_zeros += 1 if line[6] == '1': col7one_zeros += 1 else: val = float(line[10]) all_array.append(val) if line[6] == '1': col7one_array.append(val) all_array = numpy.array(all_array) hist_all, edges = numpy.histogram(all_array, bins=bins) hist_col7one, edges = numpy.histogram(col7one_array, bins=bins) bin_ranges = ['%s:%s' % (x, y) for x, y in zip(bins[:-1], bins[1:])] digitized = numpy.digitize(all_array, bins) bin_means = [all_array[digitized == i].mean() if hist_all[i - 1] else 'n/a' for i in range(1, len(bins))] with open(outfile, 'w') as fobj: writer = csv.writer(fobj) writer.writerow(['x', 'n/a', col7one_x, total_x]) writer.writerow(['0', 0 if total_zeros else 'n/a', col7one_zeros, total_zeros]) for row in zip(bin_ranges, bin_means, hist_col7one, hist_all): writer.writerow(row) if __name__ == '__main__': count()
This might work: import numpy as np import pandas as pd column_names = ['col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8', 'col9', 'col10', 'col11'] #names to be used as column labels. If no names are specified then columns can be refereed to by number eg. df[0], df[1] etc. df = pd.read_csv('data.csv', header=None, names=column_names) #header= None means there are no column headings in the csv file df.ix[df.col11 == 'x', 'col11']=-0.08 #trick so that 'x' rows will be grouped into a category >-0.1 and <= -0.05. This will allow all of col11 to be treated as a numbers bins = np.arange(-0.1, 1.0, 0.05) #bins to put col11 values in. >-0.1 and <=-0.05 will be our special 'x' rows, >-0.05 and <=0 will capture all the '0' values. labels = np.array(['%s:%s' % (x, y) for x, y in zip(bins[:-1], bins[1:])]) #create labels for the bins labels[0] = 'x' #change first bin label to 'x' labels[1] = '0' #change second bin label to '0' df['col11'] = df['col11'].astype(float) #convert col11 to numbers so we can do math on them df['bin'] = pd.cut(df['col11'], bins=bins, labels=False) # make another column 'bins' and put in an integer representing what bin the number falls into.Later we'll map the integer to the bin label df.set_index('bin', inplace=True, drop=False, append=False) #groupby is meant to run faster with an index def count_ones(x): """aggregate function to count values that equal 1""" return np.sum(x==1) dfg = df[['bin','col7','col11']].groupby('bin').agg({'col11': [np.mean], 'col7': [count_ones, len]}) # groupby the bin number and apply aggregate functions to specified column. dfg.index = labels[dfg.index]# apply labels to bin numbers dfg.ix['x',('col11', 'mean')]='N/A' #mean of 'x' rows is meaningless print(dfg) dfg.to_csv('new.csv') which gave me col7 col11 count_ones len mean x 1 7 N/A 0 2 21 0 0.15:0.2 2 2 0.2 0.2:0.25 9 22 0.2478632 0.25:0.3 0 13 0.2840755 0.3:0.35 0 5 0.3333333 0.45:0.5 0 4 0.5
This solution uses numpy.histogram. See below. import csv import numpy def count(infile='data.csv', outfile='new.csv'): total_x = 0 col7one_x = 0 total_zeros = 0 col7one_zeros = 0 all_array = [] col7one_array = [] with open(infile, 'r') as fobj: reader = csv.reader(fobj) for line in reader: if line[10] == 'x': total_x += 1 if line[6] == '1': col7one_x += 1 elif line[10] == '0': # assumes zero is represented as "0" and not as say, "0.0" total_zeros += 1 if line[6] == '1': col7one_zeros += 1 else: val = float(line[10]) all_array.append(val) if line[6] == '1': col7one_array.append(val) bins = numpy.arange(0, 1.05, 0.05) hist_all, edges = numpy.histogram(all_array, bins=bins) hist_col7one, edges = numpy.histogram(col7one_array, bins=bins) with open(outfile, 'w') as fobj: writer = csv.writer(fobj) writer.writerow([col7one_x, total_x]) writer.writerow([col7one_zeros, total_zeros]) for row in zip(hist_col7one, hist_all): writer.writerow(row) if __name__ == '__main__': count()