python panda find next higher and next lower number - python
I want to find the next value in 'y' to my val_test. In this exampel I want
next_up = 4
next_down = 6
( up and down is in the cooridinat system, not the number value )
val_test = 5
df = pd.DataFrame({'x': [1,1,1,2,2,3,3,3,4,4,4],
'y': [1,2,3,4,5,6,7,15,21,8,9]})
df=df.sort_values('y',ascending = True)
next_up = int(df.y.iloc[np.searchsorted(df.y.values,[val_test])])
df=df.sort_values('y', ascending = False)
next_down = int(df.y.iloc[np.searchsorted(df.y.values,[val_test])])
print('next_up = ', next_up)
print('next_down = ',next_down)
Perhaps you could stay only within Pandas.
There is no need to sort twice or use Numpy.
import pandas as pd
val_test = 5
col = 'y'
df = pd.DataFrame({
'x': [1,1,1,2,2,3,3,3,4,4,4],
'y': [1,2,3,4,5,6,7,15,21,8,9]})
# default from sort_values is ascending=True
# you create a sorted dataframe in which the index correspond to the ascending order
df_sorted = df.sort_values(col).reset_index(drop=True)
# you find the id in your sorted dataframe that correspond to your val_test
idx_val_test = df_sorted.loc[df_sorted[col]==val_test].index[0]
# you select the next up value in this sorted dataframe
next_up = df_sorted[col].iloc[idx_val_test+1]
# you select the next down value in this sorted dataframe
next_down = df_sorted[col].iloc[idx_val_test-1]
print('next_up = ', next_up)
print('next_down = ', next_down)
I can suggest 2 cases depending on what you need, for example :
CASE 1 : You can validate of real position keeping in mind the boundaries
df = pd.DataFrame({'x': [1,1,1,2,2,3,3,3,4,4,4],
'y': [1,2,3,4,5,6,7,15,21,8,9]})
val_test = 9
for i in range(len(lista)):
if i==0 :
if lista[i]==val_test :
next_up=lista[i+1]
next_down='Firts number'
elif i != 0:
if lista[i]==val_test :
try :
next_up=lista[i+1]
except :
next_up='Last number'
try :
next_down=lista[i-1]
except :
next_down='Firts number'
break
print('next_up = ', next_up)
print('next_down = ',next_down)
OUTPUT CASE #1
val_test = 9
next_up = Last number
next_down = 8
# but if you use
val_test = 5
next_up = 6
next_down = 4
Cool !!
CASE 2 : You can have problems with the first and last index althougt you can use
val_test = 15
df = pd.DataFrame({'x': [1,1,1,2,2,3,3,3,4,4,4],
'y': [1,2,3,4,5,6,7,15,21,8,9]})
next_up = int(df.y.iloc[np.searchsorted(df['y'].values,[val_test])+1])
next_down = int(df.y.iloc[np.searchsorted(df['y'].values,[val_test])-1])
print('next_up = ', next_up)
print('next_down = ',next_down)
OUTPUT CASE #2
val_test = 15
next_up = 21
next_down = 7
But you can have this problem when using val_test=9
val_test = 9
next_up = 21
next_down = 7
😓
Related
Merge / Loop thru data
Hello need to get insides from data, the desired_result is from VBA code that compare two sheets desired_result is checked and 100% accurate, If someone can assist me get the desired output, conditions are: err['p'] == scr['p'] & err['errd'] >= scr['scrd'] & err['errq'] - scr['scrq'] >= 0 Its all about checking how many of scr['n'] wenth thru err but if one passes thru err then err['errq'] -= scr['scrq'] and jump to next item scr['p'], scr['n'] are unique, please see the sample code below: import pandas as pd err = pd.DataFrame({ 'p' : ['10100.A','10101.A','10101.A','10101.A','10102.A','10102.A','10102.A','10103.A','10103.A','10147.A','10147.A'], 'errd' : ['18-5-2022','16-5-2022','4-5-2022','13-5-2022','9-5-2022','2-5-2022','29-5-2022','6-5-2022','11-5-2022','25-5-2022','6-5-2022'], 'errq' : [1, 1, 1, 1, 1, 2, 46, 1, 4, 1, 5]}) err = err.sort_values('errd') scr = pd.DataFrame({ 'p' : ['10101.A','10101.A','10101.A','10102.A','10102.A','10102.A','10103.A','10147.A','10147.A','10147.A','10147.A','10147.A'], 'scrd' : ['10-5-2022','10-5-2022','9-5-2022','13-5-2022','9-5-2022','9-5-2022','25-5-2022','6-5-2022','6-5-2022','6-5-2022','6-5-2022','11-5-2022'], 'scrq' : [1,1,1,1,1,1,1,1,1,1,1,1], 'n' : ['7000000051481339','7000000051481342','7000000051722237','7000000052018581','7000000051721987','7000000051721990','7000000052725251','7000000051530150','7000000051530152','7000000051530157','7000000051546193','7000000051761150']}) desired_result = pd.DataFrame({ 'report' : ['7000000051722237','7000000051481339','7000000051721987','7000000051721990','7000000052018581','7000000051530150','7000000051530152','7000000051530157','7000000051546193','7000000051761150'], 'match_err_scr' : ['10101.A','10101.A','10102.A','10102.A','10102.A','10147.A','10147.A','10147.A','10147.A','10147.A']}) What i have tried so far: match = [] #Iterating scr rows for i, row in scr.iterrows(): #Checking for match row now is full row in scr if row['scrq'] <= err[(err['p'] == row['p']) & (err['errd'] >= row['scrd'])]['errq'].sum(): r = row.to_dict() match.append(r) #Creating new data frame report = pd.DataFrame(match) report Merge left filter later report1 = scr.merge(err, how = 'left', on = 'p') flt = (report1['errd'] >= report1['scrd']) & (report1['errq'] - report1['scrq'] >= 0) report1 = report1.loc[flt] report1 = report1.drop_duplicates(subset = ['n']) report1 Nested loop way to slow and again not correct match = [] for i, row in scr.iterrows(): for e, erow in err.iterrows(): if (row['p'] == erow['p']) & (erow['errd'] >= row['scrd']) & (erow['errq'] - row['scrq'] >= 0): err['errq'][e]-= row['scrq'] row_to_dict = row.to_dict() match.append(row_to_dict) break report2 = pd.DataFrame(match) report2
Not an answer, but required to help understand the question. #B02T, this is what I am seeing as a slice of the data. So I am correct in that you are only comparing scr.loc[0] to err.loc[3], scr.loc[1] to err.loc[1] and scr.loc[2] to err.loc[2] ? Or are you comparing each row in scr to each row in err? Looking at the desired_result, I don't understand how scr.loc[2] could be in the desired_result since, using err.loc[2], (err['errd'] >= scr['srcd']) evals to False. And, following the same methodology, scr.loc[1] should be in desired_result. >>> err[err['p'] == '10101.A'] p errd errq 3 10101.A 13-5-2022 1 1 10101.A 16-5-2022 1 2 10101.A 4-5-2022 1 >>> scr[scr['p'] == '10101.A'] p scrd scrq n 0 10101.A 10-5-2022 1 7000000051481339 1 10101.A 10-5-2022 1 7000000051481342 2 10101.A 9-5-2022 1 7000000051722237 >>> desired_result report match_err_scr 0 7000000051722237 10101.A 1 7000000051481339 10101.A 2 7000000051721987 10102.A 3 7000000051721990 10102.A 4 7000000052018581 10102.A 5 7000000051530150 10147.A 6 7000000051530152 10147.A 7 7000000051530157 10147.A 8 7000000051546193 10147.A 9 7000000051761150 10147.A
Melt dataframe based on condition
d = {'key': [1,2,3], 'a': [True,True, False], 'b': [False,False,True]} df = pd.DataFrame(d) Current melt function is: df2 = df.melt(id_vars=['key'], var_name = 'letter', value_name = 'Bool') df2 = df2.query('Bool == True') Is there a way to incorporate that 'True' condition in the melt function. As I continue to add entries to my df and I have hundreds of columns, I assume it's much less costly to pull only the values I need instead of melting the entire df and then filtering. Any ideas?
Use pd.melt instead. Factor in replacement of False with NaN and dropna() eventually. pd.melt(df.replace(False, np.nan), id_vars=['key'],var_name = 'letter', value_name = 'Bool').dropna() key letter Bool 0 1 a True 1 2 a True 5 3 b True
You can filter the non key cols first, melt the results and concat the melted rows back. See the following; import pandas as pd import numpy as np import time d = {'key': [1,2,3], 'a': [True,True, False], 'b': [False,False,True]} df = pd.DataFrame(d) start_time = time.time() key_column_name = 'key' key_column_loc = list(df.columns).index(key_column_name) filtered_frame = None for letter in [s for s in list(df.columns) if s != key_column_name]: true_booleans = np.nonzero(df[letter].values)[0] melted_df = df.iloc[true_booleans][[key_column_name, letter]].reset_index(drop=True).melt(id_vars=[key_column_name], var_name = 'letter', value_name = 'Bool') if filtered_frame is None: filtered_frame = melted_df else: filtered_frame = pd.concat((filtered_frame, melted_df), axis = 0) end_time = time.time() print(filtered_frame, '\n\n', end_time - start_time, 'seconds!') Output key letter Bool 0 1 a True 1 2 a True 0 3 b True 0.011133432388305664 seconds! Compared to your code, it is slower (your score is 0.008090734481811523 seconds!), however as the rows increase, I would expect that above way of doing it will be more efficient. Looking forward for the results. Regarding the discussion on speed (Benchmarks) import pandas as pd import numpy as np import time import matplotlib.pyplot as plt # Benchmark Tests d = {'key': [1,2,3], 'a': [True,True, False], 'b': [False,False,True]} df_initial = pd.DataFrame(d) data_size = [10, 100, 10000, 50000, 100000, 500000, 1000000, 5000000, 10000000, 50000000] scores_current = [] scores_golden_lion = [] scores_sammywemmy = [] scores_wwnde = [] scores_slybot = [] for n_rows in data_size: df = df_initial.sample(n=n_rows, replace=True).reset_index(drop=True) ## #Current method start_time = time.time() df_current = df.melt(id_vars=['key'], var_name = 'letter', value_name = 'Bool') df_current = df_current.query('Bool == True') end_time = time.time() scores_current.append(end_time-start_time) ## #Golden Lion start_time = time.time() df_golden_lion = df.melt(id_vars=['key'], var_name = 'letter', value_name = 'Boolean') df_golden_lion= df_golden_lion.drop(df_golden_lion.index[df_golden_lion['Boolean'] == False]) end_time = time.time() scores_golden_lion.append(end_time-start_time) ## #sammywemmy start_time = time.time() box = df.iloc[:, 1:] len_df = len(df) letters = np.tile(box.columns, (len_df,1))[box] df_sammywemmy = pd.DataFrame({'key':df.key.array, 'letter' : letters, 'Bool' : [True]*len_df}) end_time = time.time() scores_sammywemmy.append(end_time-start_time) ## #wwnde start_time = time.time() df_wwnde = pd.melt(df.replace(False, np.nan), id_vars=['key'],var_name = 'letter', value_name = 'Bool').dropna() end_time = time.time() scores_wwnde.append(end_time-start_time) ## #Slybot start_time = time.time() key_column_name = 'key' key_column_loc = list(df.columns).index(key_column_name) filtered_frame = None for letter in [s for s in list(df.columns) if s != key_column_name]: true_booleans = np.nonzero(df[letter].values)[0] melted_df = df.iloc[true_booleans][[key_column_name, letter]].melt(id_vars=[key_column_name], var_name = 'letter', value_name = 'Bool') if filtered_frame is None: filtered_frame = melted_df else: filtered_frame = pd.concat((filtered_frame, melted_df), axis = 0) end_time = time.time() scores_slybot.append(end_time-start_time) plt.plot(data_size, scores_current, label = "Current method") plt.plot(data_size, scores_golden_lion, label = "Golden Lion") plt.plot(data_size, scores_sammywemmy, label = "sammywemmy") plt.plot(data_size, scores_wwnde, label = "wwnde") plt.plot(data_size, scores_slybot, label = "Slybot") plt.legend() plt.show() Interesting to see that none of the other answers can beat the originally suggested method with a dataset of 500,000 rows! Until 200,000 rows sammywemmy's method is a clear winner though.
The melt and filter step is efficient though, I'd probably stick with loc instead of query, especially if your data is not that large (<200_000 rows) Another option is to skip melt, use numpy, and build a new dataframe: box = df.iloc[:, 1:] len_df = len(df) letters = np.tile(box.columns, (len_df,1))[box] pd.DataFrame({'key':df.key.array, 'letter' : letters, 'Bool' : [True]*len_df}) key letter Bool 0 1 a True 1 2 a True 2 3 b True
melt moves column data and stacks it vertically resulting in two columns: the variable name of the column being stacked and the value column name. d = {'key': [1,2,3], 'a': [True,True, False], 'b': [False,False,True],'c':['Batchelor','Masters','Doctorate']} df = pd.DataFrame(d) df2 = df.melt(id_vars=['key'], var_name = 'letter', value_name = 'Boolean') df2=df2.drop(df2.index[df2['Boolean'] == False]) print(df2) output key letter Boolean 0 1 a True 1 2 a True 5 3 b True 6 1 c Batchelor 7 2 c Masters 8 3 c Doctorate ​
Compare three columns and choose the highest
I have a dataset that looks like the image below, and my goal is compare the three last rows and choose the highest each time. I have four new variables: empty = 0, cancel = 0, release = 0, undertermined = 0 for index 0, the cancelCount is the highest, therefore cancel += 1. The undetermined is increased only if the three rows are the same. Here is my failed code sample: empty = 0 cancel = 0 release = 0 undetermined = 0 if (df["emptyCount"] > df["cancelcount"]) & (df["emptyCount"] > df["releaseCount"]): empty += 1 elif (df["cancelcount"] > df["emptyCount"]) & (df["cancelcount"] > df["releaseCount"]): cancel += 1 elif (df["releasecount"] > df["emptyCount"]) & (df["releasecount"] > df["emptyCount"]): release += 1 else: undetermined += 1 ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
Fist we find the undetermined rows equal = (df['emptyCount'] == df['cancelcount']) | (df['cancelount'] == df['releaseCount']) Then we find the max column of the determined rows max_arg = df.loc[~equal, ['emptyCount', 'cancelcount', 'releaseCount']].idxmax(axis=1) And count them undetermined = equal.sum() empty = (max_arg == 'emptyCount').sum() cancel = (max_arg == 'cancelcount').sum() release = (max_arg == 'releaseCount').sum()
In general, you should avoid looping. Here's an example of vectorized code that does what you need: # data of intereset s = df[['emptyCount', 'cancelCount', 'releaseCount']] # maximum by rows max_vals = s.max(1) # those are equal to max values: equal_max = df.eq(max_vals, axis='rows').astype(int) # If there are single maximum along the rows: single_max = equal_max.sum(1)==1 # The values: equal_max.mul(single_max, axis='rows').sum() Output would be a series that looks like this: emmptyCount count1 cancelCount count2 releaseCount count3 dtype: int64
import pandas as pd import numpy as np class thing(object): def __init__(self): self.value = 0 empty , cancel , release , undetermined = [thing() for i in range(4)] dictt = { 0 : empty, 1 : cancel , 2 : release , 3 : undetermined } df = pd.DataFrame({ 'emptyCount': [2,4,5,7,3], 'cancelCount': [3,7,8,11,2], 'releaseCount': [2,0,0,5,3], }) for i in range(1,4): series = df.iloc[-4+i] for j in range(len(series)): if series[j] == series.max(): dictt[j].value +=1 cancel.value
A small script to get the maximum values: import numpy as np emptyCount = [2,4,5,7,3] cancelCount = [3,7,8,11,2] releaseCount = [2,0,0,5,3] # Here we use np.where to count instances where there is more than one index with the max value. # np.where returns a tuple, so we flatten it using "for n in m" count = [n for z in zip(emptyCount, cancelCount, releaseCount) for m in np.where(np.array(z) == max(z)) for n in m] empty = count.count(0) # 1 cancel = count.count(1) # 4 release = count.count(2) # 1
How to store the results of each iteration of for loop in a dataframe
cols = Germandata.columns percentage_list = [0.05,0.01,0.1] for i in range(len(Germandata)) : for percentage in percentage_list: columns_n = 3 random_columns = np.random.choice(cols, columns_n, replace=False) local_data = Germandata.copy() remove_n = int(round(local_data.shape[0] * percentage, 0)) for column_name in random_columns: drop_indices = np.random.choice(local_data.index, remove_n, replace=False) local_data.loc[drop_indices, column_name] = np.nan The code here selects the columns at random and will delete certain percentage of observations from the data and it will replace them with NANs. The problem here is after running the loop i will get the final percentage deleted dataframe in the percentage list because it is overwriting after each iteration. How to store the dataframe with nans after each iteration.? Ideally i should get three dataframes with different percent of data deleted.
Try this df_list = [] cols = Germandata.columns percentage_list = [0.05,0.01,0.1] for percentage in percentage_list: columns_n = 3 random_columns = np.random.choice(cols, columns_n, replace=False) local_data = Germandata.copy() remove_n = int(round(local_data.shape[0] * percentage, 0)) for column_name in random_columns: drop_indices = np.random.choice(local_data.index, remove_n, replace=False) local_data.loc[drop_indices, column_name] = np.nan local_data['percentage'] = percentage # optional df_list.append(local_data) df_05 = df_list[0] df_01 = df_list[1] df_1 = df_list[2] Alternatively, you can use a dictionary df_dict = {} cols = Germandata.columns percentage_list = [0.05,0.01,0.1] for percentage in percentage_list: columns_n = 3 random_columns = np.random.choice(cols, columns_n, replace=False) local_data = Germandata.copy() remove_n = int(round(local_data.shape[0] * percentage, 0)) for column_name in random_columns: drop_indices = np.random.choice(local_data.index, remove_n, replace=False) local_data.loc[drop_indices, column_name] = np.nan local_data['percentage'] = percentage # optional df_dict[str(percentage)] = local_data df_05 = df_dict['0.05'] df_01 = df_dict['0.01'] df_1 = df_dict['0.1']
Counting data within ranges in csv
I have some data which I need to break down into manageable chunks. With the following data I need to count the number of times x occurs in column 11 with column 7 being a 1 and how many times the number x occurs in column 11. I need to put them into the first line of a csv. After that I need to count the same thing but with column 11 being the following brackets: 0 ">0 but <0.05" ">0.05 but <0.10" ">0.1 but <0.15... all the way up to 1.00" All of these would ideally be appended to the same new.csv i.e. not the main data csv Some example raw data that fits the above description (please note a lot of the brackets will contain no data. In which case they would need to return 0,0: 01/01/2002,Data,class1,4,11yo+,4,1,George Smith,0,0,x 01/01/2002,Data,class1,4,11yo+,4,2,Ted James,0,0,x 01/01/2002,Data,class1,4,11yo+,4,3,Emma Lilly,0,0,x 01/01/2002,Data,class1,4,11yo+,4,5,George Smith,0,0,x 02/01/2002,Data,class2,4,10yo+,6,4,Tom Phillips,0,0,x 02/01/2002,Data,class2,4,10yo+,6,2,Tom Phillips,0,0,x 02/01/2002,Data,class2,4,10yo+,6,5,George Smith,1,2,0.5 02/01/2002,Data,class2,4,10yo+,6,3,Tom Phillips,0,0,x 02/01/2002,Data,class2,4,10yo+,6,1,Emma Lilly,0,1,0 02/01/2002,Data,class2,4,10yo+,6,6,George Smith,1,2,0.5 03/01/2002,Data,class3,4,10yo+,6,6,Ted James,0,1,0 03/01/2002,Data,class3,4,10yo+,6,3,Tom Phillips,0,3,0 03/01/2002,Data,class3,4,10yo+,6,2,George Smith,1,4,0.25 03/01/2002,Data,class3,4,10yo+,6,4,George Smith,1,4,0.25 03/01/2002,Data,class3,4,10yo+,6,1,George Smith,1,4,0.25 03/01/2002,Data,class3,4,10yo+,6,5,Tom Phillips,0,3,0 04/01/2002,Data,class4,2,10yo+,5,3,Emma Lilly,1,2,0.5 04/01/2002,Data,class4,2,10yo+,5,1,Ted James,0,2,0 04/01/2002,Data,class4,2,10yo+,5,2,George Smith,2,7,0.285714286 04/01/2002,Data,class4,2,10yo+,5,4,Emma Lilly,1,2,0.5 04/01/2002,Data,class4,2,10yo+,5,5,Tom Phillips,0,5,0 05/01/2002,Data,class5,4,11yo+,4,1,George Smith,2,8,0.25 05/01/2002,Data,class5,4,11yo+,4,2,Ted James,1,3,0.333333333 05/01/2002,Data,class5,4,11yo+,4,3,Emma Lilly,1,4,0.25 05/01/2002,Data,class5,4,11yo+,4,5,George Smith,2,8,0.25 06/01/2002,Data,class6,4,10yo+,6,4,Tom Phillips,0,6,0 06/01/2002,Data,class6,4,10yo+,6,2,Tom Phillips,0,6,0 06/01/2002,Data,class6,4,10yo+,6,5,George Smith,3,10,0.3 06/01/2002,Data,class6,4,10yo+,6,3,Tom Phillips,0,6,0 06/01/2002,Data,class6,4,10yo+,6,1,Emma Lilly,1,5,0.2 06/01/2002,Data,class6,4,10yo+,6,6,George Smith,3,10,0.3 07/01/2002,Data,class7,4,10yo+,6,6,Ted James,1,4,0.25 07/01/2002,Data,class7,4,10yo+,6,3,Tom Phillips,0,9,0 07/01/2002,Data,class7,4,10yo+,6,2,George Smith,3,12,0.25 07/01/2002,Data,class7,4,10yo+,6,4,George Smith,3,12,0.25 07/01/2002,Data,class7,4,10yo+,6,1,George Smith,3,12,0.25 07/01/2002,Data,class7,4,10yo+,6,5,Tom Phillips,0,9,0 08/01/2002,Data,class8,2,10yo+,5,3,Emma Lilly,2,6,0.333333333 08/01/2002,Data,class8,2,10yo+,5,1,Ted James,1,5,0.2 08/01/2002,Data,class8,2,10yo+,5,2,George Smith,4,15,0.266666667 08/01/2002,Data,class8,2,10yo+,5,4,Emma Lilly,2,6,0.333333333 08/01/2002,Data,class8,2,10yo+,5,5,Tom Phillips,0,11,0 09/01/2002,Data,class9,4,11yo+,4,1,George Smith,4,16,0.25 09/01/2002,Data,class9,4,11yo+,4,2,Ted James,2,6,0.333333333 09/01/2002,Data,class9,4,11yo+,4,3,Emma Lilly,2,8,0.25 09/01/2002,Data,class9,4,11yo+,4,5,George Smith,4,16,0.25 10/01/2002,Data,class10,4,10yo+,6,4,Tom Phillips,0,12,0 10/01/2002,Data,class10,4,10yo+,6,2,Tom Phillips,0,12,0 10/01/2002,Data,class10,4,10yo+,6,5,George Smith,5,18,0.277777778 10/01/2002,Data,class10,4,10yo+,6,3,Tom Phillips,0,12,0 10/01/2002,Data,class10,4,10yo+,6,1,Emma Lilly,2,9,0.222222222 10/01/2002,Data,class10,4,10yo+,6,6,George Smith,5,18,0.277777778 11/01/2002,Data,class11,4,10yo+,6,6,Ted James,2,7,0.285714286 11/01/2002,Data,class11,4,10yo+,6,3,Tom Phillips,0,15,0 11/01/2002,Data,class11,4,10yo+,6,2,George Smith,5,20,0.25 11/01/2002,Data,class11,4,10yo+,6,4,George Smith,5,20,0.25 11/01/2002,Data,class11,4,10yo+,6,1,George Smith,5,20,0.25 11/01/2002,Data,class11,4,10yo+,6,5,Tom Phillips,0,15,0 12/01/2002,Data,class12,2,10yo+,5,3,Emma Lilly,3,10,0.3 12/01/2002,Data,class12,2,10yo+,5,1,Ted James,2,8,0.25 12/01/2002,Data,class12,2,10yo+,5,2,George Smith,6,23,0.260869565 12/01/2002,Data,class12,2,10yo+,5,4,Emma Lilly,3,10,0.3 12/01/2002,Data,class12,2,10yo+,5,5,Tom Phillips,0,17,0 13/01/2002,Data,class13,4,11yo+,4,1,George Smith,6,24,0.25 13/01/2002,Data,class13,4,11yo+,4,2,Ted James,3,9,0.333333333 13/01/2002,Data,class13,4,11yo+,4,3,Emma Lilly,3,12,0.25 13/01/2002,Data,class13,4,11yo+,4,5,George Smith,6,24,0.25 14/01/2002,Data,class14,4,10yo+,6,4,Tom Phillips,0,18,0 14/01/2002,Data,class14,4,10yo+,6,2,Tom Phillips,0,18,0 14/01/2002,Data,class14,4,10yo+,6,5,George Smith,7,26,0.269230769 14/01/2002,Data,class14,4,10yo+,6,3,Tom Phillips,0,18,0 14/01/2002,Data,class14,4,10yo+,6,1,Emma Lilly,3,13,0.230769231 14/01/2002,Data,class14,4,10yo+,6,6,George Smith,7,26,0.269230769 15/01/2002,Data,class15,4,10yo+,6,6,Ted James,3,10,0.3 If anybody can help me achieve this I will truly grateful. If this requires more detail please ask. One last note the csv in question has main data csv in question has 800k rows. EDIT Currently the output file appears as follows using the code supplied by #user650654: data1,data2 If at all possible I would like the code changed slightly to out put two more things. Hopefully therse are not too difficult to do. Proposed changes to output file (commas represent each new row): title row labeling the row (e.g. "x" or "0:0.05",Calculated avereage of values within each bracket e.g."0.02469",data1,data2 So in reality it would probably look like this: x,n/a,data1,data2 0:0.05,0.02469,data1,data2 0.05:0.1,0.5469,data1,data2 .... .... Column1 = Row label (The data ranges that are being counted in the original question i.e. from 0 to 0.05 Column2 = Calculated average of values that fell within a particular range. I.e. If the Note the data1 & data2 are the two values the question innitially asked for. Column1 Many thanks AEA
Here is a solution for adding the two new fields: import csv import numpy def count(infile='data.csv', outfile='new.csv'): bins = numpy.arange(0, 1.05, 0.05) total_x = 0 col7one_x = 0 total_zeros = 0 col7one_zeros = 0 all_array = [] col7one_array = [] with open(infile, 'r') as fobj: reader = csv.reader(fobj) for line in reader: if line[10] == 'x': total_x += 1 if line[6] == '1': col7one_x += 1 elif line[10] == '0': # assumes zero is represented as "0" and not as say, "0.0" total_zeros += 1 if line[6] == '1': col7one_zeros += 1 else: val = float(line[10]) all_array.append(val) if line[6] == '1': col7one_array.append(val) all_array = numpy.array(all_array) hist_all, edges = numpy.histogram(all_array, bins=bins) hist_col7one, edges = numpy.histogram(col7one_array, bins=bins) bin_ranges = ['%s:%s' % (x, y) for x, y in zip(bins[:-1], bins[1:])] digitized = numpy.digitize(all_array, bins) bin_means = [all_array[digitized == i].mean() if hist_all[i - 1] else 'n/a' for i in range(1, len(bins))] with open(outfile, 'w') as fobj: writer = csv.writer(fobj) writer.writerow(['x', 'n/a', col7one_x, total_x]) writer.writerow(['0', 0 if total_zeros else 'n/a', col7one_zeros, total_zeros]) for row in zip(bin_ranges, bin_means, hist_col7one, hist_all): writer.writerow(row) if __name__ == '__main__': count()
This might work: import numpy as np import pandas as pd column_names = ['col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8', 'col9', 'col10', 'col11'] #names to be used as column labels. If no names are specified then columns can be refereed to by number eg. df[0], df[1] etc. df = pd.read_csv('data.csv', header=None, names=column_names) #header= None means there are no column headings in the csv file df.ix[df.col11 == 'x', 'col11']=-0.08 #trick so that 'x' rows will be grouped into a category >-0.1 and <= -0.05. This will allow all of col11 to be treated as a numbers bins = np.arange(-0.1, 1.0, 0.05) #bins to put col11 values in. >-0.1 and <=-0.05 will be our special 'x' rows, >-0.05 and <=0 will capture all the '0' values. labels = np.array(['%s:%s' % (x, y) for x, y in zip(bins[:-1], bins[1:])]) #create labels for the bins labels[0] = 'x' #change first bin label to 'x' labels[1] = '0' #change second bin label to '0' df['col11'] = df['col11'].astype(float) #convert col11 to numbers so we can do math on them df['bin'] = pd.cut(df['col11'], bins=bins, labels=False) # make another column 'bins' and put in an integer representing what bin the number falls into.Later we'll map the integer to the bin label df.set_index('bin', inplace=True, drop=False, append=False) #groupby is meant to run faster with an index def count_ones(x): """aggregate function to count values that equal 1""" return np.sum(x==1) dfg = df[['bin','col7','col11']].groupby('bin').agg({'col11': [np.mean], 'col7': [count_ones, len]}) # groupby the bin number and apply aggregate functions to specified column. dfg.index = labels[dfg.index]# apply labels to bin numbers dfg.ix['x',('col11', 'mean')]='N/A' #mean of 'x' rows is meaningless print(dfg) dfg.to_csv('new.csv') which gave me col7 col11 count_ones len mean x 1 7 N/A 0 2 21 0 0.15:0.2 2 2 0.2 0.2:0.25 9 22 0.2478632 0.25:0.3 0 13 0.2840755 0.3:0.35 0 5 0.3333333 0.45:0.5 0 4 0.5
This solution uses numpy.histogram. See below. import csv import numpy def count(infile='data.csv', outfile='new.csv'): total_x = 0 col7one_x = 0 total_zeros = 0 col7one_zeros = 0 all_array = [] col7one_array = [] with open(infile, 'r') as fobj: reader = csv.reader(fobj) for line in reader: if line[10] == 'x': total_x += 1 if line[6] == '1': col7one_x += 1 elif line[10] == '0': # assumes zero is represented as "0" and not as say, "0.0" total_zeros += 1 if line[6] == '1': col7one_zeros += 1 else: val = float(line[10]) all_array.append(val) if line[6] == '1': col7one_array.append(val) bins = numpy.arange(0, 1.05, 0.05) hist_all, edges = numpy.histogram(all_array, bins=bins) hist_col7one, edges = numpy.histogram(col7one_array, bins=bins) with open(outfile, 'w') as fobj: writer = csv.writer(fobj) writer.writerow([col7one_x, total_x]) writer.writerow([col7one_zeros, total_zeros]) for row in zip(hist_col7one, hist_all): writer.writerow(row) if __name__ == '__main__': count()