python panda find next higher and next lower number - python

I want to find the next value in 'y' to my val_test. In this exampel I want
next_up = 4
next_down = 6
( up and down is in the cooridinat system, not the number value )
val_test = 5
df = pd.DataFrame({'x': [1,1,1,2,2,3,3,3,4,4,4],
'y': [1,2,3,4,5,6,7,15,21,8,9]})
df=df.sort_values('y',ascending = True)
next_up = int(df.y.iloc[np.searchsorted(df.y.values,[val_test])])
df=df.sort_values('y', ascending = False)
next_down = int(df.y.iloc[np.searchsorted(df.y.values,[val_test])])
print('next_up = ', next_up)
print('next_down = ',next_down)

Perhaps you could stay only within Pandas.
There is no need to sort twice or use Numpy.
import pandas as pd
val_test = 5
col = 'y'
df = pd.DataFrame({
'x': [1,1,1,2,2,3,3,3,4,4,4],
'y': [1,2,3,4,5,6,7,15,21,8,9]})
# default from sort_values is ascending=True
# you create a sorted dataframe in which the index correspond to the ascending order
df_sorted = df.sort_values(col).reset_index(drop=True)
# you find the id in your sorted dataframe that correspond to your val_test
idx_val_test = df_sorted.loc[df_sorted[col]==val_test].index[0]
# you select the next up value in this sorted dataframe
next_up = df_sorted[col].iloc[idx_val_test+1]
# you select the next down value in this sorted dataframe
next_down = df_sorted[col].iloc[idx_val_test-1]
print('next_up = ', next_up)
print('next_down = ', next_down)

I can suggest 2 cases depending on what you need, for example :
CASE 1 : You can validate of real position keeping in mind the boundaries
df = pd.DataFrame({'x': [1,1,1,2,2,3,3,3,4,4,4],
'y': [1,2,3,4,5,6,7,15,21,8,9]})
val_test = 9
for i in range(len(lista)):
if i==0 :
if lista[i]==val_test :
next_up=lista[i+1]
next_down='Firts number'
elif i != 0:
if lista[i]==val_test :
try :
next_up=lista[i+1]
except :
next_up='Last number'
try :
next_down=lista[i-1]
except :
next_down='Firts number'
break
print('next_up = ', next_up)
print('next_down = ',next_down)
OUTPUT CASE #1
val_test = 9
next_up = Last number
next_down = 8
# but if you use
val_test = 5
next_up = 6
next_down = 4
Cool !!
CASE 2 : You can have problems with the first and last index althougt you can use
val_test = 15
df = pd.DataFrame({'x': [1,1,1,2,2,3,3,3,4,4,4],
'y': [1,2,3,4,5,6,7,15,21,8,9]})
next_up = int(df.y.iloc[np.searchsorted(df['y'].values,[val_test])+1])
next_down = int(df.y.iloc[np.searchsorted(df['y'].values,[val_test])-1])
print('next_up = ', next_up)
print('next_down = ',next_down)
OUTPUT CASE #2
val_test = 15
next_up = 21
next_down = 7
But you can have this problem when using val_test=9
val_test = 9
next_up = 21
next_down = 7
😓

Related

Merge / Loop thru data

Hello need to get insides from data, the desired_result is from VBA code that compare two sheets desired_result is checked and 100% accurate, If someone can assist me get the desired output, conditions are:
err['p'] == scr['p'] & err['errd'] >= scr['scrd'] & err['errq'] - scr['scrq'] >= 0
Its all about checking how many of scr['n'] wenth thru err but if one passes thru err then err['errq'] -= scr['scrq'] and jump to next item scr['p'], scr['n'] are unique, please see the sample code below:
import pandas as pd
err = pd.DataFrame({
'p' : ['10100.A','10101.A','10101.A','10101.A','10102.A','10102.A','10102.A','10103.A','10103.A','10147.A','10147.A'],
'errd' : ['18-5-2022','16-5-2022','4-5-2022','13-5-2022','9-5-2022','2-5-2022','29-5-2022','6-5-2022','11-5-2022','25-5-2022','6-5-2022'],
'errq' : [1, 1, 1, 1, 1, 2, 46, 1, 4, 1, 5]})
err = err.sort_values('errd')
scr = pd.DataFrame({
'p' : ['10101.A','10101.A','10101.A','10102.A','10102.A','10102.A','10103.A','10147.A','10147.A','10147.A','10147.A','10147.A'],
'scrd' : ['10-5-2022','10-5-2022','9-5-2022','13-5-2022','9-5-2022','9-5-2022','25-5-2022','6-5-2022','6-5-2022','6-5-2022','6-5-2022','11-5-2022'],
'scrq' : [1,1,1,1,1,1,1,1,1,1,1,1],
'n' : ['7000000051481339','7000000051481342','7000000051722237','7000000052018581','7000000051721987','7000000051721990','7000000052725251','7000000051530150','7000000051530152','7000000051530157','7000000051546193','7000000051761150']})
desired_result = pd.DataFrame({
'report' : ['7000000051722237','7000000051481339','7000000051721987','7000000051721990','7000000052018581','7000000051530150','7000000051530152','7000000051530157','7000000051546193','7000000051761150'],
'match_err_scr' : ['10101.A','10101.A','10102.A','10102.A','10102.A','10147.A','10147.A','10147.A','10147.A','10147.A']})
What i have tried so far:
match = []
#Iterating scr rows
for i, row in scr.iterrows():
#Checking for match row now is full row in scr
if row['scrq'] <= err[(err['p'] == row['p']) & (err['errd'] >= row['scrd'])]['errq'].sum():
r = row.to_dict()
match.append(r)
#Creating new data frame
report = pd.DataFrame(match)
report
Merge left filter later
report1 = scr.merge(err, how = 'left', on = 'p')
flt = (report1['errd'] >= report1['scrd']) & (report1['errq'] - report1['scrq'] >= 0)
report1 = report1.loc[flt]
report1 = report1.drop_duplicates(subset = ['n'])
report1
Nested loop way to slow and again not correct
match = []
for i, row in scr.iterrows():
for e, erow in err.iterrows():
if (row['p'] == erow['p']) & (erow['errd'] >= row['scrd']) & (erow['errq'] - row['scrq'] >= 0):
err['errq'][e]-= row['scrq']
row_to_dict = row.to_dict()
match.append(row_to_dict)
break
report2 = pd.DataFrame(match)
report2
Not an answer, but required to help understand the question.
#B02T, this is what I am seeing as a slice of the data.
So I am correct in that you are only comparing scr.loc[0] to err.loc[3], scr.loc[1] to err.loc[1] and scr.loc[2] to err.loc[2] ? Or are you comparing each row in scr to each row in err?
Looking at the desired_result, I don't understand how scr.loc[2] could be in the desired_result since, using err.loc[2], (err['errd'] >= scr['srcd']) evals to False. And, following the same methodology, scr.loc[1] should be in desired_result.
>>> err[err['p'] == '10101.A']
p errd errq
3 10101.A 13-5-2022 1
1 10101.A 16-5-2022 1
2 10101.A 4-5-2022 1
>>> scr[scr['p'] == '10101.A']
p scrd scrq n
0 10101.A 10-5-2022 1 7000000051481339
1 10101.A 10-5-2022 1 7000000051481342
2 10101.A 9-5-2022 1 7000000051722237
>>> desired_result
report match_err_scr
0 7000000051722237 10101.A
1 7000000051481339 10101.A
2 7000000051721987 10102.A
3 7000000051721990 10102.A
4 7000000052018581 10102.A
5 7000000051530150 10147.A
6 7000000051530152 10147.A
7 7000000051530157 10147.A
8 7000000051546193 10147.A
9 7000000051761150 10147.A

Melt dataframe based on condition

d = {'key': [1,2,3], 'a': [True,True, False], 'b': [False,False,True]}
df = pd.DataFrame(d)
Current melt function is:
df2 = df.melt(id_vars=['key'], var_name = 'letter', value_name = 'Bool')
df2 = df2.query('Bool == True')
Is there a way to incorporate that 'True' condition in the melt function. As I continue to add entries to my df and I have hundreds of columns, I assume it's much less costly to pull only the values I need instead of melting the entire df and then filtering. Any ideas?
Use pd.melt instead. Factor in replacement of False with NaN and dropna() eventually.
pd.melt(df.replace(False, np.nan), id_vars=['key'],var_name = 'letter', value_name = 'Bool').dropna()
key letter Bool
0 1 a True
1 2 a True
5 3 b True
You can filter the non key cols first, melt the results and concat the melted rows back. See the following;
import pandas as pd
import numpy as np
import time
d = {'key': [1,2,3], 'a': [True,True, False], 'b': [False,False,True]}
df = pd.DataFrame(d)
start_time = time.time()
key_column_name = 'key'
key_column_loc = list(df.columns).index(key_column_name)
filtered_frame = None
for letter in [s for s in list(df.columns) if s != key_column_name]:
true_booleans = np.nonzero(df[letter].values)[0]
melted_df = df.iloc[true_booleans][[key_column_name, letter]].reset_index(drop=True).melt(id_vars=[key_column_name], var_name = 'letter', value_name = 'Bool')
if filtered_frame is None:
filtered_frame = melted_df
else:
filtered_frame = pd.concat((filtered_frame, melted_df), axis = 0)
end_time = time.time()
print(filtered_frame, '\n\n', end_time - start_time, 'seconds!')
Output
key letter Bool
0 1 a True
1 2 a True
0 3 b True
0.011133432388305664 seconds!
Compared to your code, it is slower (your score is 0.008090734481811523 seconds!), however as the rows increase, I would expect that above way of doing it will be more efficient. Looking forward for the results.
Regarding the discussion on speed (Benchmarks)
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
# Benchmark Tests
d = {'key': [1,2,3], 'a': [True,True, False], 'b': [False,False,True]}
df_initial = pd.DataFrame(d)
data_size = [10, 100, 10000, 50000, 100000, 500000, 1000000, 5000000, 10000000, 50000000]
scores_current = []
scores_golden_lion = []
scores_sammywemmy = []
scores_wwnde = []
scores_slybot = []
for n_rows in data_size:
df = df_initial.sample(n=n_rows, replace=True).reset_index(drop=True)
## #Current method
start_time = time.time()
df_current = df.melt(id_vars=['key'], var_name = 'letter', value_name = 'Bool')
df_current = df_current.query('Bool == True')
end_time = time.time()
scores_current.append(end_time-start_time)
## #Golden Lion
start_time = time.time()
df_golden_lion = df.melt(id_vars=['key'], var_name = 'letter', value_name = 'Boolean')
df_golden_lion= df_golden_lion.drop(df_golden_lion.index[df_golden_lion['Boolean'] == False])
end_time = time.time()
scores_golden_lion.append(end_time-start_time)
## #sammywemmy
start_time = time.time()
box = df.iloc[:, 1:]
len_df = len(df)
letters = np.tile(box.columns, (len_df,1))[box]
df_sammywemmy = pd.DataFrame({'key':df.key.array,
'letter' : letters,
'Bool' : [True]*len_df})
end_time = time.time()
scores_sammywemmy.append(end_time-start_time)
## #wwnde
start_time = time.time()
df_wwnde = pd.melt(df.replace(False, np.nan), id_vars=['key'],var_name = 'letter', value_name = 'Bool').dropna()
end_time = time.time()
scores_wwnde.append(end_time-start_time)
## #Slybot
start_time = time.time()
key_column_name = 'key'
key_column_loc = list(df.columns).index(key_column_name)
filtered_frame = None
for letter in [s for s in list(df.columns) if s != key_column_name]:
true_booleans = np.nonzero(df[letter].values)[0]
melted_df = df.iloc[true_booleans][[key_column_name, letter]].melt(id_vars=[key_column_name], var_name = 'letter', value_name = 'Bool')
if filtered_frame is None:
filtered_frame = melted_df
else:
filtered_frame = pd.concat((filtered_frame, melted_df), axis = 0)
end_time = time.time()
scores_slybot.append(end_time-start_time)
plt.plot(data_size, scores_current, label = "Current method")
plt.plot(data_size, scores_golden_lion, label = "Golden Lion")
plt.plot(data_size, scores_sammywemmy, label = "sammywemmy")
plt.plot(data_size, scores_wwnde, label = "wwnde")
plt.plot(data_size, scores_slybot, label = "Slybot")
plt.legend()
plt.show()
Interesting to see that none of the other answers can beat the originally suggested method with a dataset of 500,000 rows! Until 200,000 rows sammywemmy's method is a clear winner though.
The melt and filter step is efficient though, I'd probably stick with loc instead of query, especially if your data is not that large (<200_000 rows)
Another option is to skip melt, use numpy, and build a new dataframe:
box = df.iloc[:, 1:]
len_df = len(df)
letters = np.tile(box.columns, (len_df,1))[box]
pd.DataFrame({'key':df.key.array,
'letter' : letters,
'Bool' : [True]*len_df})
key letter Bool
0 1 a True
1 2 a True
2 3 b True
melt moves column data and stacks it vertically resulting in two columns: the variable name of the column being stacked and the value column name.
d = {'key': [1,2,3], 'a': [True,True, False], 'b': [False,False,True],'c':['Batchelor','Masters','Doctorate']}
df = pd.DataFrame(d)
df2 = df.melt(id_vars=['key'], var_name = 'letter', value_name = 'Boolean')
df2=df2.drop(df2.index[df2['Boolean'] == False])
print(df2)
output
key letter Boolean
0 1 a True
1 2 a True
5 3 b True
6 1 c Batchelor
7 2 c Masters
8 3 c Doctorate
​

Compare three columns and choose the highest

I have a dataset that looks like the image below,
and my goal is compare the three last rows and choose the highest each time.
I have four new variables: empty = 0, cancel = 0, release = 0, undertermined = 0
for index 0, the cancelCount is the highest, therefore cancel += 1. The undetermined is increased only if the three rows are the same.
Here is my failed code sample:
empty = 0
cancel = 0
release = 0
undetermined = 0
if (df["emptyCount"] > df["cancelcount"]) & (df["emptyCount"] > df["releaseCount"]):
empty += 1
elif (df["cancelcount"] > df["emptyCount"]) & (df["cancelcount"] > df["releaseCount"]):
cancel += 1
elif (df["releasecount"] > df["emptyCount"]) & (df["releasecount"] > df["emptyCount"]):
release += 1
else:
undetermined += 1
ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
Fist we find the undetermined rows
equal = (df['emptyCount'] == df['cancelcount']) | (df['cancelount'] == df['releaseCount'])
Then we find the max column of the determined rows
max_arg = df.loc[~equal, ['emptyCount', 'cancelcount', 'releaseCount']].idxmax(axis=1)
And count them
undetermined = equal.sum()
empty = (max_arg == 'emptyCount').sum()
cancel = (max_arg == 'cancelcount').sum()
release = (max_arg == 'releaseCount').sum()
In general, you should avoid looping. Here's an example of vectorized code that does what you need:
# data of intereset
s = df[['emptyCount', 'cancelCount', 'releaseCount']]
# maximum by rows
max_vals = s.max(1)
# those are equal to max values:
equal_max = df.eq(max_vals, axis='rows').astype(int)
# If there are single maximum along the rows:
single_max = equal_max.sum(1)==1
# The values:
equal_max.mul(single_max, axis='rows').sum()
Output would be a series that looks like this:
emmptyCount count1
cancelCount count2
releaseCount count3
dtype: int64
import pandas as pd
import numpy as np
class thing(object):
def __init__(self):
self.value = 0
empty , cancel , release , undetermined = [thing() for i in range(4)]
dictt = { 0 : empty, 1 : cancel , 2 : release , 3 : undetermined }
df = pd.DataFrame({
'emptyCount': [2,4,5,7,3],
'cancelCount': [3,7,8,11,2],
'releaseCount': [2,0,0,5,3],
})
for i in range(1,4):
series = df.iloc[-4+i]
for j in range(len(series)):
if series[j] == series.max():
dictt[j].value +=1
cancel.value
A small script to get the maximum values:
import numpy as np
emptyCount = [2,4,5,7,3]
cancelCount = [3,7,8,11,2]
releaseCount = [2,0,0,5,3]
# Here we use np.where to count instances where there is more than one index with the max value.
# np.where returns a tuple, so we flatten it using "for n in m"
count = [n for z in zip(emptyCount, cancelCount, releaseCount) for m in np.where(np.array(z) == max(z)) for n in m]
empty = count.count(0) # 1
cancel = count.count(1) # 4
release = count.count(2) # 1

How to store the results of each iteration of for loop in a dataframe

cols = Germandata.columns
percentage_list = [0.05,0.01,0.1]
for i in range(len(Germandata)) :
for percentage in percentage_list:
columns_n = 3
random_columns = np.random.choice(cols, columns_n, replace=False)
local_data = Germandata.copy()
remove_n = int(round(local_data.shape[0] * percentage, 0))
for column_name in random_columns:
drop_indices = np.random.choice(local_data.index, remove_n, replace=False)
local_data.loc[drop_indices, column_name] = np.nan
The code here selects the columns at random and will delete certain percentage of observations from the data and it will replace them with NANs. The problem here is after running the loop i will get the final percentage deleted dataframe in the percentage list because it is overwriting after each iteration. How to store the dataframe with nans after each iteration.? Ideally i should get three dataframes with different percent of data deleted.
Try this
df_list = []
cols = Germandata.columns
percentage_list = [0.05,0.01,0.1]
for percentage in percentage_list:
columns_n = 3
random_columns = np.random.choice(cols, columns_n, replace=False)
local_data = Germandata.copy()
remove_n = int(round(local_data.shape[0] * percentage, 0))
for column_name in random_columns:
drop_indices = np.random.choice(local_data.index, remove_n, replace=False)
local_data.loc[drop_indices, column_name] = np.nan
local_data['percentage'] = percentage # optional
df_list.append(local_data)
df_05 = df_list[0]
df_01 = df_list[1]
df_1 = df_list[2]
Alternatively, you can use a dictionary
df_dict = {}
cols = Germandata.columns
percentage_list = [0.05,0.01,0.1]
for percentage in percentage_list:
columns_n = 3
random_columns = np.random.choice(cols, columns_n, replace=False)
local_data = Germandata.copy()
remove_n = int(round(local_data.shape[0] * percentage, 0))
for column_name in random_columns:
drop_indices = np.random.choice(local_data.index, remove_n, replace=False)
local_data.loc[drop_indices, column_name] = np.nan
local_data['percentage'] = percentage # optional
df_dict[str(percentage)] = local_data
df_05 = df_dict['0.05']
df_01 = df_dict['0.01']
df_1 = df_dict['0.1']

Counting data within ranges in csv

I have some data which I need to break down into manageable chunks. With the following data I need to count the number of times x occurs in column 11 with column 7 being a 1 and how many times the number x occurs in column 11. I need to put them into the first line of a csv. After that I need to count the same thing but with column 11 being the following brackets:
0
">0 but <0.05"
">0.05 but <0.10"
">0.1 but <0.15... all the way up to 1.00"
All of these would ideally be appended to the same new.csv i.e. not the main data csv
Some example raw data that fits the above description (please note a lot of the brackets will contain no data. In which case they would need to return 0,0:
01/01/2002,Data,class1,4,11yo+,4,1,George Smith,0,0,x
01/01/2002,Data,class1,4,11yo+,4,2,Ted James,0,0,x
01/01/2002,Data,class1,4,11yo+,4,3,Emma Lilly,0,0,x
01/01/2002,Data,class1,4,11yo+,4,5,George Smith,0,0,x
02/01/2002,Data,class2,4,10yo+,6,4,Tom Phillips,0,0,x
02/01/2002,Data,class2,4,10yo+,6,2,Tom Phillips,0,0,x
02/01/2002,Data,class2,4,10yo+,6,5,George Smith,1,2,0.5
02/01/2002,Data,class2,4,10yo+,6,3,Tom Phillips,0,0,x
02/01/2002,Data,class2,4,10yo+,6,1,Emma Lilly,0,1,0
02/01/2002,Data,class2,4,10yo+,6,6,George Smith,1,2,0.5
03/01/2002,Data,class3,4,10yo+,6,6,Ted James,0,1,0
03/01/2002,Data,class3,4,10yo+,6,3,Tom Phillips,0,3,0
03/01/2002,Data,class3,4,10yo+,6,2,George Smith,1,4,0.25
03/01/2002,Data,class3,4,10yo+,6,4,George Smith,1,4,0.25
03/01/2002,Data,class3,4,10yo+,6,1,George Smith,1,4,0.25
03/01/2002,Data,class3,4,10yo+,6,5,Tom Phillips,0,3,0
04/01/2002,Data,class4,2,10yo+,5,3,Emma Lilly,1,2,0.5
04/01/2002,Data,class4,2,10yo+,5,1,Ted James,0,2,0
04/01/2002,Data,class4,2,10yo+,5,2,George Smith,2,7,0.285714286
04/01/2002,Data,class4,2,10yo+,5,4,Emma Lilly,1,2,0.5
04/01/2002,Data,class4,2,10yo+,5,5,Tom Phillips,0,5,0
05/01/2002,Data,class5,4,11yo+,4,1,George Smith,2,8,0.25
05/01/2002,Data,class5,4,11yo+,4,2,Ted James,1,3,0.333333333
05/01/2002,Data,class5,4,11yo+,4,3,Emma Lilly,1,4,0.25
05/01/2002,Data,class5,4,11yo+,4,5,George Smith,2,8,0.25
06/01/2002,Data,class6,4,10yo+,6,4,Tom Phillips,0,6,0
06/01/2002,Data,class6,4,10yo+,6,2,Tom Phillips,0,6,0
06/01/2002,Data,class6,4,10yo+,6,5,George Smith,3,10,0.3
06/01/2002,Data,class6,4,10yo+,6,3,Tom Phillips,0,6,0
06/01/2002,Data,class6,4,10yo+,6,1,Emma Lilly,1,5,0.2
06/01/2002,Data,class6,4,10yo+,6,6,George Smith,3,10,0.3
07/01/2002,Data,class7,4,10yo+,6,6,Ted James,1,4,0.25
07/01/2002,Data,class7,4,10yo+,6,3,Tom Phillips,0,9,0
07/01/2002,Data,class7,4,10yo+,6,2,George Smith,3,12,0.25
07/01/2002,Data,class7,4,10yo+,6,4,George Smith,3,12,0.25
07/01/2002,Data,class7,4,10yo+,6,1,George Smith,3,12,0.25
07/01/2002,Data,class7,4,10yo+,6,5,Tom Phillips,0,9,0
08/01/2002,Data,class8,2,10yo+,5,3,Emma Lilly,2,6,0.333333333
08/01/2002,Data,class8,2,10yo+,5,1,Ted James,1,5,0.2
08/01/2002,Data,class8,2,10yo+,5,2,George Smith,4,15,0.266666667
08/01/2002,Data,class8,2,10yo+,5,4,Emma Lilly,2,6,0.333333333
08/01/2002,Data,class8,2,10yo+,5,5,Tom Phillips,0,11,0
09/01/2002,Data,class9,4,11yo+,4,1,George Smith,4,16,0.25
09/01/2002,Data,class9,4,11yo+,4,2,Ted James,2,6,0.333333333
09/01/2002,Data,class9,4,11yo+,4,3,Emma Lilly,2,8,0.25
09/01/2002,Data,class9,4,11yo+,4,5,George Smith,4,16,0.25
10/01/2002,Data,class10,4,10yo+,6,4,Tom Phillips,0,12,0
10/01/2002,Data,class10,4,10yo+,6,2,Tom Phillips,0,12,0
10/01/2002,Data,class10,4,10yo+,6,5,George Smith,5,18,0.277777778
10/01/2002,Data,class10,4,10yo+,6,3,Tom Phillips,0,12,0
10/01/2002,Data,class10,4,10yo+,6,1,Emma Lilly,2,9,0.222222222
10/01/2002,Data,class10,4,10yo+,6,6,George Smith,5,18,0.277777778
11/01/2002,Data,class11,4,10yo+,6,6,Ted James,2,7,0.285714286
11/01/2002,Data,class11,4,10yo+,6,3,Tom Phillips,0,15,0
11/01/2002,Data,class11,4,10yo+,6,2,George Smith,5,20,0.25
11/01/2002,Data,class11,4,10yo+,6,4,George Smith,5,20,0.25
11/01/2002,Data,class11,4,10yo+,6,1,George Smith,5,20,0.25
11/01/2002,Data,class11,4,10yo+,6,5,Tom Phillips,0,15,0
12/01/2002,Data,class12,2,10yo+,5,3,Emma Lilly,3,10,0.3
12/01/2002,Data,class12,2,10yo+,5,1,Ted James,2,8,0.25
12/01/2002,Data,class12,2,10yo+,5,2,George Smith,6,23,0.260869565
12/01/2002,Data,class12,2,10yo+,5,4,Emma Lilly,3,10,0.3
12/01/2002,Data,class12,2,10yo+,5,5,Tom Phillips,0,17,0
13/01/2002,Data,class13,4,11yo+,4,1,George Smith,6,24,0.25
13/01/2002,Data,class13,4,11yo+,4,2,Ted James,3,9,0.333333333
13/01/2002,Data,class13,4,11yo+,4,3,Emma Lilly,3,12,0.25
13/01/2002,Data,class13,4,11yo+,4,5,George Smith,6,24,0.25
14/01/2002,Data,class14,4,10yo+,6,4,Tom Phillips,0,18,0
14/01/2002,Data,class14,4,10yo+,6,2,Tom Phillips,0,18,0
14/01/2002,Data,class14,4,10yo+,6,5,George Smith,7,26,0.269230769
14/01/2002,Data,class14,4,10yo+,6,3,Tom Phillips,0,18,0
14/01/2002,Data,class14,4,10yo+,6,1,Emma Lilly,3,13,0.230769231
14/01/2002,Data,class14,4,10yo+,6,6,George Smith,7,26,0.269230769
15/01/2002,Data,class15,4,10yo+,6,6,Ted James,3,10,0.3
If anybody can help me achieve this I will truly grateful. If this requires more detail please ask.
One last note the csv in question has main data csv in question has 800k rows.
EDIT
Currently the output file appears as follows using the code supplied by #user650654:
data1,data2
If at all possible I would like the code changed slightly to out put two more things. Hopefully therse are not too difficult to do. Proposed changes to output file (commas represent each new row):
title row labeling the row (e.g. "x" or "0:0.05",Calculated avereage of values within each bracket e.g."0.02469",data1,data2
So in reality it would probably look like this:
x,n/a,data1,data2
0:0.05,0.02469,data1,data2
0.05:0.1,0.5469,data1,data2
....
....
Column1 = Row label (The data ranges that are being counted in the original question i.e. from 0 to 0.05
Column2 = Calculated average of values that fell within a particular range. I.e. If the
Note the data1 & data2 are the two values the question innitially asked for.
Column1
Many thanks AEA
Here is a solution for adding the two new fields:
import csv
import numpy
def count(infile='data.csv', outfile='new.csv'):
bins = numpy.arange(0, 1.05, 0.05)
total_x = 0
col7one_x = 0
total_zeros = 0
col7one_zeros = 0
all_array = []
col7one_array = []
with open(infile, 'r') as fobj:
reader = csv.reader(fobj)
for line in reader:
if line[10] == 'x':
total_x += 1
if line[6] == '1':
col7one_x += 1
elif line[10] == '0':
# assumes zero is represented as "0" and not as say, "0.0"
total_zeros += 1
if line[6] == '1':
col7one_zeros += 1
else:
val = float(line[10])
all_array.append(val)
if line[6] == '1':
col7one_array.append(val)
all_array = numpy.array(all_array)
hist_all, edges = numpy.histogram(all_array, bins=bins)
hist_col7one, edges = numpy.histogram(col7one_array, bins=bins)
bin_ranges = ['%s:%s' % (x, y) for x, y in zip(bins[:-1], bins[1:])]
digitized = numpy.digitize(all_array, bins)
bin_means = [all_array[digitized == i].mean() if hist_all[i - 1] else 'n/a' for i in range(1, len(bins))]
with open(outfile, 'w') as fobj:
writer = csv.writer(fobj)
writer.writerow(['x', 'n/a', col7one_x, total_x])
writer.writerow(['0', 0 if total_zeros else 'n/a', col7one_zeros, total_zeros])
for row in zip(bin_ranges, bin_means, hist_col7one, hist_all):
writer.writerow(row)
if __name__ == '__main__':
count()
This might work:
import numpy as np
import pandas as pd
column_names = ['col1', 'col2', 'col3', 'col4', 'col5', 'col6',
'col7', 'col8', 'col9', 'col10', 'col11'] #names to be used as column labels. If no names are specified then columns can be refereed to by number eg. df[0], df[1] etc.
df = pd.read_csv('data.csv', header=None, names=column_names) #header= None means there are no column headings in the csv file
df.ix[df.col11 == 'x', 'col11']=-0.08 #trick so that 'x' rows will be grouped into a category >-0.1 and <= -0.05. This will allow all of col11 to be treated as a numbers
bins = np.arange(-0.1, 1.0, 0.05) #bins to put col11 values in. >-0.1 and <=-0.05 will be our special 'x' rows, >-0.05 and <=0 will capture all the '0' values.
labels = np.array(['%s:%s' % (x, y) for x, y in zip(bins[:-1], bins[1:])]) #create labels for the bins
labels[0] = 'x' #change first bin label to 'x'
labels[1] = '0' #change second bin label to '0'
df['col11'] = df['col11'].astype(float) #convert col11 to numbers so we can do math on them
df['bin'] = pd.cut(df['col11'], bins=bins, labels=False) # make another column 'bins' and put in an integer representing what bin the number falls into.Later we'll map the integer to the bin label
df.set_index('bin', inplace=True, drop=False, append=False) #groupby is meant to run faster with an index
def count_ones(x):
"""aggregate function to count values that equal 1"""
return np.sum(x==1)
dfg = df[['bin','col7','col11']].groupby('bin').agg({'col11': [np.mean], 'col7': [count_ones, len]}) # groupby the bin number and apply aggregate functions to specified column.
dfg.index = labels[dfg.index]# apply labels to bin numbers
dfg.ix['x',('col11', 'mean')]='N/A' #mean of 'x' rows is meaningless
print(dfg)
dfg.to_csv('new.csv')
which gave me
col7 col11
count_ones len mean
x 1 7 N/A
0 2 21 0
0.15:0.2 2 2 0.2
0.2:0.25 9 22 0.2478632
0.25:0.3 0 13 0.2840755
0.3:0.35 0 5 0.3333333
0.45:0.5 0 4 0.5
This solution uses numpy.histogram. See below.
import csv
import numpy
def count(infile='data.csv', outfile='new.csv'):
total_x = 0
col7one_x = 0
total_zeros = 0
col7one_zeros = 0
all_array = []
col7one_array = []
with open(infile, 'r') as fobj:
reader = csv.reader(fobj)
for line in reader:
if line[10] == 'x':
total_x += 1
if line[6] == '1':
col7one_x += 1
elif line[10] == '0':
# assumes zero is represented as "0" and not as say, "0.0"
total_zeros += 1
if line[6] == '1':
col7one_zeros += 1
else:
val = float(line[10])
all_array.append(val)
if line[6] == '1':
col7one_array.append(val)
bins = numpy.arange(0, 1.05, 0.05)
hist_all, edges = numpy.histogram(all_array, bins=bins)
hist_col7one, edges = numpy.histogram(col7one_array, bins=bins)
with open(outfile, 'w') as fobj:
writer = csv.writer(fobj)
writer.writerow([col7one_x, total_x])
writer.writerow([col7one_zeros, total_zeros])
for row in zip(hist_col7one, hist_all):
writer.writerow(row)
if __name__ == '__main__':
count()

Categories

Resources