Here is my code
count = 0
def selectionSort(data):
for index in range(len(data)):
min = index
count += 1
# Find the index'th smallest element
for scan in range(index + 1, len(data)):
if (data[scan] < data[min]):
min = scan
if min != index: # swap the elements
data[index], data[min] = data[min], data[index]
return data
data = selectionSort([3,4,5,2,6])
print(count, data)
Your code as-is should not run. You should get local variable 'count' referenced before assignment.
To fix this, add the following to the top of selectionSort(data):
global count
A better way is to scrap the global variable and return count alongside the sorted data:
def selectionSort(data):
count = 0
for index in range(len(data)):
min = index
count += 1
# Find the index'th smallest element
for scan in range(index + 1, len(data)):
if (data[scan] < data[min]):
min = scan
if min != index: # swap the elements
data[index], data[min] = data[min], data[index]
return count, data
count, data = selectionSort([3,4,5,2,6])
print(count, data)
Last but not least, you are counting something other than comparisons. I leave fixing that as an exercise for the reader.
Related
I have a Pandas dataframe with ~100,000,000 rows and 3 columns (Names str, Time int, and Values float), which I compiled from ~500 CSV files using glob.glob(path + '/*.csv').
Given that two different names alternate, the job is to go through the data and count the number of times a value associated with a specific name ABC deviates from its preceding value by ±100, given that the previous 50 values for that name did not deviate by more than ±10.
I initially solved it with a for loop function that iterates through each row, as shown below. It checks for the correct name, then checks the stability of the previous values of that name, and finally adds one to the count if there is a large enough deviation.
count = 0
stabilityTime = 0
i = 0
if names[0] == "ABC":
j = value[0]
stability = np.full(50, values[0])
else:
j = value[1]
stability = np.full(50, values[1])
for name in names:
value = values[i]
if name == "ABC":
if j - 10 < value < j + 10:
stabilityTime += 1
if stabilityTime >= 50 and np.std(stability) < 10:
if value > j + 100 or value < j - 100:
stabilityTime = 0
count += 1
stability = np.roll(stability, -1)
stability[-1] = value
j = value
i += 1
Naturally, this process takes a very long computing time. I have looked at NumPy vectorization, but do not see how I can apply it in this case. Is there some way I can optimize this?
Thank you in advance for any advice!
Bonus points if you can give me a way to concatenate all the data from every CSV file in the directory that is faster than glob.glob(path + '/*.csv').
I would like to take a list of unknown size, containing ndarrays, where each ndarray could have any dimension and size independent from the the others, and replace values at random spots in this entire data structure.
I can create an index for a random spot by doing this:
for w in weights:
number_of_weights += w.size()
The problem is how I would go about inserting without having to recursively check that I am at the last dimension while adding to a counter until it is greater than the index and decrementing another counter to know where in the last dimension I am inserting.
I found out about a solution that uses the ravel() function.
def get_row_and_index(weights, index):
index_const = index
row = 0
count = weights[row].size - 1
while count < index_const:
index -= weights[row].size
row += 1
count += weights[row].size
return row, index
def mutate_weights(weights, n_mutations):
new_weights = copy.deepcopy(weights)
number_of_weights = 0
a = 0
b = 0
for i in new_weights:
number_of_weights += i.size
a = min(a, i.min())
b = max(b, i.max())
n_mutations = min(number_of_weights, n_mutations)
for i in range(n_mutations):
index = random.randrange(0, number_of_weights)
row, index = get_row_and_index(new_weights, index)
new_weight = random.uniform(a, b)
flat_row = new_weights[row].ravel()
flat_row[index] = new_weight
I am already using Tensorflow, so I came up with a solution that uses it.
The idea was to reshape the multidimensional ndarray that would get the replacement into a flat array after finding what index in the list it was, and then reshaping it back to the original shape after the replacement, and replacing it's old copy in the list.
Please note that this is not as efficient as it could be due to multiple iterations of index keeping and replacements. If it becomes an issue, the indices could be pre-computed, and the ndarrays replaced a max of once instead of every iteration of the loop below.
def mutate(weights, n_mutations):
number_of_weights = 0
a = 0
b = 0
for i in weights:
number_of_weights += i.size
a = min(a, i.min())
b = max(b, i.min())
n_mutations = min(number_of_weights, n_mutations)
row = 0
count = weights[row].size - 1
for i in range(n_mutations):
index = random.randrange(0, number_of_weights)
index_const = index
while count < index_const:
index -= (weights[row].size + 1)
row += 1
count += weights[row].size
new_weight = random.uniform(a, b - 1)
shape = weights[row].shape
flat_row = tf.reshape(weights[row], [-1]).numpy()
flat_row[index] = new_weight
new_row = tf.reshape(flat_row, shape).numpy()
weights[row] = new_row
row = 0
count = 0
return weights
df = pd.DataFrame({
'label':[f"subj_{i}" for i in range(28)],
'data':[i for i in range(1, 14)] + [1,0,0,0,2] + [0,0,0,0,0,0,0,0,0,0]
})
I have a dataset something like that. It looks like:
I want to cut it at where the longest repetitions of 0s occur, so I want to cut at index 18, but I want to leave index 14-16 intact. So far I've tried stuff like:
Counters
cad_recorder = 0
new_index = []
for i,row in tqdm(temp_df.iterrows()):
if row['cadence'] == 0:
cad_recorder += 1
new_index.append(i)
* But obviously that won't work since the indices will be rewritten at each occurrance of zero.
I also tried a dictionary, but I'm not sure how to compare previous and next values using iterrows.
I also took the rolling mean for X rows at a time, and if its zero then I got an index. But then I got stuck at actually inferring the range of indices. Or finding the longest sequence of zeroes.
Edit: A friend of mine suggested the following logic, which gave the same results as #shubham-sharma. The poster's solution is much more pythonic and elegant.
def find_longest_zeroes(df):
'''
Finds the index at which the longest reptitions of <1 values begin
'''
current_length = 0
max_length = 0
start_idx = 0
max_idx = 0
for i in range(len(df['data'])):
if df.iloc[i,9] <= 1:
if current_length == 0:
start_idx = i
current_length += 1
if current_length > max_length:
max_length = current_length
max_idx = start_idx
else:
current_length = 0
return max_idx
The code I went with following #shubham-sharma's solution:
cut_us_sof = {}
og_df_sof = pd.DataFrame()
cut_df_sof = pd.DataFrame()
for lab in df['label'].unique():
temp_df = df[df['label'] == lab].reset_index(drop=True)
mask = temp_df['data'] <= 1 # some values in actual dataset were 0.0000001
counts = temp_df[mask].groupby((~mask).cumsum()).transform('count')['data']
idx = counts.idxmax()
# my dataset's trailing zeroes are usually after 200th index. But I also didn't want to remove trailing zeroes < 500 in length
if (idx > 2000) & (counts.loc[idx] > 500):
cut_us_sof[lab] = idx
og_df_sof = og_df_sof.append(temp_df)
cut_df_sof = cut_df_sof.append(temp_df.iloc[:idx,:])
We can use boolean masking and cumsum to identify the blocks of zeros, then groupby and transform these blocks using count followed by idxmax to get the starting index of the block having the maximum consecutive zeros
m = df['data'].eq(0)
idx = m[m].groupby((~m).cumsum()).transform('count').idxmax()
print(idx)
18
I have a dataframe where the index is a datetime, and it is sorted. Basically I want to creating a column rolling_time1, rolling_time2,... etc where the value is the number of count after the row that is within timex. I created the following but it is very slow. Any ways to make this faster?
def sum_window_wd(row, wd_file, wd, df, num):
if row.start_index > num:
return row['rolling_' + str(wd)]
count = 0
for i in range(row.start_index + 1, len(df)):
if GetWinddownLeft(wd_file, df.iloc[i].name, row.name) < wd:
count = count + 1
else:
break
return count
for rolling in rollings:
df['rolling_' + str(rolling)] = 0
for rolling in rollings:
df['rolling_' + str(rolling)] = df.apply(sum_window_wd, axis=1, args = (winddown, rolling, df, len))
I have a collection of 101 documents, I need to iterate over them taking 10 collections at a time and store a value of a particular field(of 10 documents) in a list.
I tried this:
values = db.find({},{"field":1})
urls = []
count = 0
for value in values:
if(count < 10):
urls.append(value["field"])
count = count + 1
print count
else:
print urls
urls = []
urls.append(value["field"])
count = 1
It doesn't fetch the last value because it doesn't reach if condition. Any elegant way to do this and rectify ths situation?
You reset count to 0 everytime the loop restarted. Move the declaration outside the loop:
count = 0
for value in values:
If urls is already filled, this will be your only problem.
As far as I can tell, you've some data that you want to organize into batches of size 10. If so, perhaps this will help:
N = 10
values = list(db.find({},{"field":1}))
url_batches = [
[v['field'] for v in values[i:i+N]]
for i in xrange(0, len(values), N)
]