K-Means clustering multidimensional data with a heatmap

K-Means clustering multidimensional data with a heatmap - python

I have been trying to implement k-means clustering with a heatmap, but have been unsuccessful.
Here is the initial data set:
https://raw.githubusercontent.com/gsprint23/cpts215/master/progassignments/files/simple.csv
And here is my code:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math
import random
#%matplotlib inline
def truncate(f, n):
return math.floor(f * 10 ** n) / 10 ** n
def chooseCenter(data, centers):
length = data.shape
cent = []
while len(cent) < centers :
x = random.randrange(0,length[0])
y = random.randrange(0,length[1])
if data.iloc[x][y] not in cent:
d = truncate(data.iloc[x][y],2)
cent.append(d)
return cent
def distance(val, center):
return math.sqrt((val- center)**2)
def getDistances(centers, data):
length = data.shape
dist = []
for i in range(length[0]):
for j in range(length[1]):
y = []
for k in range(len(centers)):
val = distance(data.iloc[i][j], centers[k])
y.append(truncate(val,3))
dist.append(y)
return dist
def findClosest(data, dist):
close = data.copy()
length = close.shape
indexes = []
for i in range(len(dist)):
pt = min(dist[i])
idx = dist[i].index(pt)
indexes.append(idx)
#print(indexes)
length = data.shape
n = np.array(indexes)
n = pd.DataFrame(np.reshape(n, (length[0],length[1])))
#reshape this data frame into the same shape as the data
#keep running the find closest until there is no change
#try heatmap on this?
#this should cluster it, but to make sure test it
#might need to do some tweaking to this
return n
# for i in range(length[0]):
# for j in range(length[1]):
# print('dist[i]', dist[j])
# pt = min(dist[j])
# print(pt)
# idx = dist[j].index(pt)
# close.iloc[i][j] = int(idx)
#return close
def computeNewCenter(data, close):
d = dict()
for i in range(len(close)):
for j in range(len(close[0])):
d[close.iloc[i][j]] = []
for i in range(len(data)):
for j in range(len(data[0])):
if close.iloc[i][j] in d:
d[close.iloc[i][j]].append(data.iloc[i][j])
newCenters = []
for key, value in d.items():
m = np.mean(value)
newCenters.append(truncate(m, 3))
return newCenters
# lst = [[] * numcenters]
# for i in range(len(close)):
# for j in range(len(close[0])):
# if close.iloc[i][j]
def main():
data = np.array(pd.read_csv('https://raw.githubusercontent.com/gsprint23/cpts215/master/progassignments/files/simple.csv', header=None))
data = data.T
#print(data)
df = pd.DataFrame(data[1:], columns=data[0], dtype=float).T
df = df.iloc[::-1]
# print(df)
# print(df.iloc[1][9])
# print(df)
# print(df.iloc[0][1])
# heatmap = plt.pcolor(df, cmap=plt.cm.bwr)
# plt.colorbar(heatmap)
c = chooseCenter(df, 3)
print(c)
#print(len(c))
dist = getDistances(c, df)
#print(dist)
y = findClosest(df, dist)
# q = []
# for i in range(len(c)):
# q.append([])
# #print(q)
j = computeNewCenter(df, y)
#print(j)
length = df.shape
oldFrame = pd.DataFrame(np.ndarray((length[0],length[1])))
oldFrame = oldFrame.fillna(0)
ct=0
while y.equals(oldFrame) == False:
ct+=1
oldFrame = y.copy()
c = computeNewCenter(df, oldFrame)
#print(c)
dist = getDistances(c, df)
#print(dist)
y = findClosest(df, dist)
#print(y)
#plt.pcolor(df, cmap=plt.cm.bwr)
l = []
for i in range(len(y)):
for j in range(len(y[0])):
if y.iloc[i][j] == 1:
l.append(df.iloc[i][j])
for i in range(len(y)):
for j in range(len(y[0])):
if y.iloc[i][j] == 2:
l.append(df.iloc[i][j])
for i in range(len(y)):
for j in range(len(y[0])):
if y.iloc[i][j] == 0:
l.append(df.iloc[i][j])
l = np.ndarray((length[0],length[1]))
l = pd.DataFrame(l)
print(l)
hm = plt.pcolor(l, cmap=plt.cm.bwr)
plt.colorbar(hm)
# print(y)
# print(c)
# print(ct)
#plt.pcolor(y, cmap=plt.cm.bwr)
if __name__ == '__main__':
main()
My line of thinking was this:
My current thought process was to first randomly choose the centers.
Then create a list of lists for each point for the distance to each center.
Find the index of the minimum distance for each point for each center.
Create a data frame of the same size as the data set and fill each index for each element with the index of the center the point is closest to.
Recompute the center by taking the mean of the points with the same center index
Repeat this process multiple times until the index data frame does not change.
Create a new data frame and add the points which have the same center point close together in the frame.
Then create the heatmap.
This did not seem to work though.
Just wondering, am I on the right track or am I completely off, and if I am on the right track which parts would I need to change in order to fix the issue. If not could you please point me on the right track.
Here is a comparison of the maps:
Here are the maps
The first one is the one my program generated while the second is the way it is supposed to look.
I know my problem lies in some part of the k-means clustering algorithm, and my guess is it is either in the reassignment stage where you reassign the points to the centroids and calculate the new centroids or in the stopping condition in that the algorithm does not run long enough. Also in the back of my head, something tells me that I am not doing this as efficiently as I could have and that I am missing something key. I have watched several videos on K-means clustering and understand it conceptually, I'm just having a hard time implementing it.

Related

How to delete elements from a numpy array using indecies returned by scipy.spatial.KDTree.query_ball_point method

I am trying to use Kdtree data structure to remove closest points from an array preferablly without for loops.
import sys
import time
import scipy.spatial
class KDTree:
"""
Nearest neighbor search class with KDTree
"""
def __init__(self, data):
# store kd-tree
self.tree = scipy.spatial.cKDTree(data)
def search(self, inp, k=1):
"""
Search NN
inp: input data, single frame or multi frame
"""
if len(inp.shape) >= 2: # multi input
index = []
dist = []
for i in inp.T:
idist, iindex = self.tree.query(i, k=k)
index.append(iindex)
dist.append(idist)
return index, dist
dist, index = self.tree.query(inp, k=k)
return index, dist
def search_in_distance(self, inp, r):
"""
find points with in a distance r
"""
index = self.tree.query_ball_point(inp, r)
return np.asarray(index)
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
start = time.time()
fig, ar = plt.subplots()
t = 0
R = 50.0
u = R *np.cos(t)
v = R *np.sin(t)
x = np.linspace(-100,100,51)
y = np.linspace(-100,100,51)
xx, yy = np.meshgrid(x,y)
points =np.vstack((xx.ravel(),yy.ravel())).T
Tree = KDTree(points)
ind = Tree.search_in_distance([u, v],10.0)
ar.scatter(points[:,0],points[:,1],c='k',s=1)
infected = points[ind]
ar.scatter(infected[:,0],infected[:,1],c='r',s=5)
def animate(i):
global R,t,start,points
ar.clear()
u = R *np.cos(t)
v = R *np.sin(t)
ind = Tree.search_in_distance([u, v],10.0)
ar.scatter(points[:,0],points[:,1],c='k',s=1)
infected = points[ind]
ar.scatter(infected[:,0],infected[:,1],c='r',s=5)
#points = np.delete(points,ind)
t+=0.01
end = time.time()
if end - start != 0:
print((end - start), end="\r")
start = end
ani = animation.FuncAnimation(fig, animate, interval=20)
plt.show()
but no matter what i do i can't get np.delete to work with the indecies returned by the ball_query method. What am i missing?
I would like to make the red colored points vanish in each iteration from the points array.

Your points array is a Nx2 matrix. Your ind indices are a list of row indices. What you need is to specify the axis along which you need deletion, ultimately this:
points = np.delete(points,ind,axis=0)
Also, once you delete indices, watch out for missing indices in your next iteration/calculations. Maybe you want to have a copy to delete points and plot and another copy for calculations that you do not delete from it.

Perceptron learning algorithm: Is PLA impossible to halt at some specific number of update?

I'm currently learning about PLA, and found something weird about my implement.
First of all, the data is linear separable, so it converges every time.
I implemented my PLA by visiting examples in a fixed, pre-determined random cycle, and repeated the experiment for a large number of times.
When I plot the number of updates v.s. frequency, I found out there are some number of update having frequency 0, for example, it never halts with 34, 36, 38, 40 updates, why is this?
Also, why isn't the plot a single peak curve but rather a 3 peak curve?
Train Data
My PLA plot
Here is my implement:
import numpy as np
import random as rd
import matplotlib.pyplot as plt
#Read Data
Data_X = []
Data_Y = []
Train_Data = "hw1_7_train"
f = open(Train_Data, "r")
if f.mode == "r":
fl = f.readlines()
for line in fl:
Data = line.split()
Data_X.append([1.0] + [float(i) for i in Data[0:4]])
Data_Y.append(int(Data[4]))
f.close()
Data_X = np.array(Data_X)
Data_Y = np.array(Data_Y)
def GoNext(Pos, Length):
if Pos < Length-1:
Pos += 1
else:
Pos = 0
return Pos
def sign(x):
return -1 if x <= 0 else 1
def PLA(X, Y, cycle):
weight = np.array([0.0] * 5)
Length = len(Y)
Success = 0
LastFail = -1
Current_Id = 0
UpdateCount = 0
while(not Success):
Current_Pos = cycle[Current_Id]
Sign = sign(np.inner(weight, X[Current_Pos]))
#If the current point is correct, check if we can halt, else continue
if Sign == Y[Current_Pos]:
#Went for a full round without mistake with initial weight (NOT likely to happen)
if LastFail == -1 and Current_Id == Length-1:
Success = 1
continue
#Went for a full round with out mistake from last point we modified
elif Current_Pos == LastFail:
Success = 1
continue
#Otherwise go to the next point
else:
Current_Id = GoNext(Current_Id, Length)
continue
#If the current point is error, modify the weight
else:
LastFail = Current_Pos
#Modify the weight
weight += Y[Current_Pos] * X[Current_Pos]
UpdateCount += 1
Current_Id = GoNext(Current_Id, Length)
continue
return UpdateCount
TotalIter = 9999
TrackUpdateCount = []
for iter in range(TotalIter):
Cycle = list(range(len(Data_Y)))
rd.shuffle(Cycle)
TrackUpdateCount.append(PLA(Data_X, Data_Y, Cycle))
print("Average Update: ", sum(TrackUpdateCount)/TotalIter)
#Plotting...
UpperBound = max(TrackUpdateCount)
LowerBound = min(TrackUpdateCount)
x_axis = list(range(LowerBound, UpperBound+1))
y_axis = [0]*(UpperBound-LowerBound+1)
for i in range(LowerBound, UpperBound+1):
y_axis[i-LowerBound] = TrackUpdateCount.count(i)
plt.bar(x_axis, y_axis)
plt.xlabel("Number of updates")
plt.ylabel("Frequency")
plt.show()

Why is my code so much slower in function form?

I wrote a code a while ago that processes spectra using data from text files and performing calculations on them. I started with a code that just does everything line-by-line without any functions, and despite being long, it finishes running in 2.11 seconds (according to %%timeit). Below is that original code, labeled as such.
However, I wanted to put my code into functions instead, to allow for easier readability and usage with different models in the future. Even though I'm using all the same steps as I did before (but this time inside my functions), it is so much slower. This code is also below. Now, I have to wait for about 15-20 minutes to get the same outputs. Why is it so much slower, and is there any way I can make it significantly faster but still use functions?
Original Code:
import re
import matplotlib.pyplot as plt
import numpy as np
import scipy.interpolate
filename = 'bpass_spectra.txt'
extinctionfile = 'ExtinctionLawPoints.txt' # from R_V = 4.0
pointslist = []
datalist = []
speclist = []
# Constants
Msun = 1.98892e30 # solar mass [kg]
h = 4.1357e-15 # Planck's constant [eV s]
c = float(3e8) # speed of light [m/s]
# Read spectra file
f = open(filename, 'r')
rawspectra = f.readlines()
met = re.findall('Z\s=\s(\d*\.\d+)', rawspectra[0])
del rawspectra[0]
for i in range(len(rawspectra)):
newlist = rawspectra[i].split(' ')
datalist.append(newlist)
# Read extinction curve data file
rawpoints = open(extinctionfile, 'r').readlines()
for i in range(len(rawpoints)):
newlst = re.split('(?!\S)\s(?=\S)|(?!\S)\s+(?=\S)', rawpoints[i])
pointslist.append(newlst)
pointslist = pointslist[3:]
lambdalist = [float(item[0]) for item in pointslist]
k_abslist = [float(item[4]) for item in pointslist]
xvallist = [(c*h)/(lamb*1e-6) for lamb in lambdalist]
k_interp = scipy.interpolate.interp1d(xvallist, k_abslist)
# Create new lists
Elist = [float(item[0]) for item in datalist]
speclambdalist = [h*c*1e9/E for E in Elist]
z1list = [float(item[1]) for item in datalist]
speclist.extend(z1list)
met = met[0]
klist = [None]*len(speclist)
Loutlist = [None]*len(speclist)
Tlist = [None]*len(speclist)
# Define parameters
b = 2.0
R = 1.0
z = 1.0
Mgas = 1.0 # mass of gas, input
Mhalo = 2e41 # mass of dark matter halo, known
if float(met) > 0.0052:
DGRlist = [50.0*np.exp(-2.21)*float(met)]*len(speclist)
elif float(met) <= 0.0052:
DGRlist = [((50.0*float(met))**3.15)*np.exp(-0.96)]*len(speclist)
for i in range(len(speclist)):
if float(Elist[i]) <= 4.1357e-3: # frequencies <= 10^12 Hz
klist[i] = 0.1*(float(Elist[i])/(1000.0*h))**b # extinction law [cm^2/g]
elif float(Elist[i]) > 4.1357e-3: # frequencies > 10^12 Hz
klist[i] = k_interp(Elist[i]) # interpolated function's value at Elist[i]
Mdustlist = [Mgas*DGR for DGR in DGRlist] # dust mass
Rhalo = 0.784*(0.27**2.0)*(0.7**(-2.0/3.0))*float(10.0/(1.0+z))*((Mhalo/(1e8*Msun))**(1.0/3.0))
Rdust = 0.018*Rhalo # [kpc]
for i in range(len(speclist)):
Tlist[i] = 3*Mdustlist[i]*klist[i]/(4*np.pi*Rdust)
Linlist = [float(spectra)*R for spectra in speclist]
# Outgoing luminosity as function of wavelength
for i in range(len(Linlist)):
Loutlist[i] = Linlist[i]*np.exp(-Tlist[i])
# Test the calculation
print "LIN ELEMENTS 0 AND 1000:", Linlist[0], Linlist[1000]
print "LOUT ELEMENTS 0 AND 1000:", Loutlist[0], Loutlist[1000]
New "function-ized" Code (much slower):
import re
import matplotlib.pyplot as plt
import numpy as np
import scipy.interpolate
# Required files and lists
filename = 'bpass_spectra.txt' # number of columns = 4
extinctionfile = 'ExtinctionLawPoints.txt' # R_V = 4.0
datalist = []
if filename == 'bpass_spectra.txt':
filetype = 4
else:
filetype = 1
if extinctionfile == 'ExtinctionLawPoints.txt':
R_V = 4.0
else:
R_V = 1.0 #to be determined
# Constants
M_sun = 1.98892e30 # solar mass [kg]
h = 4.1357e-15 # Planck's constant [eV s]
c = float(3e8) # speed of light [m/s]
# Inputs
beta = 2.0
R = 1.0
z = 1.0
M_gas = 1.0
M_halo = 2e41
# Read spectra file
f = open(filename, 'r')
rawlines = f.readlines()
met = re.findall('Z\s=\s(\d*\.\d+)', rawlines[0])
del rawlines[0]
for i in range(len(rawlines)):
newlist = rawlines[i].split(' ')
datalist.append(newlist)
# Read extinction curve data file
rawpoints = open(extinctionfile, 'r').readlines()
def interpolate(R_V, rawpoints, Elist, j):
pointslist = []
if R_V == 4.0:
for i in range(len(rawpoints)):
newlst = re.split('(?!\S)\s(?=\S)|(?!\S)\s+(?=\S)', rawpoints[i])
pointslist.append(newlst)
pointslist = pointslist[3:]
lambdalist = [float(item[0]) for item in pointslist]
k_abslist = [float(item[4]) for item in pointslist]
xvallist = [(c*h)/(lamb*1e-6) for lamb in lambdalist]
k_interp = scipy.interpolate.interp1d(xvallist, k_abslist)
return k_interp(Elist[j])
# Dust extinction function
def dust(interpolate, filetype, datalist, beta, R, z, M_gas, M_halo, met):
speclist = []
if filetype == 4:
metallicity = float(met[0])
Elist = [float(item[0]) for item in datalist]
speclambdalist = [h*c*1e9/E for E in Elist]
met1list = [float(item[1]) for item in datalist]
speclist.extend(met1list)
klist, Tlist = [None]*len(speclist), [None]*len(speclist)
if metallicity > 0.0052:
DGRlist = [50.0*np.exp(-2.21)*metallicity]*len(speclist) # dust to gas ratio
elif metallicity <= 0.0052:
DGRlist = [((50.0*metallicity)**3.15)*np.exp(-0.96)]*len(speclist)
for i in range(len(speclist)):
if Elist[i] <= 4.1357e-3: # frequencies <= 10^12 Hz
klist[i] = 0.1*(float(Elist[i])/(1000.0*h))**beta # extinction law [cm^2/g]
elif Elist[i] > 4.1357e-3: # frequencies > 10^12 Hz
klist[i] = interpolate(R_V, rawpoints, Elist, i) # interpolated function's value at Elist[i]
Mdustlist = [M_gas*DGR for DGR in DGRlist] # dust mass
R_halo = 0.784*(0.27**2.0)*(0.7**(-2.0/3.0))*float(10/(1+z))*((M_halo/(1e8*M_sun))**(1.0/3.0))
R_dust = 0.018*R_halo # [kpc]
# Optical depth calculation
Tlist = [3*Mdustlist[i]*klist[i]/(4*np.pi*R_dust) for i in range(len(speclist))]
# Ingoing and outgoing luminosities as functions of wavelength
Linlist = [float(spectra)*R for spectra in speclist]
Loutlist = [Linlist[i]*np.exp(-Tlist[i]) for i in range(len(speclist))]
return speclambdalist, Linlist, Loutlist
print dust(interpolate, filetype, datalist, beta, R, z, M_gas, M_halo, met)
Even when I only have the function return Loutlist instead of the tuple of 3 lists, it's still extremely slow. Any ideas on why this is? Also, I'm going to want to return the tuple and then plot speclambdalist versus Linlist, and also plot speclambdalist versus Loutlist on the same plot. But I'm under the impression that each time I call dust(interpolate, filetype, datalist, beta, R, z, M_gas, M_halo, met)[i] where i = 0, 1, or 2 (I'll be doing this multiple times), it'll have to run the function again each time. Is there any way to bypass these extra runs to further increase speed? Thank you!

In a loop - match 'i' with an index value from a csv (Python/Networkx)

I'm currently trying to draw some edges in Networkx, my nodes have 2 patch properties, position and status which are used in a colonisation simulation algorithm. I've been trying to scale up my simulation which has meant turning away from working out euclidean distances between my nodes (and also away from code that works!).
I have a csv of the row number index of the nearest neighbours of each node, this index corresponding to the row of another csv which has the 3d co-ordinates of the nodes contained. i.e., on the nearest neighbour csv on row 0 may have 3 nearest neighbours on the same row in separate columns so it would be 0, 56, 76 if node 0 had nearest neighbours in node 56 and 76 which would correspond to rows 0, 56 and 76 on the co-ord csv.
I then need to draw edges between these nearest neighbour nodes for my algorithm to play with the nodes. So I have some pseudo-code:
import networkx as nx
import numpy as np
from sklearn.neighbors import BallTree
import csv
from itertools import izip_longest
import pandas as pd
density = 0.14 #Stellar density per cubic parsec
L = 100
Patches = int(0.056*density*L**3+15)
P_init = 0.0001 # Probability that a patch will be occupied at the beginning
Distance = 10
dat = np.random.uniform(low = -1, high = 1, size = (Patches,3)) * L
np.savetxt('nearand1.csv', dat, delimiter = ',')
nearand = np.genfromtxt('nearand1.csv', delimiter = ',',usecols=np.arange(0, 3))
tree = BallTree(nearand, leaf_size=2)
ind = tree.query_radius(nearand, r=10)
df = pd.DataFrame(ind)
df.to_csv('bobbington4.csv',sep='e',index=False, header=False)
xcoord = nearand[:,0]
ycoord = nearand[:,1]
zcoord = nearand[:,2]
bobbington = np.genfromtxt('bobbington4.csv', delimiter = ',', dtype = 'int')
bobbington0 = bobbington[:,0]
bobbington1 = bobbington[:,1]
bobbington2 = bobbington[:,2]
bobbington3 = bobbington[:,3]
bobbington4 = bobbington[:,4]
bobbington5 = bobbington[:,5]
bobbington6 = bobbington[:,6]
bobbington7 = bobbington[:,7]
bobbington8 = bobbington[:,8]
bobbington9 = bobbington[:,9]
bobbington10 = bobbington[:,10]
bobbington11 = bobbington[:,11]
bobbington12 = bobbington[:,12]
bobbington13 = bobbington[:,13]
class patch:
def __init__(self,status=0,pos=(0,0,0)):
self.status = status
self.pos = pos
def __str__(self):
return(str(self.status))
G = nx.Graph()
for i in xrange(Patches):
Stat = 1 if np.random.uniform() < P_init else 0
Pos = (xcoord[i], ycoord[i], zcoord[i])
G.add_node(patch(Stat,Pos))
for i in G.nodes():
for j in G.nodes():
if i.pos where i == bobbington0:
if j.pos where j == bobbington1:
G.add_edge(i,j)
pos = {}
for n in G.nodes():
pos[n] = n.pos
occup = [n.status for n in G]
Time = [0]
Occupancy = [np.sum([n.status for n in G])/float(Patches)]
Here bobbington0 is just a column of node indices going from 0 -> 7854 and bibbington1 is the first nearest neighbour for each of those nodes. What is is the best way to go about this? I'm struggling to find anything on this type of problem but I'm probably wording things poorly.
Thanks in advance for any help you can give me.

I've got it. Not particularly elegant but it works.
for i in G.nodes():
for j in G.nodes():
diff1 = j.boba[0] - i.bubu
if diff1 == 0:
G.add_edge(i, j)

Shifting data in 2d array through shifted indices

I need to shift a 2D array field, i.e. I have a "previous_data" array which I access through shifted indices to create my "new_data" array.
I can do this in a nonpythonic (and slow) loop, but would very much appreciate some help in finding a pythonic (and faster) solution!
Any help and hints are very much appreciated!
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import mpl
def nonpythonic():
#this works, but is slow (for large arrays)
new_data = np.zeros((ny,nx))
for j in xrange(ny):
for i in xrange(nx):
#go through each item, check if it is within the bounds
#and assign the data to the new_data array
i_new = ix[j,i]
j_new = iy[j,i]
if ((i_new>=0) and (i_new<nx) and (j_new>=0) and (j_new<ny)):
new_data[j,i]=previous_data[j_new,i_new]
ef, axar = plt.subplots(1,2)
im = axar[0].pcolor(previous_data, vmin=0,vmax=2)
ef.colorbar(im, ax=axar[0], shrink=0.9)
im = axar[1].pcolor(new_data, vmin=0,vmax=2)
ef.colorbar(im, ax=axar[1], shrink=0.9)
plt.show()
def pythonic():
#tried a few things here, but none are working
#-tried assigning NaNs to indices (ix,iy) which are out of bounds, but NaN's don't work for indices
#-tried masked arrays, but they also don't work as indices
#-tried boolean arrays, but ended in shape mismatches
#just as in the nonworking code below
ind_y_good = np.where(iy>=0) and np.where(iy<ny)
ind_x_good = np.where(ix>=0) and np.where(ix<nx)
new_data = np.zeros((ny,nx))
new_data[ind_y_good,ind_x_good] = previous_data[iy[ind_y_good],ix[ind_x_good]]
#some 2D array:
nx = 20
ny = 30
#array indices:
iy, ix = np.indices((ny,nx))
#modify indices (shift):
iy = iy + 1
ix = ix - 4
#create some out of range indices (which might happen in my real scenario)
iy[0,2:7] = -9999
ix[0:3,-1] = 6666
#some previous data which is the basis for the new_data:
previous_data = np.ones((ny,nx))
previous_data[2:8,10:20] = 2
nonpythonic()
pythonic()
This is the result of the working (nonpythonic) code above:

I implemented a version of pythonic that replicates nonpythonic with some masking and index fiddling - see below. By the way I think the "new" indices should be the ones corresponding to the new array, rather than the old ones, but I've left it as in your existing function.
The main thing to realise is that in your attempt in the question, your conditions
ind_y_good = np.where(iy>=0) and np.where(iy<ny)
ind_x_good = np.where(ix>=0) and np.where(ix<nx)
must be combined, since we must always have pairs of x and y indices. i.e. if the x index is invalid, then so is the y.
Finally, if the indices are really all shifted by a constant factor, you can make this even simpler by using NumPy's roll function and taking a slice of the indices corresponding to the valid area.
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import mpl
def nonpythonic(previous_data, ix, iy, nx, ny):
#this works, but is slow (for large arrays)
new_data = np.zeros((ny,nx))
for j in xrange(ny):
for i in xrange(nx):
#go through each item, check if it is within the bounds
#and assign the data to the new_data array
i_new = ix[j,i]
j_new = iy[j,i]
if ((i_new>=0) and (i_new<nx) and (j_new>=0) and (j_new<ny)):
new_data[j,i]=previous_data[j_new,i_new]
return new_data
def pythonic(previous_data, ix, iy):
ny, nx = previous_data.shape
iy_old, ix_old = np.indices(previous_data.shape)
# note you must apply the same condition to both
# index arrays
valid = (iy >= 0) & (iy < ny) & (ix >= 0) & (ix < nx)
new_data = np.zeros((ny,nx))
new_data[iy_old[valid], ix_old[valid]] = previous_data[iy[valid], ix[valid]]
return new_data
def main():
#some 2D array:
nx = 20
ny = 30
#array indices:
iy, ix = np.indices((ny,nx))
#modify indices (shift):
iy = iy + 1
ix = ix - 4
#create some out of range indices (which might happen in my real scenario)
iy[0,2:7] = -9999
ix[0:3,-1] = 6666
#some previous data which is the basis for the new_data:
previous_data = np.ones((ny,nx))
previous_data[2:8,10:20] = 2
data_nonpythonic = nonpythonic(previous_data, ix, iy, nx, ny)
data_pythonic = pythonic(previous_data, ix, iy)
new_data = data_nonpythonic
ef, axar = plt.subplots(1,2)
im = axar[0].pcolor(previous_data, vmin=0,vmax=2)
ef.colorbar(im, ax=axar[0], shrink=0.9)
im = axar[1].pcolor(new_data, vmin=0,vmax=2)
ef.colorbar(im, ax=axar[1], shrink=0.9)
plt.show()
print(np.allclose(data_nonpythonic, data_pythonic))
if __name__ == "__main__":
main()

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

K-Means clustering multidimensional data with a heatmap - python

Related

How to delete elements from a numpy array using indecies returned by scipy.spatial.KDTree.query_ball_point method

Perceptron learning algorithm: Is PLA impossible to halt at some specific number of update?

Why is my code so much slower in function form?

In a loop - match 'i' with an index value from a csv (Python/Networkx)

Shifting data in 2d array through shifted indices

Categories

Resources