So I'm running some measurements using a keithley 2450 source meter with this code:
def res_trace(n = None, max_v = None, min_v = None, data_points = None, r_crit = None,
ilim = None):
beep(164,0.5)
# check values and provide defaults
n = n or 0
data_points = data_points or 100
max_v = max_v or 0.5
min_v = min_v or -0.5
r_crit = r_crit or 1e+7
ilim = ilim or 'MAXimum'
v = np.linspace( min_v, max_v, num = data_points)
i = []
res_run = []
# reset keithley
# just so we can use them without any prior settings
reset()
# set up I measurement systems
keith.write(':SENSe:FUNCtion "CURR"')
keith.write(':SENSe:CURRent:RANGe:AUTO 1')
keith.write(':SENSe:CURRent:UNIT AMP')
keith.write(':SENSe:CURRent:NPLCycles DEFault')
keith.write(':SENSe:COUNt DEFault')
keith.write(':SENSe:CURRent:OCOM ON')
# set up V source, (Hi Michale here!)
keith.write(':SOURce:FUNCtion VOLT')
keith.write(':SOURce:VOLTage:RANGe '+str(max_v))
keith.write(':SOURce:VOLTage:ILIMit '+ ilim)
# Turn keith on
keith_output('on')
for j in v:
keith.write(':SOURce:VOLT '+str(j))
itemp = float(keith.query(':MEASure:CURRent?'))
i.append(itemp)
###
# turn them off
keith_output('off')
# plot
plt.figure()
plt.title('Res trace # '+str(n))
plt.plot(v,i, c = 'm')
plt.xlabel('V')
plt.ylabel('I')
plt.grid()
plt.show()
I'm currently running the script and it takes a few seconds for keithley to take measurements and return values. I'd like to get a way to live plot the data within each loop as its being collected, but I have no idea how to go about this and want the process to be as simple as possible. Any suggestions?
Thanks
You can use Jupyterplot and create a realtime plot like this:
from jupyterplot import ProgressPlot
import numpy as np
pp = ProgressPlot()
for i in range(1000):
pp.update(np.sin(i / 100))
pp.finalize()
Related
I have this script on Cantera. I want to save data into csv for both the two parts of the script: the first that evaluate Tfinal vs autoignition delay time and the second that evalutate the NTC behavior. In the first part the example suggests to uncomment # timeHistory.to_csv("time_history.csv") but it doesn't work. I think I need to create a dataframe because it's not well defined (I suppose). Not only this, but I saw also this error: <cantera.composite.SolutionArray object at 0x7f4badca0fd0>.
How can I solve this, and how can I create the two csv for this script?
Thank you very much
import pandas as pd
import numpy as np
import time
import cantera as ct
print('Runnning Cantera version: ' + ct.__version__)
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['figure.autolayout'] = True
plt.style.use('ggplot')
plt.style.use('seaborn-pastel')
gas = ct.Solution('Seiser.cti')
# Define the reactor temperature and pressure
reactor_temperature = 1000 # Kelvin
reactor_pressure = 101325 # Pascals
gas.TP = reactor_temperature, reactor_pressure
# Define the fuel, oxidizer and set the stoichiometry
gas.set_equivalence_ratio(phi=1.0, fuel="nc7h16", oxidizer={"o2": 1.0, "n2": 3.76})
# Create a batch reactor object and add it to a reactor network
# In this example, the batch reactor will be the only reactor
# in the network
r = ct.IdealGasReactor(contents=gas, name="Batch Reactor")
reactor_network = ct.ReactorNet([r])
# use the above list to create a DataFrame
time_history = ct.SolutionArray(gas, extra="t")
def ignition_delay(states, species):
"""
This function computes the ignition delay from the occurence of the
peak in species' concentration.
"""
i_ign = states(species).Y.argmax()
return states.t[i_ign]
reference_species = "oh"
# Tic
t0 = time.time()
# This is a starting estimate. If you do not get an ignition within this time, increase it
estimated_ignition_delay_time = 0.1
t = 0
counter = 1
while t < estimated_ignition_delay_time:
t = reactor_network.step()
if not counter % 10:
# We will save only every 10th value. Otherwise, this takes too long
# Note that the species concentrations are mass fractions
time_history.append(r.thermo.state, t=t)
counter += 1
# We will use the 'oh' species to compute the ignition delay
tau = ignition_delay(time_history, reference_species)
# Toc
t1 = time.time()
print(f"Computed Ignition Delay: {tau:.3e} seconds. Took {t1-t0:3.2f}s to compute")
# If you want to save all the data - molefractions, temperature, pressure, etc
# >>>>>>>>>>>>>>>>>>>>>>>>uncomment the next line
time_history.to_csv("time_history_TEST.csv")
plt.figure()
plt.plot(time_history.t, time_history(reference_species).Y, "-o")
plt.xlabel("Time (s)")
plt.ylabel("$Y_{OH}$")
plt.xlim([0,0.05])
plt.arrow(0, 0.008, tau, 0, width=0.0001, head_width=0.0005,
head_length=0.001, length_includes_head=True, color="r", shape="full")
plt.annotate(r"$Ignition Delay: \tau_{ign}$", xy=(0,0), xytext=(0.01, 0.0082), fontsize=16);
# Make a list of all the temperatures we would like to run simulations at
T = np.hstack((np.arange(1800, 900, -100), np.arange(975, 475, -25)))
estimated_ignition_delay_times = np.ones_like(T, dtype=float)
# Make time adjustments for the highest and lowest temperatures. This we do empirically
estimated_ignition_delay_times[:6] = 6 * [0.1]
estimated_ignition_delay_times[-4:-2] = 10
estimated_ignition_delay_times[-2:] = 100
# Now create a SolutionArray out of these
ignition_delays = ct.SolutionArray(gas, shape=T.shape, extra={"tau": estimated_ignition_delay_times})
ignition_delays.set_equivalence_ratio(1.0, fuel="nc7h16", oxidizer={"o2": 1.0, "n2": 3.76})
ignition_delays.TP = T, reactor_pressure
for i, state in enumerate(ignition_delays):
# Setup the gas and reactor
gas.TPX = state.TPX
r = ct.IdealGasReactor(contents=gas, name="Batch Reactor")
reactor_network = ct.ReactorNet([r])
reference_species_history = []
time_history = []
t0 = time.time()
t = 0
while t < estimated_ignition_delay_times[i]:
t = reactor_network.step()
time_history.append(t)
reference_species_history.append(gas[reference_species].X[0])
i_ign = np.array(reference_species_history).argmax()
tau = time_history[i_ign]
t1 = time.time()
print('Computed Ignition Delay: {:.3e} seconds for T={}K. Took {:3.2f}s to compute'.format(tau, state.T, t1-t0))
ignition_delays.tau[i] = tau
fig = plt.figure()
ax = fig.add_subplot(111)
ax.semilogy(1000/ignition_delays.T, ignition_delays.tau, 'o-')
ax.set_ylabel('Ignition Delay (s)')
ax.set_xlabel(r'$\frac{1000}{T (K)}$', fontsize=18)
# Add a second axis on top to plot the temperature for better readability
ax2 = ax.twiny()
ticks = ax.get_xticks()
ax2.set_xticks(ticks)
ax2.set_xticklabels((1000/ticks).round(1))
ax2.set_xlim(ax.get_xlim())
ax2.set_xlabel(r'Temperature: $T(K)$');
I modified the first part of the script. I deleted time_history as function of ct.SolutionArray(gas, extra="t") because It created problems to create a functional dataframe to save data. Now, I implemented pandas to save into csv but, It creates the csv file with columns and declaration of variables but it doesn't fill the csv. Moreover, I see the error:
Traceback (most recent call last):
File "test.py", line 77, in <module>
tau = ignition_delay(tHyBatch_base, reference_species)
File "test.py", line 50, in ignition_delay
i_ign = states(species).Y.argmax()
TypeError: 'DataFrame' object is not callable
import pandas as pd
import numpy as np
import time
import csv
import cantera as ct
print('Running Cantera version: ' + ct.__version__)
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['figure.autolayout'] = True
plt.style.use('ggplot')
plt.style.use('seaborn-pastel')
gas = ct.Solution('Seiser.cti')
# Define the reactor temperature and pressure
reactor_temperature = 1000 # Kelvin
reactor_pressure = 101325 # Pascals
gas.TP = reactor_temperature, reactor_pressure
# Define the fuel, oxidizer and set the stoichiometry
gas.set_equivalence_ratio(phi=1.0, fuel="nc7h16", oxidizer={"o2": 1.0, "n2": 3.76})
# Create a batch reactor object and add it to a reactor network
# In this example, the batch reactor will be the only reactor
# in the network
r = ct.IdealGasReactor(contents=gas, name="Batch Reactor")
reactor_network = ct.ReactorNet([r])
# Now compile a list of all variables for which we will store data
columnNames = [r.component_name(item) for item in range(r.n_vars)]
columnNames = ['pressure'] + columnNames
tHyBatch_base=pd.DataFrame(columns=columnNames)
tHyBatch_base.index.name = 'time'
def ignition_delay(states, species):
"""
This function computes the ignition delay from the occurence of the
peak in species' concentration.
"""
i_ign = states(species).Y.argmax()
return states.t[i_ign]
reference_species = "oh"
# Tic
t0 = time.time()
# This is a starting estimate. If you do not get an ignition within this time, increase it
estimated_ignition_delay_time = 0.1
t = 0
counter = 1
while t < estimated_ignition_delay_time:
t = reactor_network.step()
if not counter % 10:
# We will save only every 10th value. Otherwise, this takes too long
# Note that the species concentrations are mass fractions
state = np.hstack([r.thermo.state])
# Update the dataframe
tHyBatch_base.append(pd.Series(state, index=tHyBatch_base.columns[:len(state)]), ignore_index=True)
counter += 1
tHyBatch_base.to_csv("TESTCSV.csv")
# We will use the 'oh' species to compute the ignition delay
tau = ignition_delay(tHyBatch_base, reference_species)
# Toc
t1 = time.time()
print(f"Computed Ignition Delay: {tau:.3e} seconds. Took {t1-t0:3.2f}s to compute")
Someone can help? Thanks to all who want to give me an answer for an intrinsic problem of using pandas.
You should only change the command
timeHistory.to_csv("time_history.csv")
as below :
time_history.write_csv('time_history.csv')
I am trying to create a 2D colored bar chart
import numpy as np
import matplotlib.pyplot as plt
import pickle
from graphviz import Digraph
from torch.autograd import Variable
import argparse
def make_dot(var):
'''
Visualization of the computation graph
Taken from : https://github.com/szagoruyko/functional-zoo/blob/master/visualize.py
'''
node_attr = dict(style='filled',
shape='box',
align='left',
fontsize='12',
ranksep='0.1',
height='0.2')
dot = Digraph(node_attr=node_attr, graph_attr=dict(size="12,12"))
seen = set()
def add_nodes(var):
if var not in seen:
if isinstance(var, Variable):
value = '('+(', ').join(['%d' % v for v in var.size()])+')'
dot.node(str(id(var)), str(value), fillcolor='lightblue')
else:
dot.node(str(id(var)), str(type(var).__name__))
seen.add(var)
if hasattr(var, 'previous_functions'):
for u in var.previous_functions:
dot.edge(str(id(u[0])), str(id(var)))
add_nodes(u[0])
add_nodes(var.creator)
return dot
def plot_trajectories(true_trajs, pred_trajs, nodesPresent, obs_length, name, plot_directory, withBackground=False):
'''
Parameters
==========
true_trajs : Numpy matrix of shape seq_length x numNodes x 2
Contains the true trajectories of the nodes
pred_trajs : Numpy matrix of shape seq_length x numNodes x 2
Contains the predicted trajectories of the nodes
nodesPresent : A list of lists, of size seq_length
Each list contains the nodeIDs present at that time-step
obs_length : Length of observed trajectory
name : Name of the plot
withBackground : Include background or not
'''
traj_length, numNodes, _ = true_trajs.shape
# Initialize figure
plt.figure()
# Load the background
# im = plt.imread('plot/background.png')
# if withBackground:
# implot = plt.imshow(im)
# width_true = im.shape[0]
# height_true = im.shape[1]
# if withBackground:
# width = width_true
# height = height_true
# else:
width = 1
height = 1
traj_data = {}
for tstep in range(traj_length):
pred_pos = pred_trajs[tstep, :]
true_pos = true_trajs[tstep, :]
for ped in range(numNodes):
if ped not in traj_data and tstep < obs_length:
traj_data[ped] = [[], []]
if ped in nodesPresent[tstep]:
traj_data[ped][0].append(true_pos[ped, :])
traj_data[ped][1].append(pred_pos[ped, :])
for j in traj_data:
c = np.random.rand(3, 1)
true_traj_ped = traj_data[j][0] # List of [x,y] elements
pred_traj_ped = traj_data[j][1]
true_x = [(p[0]+1)/2*height for p in true_traj_ped]
true_y = [(p[1]+1)/2*width for p in true_traj_ped]
pred_x = [(p[0]+1)/2*height for p in pred_traj_ped]
pred_y = [(p[1]+1)/2*width for p in pred_traj_ped]
plt.plot(true_x, true_y, color=c, linestyle='solid', marker='o')
plt.plot(pred_x, pred_y, color=c, linestyle='dashed', marker='x')
if not withBackground:
plt.ylim((1, 0))
plt.xlim((0, 1))
# plt.show()
if withBackground:
plt.savefig('plot_with_background/'+name+'.png')
else:
plt.savefig(plot_directory+'/'+name+'.png')
plt.gcf().clear()
plt.close()
def main():
parser = argparse.ArgumentParser()
# Experiments
parser.add_argument('--test_dataset', type=int, default=0,
help='test dataset index')
# Parse the parameters
args = parser.parse_args()
# Save directory
save_directory = 'save/'
save_directory += str(args.test_dataset) + '/'
plot_directory = 'plot/'
f = open(save_directory+'/results.pkl', 'rb')
results = pickle.load(f)
# print "Enter 0 (or) 1 for without/with background"
# withBackground = int(input())
withBackground = 1
for i in range(len(results)):
print i
name = 'sequence' + str(i)
plot_trajectories(results[i][0], results[i][1], results[i][2], results[i][3], name, plot_directory, withBackground)
if __name__ == '__main__':
main()
Now I am unable to debug the Invalid RGBA argument because I don't understand what is causing the error. I even tried to use random colors instead with colors = np.random.rand(91,91,4) and still the error persists.
I have checked Stack Overflow posts regarding Invalid RGBA argument (for example this, this, this and this) and none of them seems to answer my problem.
I want to know what could be causing this error. I am using the standard Anaconda distribution for Python on Ubuntu Mate 16.
Could it be that due to recent updates in Python, the solution as in the original Stack Overflow post becomes obsolete?
Just replace
c = np.random.rand(3, 1)
with this:
c = np.random.rand(3)
It removes the error. matplotlib expects (3,) or (4,) shape for the c argument.
I wrote a code a while ago that processes spectra using data from text files and performing calculations on them. I started with a code that just does everything line-by-line without any functions, and despite being long, it finishes running in 2.11 seconds (according to %%timeit). Below is that original code, labeled as such.
However, I wanted to put my code into functions instead, to allow for easier readability and usage with different models in the future. Even though I'm using all the same steps as I did before (but this time inside my functions), it is so much slower. This code is also below. Now, I have to wait for about 15-20 minutes to get the same outputs. Why is it so much slower, and is there any way I can make it significantly faster but still use functions?
Original Code:
import re
import matplotlib.pyplot as plt
import numpy as np
import scipy.interpolate
filename = 'bpass_spectra.txt'
extinctionfile = 'ExtinctionLawPoints.txt' # from R_V = 4.0
pointslist = []
datalist = []
speclist = []
# Constants
Msun = 1.98892e30 # solar mass [kg]
h = 4.1357e-15 # Planck's constant [eV s]
c = float(3e8) # speed of light [m/s]
# Read spectra file
f = open(filename, 'r')
rawspectra = f.readlines()
met = re.findall('Z\s=\s(\d*\.\d+)', rawspectra[0])
del rawspectra[0]
for i in range(len(rawspectra)):
newlist = rawspectra[i].split(' ')
datalist.append(newlist)
# Read extinction curve data file
rawpoints = open(extinctionfile, 'r').readlines()
for i in range(len(rawpoints)):
newlst = re.split('(?!\S)\s(?=\S)|(?!\S)\s+(?=\S)', rawpoints[i])
pointslist.append(newlst)
pointslist = pointslist[3:]
lambdalist = [float(item[0]) for item in pointslist]
k_abslist = [float(item[4]) for item in pointslist]
xvallist = [(c*h)/(lamb*1e-6) for lamb in lambdalist]
k_interp = scipy.interpolate.interp1d(xvallist, k_abslist)
# Create new lists
Elist = [float(item[0]) for item in datalist]
speclambdalist = [h*c*1e9/E for E in Elist]
z1list = [float(item[1]) for item in datalist]
speclist.extend(z1list)
met = met[0]
klist = [None]*len(speclist)
Loutlist = [None]*len(speclist)
Tlist = [None]*len(speclist)
# Define parameters
b = 2.0
R = 1.0
z = 1.0
Mgas = 1.0 # mass of gas, input
Mhalo = 2e41 # mass of dark matter halo, known
if float(met) > 0.0052:
DGRlist = [50.0*np.exp(-2.21)*float(met)]*len(speclist)
elif float(met) <= 0.0052:
DGRlist = [((50.0*float(met))**3.15)*np.exp(-0.96)]*len(speclist)
for i in range(len(speclist)):
if float(Elist[i]) <= 4.1357e-3: # frequencies <= 10^12 Hz
klist[i] = 0.1*(float(Elist[i])/(1000.0*h))**b # extinction law [cm^2/g]
elif float(Elist[i]) > 4.1357e-3: # frequencies > 10^12 Hz
klist[i] = k_interp(Elist[i]) # interpolated function's value at Elist[i]
Mdustlist = [Mgas*DGR for DGR in DGRlist] # dust mass
Rhalo = 0.784*(0.27**2.0)*(0.7**(-2.0/3.0))*float(10.0/(1.0+z))*((Mhalo/(1e8*Msun))**(1.0/3.0))
Rdust = 0.018*Rhalo # [kpc]
for i in range(len(speclist)):
Tlist[i] = 3*Mdustlist[i]*klist[i]/(4*np.pi*Rdust)
Linlist = [float(spectra)*R for spectra in speclist]
# Outgoing luminosity as function of wavelength
for i in range(len(Linlist)):
Loutlist[i] = Linlist[i]*np.exp(-Tlist[i])
# Test the calculation
print "LIN ELEMENTS 0 AND 1000:", Linlist[0], Linlist[1000]
print "LOUT ELEMENTS 0 AND 1000:", Loutlist[0], Loutlist[1000]
New "function-ized" Code (much slower):
import re
import matplotlib.pyplot as plt
import numpy as np
import scipy.interpolate
# Required files and lists
filename = 'bpass_spectra.txt' # number of columns = 4
extinctionfile = 'ExtinctionLawPoints.txt' # R_V = 4.0
datalist = []
if filename == 'bpass_spectra.txt':
filetype = 4
else:
filetype = 1
if extinctionfile == 'ExtinctionLawPoints.txt':
R_V = 4.0
else:
R_V = 1.0 #to be determined
# Constants
M_sun = 1.98892e30 # solar mass [kg]
h = 4.1357e-15 # Planck's constant [eV s]
c = float(3e8) # speed of light [m/s]
# Inputs
beta = 2.0
R = 1.0
z = 1.0
M_gas = 1.0
M_halo = 2e41
# Read spectra file
f = open(filename, 'r')
rawlines = f.readlines()
met = re.findall('Z\s=\s(\d*\.\d+)', rawlines[0])
del rawlines[0]
for i in range(len(rawlines)):
newlist = rawlines[i].split(' ')
datalist.append(newlist)
# Read extinction curve data file
rawpoints = open(extinctionfile, 'r').readlines()
def interpolate(R_V, rawpoints, Elist, j):
pointslist = []
if R_V == 4.0:
for i in range(len(rawpoints)):
newlst = re.split('(?!\S)\s(?=\S)|(?!\S)\s+(?=\S)', rawpoints[i])
pointslist.append(newlst)
pointslist = pointslist[3:]
lambdalist = [float(item[0]) for item in pointslist]
k_abslist = [float(item[4]) for item in pointslist]
xvallist = [(c*h)/(lamb*1e-6) for lamb in lambdalist]
k_interp = scipy.interpolate.interp1d(xvallist, k_abslist)
return k_interp(Elist[j])
# Dust extinction function
def dust(interpolate, filetype, datalist, beta, R, z, M_gas, M_halo, met):
speclist = []
if filetype == 4:
metallicity = float(met[0])
Elist = [float(item[0]) for item in datalist]
speclambdalist = [h*c*1e9/E for E in Elist]
met1list = [float(item[1]) for item in datalist]
speclist.extend(met1list)
klist, Tlist = [None]*len(speclist), [None]*len(speclist)
if metallicity > 0.0052:
DGRlist = [50.0*np.exp(-2.21)*metallicity]*len(speclist) # dust to gas ratio
elif metallicity <= 0.0052:
DGRlist = [((50.0*metallicity)**3.15)*np.exp(-0.96)]*len(speclist)
for i in range(len(speclist)):
if Elist[i] <= 4.1357e-3: # frequencies <= 10^12 Hz
klist[i] = 0.1*(float(Elist[i])/(1000.0*h))**beta # extinction law [cm^2/g]
elif Elist[i] > 4.1357e-3: # frequencies > 10^12 Hz
klist[i] = interpolate(R_V, rawpoints, Elist, i) # interpolated function's value at Elist[i]
Mdustlist = [M_gas*DGR for DGR in DGRlist] # dust mass
R_halo = 0.784*(0.27**2.0)*(0.7**(-2.0/3.0))*float(10/(1+z))*((M_halo/(1e8*M_sun))**(1.0/3.0))
R_dust = 0.018*R_halo # [kpc]
# Optical depth calculation
Tlist = [3*Mdustlist[i]*klist[i]/(4*np.pi*R_dust) for i in range(len(speclist))]
# Ingoing and outgoing luminosities as functions of wavelength
Linlist = [float(spectra)*R for spectra in speclist]
Loutlist = [Linlist[i]*np.exp(-Tlist[i]) for i in range(len(speclist))]
return speclambdalist, Linlist, Loutlist
print dust(interpolate, filetype, datalist, beta, R, z, M_gas, M_halo, met)
Even when I only have the function return Loutlist instead of the tuple of 3 lists, it's still extremely slow. Any ideas on why this is? Also, I'm going to want to return the tuple and then plot speclambdalist versus Linlist, and also plot speclambdalist versus Loutlist on the same plot. But I'm under the impression that each time I call dust(interpolate, filetype, datalist, beta, R, z, M_gas, M_halo, met)[i] where i = 0, 1, or 2 (I'll be doing this multiple times), it'll have to run the function again each time. Is there any way to bypass these extra runs to further increase speed? Thank you!
I have been trying to implement k-means clustering with a heatmap, but have been unsuccessful.
Here is the initial data set:
https://raw.githubusercontent.com/gsprint23/cpts215/master/progassignments/files/simple.csv
And here is my code:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math
import random
#%matplotlib inline
def truncate(f, n):
return math.floor(f * 10 ** n) / 10 ** n
def chooseCenter(data, centers):
length = data.shape
cent = []
while len(cent) < centers :
x = random.randrange(0,length[0])
y = random.randrange(0,length[1])
if data.iloc[x][y] not in cent:
d = truncate(data.iloc[x][y],2)
cent.append(d)
return cent
def distance(val, center):
return math.sqrt((val- center)**2)
def getDistances(centers, data):
length = data.shape
dist = []
for i in range(length[0]):
for j in range(length[1]):
y = []
for k in range(len(centers)):
val = distance(data.iloc[i][j], centers[k])
y.append(truncate(val,3))
dist.append(y)
return dist
def findClosest(data, dist):
close = data.copy()
length = close.shape
indexes = []
for i in range(len(dist)):
pt = min(dist[i])
idx = dist[i].index(pt)
indexes.append(idx)
#print(indexes)
length = data.shape
n = np.array(indexes)
n = pd.DataFrame(np.reshape(n, (length[0],length[1])))
#reshape this data frame into the same shape as the data
#keep running the find closest until there is no change
#try heatmap on this?
#this should cluster it, but to make sure test it
#might need to do some tweaking to this
return n
# for i in range(length[0]):
# for j in range(length[1]):
# print('dist[i]', dist[j])
# pt = min(dist[j])
# print(pt)
# idx = dist[j].index(pt)
# close.iloc[i][j] = int(idx)
#return close
def computeNewCenter(data, close):
d = dict()
for i in range(len(close)):
for j in range(len(close[0])):
d[close.iloc[i][j]] = []
for i in range(len(data)):
for j in range(len(data[0])):
if close.iloc[i][j] in d:
d[close.iloc[i][j]].append(data.iloc[i][j])
newCenters = []
for key, value in d.items():
m = np.mean(value)
newCenters.append(truncate(m, 3))
return newCenters
# lst = [[] * numcenters]
# for i in range(len(close)):
# for j in range(len(close[0])):
# if close.iloc[i][j]
def main():
data = np.array(pd.read_csv('https://raw.githubusercontent.com/gsprint23/cpts215/master/progassignments/files/simple.csv', header=None))
data = data.T
#print(data)
df = pd.DataFrame(data[1:], columns=data[0], dtype=float).T
df = df.iloc[::-1]
# print(df)
# print(df.iloc[1][9])
# print(df)
# print(df.iloc[0][1])
# heatmap = plt.pcolor(df, cmap=plt.cm.bwr)
# plt.colorbar(heatmap)
c = chooseCenter(df, 3)
print(c)
#print(len(c))
dist = getDistances(c, df)
#print(dist)
y = findClosest(df, dist)
# q = []
# for i in range(len(c)):
# q.append([])
# #print(q)
j = computeNewCenter(df, y)
#print(j)
length = df.shape
oldFrame = pd.DataFrame(np.ndarray((length[0],length[1])))
oldFrame = oldFrame.fillna(0)
ct=0
while y.equals(oldFrame) == False:
ct+=1
oldFrame = y.copy()
c = computeNewCenter(df, oldFrame)
#print(c)
dist = getDistances(c, df)
#print(dist)
y = findClosest(df, dist)
#print(y)
#plt.pcolor(df, cmap=plt.cm.bwr)
l = []
for i in range(len(y)):
for j in range(len(y[0])):
if y.iloc[i][j] == 1:
l.append(df.iloc[i][j])
for i in range(len(y)):
for j in range(len(y[0])):
if y.iloc[i][j] == 2:
l.append(df.iloc[i][j])
for i in range(len(y)):
for j in range(len(y[0])):
if y.iloc[i][j] == 0:
l.append(df.iloc[i][j])
l = np.ndarray((length[0],length[1]))
l = pd.DataFrame(l)
print(l)
hm = plt.pcolor(l, cmap=plt.cm.bwr)
plt.colorbar(hm)
# print(y)
# print(c)
# print(ct)
#plt.pcolor(y, cmap=plt.cm.bwr)
if __name__ == '__main__':
main()
My line of thinking was this:
My current thought process was to first randomly choose the centers.
Then create a list of lists for each point for the distance to each center.
Find the index of the minimum distance for each point for each center.
Create a data frame of the same size as the data set and fill each index for each element with the index of the center the point is closest to.
Recompute the center by taking the mean of the points with the same center index
Repeat this process multiple times until the index data frame does not change.
Create a new data frame and add the points which have the same center point close together in the frame.
Then create the heatmap.
This did not seem to work though.
Just wondering, am I on the right track or am I completely off, and if I am on the right track which parts would I need to change in order to fix the issue. If not could you please point me on the right track.
Here is a comparison of the maps:
Here are the maps
The first one is the one my program generated while the second is the way it is supposed to look.
I know my problem lies in some part of the k-means clustering algorithm, and my guess is it is either in the reassignment stage where you reassign the points to the centroids and calculate the new centroids or in the stopping condition in that the algorithm does not run long enough. Also in the back of my head, something tells me that I am not doing this as efficiently as I could have and that I am missing something key. I have watched several videos on K-means clustering and understand it conceptually, I'm just having a hard time implementing it.
I'm trying to do some bayesian probit code using data augmentation. I can get it to work if I loop over the rows of the output matrix, but I'd like to vectorize it and do it all in one shot (presumably that's faster).
import numpy as np
from numpy import random
import statsmodels.api as sm
from scipy import stats
from scipy.stats import norm, truncnorm
##################################
### Create some simulated data ###
num_leg = 50
num_bills = 20
a = np.random.uniform(-1,1,num_bills).reshape(num_bills, 1)
b = np.random.uniform(-2,2,num_bills).reshape(num_bills, 1)
x = np.random.standard_normal(num_leg).reshape(num_leg, 1)
ystar_base = a + np.dot(b,x.T)
epsilon = np.random.standard_normal(num_leg * num_bills).reshape(num_bills, num_leg)
ystar = ystar_base + epsilon
y = 1*(ystar >0)
### Initialize some stuff I need ###
avec = [0]*num_bills # These are bill parameters
bvec = [0]*num_bills
betavec = [np.matrix(zip(avec,bvec))]
xvec = [0]*num_leg # these are legislator parameters
_ones = np.ones(num_leg)
def init_y(mat): # initialize a latent y matrix
if mat==1: return truncnorm.rvs(0,10000)
else: return truncnorm.rvs(-10000,0)
vectorize_y = np.vectorize(init_y)
latent_y = np.matrix(vectorize_y(y))
burn = 500 # How long to run the MCMC
runs = 500
### define the functions ###
def sample_params(xnow,ynow): # This is the function I'd like to vectorize
if type(xnow) == list:
xnow = np.array(xnow)
if type(ynow) == list:
ynow = np.array(ynow)
ynow = ynow.T #reshape(ynow.shape[0],1)
sigma = np.linalg.inv(np.dot(xnow.T,xnow)) ###This is the line that produces an error###
xy = np.dot(xnow.T,ynow)
mu = np.dot(sigma, xy) # this is just (x'x)inv x'y
return np.random.multivariate_normal(np.array(mu).flatten(), sigma)
vecparams = np.vectorize(sample_params)
def get_mu(xnow, bnow): # getting the updated mean to draw the latent ys
if type(xnow) == list:
xnow = np.array(xnow)
if type(bnow) == list:
bnow = np.array(bnow)
mu = np.dot(xnow,bnow.T)
mu = np.matrix(mu)
return mu
def sample_y(mu, ynow): # generate latent y matrix
if ynow==1:
a, b = (0 - mu),(10000-mu)
else:
a, b = (-10000 - mu),(0-mu)
return truncnorm.rvs(a,b)
vector_sample = np.vectorize(sample_y) # I'd like to be able to do something like this
### Here's the MCMC loop with the internal loop over rows(bills)
for i in range(burn+runs):
this_beta = []
this_x = []
this_y = []
for j in range(num_bills): #I'd like to get rid of this loop
ex = zip(x_ones, x)
newbeta = sample_params(ex, latent_y[j])
this_beta.append(newbeta)
#ex = np.array(zip(x_ones, x))
#this_beta = vecparams(ex, latent_y[:,]) # and call the vectorized function here
betavec.append(this_beta)
#Note, I can vectorize the latent outputs easily enough here
mean = get_mu(ex, betavec[-1])
latent_y = np.matrix(vector_sample(mean, np.matrix(y).T).T.reshape(latent_y.shape[0], latent_y.shape[1]))
### Now a bit of code to check to see if I've recovered what I want ###
test_beta = [zip(*(z)) for z in betavec[burn:]]
test_a = np.array([z[0] for z in test_beta])
test_b = np.array([z[1] for z in test_beta])
amean = test_a.sum(axis = 0)/float(runs)
bmean = test_b.sum(axis = 0)/float(runs)
print 'a mean'
print np.corrcoef([amean, np.array(a)])
print
print 'b mean'
print np.corrcoef([bmean, np.array(b)])
If I comment out the loop and use the commented out lines just above, I get the following error at the line I indicated earlier (the one that defines sigma):
LinAlgError: 0-dimensional array given. Array must be at least two-dimensional