get correlation p-value with deep graph - python

I am using deepgraph in python to compute correlation coefficients for large matrices. The output gives a multi-index data frame:
s t
0 1 -0.006066
2 0.094063
3 -0.025529
4 0.074080
5 0.035490
6 0.005221
7 0.032064
I want to add a column with corresponding p-values.
The original code with input example is obtained from https://deepgraph.readthedocs.io/en/latest/tutorials/pairwise_correlations.html
The code surrounded by hashtags is my approach to get p-values.
I want to merge the separate edge lists later on.
#!/bin/python
import os
from multiprocessing import Pool
import numpy as np
import pandas as pd
import deepgraph as dg
from numpy.random import RandomState
from scipy.stats import pearsonr, spearmanr
prng = RandomState(0)
n_features = int(5e3)
n_samples = int(1e2)
X = prng.randint(100, size=(n_features, n_samples)).astype(np.float64)
# Spearman's correlation coefficients
X = X.argsort(axis=1).argsort(axis=1)
# whiten variables for fast parallel computation later on
X = (X - X.mean(axis=1, keepdims=True)) / X.std(axis=1, keepdims=True)
# save in binary format
np.save('samples', X)
# parameters (change these to control RAM usage)
step_size = 1e5
n_processes = 100
# load samples as memory-map
X = np.load('samples.npy', mmap_mode='r')
# create node table that stores references to the mem-mapped samples
v = pd.DataFrame({'index': range(X.shape[0])})
# connector function to compute pairwise pearson correlations
def corr(index_s, index_t):
features_s = X[index_s]
features_t = X[index_t]
corr = np.einsum('ij,ij->i', features_s, features_t) / n_samples
return corr
#################################
def p_Val(index_s, index_t):
features_s = X[index_s]
features_t = X[index_t]
p = spearmanr(features_s, features_t)[1]
return p
#################################
# index array for parallelization
pos_array = np.array(np.linspace(0, n_features*(n_features-1)//2, n_processes), dtype=int)
# parallel computation
def create_ei(i):
from_pos = pos_array[i]
to_pos = pos_array[i+1]
# initiate DeepGraph
g = dg.DeepGraph(v)
# create edges
g.create_edges(connectors=corr, step_size=step_size, from_pos=from_pos, to_pos=to_pos)
# store edge table
g.e.to_pickle('tmp/correlations/{}_corr.pickle'.format(str(i).zfill(3)))
#################################
gp = dg.DeepGraph(v)
# create edges
gp.create_edges(connectors=p_Val, step_size=step_size, from_pos=from_pos, to_pos=to_pos)
# store edge table
gp.e.to_pickle('tmp/correlations/{}_pval.pickle'.format(str(i).zfill(3)))
#################################
# computation
if __name__ == '__main__':
os.makedirs("tmp/correlations", exist_ok=True)
indices = np.arange(0, n_processes - 1)
p = Pool()
for _ in p.imap_unordered(create_ei, indices):
pass
# store correlation values
files = os.listdir('tmp/correlations/')
files.sort()
for f in files:
et = pd.read_pickle('tmp/correlations/{}'.format(f))
print(et)
store.close()
I get the following error:
Traceback (most recent call last):
File "/lib/python3.9/multiprocessing/pool.py", line 125, in worker
result = (True, func(*args, **kwds))
File "pairwise_corr.py", line 64, in create_ei
gp.create_edges(connectors=p_Val, step_size=step_size, from_pos=from_pos, to_pos=to_pos)
File "/lib/python3.9/site-packages/deepgraph/deepgraph.py", line 616, in create_edges
self.e = _matrix_iterator(
File "/lib/python3.9/site-packages/deepgraph/deepgraph.py", line 4875, in _matrix_iterator
ei = _select_and_return(vi, sources_k, targets_k, ft_feature,
File "/lib/python3.9/site-packages/deepgraph/deepgraph.py", line 5339, in _select_and_return
ei = pd.DataFrame({col: data[col] for col in coldtypedic})
File "/lib/python3.9/site-packages/pandas/core/frame.py", line 614, in __init__
mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
File "/lib/python3.9/site-packages/pandas/core/internals/construction.py", line 464, in dict_to_mgr
return arrays_to_mgr(
File "/lib/python3.9/site-packages/pandas/core/internals/construction.py", line 124, in arrays_to_mgr
arrays = _homogenize(arrays, index, dtype)
File "/lib/python3.9/site-packages/pandas/core/internals/construction.py", line 589, in _homogenize
val = sanitize_array(
File "/lib/python3.9/site-packages/pandas/core/construction.py", line 576, in sanitize_array
subarr = _sanitize_ndim(subarr, data, dtype, index, allow_2d=allow_2d)
File "/lib/python3.9/site-packages/pandas/core/construction.py", line 627, in _sanitize_ndim
raise ValueError("Data must be 1-dimensional")
ValueError: Data must be 1-dimensional
Any suggestions?
Thanks!

I was able to solve it with
def p_Val(index_s, index_t):
features_s = X[index_s]
features_t = X[index_t]
p = [pearsonr(features_s[i, :], features_t[i, :])[1] for i in range(len(features_s))]
p_val = np.asarray(p)
return p_val

Related

IndexError: boolean index did not match indexed array along dimension 1; dimension is 21 but corresponding boolean dimension is 121

I'm trying to implement an optimization algorithm.
The dataset shape is (522, 22)
The code below: (I didn't put the entire code).
file_ = r"D:\New folder (5)\BPSO-and-ANN-for-sofware-fault-predicition-master\kc2.csv"
df = pd.read_csv(file_)
y = df['defects'].values
X = df.drop('defects', axis=1).values
def evaluation(feature_possibilities):
feature_possibilities = np.round(feature_possibilities)
feature_possibilities = feature_possibilities > np.float32(0.5)
selectedX = X[:, feature_possibilities]
s = svm.SVC(kernel="poly", C=1)
loocv = LeaveOneOut()
evaluation = cross_val_score(s, selectedX, y, cv=loocv)
return evaluation.mean()
But there is an error.
Traceback (most recent call last):
File "C:\Users\sa\Desktop\Feature-Selection-with-Firefly-Algorithm-master\FF-SVM.py", line 142, in <module>
Best = Algorithm.Run()
File "C:\Users\sa\Desktop\Feature-Selection-with-Firefly-Algorithm-master\FF-SVM.py", line 102, in Run
self.Fitness[i] = self.Fun(self.Fireflies[i])
File "C:\Users\sa\Desktop\Feature-Selection-with-Firefly-Algorithm-master\FF-SVM.py", line 132, in evaluation
selectedX = X[:, feature_possibilities]
IndexError: boolean index did not match indexed array along dimension 1; dimension is 21 but corresponding boolean dimension is 121

Librosa failing to plot mfcc generated

I'm was being able to generate MFCC from system captured audio and plot it, but after some refactor and configuring Tensorflow with CUDA. I used Librosa to generated the mfcc, matplotlib.pyplot with librosa.display to plot the MFCC and sounddevice capturing sound from Stereo mix from windows. The current configuration can create and plot MFCC from sample .wav files but when using system captured sounds it's not able to plot it since its generating a 3D array instead of a 2D when running MFCC. Here is the code that generates and plots
N_MFCC = 40
N_MELS = 40
N_FFT = 512
HOP_LENGTH = 160
MIN_FREQ = 0
MAX_FREQ = None
def create_mfcc(record, sample_rate):
features = librosa.feature.mfcc(record, sample_rate, n_fft=N_FFT,n_mfcc=N_MFCC,
n_mels=N_MELS,hop_length=HOP_LENGTH,fmin=MIN_FREQ, fmax=MAX_FREQ, htk=False)
return features
def plot_and_save_mfcc(mfcc_data, file_name, sample_rate):
plt.figure(figsize=(10, 8))
plt.title('Current audio MFCC', fontsize=18)
plt.xlabel('Time [s]', fontsize=18)
librosa_display.specshow(mfcc_data, sr=sample_rate)
plt.savefig(file_name)
plt.cla()
This generates this stack trace
Traceback (most recent call last):
File "main.py", line 68, in <module>
main()
File "main.py", line 63, in main
start_listening_and_creating_mfcc()
File "main.py", line 48, in start_listening_and_creating_mfcc
plot_and_save_mfcc(mfcc_data, conf.DEFAULT_MFCC_IMAGE_NAME.format(image_count), conf.SAMPLE_RATE)
File "main.py", line 38, in plot_and_save_mfcc
librosa_display.specshow(mfcc_data, sr=sample_rate)
File anaconda3\lib\site-packages\librosa\util\decorators.py", line 88, in inner_f
return f(*args, **kwargs)
File anaconda3\lib\site-packages\librosa\display.py", line 879, in specshow
out = axes.pcolormesh(x_coords, y_coords, data, **kwargs)
File anaconda3\lib\site-packages\matplotlib\__init__.py", line 1361, in inner
return func(ax, *map(sanitize_sequence, args), **kwargs)
File anaconda3\lib\site-packages\matplotlib\axes\_axes.py", line 6183, in pcolormesh
X, Y, C, shading = self._pcolorargs('pcolormesh', *args,
File anaconda3\lib\site-packages\matplotlib\axes\_axes.py", line 5671, in _pcolorargs
nrows, ncols = C.shape
ValueError: too many values to unpack (expected 2)
I did try debug it and change mfcc configuration, but no success. Also did try to reconfigure my environment but this didn't help either.
EDIT: Here is the mfcc.Shapes for the System audio
(48000, 40, 1)
And for the .wav sample files
(40, 122)
As mentioned I left a function out of the question but here is it and the function the is used to load and create mfcc for the .wav files
def create_mfcc_from_file(file_path):
(signal, sample_rate) = librosa.load(file_path)
librosa_features = create_mfcc(signal, sample_rate)
plot_and_save_mfcc(librosa_features, 'mfcc-librosa', sample_rate)
def start_listening_and_creating_mfcc():
image_count = 0
while True:
my_recording = record_window()
mfcc_data = create_mfcc(my_recording, conf.SAMPLE_RATE)
plot_and_save_mfcc(mfcc_data, conf.DEFAULT_MFCC_IMAGE_NAME.format(image_count), conf.SAMPLE_RATE)
wav.write(conf.DEFAULT_MFCC_IMAGE_NAME.format(image_count) + '.wav', conf.SAMPLE_RATE, my_recording)
image_count += 1
def delta(feat, N):
"""Compute delta features from a feature vector sequence.
:param feat: A numpy array of size (NUMFRAMES by number of features) containing features. Each row holds 1 feature vector.
:param N: For each frame, calculate delta features based on preceding and following N frames
:returns: A numpy array of size (NUMFRAMES by number of features) containing delta features. Each row holds 1 delta feature vector.
"""
if N < 0:
raise ValueError('N must be an integer >0')
NUMFRAMES = len(feat)
denominator = 2 * sum([i**2 for i in range(1, N+1)])
delta_feat = numpy.empty_like(feat)
padded = numpy.pad(feat, ((N, N), (0, 0)), mode='edge') # padded version of feat
for t in range(NUMFRAMES):
delta_feat[t] = numpy.dot(numpy.arange(-N, N+1), padded[t : t+2*N+1]) / denominator
plt.plot(signal, c='c')# [t : t+2*N+1] == [(N+t)-N : (N+t)+N+1]
return delta_feat

Issue using mpcalc.advection( ) to calculate advection of a scalar field

I'm attempting to follow this training example to calculate QG omega on NCEP/NCAR data but I'm getting hung up on mpcalc.advection().
It appears as if my dx and dy variables are a different shape, but I'm directly following the routine in an online example that supposedly works.
import numpy as np
import xarray as xr
import metpy.calc as mc
import metpy.constants as mpconstants
from metpy.units import units
# CONSTANTS
# ---------
sigma = 2.0e-6 * units('m^2 Pa^-2 s^-2')
f0 = 1e-4 * units('s^-1')
Rd = mpconstants.Rd
path = './'
uf = 'uwnd.2018.nc'
vf = 'vwnd.2018.nc'
af = 'air.2018.nc'
ads = xr.open_dataset(path+af).metpy.parse_cf()
uds = xr.open_dataset(path+uf).metpy.parse_cf()
vds = xr.open_dataset(path+vf).metpy.parse_cf()
a700 = ads['air'].metpy.sel(
level=700 * units.hPa,
time='2018-01-04T12')
u700 = uds['uwnd'].metpy.sel(
level=700 * units.hPa,
time='2018-01-04T12')
v700 = vds['vwnd'].metpy.sel(
level=700 * units.hPa,
time='2018-01-04T12')
lats = ads['lat'].metpy.unit_array
lons = ads['lon'].metpy.unit_array
X, Y = np.meshgrid(lons,lats)
dx, dy = mc.lat_lon_grid_deltas(lons,lats)
avort = mc.absolute_vorticity(u700, v700,
dx, dy, lats[:,None])
print('Array shape:', avort.shape)
print('DX shape:', dx.shape)
print('DY shape:', dy.shape)
print('U700 shape:', u700.shape)
print('V700 shape:', v700.shape)
vortadv = mc.advection(avort, (u700,v700), (dx,dy)).to_base_units()
Here's the error message, it also looks like I may have a unit issue?
Found lat/lon values, assuming latitude_longitude for projection grid_mapping variable
Found lat/lon values, assuming latitude_longitude for projection grid_mapping variable
Found lat/lon values, assuming latitude_longitude for projection grid_mapping variable
/work1/jsa/anconda3/envs/earth/lib/python3.7/site-packages/metpy/calc/basic.py:1033: UserWarning: Input over 1.5707963267948966 radians. Ensure proper units are given.
'Ensure proper units are given.'.format(max_radians))
/work1/jsa/anconda3/envs/earth/lib/python3.7/site-packages/pint/quantity.py:888: RuntimeWarning: invalid value encountered in true_divide
magnitude = magnitude_op(new_self._magnitude, other._magnitude)
Array shape: (73, 144)
DX shape: (73, 143)
DY shape: (72, 144)
U700 shape: (73, 144)
V700 shape: (73, 144)
Traceback (most recent call last):
File "metpy.decomp.py", line 56, in <module>
vortadv = mc.advection(avort, (u700,v700), (dx,dy)).to_base_units()
File "/work1/jsa/anconda3/envs/earth/lib/python3.7/site-packages/metpy/xarray.py", line 570, in wrapper
return func(*args, **kwargs)
File "/work1/jsa/anconda3/envs/earth/lib/python3.7/site-packages/metpy/calc/kinematics.py", line 61, in wrapper
ret = func(*args, **kwargs)
File "/work1/jsa/anconda3/envs/earth/lib/python3.7/site-packages/metpy/calc/kinematics.py", line 320, in advection
wind = _stack(wind)
File "/work1/jsa/anconda3/envs/earth/lib/python3.7/site-packages/metpy/calc/kinematics.py", line 24, in _stack
return concatenate([a[np.newaxis] if iterable(a) else a for a in arrs], axis=0)
File "/work1/jsa/anconda3/envs/earth/lib/python3.7/site-packages/metpy/calc/kinematics.py", line 24, in <listcomp>
return concatenate([a[np.newaxis] if iterable(a) else a for a in arrs], axis=0)
File "/work1/jsa/anconda3/envs/earth/lib/python3.7/site-packages/xarray/core/dataarray.py", line 642, in __getitem__
return self.isel(indexers=self._item_key_to_dict(key))
File "/work1/jsa/anconda3/envs/earth/lib/python3.7/site-packages/xarray/core/dataarray.py", line 1040, in isel
indexers, drop=drop, missing_dims=missing_dims
File "/work1/jsa/anconda3/envs/earth/lib/python3.7/site-packages/xarray/core/dataset.py", line 2014, in _isel_fancy
name, var, self.indexes[name], var_indexers
File "/work1/jsa/anconda3/envs/earth/lib/python3.7/site-packages/xarray/core/indexes.py", line 106, in isel_variable_and_index
new_variable = variable.isel(indexers)
File "/work1/jsa/anconda3/envs/earth/lib/python3.7/site-packages/xarray/core/variable.py", line 1118, in isel
return self[key]
File "/work1/jsa/anconda3/envs/earth/lib/python3.7/site-packages/xarray/core/variable.py", line 766, in __getitem__
dims, indexer, new_order = self._broadcast_indexes(key)
File "/work1/jsa/anconda3/envs/earth/lib/python3.7/site-packages/xarray/core/variable.py", line 612, in _broadcast_indexes
return self._broadcast_indexes_outer(key)
File "/work1/jsa/anconda3/envs/earth/lib/python3.7/site-packages/xarray/core/variable.py", line 688, in _broadcast_indexes_outer
return dims, OuterIndexer(tuple(new_key)), None
File "/work1/jsa/anconda3/envs/earth/lib/python3.7/site-packages/xarray/core/indexing.py", line 410, in __init__
f"invalid indexer array, does not have integer dtype: {k!r}"
TypeError: invalid indexer array, does not have integer dtype: array(None, dtype=object)
Thanks in advance for any help!
So the problem is that in MetPy 1.0 the signature for advection changed, to be easier to use, to advection(scalar, u, v). Also, since you're working with Xarray data with MetPy 1.0, it can handle all of the coordinate stuff for you--you don't even need to deal with dx and dy manually:
subset = dict(level=700 * units.hPa, time='2018-01-04T12')
a700 = ads['air'].metpy.sel(**subset)
u700 = uds['uwnd'].metpy.sel(**subset)
v700 = vds['vwnd'].metpy.sel(**subset)
avort = mc.absolute_vorticity(u700, v700)
vortadv = mc.advection(avort, u700, v700).to_base_units()
We need to update that training example, but right now I'd recommend looking at the gallery examples 500 hPa Geopotential Heights, Absolute Vorticity, and Winds and 500 hPa Vorticity Advection.

python fmin_slsqp - error with constraints

I am practicing with SciPy and I encountered an error when trying to use fmin_slsqp. I set up a problem in which I want to maximize an objective function, U, given a set of constraints.
I have two control variables, x[0,t] and x[1,t] and, as you can see, they are indexed by t (time periods). The objective function is:
def obj_fct(x, alpha,beta,Al):
U = 0
x[1,0] = x0
for t in trange:
U = U - beta**t * ( (Al[t]*L)**(1-alpha) * x[1,t]**alpha - x[0,t])
return U
The constraints are defined over these two variables and one of them links the variables from one period (t) to another (t-1).
def constr(x,alpha,beta,Al):
return np.array([
x[0,t],
x[1,0] - x0,
x[1,t] - x[0,t] - (1-delta)*x[1,t-1]
])
Finally, here is the use of fmin_slsqp:
sol = fmin_slsqp(obj_fct, x_init, f_eqcons=constr, args=(alpha,beta,Al))
Leaving aside the fact that there are better ways to solve such dynamic problems, my question is about the syntax. When running this simple code, I get the following error:
Traceback (most recent call last):
File "xxx", line 34, in <module>
sol = fmin_slsqp(obj_fct, x_init, f_eqcons=constr, args=(alpha,beta,Al))
File "D:\Anaconda3\lib\site-packages\scipy\optimize\slsqp.py", line 207, in fmin_slsqp
constraints=cons, **opts)
File "D:\Anaconda3\lib\site-packages\scipy\optimize\slsqp.py", line 311, in _minimize_slsqp
meq = sum(map(len, [atleast_1d(c['fun'](x, *c['args'])) for c in cons['eq']]))
File "D:\Anaconda3\lib\site-packages\scipy\optimize\slsqp.py", line 311, in <listcomp>
meq = sum(map(len, [atleast_1d(c['fun'](x, *c['args'])) for c in cons['eq']]))
File "xxx", line 30, in constr
x[0,t],
IndexError: too many indices for array
[Finished in 0.3s with exit code 1]
What am I doing wrong?
The initial part of the code, assigning values to the parameters, is:
from scipy.optimize import fmin_slsqp
import numpy as np
T = 30
beta = 0.96
L = 1
x0 = 1
gl = 0.02
alpha = 0.3
delta = 0.05
x_init = np.array([1,0.1])
A_l0 = 1000
Al = np.zeros((T+1,1))
Al[1] = A_l0
trange = np.arange(1,T+1,1, dtype='Int8') # does not include period zero
for t in trange: Al[t] = A_l0*(1 + gl)**(t-1)
The array x passed to your objective and constraint functions will be a one-dimensional array (just like your x_init is). You can't index a one-dimensional array with two indices, so expressions such as x[1,0] and x[0,t] will generate an error.

How to make bins and histograms

I need to make 200 bins that are evenly spaced and have my data be sorted into them, so that I can make a histogram out of he data. Can someone help me make a script that can make 200 bins and have data be sorted inside of them.
This is my current code:
#!/usr/bin/python
import operator
import matplotlib.pyplot as plt
import numpy as np
l=[]
with open("testdata") as f:
line = f.next()
f.next()# skip headers
nat = int(line.split()[0])
print nat
for line in f:
if line.strip():
if line.strip():
l.append(map(float,line.split()[1:]))
b = 0
a = 1
for b in range(53):
for a in range(b+1,54):
vector1 = (l[b][0],l[b][1],l[b][2])
vector2 = (l[a][0],l[a][1],l[a][2])
x = vector1
y = vector2
vector3 = list(np.array(x) - np.array(y))
dotProduct = reduce( operator.add, map( operator.mul, vector3, vector3))
dp = dotProduct**.5
print dp
#data = dp
#num_bins = 200 # <- number of bins for the histogram
#plt.hist(data, num_bins)
#plt.show()
Errors:
/usr/lib64/python2.6/site-packages/matplotlib/backends/backend_gtk.py:621: DeprecationWarning: Use the new widget gtk.Tooltip
self.tooltips = gtk.Tooltips()
Traceback (most recent call last):
File "vector_final", line 42, in <module>
plt.hist(data, num_bins)
File "/usr/lib64/python2.6/site-packages/matplotlib/pyplot.py", line 2008, in hist
ret = ax.hist(x, bins, range, normed, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, **kwargs)
File "/usr/lib64/python2.6/site-packages/matplotlib/axes.py", line 7098, in hist
w = [None]*len(x)
TypeError: len() of unsized object
You are pretty close. The only thing you are missing is storing your data and passing it to the histogram function correctly.
#!/usr/bin/python
import operator
import matplotlib.pyplot as plt
import numpy as np
l=[]
with open("testdata") as f:
line = f.next()
f.next()# skip headers
nat = int(line.split()[0])
print nat
for line in f:
# store striped line and only store if there is data on the line.
cleaned = line.strip()
if cleaned:
# convert to float and remove characters in first index
l.append(map(float,cleaned.split()[1:]))
b = 0
a = 1
# create a list to store our calculations in
distances = []
num_vects = len(l)
for b in range(num_vects-1):
for a in range(b+1,num_vects):
vector1 = (l[b][0],l[b][1],l[b][2])
vector2 = (l[a][0],l[a][1],l[a][2])
x = vector1
y = vector2
vector3 = list(np.array(x) - np.array(y))
dotProduct = reduce( operator.add, map( operator.mul, vector3, vector3))
dp = dotProduct**.5
# store individual data point into the list of calculated distances
distances.append(dp)
# plot histogram
num_bins = 200 # <- number of bins for the histogram
# store useful data returned by the histogram function
(n, bins, patches) = plt.hist(distances, num_bins)
plt.show()

Categories

Resources