Generate synthetic time series data from existing sample data

Generate synthetic time series data from existing sample data - python

Are there any good library/tools in python for generating synthetic time series data from existing sample data? For example I have sales data from January-June and would like to generate synthetic time series data samples from July-December )(keeping time series factors intact, like trend, seasonality, etc).

Leaving the question about quality of such data aside, here is a simple approach you can use Gaussian distribution to generate synthetic data based-off a sample. Below is the critical part.
import numpy as np
x # original sample np.array of features
feature_means = np.mean(x, axis=1)
feature_std = np.std(x, axis=1)
random_normal_feature_values = np.random.normal(feature_means, feature_std)
Here is a fully functioning code I used,
def generate_synthetic_data(sample_dataset, window_mean, window_std, fixed_window=None, variance_range =1 , sythesize_ratio = 2, forced_reverse = False):
synthetic_data = pd.DataFrame(columns=sample_dataset.columns)
synthetic_data.insert(len(sample_dataset.columns), "synthesis_seq", [], True)
for k in range(sythesize_ratio):
if len(synthetic_data) >= len(sample_dataset) * sythesize_ratio:
break;
#this loop generates a set that resembles the entire dataset
country_synthetic = pd.DataFrame(columns=synthetic_data.columns)
if fixed_window != None:
input_sequence_len = fixed_window
else:
input_sequence_len = int(np.random.normal(window_mean, window_std))
#population data change
country_data_i = sample_dataset
if len(country_data_i) < input_sequence_len :
continue
feature_length = configuration['feature_length'] #number of features to be randomized
country_data_array = country_data_i.to_numpy()
country_data_array = country_data_array.T[:feature_length]
country_data_array = country_data_array.reshape(feature_length,len(country_data_i))
x = country_data_array[:feature_length].T
reversed = np.random.normal(0,1)>0
if reversed:
x = x[::-1]
sets =0
x_list = []
dict_x = dict()
for i in range(input_sequence_len):
array_len = ((len(x) -i) - ((len(x)-i)%input_sequence_len))+i
if array_len <= 0:
continue
sets = int( array_len/ input_sequence_len)
if sets <= 0:
continue
x_temp = x[i:array_len].T.reshape(sets,feature_length,input_sequence_len)
uniq_keys = np.array([i+(input_sequence_len*k) for k in range(sets)])
x_temp = x_temp.reshape(feature_length,sets,input_sequence_len)
arrays_split = np.hsplit(x_temp,sets)
dict_x.update(dict(zip(uniq_keys, arrays_split)))
temp_x_list = [dict_x[i].T for i in sorted(dict_x.keys())]
temp_x_list = np.array(temp_x_list).squeeze()
feature_means = np.mean(temp_x_list, axis=1)
feature_std = np.std(temp_x_list, axis=1) /variance_range
random_normal_feature_values = np.random.normal(feature_means, feature_std).T
random_normal_feature_values = np.round(random_normal_feature_values,0)
random_normal_feature_values[random_normal_feature_values < 0] = 0
if reversed:
random_normal_feature_values = random_normal_feature_values.T[::-1]
random_normal_feature_values = random_normal_feature_values.T
for i in range(len(random_normal_feature_values)):
country_synthetic[country_synthetic.columns[i]] = random_normal_feature_values[i]
country_synthetic['synthesis_seq'] = k
synthetic_data = synthetic_data.append(country_synthetic, ignore_index=True)
return synthetic_data
for i in range(1):
directory_name = '/synthetic_'+str(i)
mypath = source_path+ '/cleaned'+directory_name
if os.path.exists(mypath) == False:
os.mkdir(mypath)
data = generate_synthetic_data(original_data, window_mean = 0, window_std= 0, fixed_window=2 ,variance_range = 10**i, sythesize_ratio = 1)
synthetic_data.append(data)
#data.to_csv(mypath+'/synthetic_'+str(i)+'_dt31_05_.csv', index=False )
print('synth step : ', i, ' len : ', len(synthetic_data))
Good luck!

Related

New classification method in machine learning python code optimization

I've new method of classification. I've developed it in python. But code is very slow.
Please, help me to optimize code below.
from joblib import Parallel, delayed
from datetime import datetime
from sklearn.metrics.pairwise import euclidean_distances
import copy
import pandas as pd
import numpy as np
This function for finding hyperspheres in space
def find_sphere(ind, df, dist_sq, sidx):
objectCount = df.shape[0]
sphere = dict()
sphere['index'] = ind
sphere['relatives'] = []
sphere['class'] = df.at[ind, "Class"]
indexes = sidx[:, ind]
sphere['distances'] = dist_sq[ind, :]
for i in range(objectCount):
if df.at[ind, "Class"] != df.at[indexes[i], "Class"]:
sphere['relatives'] = indexes[:i]
sphere['radius'] = sphere['distances'][indexes[i]]
break
sphere['enemies'] = np.where(
sphere['distances'] == sphere['radius'])[0]
sphere['coverages'] = set()
for enemy in sphere['enemies']:
min_dist = dist_sq[enemy, sphere['relatives']].min()
sphere['coverages'].update(
sphere['relatives'][np.where(dist_sq[enemy, sphere['relatives']] == min_dist)[0]])
sphere['relatives'] = set(sphere['relatives'])
return sphere
This function is clustering objects in space
def find_groups(spheres):
coverages = {x for d in spheres for x in d['coverages']}
all_coverages = {x for d in spheres for x in d['coverages']}
notSeenObjects = copy.deepcopy(spheres)
groups = list()
while (len(coverages) > 0):
# print(f'\t getting one of {len(coverages)} coverage(s)...')
obj = coverages.pop()
group = {d['index'] for d in notSeenObjects if obj in d['relatives']}
if len(group) == 0:
continue
linkerCoverage = set()
while True:
linkerCoverage = linkerCoverage | {r for s in notSeenObjects if s['index'] in group
for r in s['relatives'] if r in all_coverages}
notSeenObjects = [
s for s in notSeenObjects if not s['index'] in group]
newObjects = {s['index'] for s in notSeenObjects if len(
s['relatives'].intersection(linkerCoverage)) > 0}
if len(newObjects) == 0:
break
group = group | newObjects
coverages = coverages - linkerCoverage
groups.append(group)
return groups
This function finding all standart objects. Means, new objects can be classified with this (standart) objects instead of seeing of all objects
def find_standart_objects(spheres, dists, groups):
standartObjects = [(s['index'], s['radius'], s['class']) for s in spheres]
for group in sorted(groups, key=len, reverse=True):
candidates = [s for s in standartObjects if s[0] in group]
for candidate in sorted(candidates, key=lambda x: x[1]):
isRightRecognition = True
for obj in spheres:
result = sorted([(dists[obj['index'], s[0]] / s[1], obj['class'], s[2])
for s in standartObjects if s[0] != candidate[0]], key=lambda x: x[0])
mindist = result[0][0]
for r in result:
if r[0] != mindist:
break
isRightRecognition = isRightRecognition and r[1] == r[2]
if not isRightRecognition:
print(f'Standart found {candidate[0]}')
break
if isRightRecognition:
standartObjects = [
s for s in standartObjects if s[0] != candidate[0]]
return standartObjects
This part is using that functions
df = pd.read_csv("Dry_Bean.txt", sep='\t') #https://archive.ics.uci.edu/ml/machine-learning-databases/00602/DryBeanDataset.zip
objectCount = df.shape[0]
dist_sq = euclidean_distances(df.iloc[:, :-1]).round(15)
sidx = np.argsort(dist_sq, axis=0)
spheres = Parallel(n_jobs=6)(delayed(find_sphere)
(ind, df, dist_sq, sidx) for ind in range(0, objectCount))
groups = find_groups(spheres)
standartObjects = find_standart_objects(spheres, dist_sq, groups)
Bottle neck parts are find_standart_objects and find_sphere functions.

How to find Average directional movement for stocks using Pandas?

I have a dataframe of OHLCV data. I would like to know if anyone knows any tutorial or any way of finding ADX(Average directional movement ) using pandas?
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt
import datetime as dt
import numpy as nm
start=dt.datetime.today()-dt.timedelta(59)
end=dt.datetime.today()
df=pd.DataFrame(yf.download("MSFT", start=start, end=end))
The average directional index, or ADX, is the primary technical indicator among the five indicators that make up a technical trading system developed by J. Welles Wilder, Jr. and is calculated using the other indicators that make up the trading system. The ADX is primarily used as an indicator of momentum, or trend strength, but the total ADX system is also used as a directional indicator.
Directional movement is calculated by comparing the difference between two consecutive lows with the difference between their respective highs.
For the excel calculation of ADX this is a really good video:
https://www.youtube.com/watch?v=LKDJQLrXedg&t=387s

I was playing with this a little bit and found something that can help you with the issue:
def ADX(data: pd.DataFrame, period: int):
"""
Computes the ADX indicator.
"""
df = data.copy()
alpha = 1/period
# TR
df['H-L'] = df['High'] - df['Low']
df['H-C'] = np.abs(df['High'] - df['Close'].shift(1))
df['L-C'] = np.abs(df['Low'] - df['Close'].shift(1))
df['TR'] = df[['H-L', 'H-C', 'L-C']].max(axis=1)
del df['H-L'], df['H-C'], df['L-C']
# ATR
df['ATR'] = df['TR'].ewm(alpha=alpha, adjust=False).mean()
# +-DX
df['H-pH'] = df['High'] - df['High'].shift(1)
df['pL-L'] = df['Low'].shift(1) - df['Low']
df['+DX'] = np.where(
(df['H-pH'] > df['pL-L']) & (df['H-pH']>0),
df['H-pH'],
0.0
)
df['-DX'] = np.where(
(df['H-pH'] < df['pL-L']) & (df['pL-L']>0),
df['pL-L'],
0.0
)
del df['H-pH'], df['pL-L']
# +- DMI
df['S+DM'] = df['+DX'].ewm(alpha=alpha, adjust=False).mean()
df['S-DM'] = df['-DX'].ewm(alpha=alpha, adjust=False).mean()
df['+DMI'] = (df['S+DM']/df['ATR'])*100
df['-DMI'] = (df['S-DM']/df['ATR'])*100
del df['S+DM'], df['S-DM']
# ADX
df['DX'] = (np.abs(df['+DMI'] - df['-DMI'])/(df['+DMI'] + df['-DMI']))*100
df['ADX'] = df['DX'].ewm(alpha=alpha, adjust=False).mean()
del df['DX'], df['ATR'], df['TR'], df['-DX'], df['+DX'], df['+DMI'], df['-DMI']
return df
At the beginning the values aren't correct (as always with the EWM approach) but after several computations it converges to the correct value.

Math was taken from here.
def ADX(df):
def getCDM(df):
dmpos = df["High"][-1] - df["High"][-2]
dmneg = df["Low"][-2] - df["Low"][-1]
if dmpos > dmneg:
return dmpos
else:
return dmneg
def getDMnTR(df):
DMpos = []
DMneg = []
TRarr = []
n = round(len(df)/14)
idx = n
while n <= (len(df)):
dmpos = df["High"][n-1] - df["High"][n-2]
dmneg = df["Low"][n-2] - df["Low"][n-1]
DMpos.append(dmpos)
DMneg.append(dmneg)
a1 = df["High"][n-1] - df["High"][n-2]
a2 = df["High"][n-1] - df["Close"][n-2]
a3 = df["Low"][n-1] - df["Close"][n-2]
TRarr.append(max(a1,a2,a3))
n = idx + n
return DMpos, DMneg, TRarr
def getDI(df):
DMpos, DMneg, TR = getDMnTR(df)
CDM = getCDM(df)
POSsmooth = (sum(DMpos) - sum(DMpos)/len(DMpos) + CDM)
NEGsmooth = (sum(DMneg) - sum(DMneg)/len(DMneg) + CDM)
DIpos = (POSsmooth / (sum(TR)/len(TR))) *100
DIneg = (NEGsmooth / (sum(TR)/len(TR))) *100
return DIpos, DIneg
def getADX(df):
DIpos, DIneg = getDI(df)
dx = (abs(DIpos- DIneg) / abs(DIpos + DIneg)) * 100
ADX = dx/14
return ADX
return(getADX(df))
print(ADX(df))

This gives you the exact numbers as Tradingview and Thinkorswim.
import numpy as np
def ema(arr, periods=14, weight=1, init=None):
leading_na = np.where(~np.isnan(arr))[0][0]
arr = arr[leading_na:]
alpha = weight / (periods + (weight-1))
alpha_rev = 1 - alpha
n = arr.shape[0]
pows = alpha_rev**(np.arange(n+1))
out1 = np.array([])
if 0 in pows:
out1 = ema(arr[:int(len(arr)/2)], periods)
arr = arr[int(len(arr)/2) - 1:]
init = out1[-1]
n = arr.shape[0]
pows = alpha_rev**(np.arange(n+1))
scale_arr = 1/pows[:-1]
if init:
offset = init * pows[1:]
else:
offset = arr[0]*pows[1:]
pw0 = alpha*alpha_rev**(n-1)
mult = arr*pw0*scale_arr
cumsums = mult.cumsum()
out = offset + cumsums*scale_arr[::-1]
out = out[1:] if len(out1) > 0 else out
out = np.concatenate([out1, out])
out[:periods] = np.nan
out = np.concatenate(([np.nan]*leading_na, out))
return out
def atr(highs, lows, closes, periods=14, ema_weight=1):
hi = np.array(highs)
lo = np.array(lows)
c = np.array(closes)
tr = np.vstack([np.abs(hi[1:]-c[:-1]),
np.abs(lo[1:]-c[:-1]),
(hi-lo)[1:]]).max(axis=0)
atr = ema(tr, periods=periods, weight=ema_weight)
atr = np.concatenate([[np.nan], atr])
return atr
def adx(highs, lows, closes, periods=14):
highs = np.array(highs)
lows = np.array(lows)
closes = np.array(closes)
up = highs[1:] - highs[:-1]
down = lows[:-1] - lows[1:]
up_idx = up > down
down_idx = down > up
updm = np.zeros(len(up))
updm[up_idx] = up[up_idx]
updm[updm < 0] = 0
downdm = np.zeros(len(down))
downdm[down_idx] = down[down_idx]
downdm[downdm < 0] = 0
_atr = atr(highs, lows, closes, periods)[1:]
updi = 100 * ema(updm, periods) / _atr
downdi = 100 * ema(downdm, periods) / _atr
zeros = (updi + downdi == 0)
downdi[zeros] = .0000001
adx = 100 * np.abs(updi - downdi) / (updi + downdi)
adx = ema(np.concatenate([[np.nan], adx]), periods)
return adx

Splittig data in python dataframe and getting the array values automatically

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data = pd.read_csv('D:\ history/segment.csv')
data = pd.DataFrame(data)
data = data.sort_values(['Prob_score'], ascending=[False])
one = len(data)
actualpaid_overall = len(data.loc[data['paidstatus'] == 1])
data_split = np.array_split(data, 10)
data1 = data_split[0]
actualpaid_ten = len(data1.loc[data1['paidstatus'] == 1])
percent_ten = actualpaid_ten/actualpaid_overall
data2 = data_split[1]
actualpaid_twenty = len(data2.loc[data2['paidstatus'] == 1])
percent_twenty = (actualpaid_twenty/actualpaid_overall) + percent_ten
data3 = data_split[2]
actualpaid_thirty = len(data3.loc[data3['paidstatus'] == 1])
percent_thirty = (actualpaid_thirty/actualpaid_overall) + percent_twenty
data4 = data_split[3]
actualpaid_forty = len(data4.loc[data4['paidstatus'] == 1])
percent_forty = (actualpaid_forty/actualpaid_overall) + percent_thirty
data5 = data_split[4]
actualpaid_fifty = len(data5.loc[data5['paidstatus'] == 1])
percent_fifty = (actualpaid_fifty/actualpaid_overall) + percent_forty
data6 = data_split[5]
actualpaid_sixty = len(data6.loc[data6['paidstatus'] == 1])
percent_sixty = (actualpaid_sixty/actualpaid_overall) + percent_fifty
data7 = data_split[6]
actualpaid_seventy = len(data7.loc[data7['paidstatus'] == 1])
percent_seventy = (actualpaid_seventy/actualpaid_overall) + percent_sixty
data8 = data_split[7]
actualpaid_eighty = len(data8.loc[data8['paidstatus'] == 1])
percent_eighty = (actualpaid_eighty/actualpaid_overall) + percent_seventy
data9 = data_split[8]
actualpaid_ninenty = len(data9.loc[data9['paidstatus'] == 1])
percent_ninenty = (actualpaid_ninenty/actualpaid_overall) + percent_eighty
data10 = data_split[9]
actualpaid_hundred = len(data10.loc[data10['paidstatus'] == 1])
percent_hundred = (actualpaid_hundred/actualpaid_overall) + percent_ninenty
array_x = [10,20,30,40,50,60,70,80,90,100]
array_y = [ percent_ten, percent_twenty, percent_thirty, percent_forty,percent_fifty, percent_sixty, percent_seventy, percent_eighty, percent_ninenty, percent_hundred]
plt.xlabel(' Base')
plt.ylabel(' percent')
ax = plt.plot(array_x,array_y)
plt.minorticks_on()
plt.grid(which='major', linestyle='-', linewidth=0.5, color='0.1')
plt.grid( which='both', axis = 'both', linewidth=0.5,color='0.75')
The above is my python code i have splitted my dataframe into 10 equal sections and plotted the graph but I'm not satisfied with this i have two concerns:
array_x = [10,20,30,40,50,60,70,80,90,100] in this line of code i have manually taken the x values, is there any possible way to process automatically as i have taken split(data,10) it should show 10 array values
As we can see the whole data1,2,3,4...10 is being repeated again and again is there a solution to write this in a function or loop.
Any help with codes will be appreciated. Thanks

I believe you need list comprehension and for count is possible use simplier way - sum of boolean mask, True values are processes like 1, then convert list to numpy array and use numpy.cumsum:
data = pd.read_csv('D:\ history/segment.csv')
data = data.sort_values('Prob_score', ascending=False)
one = len(data)
actualpaid_overall = (data['paidstatus'] == 1).sum()
data_split = np.array_split(data, 10)
x = [len(x) for x in data_split]
y = [(x['paidstatus'] == 1).sum()/actualpaid_overall for x in data_split]
array_x = np.cumsum(np.array(x))
array_y = np.cumsum(np.array(y))
plt.xlabel(' Base')
plt.ylabel(' percent')
ax = plt.plot(array_x,array_y)
plt.minorticks_on()
plt.grid(which='major', linestyle='-', linewidth=0.5, color='0.1')
plt.grid( which='both', axis = 'both', linewidth=0.5,color='0.75')
Sample:
np.random.seed(2019)
N = 1000
data = pd.DataFrame({'paidstatus':np.random.randint(3, size=N),
'Prob_score':np.random.randint(100, size=N)})
#print (data)
data = data.sort_values(['Prob_score'], ascending=[False])
actualpaid_overall = (data['paidstatus'] == 1).sum()
data_split = np.array_split(data, 10)
x = [len(x) for x in data_split]
y = [(x['paidstatus'] == 1).sum()/actualpaid_overall for x in data_split]
array_x = np.cumsum(np.array(x))
array_y = np.cumsum(np.array(y))
print (array_x)
[ 100 200 300 400 500 600 700 800 900 1000]
print (array_y)
[0.09118541 0.18844985 0.27963526 0.38601824 0.49848024 0.61702128
0.72036474 0.81155015 0.9331307 1. ]

FCBF Python feature selection technique

I want to use FCBF technique from github https://github.com/shiralkarprashant/FCBF
The problem i faced is that i am working on Python 3 ad the module is implemented for python 2 users . I got the following error that describes that name 'xrange' is not defined because i work on python3
i think to solve the issue just by changing range by xrange
from FCBF_module import FCBF, FCBFK, FCBFiP, get_i
from sklearn.datasets import load_digits
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import time
from sklearn.grid_search import GridSearchCV
classifiers = [('DecisionTree', DecisionTreeClassifier(), {'max_depth' : [5, 10, 15]}),
('LogisticRegression', LogisticRegression(), {'C' : [0.1, 1, 10]})]
n_features = dataCAD.shape[1]
npieces = get_i(n_features)
The module code contains just one xrange occurence i tried to change it by range but it does not solve the problem:
# -*- coding: utf-8 -*-
import numpy as np
def count_vals(x):
vals = np.unique(x)
occ = np.zeros(shape = vals.shape)
for i in range(vals.size):
occ[i] = np.sum(x == vals[i])
return occ
def entropy(x):
n = float(x.shape[0])
ocurrence = count_vals(x)
px = ocurrence / n
return -1* np.sum(px*np.log2(px))
def symmetricalUncertain(x,y):
n = float(y.shape[0])
vals = np.unique(y)
# Computing Entropy for the feature x.
Hx = entropy(x)
# Computing Entropy for the feature y.
Hy = entropy(y)
#Computing Joint entropy between x and y.
partial = np.zeros(shape = (vals.shape[0]))
for i in range(vals.shape[0]):
partial[i] = entropy(x[y == vals[i]])
partial[np.isnan(partial)==1] = 0
py = count_vals(y).astype(dtype = 'float64') / n
Hxy = np.sum(py[py > 0]*partial)
IG = Hx-Hxy
return 2*IG/(Hx+Hy)
def suGroup(x, n):
m = x.shape[0]
x = np.reshape(x, (n,m/n)).T
m = x.shape[1]
SU_matrix = np.zeros(shape = (m,m))
for j in range(m-1):
x2 = x[:,j+1::]
y = x[:,j]
temp = np.apply_along_axis(symmetricalUncertain, 0, x2, y)
for k in range(temp.shape[0]):
SU_matrix[j,j+1::] = temp
SU_matrix[j+1::,j] = temp
return 1/float(m-1)*np.sum(SU_matrix, axis = 1)
def isprime(a):
return all(a % i for i in xrange(2, a))
"""
get
"""
def get_i(a):
if isprime(a):
a -= 1
return filter(lambda x: a % x == 0, range(2,a))
"""
FCBF - Fast Correlation Based Filter
L. Yu and H. Liu. Feature Selection for High‐Dimensional Data: A Fast Correlation‐Based Filter Solution.
In Proceedings of The Twentieth International Conference on Machine Leaning (ICML‐03), 856‐863.
Washington, D.C., August 21‐24, 2003.
"""
class FCBF:
idx_sel = []
def __init__(self, th = 0.01):
'''
Parameters
---------------
th = The initial threshold
'''
self.th = th
def fit(self, x, y):
'''
This function executes FCBF algorithm and saves indexes
of selected features in self.idx_sel
Parameters
---------------
x = dataset [NxM]
y = label [Nx1]
'''
self.idx_sel = []
"""
First Stage: Computing the SU for each feature with the response.
"""
SU_vec = np.apply_along_axis(symmetricalUncertain, 0, x, y)
SU_list = SU_vec[SU_vec > self.th]
SU_list[::-1].sort()
m = x[:,SU_vec > self.th].shape
x_sorted = np.zeros(shape = m)
for i in range(m[1]):
ind = np.argmax(SU_vec)
SU_vec[ind] = 0
x_sorted[:,i] = x[:,ind].copy()
self.idx_sel.append(ind)
"""
Second Stage: Identify relationships between feature to remove redundancy.
"""
j = 0
while True:
"""
Stopping Criteria:The search finishes
"""
if j >= x_sorted.shape[1]: break
y = x_sorted[:,j].copy()
x_list = x_sorted[:,j+1:].copy()
if x_list.shape[1] == 0: break
SU_list_2 = SU_list[j+1:]
SU_x = np.apply_along_axis(symmetricalUncertain, 0,
x_list, y)
comp_SU = SU_x >= SU_list_2
to_remove = np.where(comp_SU)[0] + j + 1
if to_remove.size > 0:
x_sorted = np.delete(x_sorted, to_remove, axis = 1)
SU_list = np.delete(SU_list, to_remove, axis = 0)
to_remove.sort()
for r in reversed(to_remove):
self.idx_sel.remove(self.idx_sel[r])
j = j + 1
def fit_transform(self, x, y):
'''
This function fits the feature selection
algorithm and returns the resulting subset.
Parameters
---------------
x = dataset [NxM]
y = label [Nx1]
'''
self.fit(x, y)
return x[:,self.idx_sel]
def transform(self, x):
'''
This function applies the selection
to the vector x.
Parameters
---------------
x = dataset [NxM]
'''
return x[:, self.idx_sel]
"""
FCBF# - Fast Correlation Based Filter
B. Senliol, G. Gulgezen, et al. Fast Correlation Based Filter (FCBF) with a Different Search Strategy.
In Computer and Information Sciences (ISCIS ‘08) 23rd International Symposium on, pages 1‐4.
Istanbul, October 27‐29, 2008.
"""
class FCBFK(FCBF):
idx_sel = []
def __init__(self, k = 10):
'''
Parameters
---------------
k = Number of features to include in the
subset.
'''
self.k = k
def fit(self, x, y):
'''
This function executes FCBFK algorithm and saves indexes
of selected features in self.idx_sel
Parameters
---------------
x = dataset [NxM]
y = label [Nx1]
'''
self.idx_sel = []
"""
First Stage: Computing the SU for each feature with the response.
"""
SU_vec = np.apply_along_axis(symmetricalUncertain, 0, x, y)
SU_list = SU_vec[SU_vec > 0]
SU_list[::-1].sort()
m = x[:,SU_vec > 0].shape
x_sorted = np.zeros(shape = m)
for i in range(m[1]):
ind = np.argmax(SU_vec)
SU_vec[ind] = 0
x_sorted[:,i] = x[:,ind].copy()
self.idx_sel.append(ind)
"""
Second Stage: Identify relationships between features to remove redundancy with stopping
criteria (features in x_best == k).
"""
j = 0
while True:
y = x_sorted[:,j].copy()
SU_list_2 = SU_list[j+1:]
x_list = x_sorted[:,j+1:].copy()
"""
Stopping Criteria:The search finishes
"""
if x_list.shape[1] == 0: break
SU_x = np.apply_along_axis(symmetricalUncertain, 0,
x_list, y)
comp_SU = SU_x >= SU_list_2
to_remove = np.where(comp_SU)[0] + j + 1
if to_remove.size > 0 and x.shape[1] > self.k:
for i in reversed(to_remove):
x_sorted = np.delete(x_sorted, i, axis = 1)
SU_list = np.delete(SU_list, i, axis = 0)
self.idx_sel.remove(self.idx_sel[i])
if x_sorted.shape[1] == self.k: break
if x_list.shape[1] == 1 or x_sorted.shape[1] == self.k:
break
j = j + 1
if len(self.idx_sel) > self.k:
self.idx_sel = self.idx_sel[:self.k]
"""
FCBFiP - Fast Correlation Based Filter in Pieces
"""
class FCBFiP(FCBF):
idx_sel = []
def __init__(self, k = 10, npieces = 2):
'''
Parameters
---------------
k = Number of features to include in the
subset.
npieces = Number of pieces to divide the
feature space.
'''
self.k = k
self.npieces = npieces
def fit(self, x, y):
'''
This function executes FCBF algorithm and saves indexes
of selected features in self.idx_sel
Parameters
---------------
x = dataset [NxM]
y = label [Nx1]
'''
"""
First Stage: Computing the SU for each feature with the response. We sort the
features. When we have a prime number of features we remove the last one from the
sorted features list.
"""
m = x.shape
nfeaturesPieces = int(m[1] / float(self.npieces))
SU_vec = np.apply_along_axis(symmetricalUncertain, 0, x, y)
x_sorted = np.zeros(shape = m, dtype = 'float64')
idx_sorted = np.zeros(shape = m[1], dtype = 'int64')
for i in range(m[1]):
ind = np.argmax(SU_vec)
SU_vec[ind] = -1
idx_sorted[i]= ind
x_sorted[:,i] = x[:,ind].copy()
if isprime(m[1]):
x_sorted = np.delete(x_sorted, m[1]-1, axis = 1 )
ind_prime = idx_sorted[m[1]-1]
idx_sorted = np.delete(idx_sorted, m[1]-1)
#m = x_sorted.shape
"""
Second Stage: Identify relationships between features into its vecinity
to remove redundancy with stopping criteria (features in x_best == k).
"""
x_2d = np.reshape(x_sorted.T, (self.npieces, nfeaturesPieces*m[0])).T
SU_x = np.apply_along_axis(suGroup, 0, x_2d, nfeaturesPieces)
SU_x = np.reshape(SU_x.T, (self.npieces*nfeaturesPieces,))
idx_sorted2 = np.zeros(shape = idx_sorted.shape, dtype = 'int64')
SU_x[np.isnan(SU_x)] = 1
for i in range(idx_sorted.shape[0]):
ind = np.argmin(SU_x)
idx_sorted2[i] = idx_sorted[ind]
SU_x[ind] = 10
"""
Scoring step
"""
self.scores = np.zeros(shape = m[1], dtype = 'int64')
for i in range(m[1]):
if i in idx_sorted:
self.scores[i] = np.argwhere(i == idx_sorted) + np.argwhere(i == idx_sorted2)
if isprime(m[1]):
self.scores[ind_prime] = 2*m[1]
self.set_k(self.k)
def set_k(self, k):
self.k = k
scores_temp = -1*self.scores
self.idx_sel = np.zeros(shape = self.k, dtype = 'int64')
for i in range(self.k):
ind = np.argmax(scores_temp)
scores_temp[ind] = -100000000
self.idx_sel[i] = ind

Try using 2to3 package for python to automatically convert the files. Worked for me!
https://docs.python.org/2/library/2to3.html

ValueError: x and y must have the same first dimension, but have different shapes

import urllib.request
from math import sqrt, fabs, exp
import matplotlib.pyplot as plot
from sklearn.linear_model import enet_path
from sklearn.metrics import roc_auc_score, roc_curve
import numpy
target_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data'
data = urllib.request.urlopen(target_url)
xList = []
for line in data:
#split on comma
row = line.strip().split(",".encode(encoding='utf-8'))
xList.append(row)
xNum = []
labels = []
for row in xList:
lastCol = row.pop()
if lastCol == b'M':
labels.append(1.0)
else:
labels.append(0.0)
attrRow = [float(elt) for elt in row]
xNum.append(attrRow)
nrow = len(xNum)
ncol = len(xNum[1])
alpha = 1.0
xMeans = []
xSD = []
for i in range(ncol):
col = [xNum[j][i] for j in range(nrow)]
mean = sum(col)/nrow
xMeans.append(mean)
colDiff = [(xNum[j][i] - mean) for j in range(nrow)]
sumSq = sum([colDiff[i] * colDiff[i] for i in range(nrow)])
stdDev = sqrt(sumSq/nrow)
xSD.append(stdDev)
xNormalized = []
for i in range(nrow):
rowNormalized = [(xNum[i][j] - xMeans[j])/xSD[j] for j in range(ncol)]
xNormalized.append(rowNormalized)
meanLabel = sum(labels)/nrow
sdLabel = sqrt(sum([(labels[i] - meanLabel) * (labels[i] - meanLabel) for i in range (nrow)])/nrow)
labelNormalized = [(labels[i] - meanLabel)/sdLabel for i in range(nrow)]
nxval = 10
for ixval in range(nxval):
idxTest = [a for a in range (nrow) if a%nxval == ixval]
idxTrain = [a for a in range(nrow) if a%nxval != ixval]
xTrain = numpy.array([xNormalized[r] for r in idxTrain])
xTest = numpy.array([xNormalized[r] for r in idxTest])
labelTrain = numpy.array([labelNormalized[r] for r in idxTrain])
labelTest = numpy.array([labelNormalized[r] for r in idxTest])
alphas, coefs, _ = enet_path(xTrain, labelTrain, l1_ratio = 0.8, fit_intercept=False, return_models=False)
if ixval == 0:
pred = numpy.dot(xTest, coefs)
yOut = labelTest
else:
yTemp = numpy.array(yOut)
yOut = numpy.concatenate((yTemp, labelTest), axis = 0)
predTemp = numpy.array(pred)
pred = numpy.concatenate((predTemp, numpy.dot(xTest, coefs)), axis = 0)
misClassRate = []
_,nPred = pred.shape
for iPred in range(1, nPred):
predList = list(pred[:, iPred])
errCnt = 0.0
for irow in range(nrow):
if (predList[irow] < 0.0) and (yOut[irow] >= 0.0):
errCnt += 1.0
elif (predList[irow] >= 0.0) and (yOut[irow] < 0.0):
errCnt += 1.0
misClassRate.append(errCnt/nrow)
minError = min(misClassRate)
idxMin = misClassRate.index(minError)
plotAlphas = numpy.array(alphas[1:len(alphas)])
misClassRate_np = numpy.array(misClassRate)
plot.figure()
plot.plot(plotAlphas, misClassRate_np, label='Misclassification Error Across Folds', linewidth=2)
plot.axvline(plotAlphas[idxMin], linestyle='--', label='CV Estimate of Best alpha')
plot.legend()
plot.semilogx()
ax = plot.gca()
ax.invert_xaxis()
plot.xlabel('alpha')
plot.ylabel('Misclassification Error')
plot.axis('tight')
plot.show()
When I executed the code above, it returns: ValueError: x and y must have same first dimension, but have shapes (99,) and (1,).
It seems the problem is due to unequal length in x and y.
Then I checked both plotAlphas and misClassRate_np, they show the same length. Also, both of them has been changed to array but still fail to fix the problem. Can't figure out what's happening.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Generate synthetic time series data from existing sample data - python

Related

New classification method in machine learning python code optimization

How to find Average directional movement for stocks using Pandas?

Splittig data in python dataframe and getting the array values automatically

FCBF Python feature selection technique

ValueError: x and y must have the same first dimension, but have different shapes

Categories

Resources