I'm trying to filter my CIFAR-100 ndarray by class index, here is my code:
def get_cifar100(folder, class_idx):
train_fname = os.path.join(folder, 'train')
test_fname = os.path.join(folder, 'test')
data_dict = unpickle(train_fname)
train_data = data_dict['data']
train_fine_labels = data_dict['fine_labels']
train_coarse_labels = data_dict['coarse_labels']
# Filtering process
filt_tdata = numpy.empty((0))
for i, v in enumerate(train_coarse_labels):
if v == class_idx:
filt_tdata = numpy.append(filt_tdata, train_data[i])
data_dict = unpickle(test_fname)
test_data = data_dict['data']
test_fine_labels = data_dict['fine_labels']
test_coarse_labels = data_dict['coarse_labels']
bm = unpickle(os.path.join(folder, 'meta'))
clabel_names = bm['coarse_label_names']
flabel_names = bm['fine_label_names']
return data_dict, filt_tdata, numpy.array(train_coarse_labels), numpy.array(train_fine_labels), test_data, numpy.array(test_coarse_labels), numpy.array(test_fine_labels), clabel_names, flabel_names
datapath = "./data/cifar-100-python"
data_dict, tr_data100, tr_clabels100, tr_flabels100, te_data100, te_clabels100, te_flabels100, clabel_names100, flabel_names100 = get_cifar100(datapath, 4)
I want to filter train_data based on class_idx = 4 (train_coarse_labels). The size of original array is 50000 and it should be 5000 on filtered. But, I got more than its original size (7 million ++). What's wrong with my function? Thanks.
I am trying to add data that I am reading from a series of JSON files to a Numpy array (or whatever data collection would work best). My idea, is that I want to sort a collection of episodes of a tv show by episode title.
The problem I have encountered, is actually creating the collection from the data.
The intent, is that I want to be able to have a collection of the items found within the for loop [a,b,c,d]; for each episode of the show.
Is a Numpy array the best way to go about making this collection, or should I use something else?
season1 = open('THEJSONFILES\seasonone.json', 'r')
season_array = np.array(['episodeTitle','seasonNum', 'episodeNum', 'plotContents'])
def ReadTheDarnJsonFile(jsonTitle):
seasondata = jsonTitle.read()
seasonobj = j.loads(seasondata)
list = (seasonobj['episodes'])
for i in range(len(list)):
a = str(list[i].get('title'))
b = str(list[i].get('seasonNumber'))
c = str(list[i].get('episodeNumber'))
d = str(list[i].get('plot'))
print(a, b, c, d)
# np.append(season_array, [a,b,c,d]) this is not correct
2 notes. First I would avoid using list as a variable name because it is a keyword in python. Second I would recommend using a custom class for your data for maximum readability.
season1 = open('THEJSONFILES\seasonone.json', 'r')
season_array = np.array(['episodeTitle','seasonNum', 'episodeNum', 'plotContents'])
class episode:
def __init__(self,title,seasonNumber,episodeNumber,plot):
self.title = title
self.seasonNumber = seasonNumber
self.episodeNumber = episodeNumber
self.plot = plot
def summary(self):
print("Season "+str(self.seasonNumber)+" Episode "+str(self.episodeNumber))
def ReadTheDarnJsonFile(jsonTitle):
seasondata = jsonTitle.read()
seasonobj = j.loads(seasondata)
episodes = (seasonobj['episodes'])
season_array = []
for i in range(len(episodes)):
a = str(list[i].get('title'))
b = str(list[i].get('seasonNumber'))
c = str(list[i].get('episodeNumber'))
d = str(list[i].get('plot'))
season_array.append(episode(a,b,c,d)) this is not correct
return season_array
season_array = Read
for item in season_array:
Here is what I ended up doing.
import json as j
import pandas as pd
emptyArray = []
season1 = open('THEJSONFILES\seasonone.json', 'r')
season2 = open('THEJSONFILES\seasontwo.json', 'r')
season3 = open('THEJSONFILES\seasonthree.json', 'r')
season4 = open('THEJSONFILES\seasonfour.json', 'r')
season5 = open('THEJSONFILES\seasonfive.json', 'r')
season6 = open('THEJSONFILES\seasonsix.json', 'r')
season7 = open('THEJSONFILES\seasonseven.json', 'r')
columnData = ["episodeTitle", "seasonIndex", "episodeIndex", "plot", "imageURL"]
finalDf = pd.DataFrame
def ReadTheDarnJsonFile(jsonTitle):
df = pd.DataFrame(columns = columnData)
seasonData = jsonTitle.read()
seasonObj = j.loads(seasonData)
currentSeasonList = (seasonObj['episodes'])
for i in range(len(currentSeasonList)):
tempTitle = str(currentSeasonList[i].get('title'))
tempSN = str(currentSeasonList[i].get('seasonNumber'))
tempEN = str(currentSeasonList[i].get('episodeNumber'))
tempPlot = str(currentSeasonList[i].get('plot'))
tempImage = str(currentSeasonList[i].get('image'))
dataObj = pd.Series([tempTitle, tempSN, tempEN, tempPlot, tempImage], index=(df.columns))
df.loc[i] = dataObj
finalDf = pd.concat(emptyArray)
holyOutput = finalDf.sort_values(by=['episodeTitle'])
I was making my automatic stock strategy yield calculation program with Python. Here's my code:
import FinanceDataReader as fdr
import numpy as np
# ...(more modules for python)
pd.options.display.float_format = '{:.5f}'.format
file_list = os.listdir('/home/sejahui/projects/stock_data_excel')
for i in range(20):
odd = file_list[i]
data = pd.read_excel('/home/sejahui/projects/stock_data_excel/'+str(odd))
def calMACD(data, short=5, long=25, signal=9):
data['MVA_25']=data['Close'].ewm(span=long, adjust=False).mean()
data['MVA_5']=data['Close'].ewm(span=short, adjust=False).mean()
data['MACD']=data['Close'].ewm(span=short, adjust=False).mean() - data['Close'].ewm(span=long, adjust=False).mean()
data['Signal']=data['MACD'].ewm(span=signal, adjust=False).mean( )
#data['Buy_sign']=(data['MACD']-data['Signal']) >=600
data['Buy_sign']=np.where(data['MACD']-data['Signal'] >=451, 'Buy' , 'Sell' )
#data['Target_1']=np.where(data['Buy_sign']=='Buy', (data['Change'])+1,1)
#data['Target_2']=np.where(data['Buy_sign']=='Sell', (data['Change'])+1,1)
#data['Real_world']= 1000000*data['Target_1']
#data['Real_world_2']= 1000000*data['Target_2']
#data['Condition'] = np.where(data['Real_world']<1000000, data['Real_world']-data['Real_world'].shift(-2),1)
##data['Condition_2'] = np.where(data['Real_world']<1000000, data['Target_1'].shift(-2),1)
#data['Moneyflow'] =
#plt.plot(data['Date'], data['Real_world'])
#data[data.Buy_sign !='Sell']
data['Target_1']=np.where(data['Buy_sign']=='Buy', data['Change'],1)
data['Target_2']=np.where(data['Buy_sign']=='Sell', data ['Change'],1)
data['Yield']=np.where(data['Buy_sign']=='Sell', data['Target_1']/data['Target_2'],1 )
data['??????'] = data['Result'] - data['Result_2']
return data
Adjusted = calMACD(data)
Adjusted.drop(['Change'], axis=1, inplace = True)
Filtered = Adjusted[Adjusted.Buy_sign!='Sell'].copy()
#Filtered = (Adjusted.Buy_sign =='Buy') #(Adjusted.Condition = 1.0)
#Master = Adjusted.loc[Adjusted,['Date','Buy_sign','Target_1','Real_world',]]
def backtester(Filtered):
Filtered['Change'] = ((Filtered['Close'] - Filtered['Close'].shift(1)) / Filtered['Close'].shift(1))+1
#data['Target_1']=np.where(data['Buy_sign']=='Buy', (data['Change'])+1,1)
Filtered['Real_world'] = 1000000*Filtered['Change']
#Filtered['Condition'] = np.where(Filtered['Real_world']<1000000, Filtered['Real_world'].shift(-2)-Filtered['Real_world'],1)
Filtered['Condition'] = np.where(Filtered['Real_world']<1000000, Filtered['Change'].shift(-2),1)
#Filtered['Target_1'] = np.where(Filtered['Buy_sign']=='Buy', (Filtered['Change'])+1,1)
#Filtered['Condition'] = np.where(Filtered['Real_world']<1000000, Filtered['Real_world'].shift(-2)-Filtered['Real_world'],1)
return Filtered
s = backtester(Filtered)
e = s[s.Condition!=1.00000]
x = e.dropna()
y = x['Condition']
list_1 = []
write_wb = Workbook()
write_ws = write_wb.create_sheet('MACD&Signal gap data sheet')
write_ws = write_wb.active
write_ws['A1'] = 'Name'
write_ws['B1'] = 'Profit'
except StatisticsError as e:
print ('Sell is empty':',odd)
d = (geometric_mean(y)*1000000*12)
Here's the part where I'm troubling with:
s = backtester(Filtered)
e = s[s.Condition!=1.00000]
x = e.dropna()
y = x['Condition']
list_1 = []
except StatisticsError as e:
print ('Sell is empty':',odd)
d = (geometric_mean(y)*1000000*12)
When I initiate the code where I am having problems, list only saves the last result of 'try, except, else' function. My intention was saving all the results. What change should I give to save all the results?
Here's the output of the list:
Your problem is that you are using insert instead of append and the main difference that insert takes a second argument for the position that you want to insert your element at and when none is provided it is 0 by default so you are consistently inserting at the same index resulting in a list with only the last element at the first position.
To fix that simply use append instead.
d = (geometric_mean(y)*1000000*12)
You want to use append, not insert. see Python Data Structures
Change list_1.insert(d) to list_1.append(d)
The insert is defaulting to index 0 and just updating it each time.
Edit: Just noticed your answer is in the question title.
I have a set of about 200 images that I want to cluster into groups of images with similar features. I'm using Resnet50 to extract feature vectors from images and with the help of Faiss Kmeans I'm trying to cluster them into groups.
I have defined a class for Faiss KMeans as given on the link here.
class FaissKMeans:
def __init__(self, n_clusters=8, n_init=10, max_iter=300):
self.n_clusters = n_clusters
self.n_init = n_init
self.max_iter = max_iter
self.kmeans = None
self.cluster_centers_ = None
self.inertia_ = None
def fit(self, X, y):
self.kmeans = faiss.Kmeans(d=X.shape[1],
self.cluster_centers_ = self.kmeans.centroids
self.inertia_ = self.kmeans.obj[-1]
def predict(self, X):
return self.kmeans.index.search(X.astype(np.float32), 1)[1]
I'm storing the images and their vectors in a dictionary as key-value pairs.
#function to extract image vector
def extract_features(file, model):
img = load_img(file,target_size=(224,224))
img = np.array(img)
reshaped_img = img.reshape(1,224,224,3)
imgx = preprocess_input(reshaped_img)
features = model.predict(imgx,use_multiprocessing=True)
return features
#append the images in a folder to list "products"
products = []
with os.scandir(mypath) as files:
for file in files:
if file.name.endswith('.jpg'):
#load ResNet50 model
model = ResNet50()
model = Model(inputs = model.inputs, outputs = model.layers[-2].output)
#save image and image vector to dictionary "feature_dict" as key value pair
feature_dict = {}
p = pkl_path
for product in products:
feat = extract_features(product,model)
feature_dict[product] = feat
with open(p,'wb') as file:
#convert dictionary to a numpy array
filenames = np.array(list(feature_dict.keys()))
feat = np.array(list(feature_dict.values()))
feat = feat.reshape(-1,2048)
I'm using the package "kneed" to determine the number of clusters
#determine the number of clusters
length = len(filenames)
lim = 25
sse = []
list_k = list(range(1, lim))
for k in list_k:
km = KMeans(n_clusters=k,random_state=22, n_jobs=-1)
labels= km.fit_predict(feat)
elbow = kneedle.elbow #number of clusters
Now I'm trying to cluster the images into different groups using faiss Kmeans but I'm getting the error of AttributeError: 'Kmeans' object has no attribute 'fit' on kmeans.fit(feat)
kmeans = faiss.Kmeans(d=feat.shape[0] ,k=elbow, niter=200)
When I try to use kmeans.train(feat) which I found on the link, I get the error AssertionError
I have images of [64,512,5] stored in *.npy files which I convert into *.tfrecords files.
I have verified that the reading of said records corresponds correctly with what is present in the *.npy files. However, when I perform some operation on the parser, like adding 1 to each pixel of the image, the result is not the expected one. The result should be 65*512*5 = 163840 but it is 163839.99980013957 (not always the same)
I have tried to perform different operations like tf.subtract, but the results are the same.
Could someone tell me what is wrong?
import re
import ast
import sys, select
import random as rn
from glob import glob
from tqdm import tqdm
from datetime import datetime
from configparser import SafeConfigParser
import numpy as np
import numpy.ma as ma
import scipy.misc
import os.path
from os import mkdir, stat
from os.path import exists, dirname, abspath
from os.path import join as dir_join
import tensorflow as tf
''' File hierarchy
_code_dir = dirname(abspath(__file__))
_python_dir = dirname(_code_dir)
_model_dir = dirname(_python_dir)
_project_dir = dirname(_model_dir)
_ml_dir = dirname(_project_dir)
_srv_dir = dirname(_ml_dir)
_root_datasets_dir = dir_join(_srv_dir,'machine_learning','data_sets/ssd_prepared')
_config_dir = dir_join(_python_dir, 'config')
'''Data sets directories
THIS_DATA_SET_DIR = 'Sph_50m' #WARNING: Global variable also used in helper.py
_data_dir = dir_join(_root_datasets_dir, THIS_DATA_SET_DIR)
_data_set_dir = dir_join(_data_dir,'ImageSet')
_data_npy_dir = dir_join(_data_dir,'data')
_data_tfRecord_dir = dir_join(_data_dir,'tfRecord')
''' Configuration parser
cfg_parser = SafeConfigParser()
''' Private variables
_batch_size = cfg_parser.getint(section='train', option='batch_size')
_max_epoch = cfg_parser.getint(section='train', option='max_epoch')
_standarize = cfg_parser.getboolean(section='train', option='standarize_input')
_input_shape = ast.literal_eval(cfg_parser.get(section='data_shape', option='input_shape'))
_label_channel = cfg_parser.getint(section='data_shape', option='label_channel')
_track_channel = cfg_parser.getint(section='data_shape', option='track_channel')
_mask_channel = cfg_parser.getint(section='data_shape', option='mask_channel')
_data_train = cfg_parser.get(section='data_set', option='data_train')
_data_val = cfg_parser.get(section='data_set', option='data_val')
_data_test = cfg_parser.get(section='data_set', option='data_test')
def _int64_feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List(value=value.reshape(-1)))
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def _floats_feature(value):
return tf.train.Feature(float_list=tf.train.FloatList(value=value.reshape(-1)))
def numpy_to_TFRecord():
if not exists(_data_tfRecord_dir): mkdir(_data_tfRecord_dir)
for dataset in [_data_train, _data_val, _data_test]:
tfRecord_folder = dir_join(_data_tfRecord_dir, dataset)
if not exists(tfRecord_folder): mkdir(tfRecord_folder)
#Retrieve list of files
file_ = open(dir_join(_data_set_dir, dataset+'.txt'), 'r')
for x in file_.readlines():
file_nat = x.strip()+'.npy'
filename = dir_join(_data_npy_dir, file_nat)
assert exists(filename), "{} doesn't exist".format(filename)
totaltfRecordSize = 0
numFile = 0
for projection_dir in tqdm(projections_dir, ncols= 100, desc = 'TFRecord {}'.format(dataset)):
scanName = projection_dir.split('/')[-1].split('.')[0]
if totaltfRecordSize > 100*(10**6) or totaltfRecordSize == 0:
# address to save the TFRecords file
train_filename = dir_join(tfRecord_folder, \
str(numFile) + '_' + dataset +'.tfrecords')
# open the TFRecords file
writer = tf.python_io.TFRecordWriter(train_filename)
numFile += 1
totaltfRecordSize = 0
# Load the image
projection = np.load(projection_dir)
image = projection[:,:,:_label_channel]
label = projection[:,:,_label_channel].astype(int)
mask = projection[:,:,_mask_channel].astype(int)
track = projection[:,:,_track_channel].astype(int)
# Create a feature
feature = {'image': _floats_feature(image),
'label': _int64_feature(label),
'mask' : _int64_feature(mask),
'track': _int64_feature(track),
'scanName': _bytes_feature(tf.compat.as_bytes(scanName))}
# Create an example protocol buffer
example = tf.train.Example(features=tf.train.Features(feature=feature))
# Serialize to string and write on the file
fileSize = stat(train_filename).st_size
totaltfRecordSize += fileSize
def readTFRecord():
# Transforms a scalar string `example_proto` into a pair of a scalar string and
# a scalar integer, representing an image and its label, respectively.
image_dim = _input_shape[0] * _input_shape[1] * _label_channel
label_dim = _input_shape[0] * _input_shape[1]
mean = np.load(dir_join(_data_dir,'mean.npy'))
std = np.load(dir_join(_data_dir,'std.npy'))
mean_tf = tf.convert_to_tensor(mean, dtype=tf.float32, name='mean')
std_tf = tf.convert_to_tensor(std, dtype=tf.float32, name='std')
with tf.variable_scope('TFRecord'):
def _parse_function(example_proto):
with tf.variable_scope('parser'):
features = {'image': tf.FixedLenFeature([image_dim], tf.float32),
'label': tf.FixedLenFeature([label_dim], tf.int64),
'mask' : tf.FixedLenFeature([label_dim], tf.int64),
'track': tf.FixedLenFeature([label_dim], tf.int64),
'scanName': tf.FixedLenFeature([], tf.string)}
parsed_features = tf.parse_single_example(example_proto, features)
# Reshape image data into the original shape
image = tf.reshape(parsed_features['image'], [_input_shape[0], _input_shape[1], _label_channel], name='image')
label = tf.reshape(parsed_features['label'], _input_shape, name='lable_reshape')
mask = tf.reshape(parsed_features['mask'], _input_shape, name='mask_reshape')
track = tf.reshape(parsed_features['track'], _input_shape, name='track_reshape')
scanName = parsed_features['scanName']
image = image + tf.constant(1., dtype=tf.float32)
return image, label, mask, track, scanName
training_filenames = glob(dir_join(_data_tfRecord_dir, _data_train, '*.tfrecords'))
validation_filenames = glob(dir_join(_data_tfRecord_dir, _data_val, '*.tfrecords'))
filenames = tf.placeholder(tf.string, shape=[None], name='filenames')
dataset = tf.data.TFRecordDataset(filenames)
dataset = dataset.map(_parse_function, num_parallel_calls=20) # Parse the record into tensors.
dataset = dataset.shuffle(buffer_size=10000)
dataset = dataset.batch(_batch_size, drop_remainder=True)
dataset = dataset.prefetch(buffer_size=10)
iterator = dataset.make_initializable_iterator()
next = iterator.get_next()
sess = tf.Session()
while True:
sess.run(iterator.initializer, feed_dict={filenames: training_filenames})
img, _, _, _, scanX = sess.run(next)
for i, scan in enumerate(scanX):
projection = np.load(dir_join(_data_npy_dir, scan.decode("utf-8") + '.npy'))
imagenp = projection[:,:,:_label_channel]
if np.abs(np.sum(img[i,...] - imagenp)) > 0.:
print(np.sum(img[i,...] - imagenp))
except tf.errors.OutOfRangeError:
return training_filenames, validation_filenames, filenames, iterator
if __name__ == '__main__':
The test I'm doing in the previous code is to convert the *.npy files to *.tfrecords. Then, I compare the *.trecords with the *.npy. The value should be 0 if both images were identical.
img, _, _, _, scanX = sess.run(next)
for i, scan in enumerate(scanX):
projection = np.load(dir_join(_data_npy_dir, scan.decode("utf-8") + '.npy'))
imagenp = projection[:,:,:_label_channel]
print(np.sum(img[i,...] - imagenp))
If the data is not preprocessed, these images are the same, however, if we perform some kind of transformation, the results do not match what was expected. In this case we are adding 1 to each pixel of the image, so the total difference should be 64 * 512 * 5.
image = image + tf.constant(1., dtype=tf.float32)
I would like to solve this error, since so far I have not been able to obtain the results obtained by my neural network using feed_dict instead of Tensorflow Dataset API, and this is the only point where I can observe a difference in the input data.
My first post here.
So I'm loading data into a variable called f1_data, then passing it to pm.removeDC() function to do some signal processing, and keeping the result into the same variable. But then, I want to replace only the column 8, with the original f1_data that I called raw_data and I can't figure it out why it doesn't work. Here are the functions. Help anyone?
inside file pm.py
def removeDC(data):
# define the filter
butter_order = 2
hp_cutoff_Hz = 1.0
b, a = signal.butter(butter_order, hp_cutoff_Hz/(fs_Hz / 2.0), 'highpass')
for i in range(1,9):
data[:,i] = signal.lfilter(b, a, data[:,i], 0)
return (data)
def get_epoch1(data, t_sec, epoch, f_tup, col):
#f_tup = (f_wdir, f_name, f_columns, out_save, out_dir, out_number, fig_width)
f_name = f_tup[1]
fig_width = f_tup[6]
epoch_boolvector = (t_sec >= epoch[0][0]) & (t_sec <= epoch[0][1])
epoch_timescale = t_sec[epoch_boolvector]
epoch_data = data[epoch_boolvector]
plt.figure(figsize=(fig_width,8), dpi=96)
plt.plot(epoch_timescale, epoch_data[:,col]);
plt.xlim(epoch_timescale[0], epoch_timescale[-1])
return (epoch_boolvector, epoch_timescale, epoch_data)
inside main file
#load the whole data
(f1_data, f1_data_indices, f1_timescale) = pm.load_data(f1_wdir, f1_name)
raw_data = f1_data[:] #create copy of f1_data
(f1ep1_boolvector, f1ep1_timescale, f1ep1_data) = pm.get_epoch1(f1_data, f1_timescale, f1_epochs[1], f1_tup, 8)
#--- filter data to remove DC (1Hz)
f1_data = pm.removeDC(f1_data)
# replace only channel 8 with original data
f1_data[:,8] = raw_data[:,8]
(f1ep2_boolvector, f1ep2_timescale, f1ep2_data) = pm.get_epoch1(f1_data, f1_timescale, f1_epochs[1], f1_tup, 8)
The solution is import copy and use copy.deepcopy function.
For further info check this link:
When I have raw_data = f1_data[:] I get, after pm.removeDC():
raw_data is f1_data: False
(raw_data == f1_data).all(): True
But when I have raw_data = copy.deepcopy(f1_data) I get, after pm.removeDC():
raw_data is f1_data: False
(raw_data == f1_data).all(): False