Azure ML output from pipeline - python

I am trying to construct a pipeline in Microsoft Azure having (for now) a simple python script in input.
The problem is that I cannot find my output.
In my Notebooks section I have constructed the following two codes:
1) script called "test.ipynb"
# azureml-core of version 1.0.72 or higher is required
from azureml.core import Workspace, Dataset, Datastore
import pandas as pd
import numpy as np
import datetime
import math
#Upload datasets
subscription_id = 'myid'
resource_group = 'myrg'
workspace_name = 'mywn'
workspace = Workspace(subscription_id, resource_group, workspace_name)
dataset_zre = Dataset.get_by_name(workspace, name='file1')
dataset_SLA = Dataset.get_by_name(workspace, name='file2')
df_zre = dataset_zre.to_pandas_dataframe()
df_SLA = dataset_SLA.to_pandas_dataframe()
result = pd.concat([df_SLA,df_zre], sort=True)
result.to_csv(path_or_buf="/mnt/azmnt/code/Users/aniello.spiezia/outputs/output.csv",index=False)
def_data_store = workspace.get_default_datastore()
def_data_store.upload(src_dir = '/mnt/azmnt/code/Users/aniello.spiezia/outputs', target_path = '/mnt/azmnt/code/Users/aniello.spiezia/outputs', overwrite = True)
print("\nFinished!")
#End of the file
2) pipeline code called "pipeline.ipynb"
import os
import pandas as pd
import json
import azureml.core
from azureml.core import Workspace, Run, Experiment, Datastore
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.runconfig import CondaDependencies, RunConfiguration
from azureml.core.runconfig import DEFAULT_CPU_IMAGE
from azureml.telemetry import set_diagnostics_collection
from azureml.pipeline.steps import PythonScriptStep
from azureml.pipeline.core import Pipeline, PipelineData, StepSequence
print("SDK Version:", azureml.core.VERSION)
###############################
ws = Workspace.from_config()
print('Workspace name: ' + ws.name,
'Subscription id: ' + ws.subscription_id,
'Resource group: ' + ws.resource_group, sep = '\n')
experiment_name = 'aml-pipeline-cicd' # choose a name for experiment
project_folder = '.' # project folder
experiment = Experiment(ws, experiment_name)
print("Location:", ws.location)
set_diagnostics_collection(send_diagnostics=True)
###############################
cd = CondaDependencies.create(pip_packages=["azureml-sdk==1.0.17", "azureml-train-automl==1.0.17", "pyculiarity", "pytictoc", "cryptography==2.5", "pandas"])
amlcompute_run_config = RunConfiguration(framework = "python", conda_dependencies = cd)
amlcompute_run_config.environment.docker.enabled = False
amlcompute_run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE
amlcompute_run_config.environment.spark.precache_packages = False
###############################
aml_compute_target = "aml-compute"
try:
aml_compute = AmlCompute(ws, aml_compute_target)
print("found existing compute target.")
except:
print("creating new compute target")
provisioning_config = AmlCompute.provisioning_configuration(vm_size = "STANDARD_D2_V2",
idle_seconds_before_scaledown=1800,
min_nodes = 0,
max_nodes = 4)
aml_compute = ComputeTarget.create(ws, aml_compute_target, provisioning_config)
aml_compute.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)
print("Azure Machine Learning Compute attached")
###############################
def_data_store = ws.get_default_datastore()
def_blob_store = Datastore(ws, "workspaceblobstore")
print("Blobstore's name: {}".format(def_blob_store.name))
# Naming the intermediate data as anomaly data and assigning it to a variable
output_data = PipelineData("output_data", datastore = def_blob_store)
print("output_data object created")
step = PythonScriptStep(name = "test",
script_name = "test.ipynb",
compute_target = aml_compute,
source_directory = project_folder,
allow_reuse = True,
runconfig = amlcompute_run_config)
print("Step created.")
###############################
steps = [step]
print("Step lists created")
pipeline = Pipeline(workspace = ws, steps = steps)
print ("Pipeline is built")
pipeline.validate()
print("Pipeline validation complete")
pipeline_run = experiment.submit(pipeline)
print("Pipeline is submitted for execution")
pipeline_run.wait_for_completion(show_output = False)
print("Pipeline run completed")
###############################
def_data_store.download(target_path = '.',
prefix = 'outputs',
show_progress = True,
overwrite = True)
model_fname = 'output.csv'
model_path = os.path.join("outputs", model_fname)
pipeline_run.upload_file(name = model_path, path_or_stream = model_path)
print('Uploaded the model {} to experiment {}'.format(model_fname, pipeline_run.experiment.name))
And this give me the following error:
Pipeline run completed
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
<ipython-input-22-a8a523969bb3> in <module>
111
112 # Upload the model file explicitly into artifacts (for CI/CD)
--> 113 pipeline_run.upload_file(name = model_path, path_or_stream = model_path)
114 print('Uploaded the model {} to experiment {}'.format(model_fname, pipeline_run.experiment.name))
115
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/core/run.py in wrapped(self, *args, **kwargs)
47 "therefore, the {} cannot upload files, or log file backed metrics.".format(
48 self, self.__class__.__name__))
---> 49 return func(self, *args, **kwargs)
50 return wrapped
51
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/core/run.py in upload_file(self, name, path_or_stream)
1749 :rtype: azure.storage.blob.models.ResourceProperties
1750 """
-> 1751 return self._client.artifacts.upload_artifact(path_or_stream, RUN_ORIGIN, self._container, name)
1752
1753 #_check_for_data_container_id
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/_restclient/artifacts_client.py in upload_artifact(self, artifact, *args, **kwargs)
108 if isinstance(artifact, str):
109 self._logger.debug("Uploading path artifact")
--> 110 return self.upload_artifact_from_path(artifact, *args, **kwargs)
111 elif isinstance(artifact, IOBase):
112 self._logger.debug("Uploading io artifact")
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/_restclient/artifacts_client.py in upload_artifact_from_path(self, path, *args, **kwargs)
100 path = os.path.normpath(path)
101 path = os.path.abspath(path)
--> 102 with open(path, "rb") as stream:
103 return self.upload_artifact_from_stream(stream, *args, **kwargs)
104
FileNotFoundError: [Errno 2] No such file or directory: '/mnt/azmnt/code/Users/aniello.spiezia/outputs/output.csv'
Do you know what the problem could be?
In particular I am interested in saving somewhere the output file called "output.csv"

The best way for you to do this depends a bit on how you want to process the output.csv file after the run completed. But, in general you can just write your csv to the ./outputs folder:
# azureml-core of version 1.0.72 or higher is required
from azureml.core import Workspace, Dataset, Datastore
import pandas as pd
import numpy as np
import datetime
import math
#Upload datasets
subscription_id = 'myid'
resource_group = 'myrg'
workspace_name = 'mywn'
workspace = Workspace(subscription_id, resource_group, workspace_name)
dataset_zre = Dataset.get_by_name(workspace, name='file1')
dataset_SLA = Dataset.get_by_name(workspace, name='file2')
df_zre = dataset_zre.to_pandas_dataframe()
df_SLA = dataset_SLA.to_pandas_dataframe()
result = pd.concat([df_SLA,df_zre], sort=True)
if not os.path.isdir('outputs')
os.mkdir('outputs')
result.to_csv('outputs/output.csv', index=False)
print("\nFinished!")
#End of the file
After the run has completed, AzureML will upload the contents of the outputs directory to the run history, so no need to datastore.upload().
Afterwards, you can see the file in http://ml.azure.com when you navigate to the run like my model.pt file below:
See here for some information on the ./outputs and ./logs folders: https://learn.microsoft.com/en-us/azure/machine-learning/how-to-save-write-experiment-files#where-to-write-files
If you actually want to create another DataSet as a result of your Run, please see this post here: Azure Machine Learning Service - dataset API question

In Daniel's example above, you would need to download the output from the run rather than the datastore in your pipeline.ipynb code. Instead of calling def_data_store.download(), you would call pipeline_run.download('outputs/output.csv', '.').
Another option is to output your data using PipelineData. PipelineData represents a named piece of output of a pipeline step, and is useful if you want to connect multiple steps together with inputs and outputs. With PipelineData, you would need to pass the PipelineData object into PythonScriptStep when you declare your step (as part of arguments=[] and outputs=[]), and then have your script read the output path from the command-line arguments.
This notebook has examples of using PipelineData within a pipeline and downloading the outputs: https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-with-data-dependency-steps.ipynb
And this blog post has details about how to handle this within your script (parsing the command-line arguments, creating the output directory, and writing the output file): https://blog.x5ff.xyz/blog/ai-azureml-python-data-pipelines/

Related

Cant generate XGBoost training report in sagemaker, only profiler_report

I am trying to generate the XGBoost trainingreport to see feature importances however the following code only generates the profiler report.
from sagemaker import get_execution_role
import numpy as np
import pandas as pd
from sagemaker.predictor import csv_serializer
from sagemaker.debugger import Rule, rule_configs
# Define IAM role
rules=[
Rule.sagemaker(rule_configs.create_xgboost_report())
]
role = get_execution_role()
prefix = 'sagemaker/models'
my_region = boto3.session.Session().region_name
# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
xgboost_container = sagemaker.image_uris.retrieve("xgboost", my_region, "latest")
bucket_name = 'binary-base'
s3 = boto3.resource('s3')
try:
if my_region == 'us-east-1':
s3.create_bucket(Bucket=bucket_name)
else:
s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={ 'LocationConstraint': my_region })
print('S3 bucket created successfully')
except Exception as e:
print('S3 error: ',e)
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('../Data/Base_Model_Data_No_Labels/train.csv')
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'validation/val.csv')).upload_file('../Data/Base_Model_Data_No_Labels/val.csv')
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'test/test.csv')).upload_file('../Data/Base_Model_Data/test.csv'
sess = sagemaker.Session()
xgb = sagemaker.estimator.Estimator(xgboost_container,
role,
volume_size =5,
instance_count=1,
instance_type='ml.m4.xlarge',
output_path='s3://{}/{}/output'.format(bucket_name, prefix, 'xgboost_model'),
sagemaker_session=sess,
rules=rules)
xgb.set_hyperparameters(objective='binary:logistic',
num_round=100,
scale_pos_weight=8.5)
xgb.fit({'train': s3_input_train, "validation": s3_input_val}, wait=True)
When Checking the output path via:
rule_output_path = xgb.output_path + "/" + xgb.latest_training_job.job_name + "/rule-output"
! aws s3 ls {rule_output_path} --recursive
We only see the profiler report generated
What am I doing wrong/missing? I wish to generate the XGboost Training report to see its feature importances.

Running python script with multiple values of command line arguments

I have a python script for pre-processing audio and it has frame length, frame step and fft length as the command line arguments. I am able to run the code if I have single values of these arguments. I wanted to know if there is a way in which I can run the python script with multiple values of the arguments? For example, get the output if values of fft lengths are 128, 256 and 512 instead of just one value.
The code for pre-processing is as follows:
import numpy as np
import pandas as pd
import tensorflow as tf
from scipy.io import wavfile
import os
import time
import pickle
import random
import argparse
import configlib
from configlib import config as C
import mfccwithpaddingandcmd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow import keras
from tensorflow.python.keras import Sequential
from tensorflow.keras.layers import Dense,Conv2D,MaxPooling2D,Flatten,Dropout,BatchNormalization,LSTM,Lambda,Reshape,Bidirectional,GRU
from tensorflow.keras.callbacks import TensorBoard
start = time.time()
classes = ['blinds','fan','light','music','tv']
#dire = r"/mnt/beegfs/home/gehani/test_speech_command/"
parser = configlib.add_parser("Preprocessing config")
parser.add_argument("-dir","--dire", metavar="", help="Directory for the audio files")
def pp():
data_list=[] #To save paths of all the audio files.....all audio files in list format in data_list
#data_list-->folder-->files in folder
for index,label in enumerate(classes):
class_list=[]
if label=='silence': #creating silence folder and storing 1sec noise audio files
silence_path = os.path.join(C["dire"],'silence')
if not os.path.exists(silence_path):
os.mkdir(silence_path)
silence_stride = 2000
#sample_rate = 16000
folder = os.path.join(C["dire"],'_background_noise_') #all silence are kept in the background_noise folder
for file_ in os.listdir(folder):
if '.wav' in file_:
load_path = os.path.join(folder,file_)
sample_rate,y = wavfile.read(load_path)
for i in range(0,len(y)-sample_rate,silence_stride):
file_path = "silence/{}_{}.wav".format(file_[:-4],i)
y_slice = y[i:i+sample_rate]
wavfile.write(os.path.join(C["dire"],file_path),sample_rate,y_slice)
class_list.append(file_path)
else:
folder = os.path.join(C["dire"],label)
for file_ in os.listdir(folder):
file_path = '{}/{}'.format(label,file_) #Ex: up/c9b653a0_nohash_2.wav
class_list.append(file_path)
random.shuffle(class_list) #To shuffle files
data_list.append(class_list) #if not a silence file then just append to the datalist
X = []
Y = []
preemphasis = 0.985
print("Feature Extraction Started")
for i,class_list in enumerate(data_list): #datalist = all files, class list = folder name in datalist, sample = path to the audio file in that particular class list
for j,samples in enumerate(class_list): #samples are of the form classes_name/audio file
if(samples.endswith('.wav')):
sample_rate,audio = wavfile.read(os.path.join(C["dire"],samples))
if(audio.size<sample_rate):
audio = np.pad(audio,(sample_rate-audio.size,0),mode="constant")
#print("****")
#print(sample_rate)
#print(preemphasis)
#print(audio.shape)
coeff = mfccwithpaddingandcmd.mfcc(audio,sample_rate,preemphasis) # 0.985 = preemphasis
#print("****")
#print(coeff)
#print("****")
X.append(coeff)
#print(X)
if(samples.split('/')[0] in classes):
Y.append(samples.split('/')[0])
elif(samples.split('/')[0]=='_background_noise_'):
Y.append('silence')
#print(len(X))
#print(len(Y))
#X= coefficient array and Y = name of the class
A = np.zeros((len(X),X[0].shape[0],X[0][0].shape[0]),dtype='object')
for i in range(0,len(X)):
A[i] = np.array(X[i]) #Converting list X into array A
end1 = time.time()
print("Time taken for feature extraction:{}sec".format(end1-start))
MLB = MultiLabelBinarizer() # one hot encoding for converting labels into binary form
MLB.fit(pd.Series(Y).fillna("missing").str.split(', '))
Y_MLB = MLB.transform(pd.Series(Y).fillna("missing").str.split(', '))
MLB.classes_ #Same like classes array
print(Y_MLB.shape)
pickle_out = open("A_all.pickle","wb") #Writes array A to a file A.pickle
pickle.dump(A, pickle_out) #pickle is the file containing the extracted features
pickle_out.close()
pickle_out = open("Y_all.pickle","wb")
pickle.dump(Y_MLB, pickle_out)
pickle_out.close()
pickle_in = open("Y_all.pickle","rb")
Y = pickle.load(pickle_in)
X = tf.keras.utils.normalize(X)
X_train,X_valtest,Y_train,Y_valtest = train_test_split(X,Y,test_size=0.2,random_state=37)
X_val,X_test,Y_val,Y_test = train_test_split(X_valtest,Y_valtest,test_size=0.5,random_state=37)
print(X_train.shape,X_val.shape,X_test.shape,Y_train.shape,Y_val.shape,Y_test.shape)
if __name__ == "__main__":
configlib.parse(save_fname="last_arguments.txt")
print("Running with configuration:")
configlib.print_config()
pp()
The code for MFCC is as follows:
import tensorflow as tf
import scipy.io.wavfile as wav
import numpy as np
import matplotlib.pyplot as plt
import pickle
import argparse
import configlib
from configlib import config as C
# Configuration arguments
parser = configlib.add_parser("MFCC config")
parser.add_argument("-fl","--frame_length", type=int, default=400, metavar="", help="Frame Length")
parser.add_argument("-fs","--frame_step", type=int, default=160, metavar="", help="Frame Step")
parser.add_argument("-fft","--fft_length", type=int, default=512, metavar="", help="FFT length")
#args = parser.parse_args()
def Preemphasis(signal,pre_emp):
return np.append(signal[0],signal[1:]-pre_emp*signal[:-1])
def Paddinggg(framelength,framestep,samplerate):
frameStart = np.arange(0,samplerate,framestep)
frameEnd = frameStart + framelength
padding = min(frameEnd[(frameEnd > samplerate)]) - samplerate
return padding
def mfcc(audio,sample_rate,pre_emp):
audio = np.pad(audio,(Paddinggg(C["frame_length"],C["frame_step"],sample_rate),0),mode='reflect')
audio = audio.astype('float32')
#Normalization
audio = tf.keras.utils.normalize(audio)
#Preemphasis
audio = Preemphasis(audio,pre_emp)
stfts = tf.signal.stft(audio,C["frame_length"],C["frame_step"],C["fft_length"],window_fn=tf.signal.hann_window)
spectrograms = tf.abs(stfts)
num_spectrogram_bins = stfts.shape[-1]
lower_edge_hertz, upper_edge_hertz, num_mel_bins = 0.0, sample_rate/2.0, 32
linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(num_mel_bins, num_spectrogram_bins, sample_rate, lower_edge_hertz,upper_edge_hertz)
mel_spectrograms = tf.tensordot(spectrograms, linear_to_mel_weight_matrix, 1)
mel_spectrograms.set_shape(spectrograms.shape[:-1].concatenate(linear_to_mel_weight_matrix.shape[-1:]))
# Compute a stabilized log to get log-magnitude mel-scale spectrograms.
log_mel_spectrograms = tf.math.log(mel_spectrograms + 1e-6)
# Compute MFCCs from log_mel_spectrograms and take the first 13.
return log_mel_spectrograms
print("End")
And the code for configlib is as follows:
from typing import Dict, Any
import logging
import pprint
import sys
import argparse
# Logging for config library
logger = logging.getLogger(__name__)
# Our global parser that we will collect arguments into
parser = argparse.ArgumentParser(description=__doc__, fromfile_prefix_chars="#")
# Global configuration dictionary that will contain parsed arguments
# It is also this variable that modules use to access parsed arguments
config:Dict[str, Any] = {}
def add_parser(title: str, description: str = ""):
"""Create a new context for arguments and return a handle."""
return parser.add_argument_group(title, description)
def parse(save_fname: str = "") -> Dict[str, Any]:
"""Parse given arguments."""
config.update(vars(parser.parse_args()))
logging.info("Parsed %i arguments.", len(config))
# Optionally save passed arguments
if save_fname:
with open(save_fname, "w") as fout:
fout.write("\n".join(sys.argv[1:]))
logging.info("Saving arguments to %s.", save_fname)
return config
def print_config():
"""Print the current config to stdout."""
pprint.pprint(config)
I use the following command to run my python file:
python3.7 preprocessingwithpaddingandcmd.py -fl 1103 -fs 88 -fft 512 -dir /mnt/beegfs/home/gehani/appliances_audio_one_channel
Should I be writing a shell script or python has some options for it?
EDIT 1
I tried using
parser.add_argument('-fft', '--fft_length', type=int, default=[], nargs=3)
for getting fft length from the command line and used the command
run preprocessingwithpaddingandcmd -dir filepath -fl 1765 -fs 1102 -fft 512 218 64
to run it. But, it gives me this error: ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Can anyone please help?
I found you can do it by these. mfcc features extraction
You can create your own mfcc features extraction or you can limit window lengths and ceptrums that is enough for simple works except you need logarithms scales where you can use target matrix ( convolution ) or else.
It is logarithms when you use FFT or alternative derivation but mfcc is only extraction where I will provide the sample output in picture.
[ Sample ]:
from python_speech_features import mfcc
from python_speech_features import logfbank
import scipy.io.wavfile as wav
import tensorflow as tf
import matplotlib.pyplot as plt
(rate,sig) = wav.read("F:\\temp\\Python\\Speech\\temple_of_love-sisters_of_mercy.wav")
mfcc_feat = mfcc(signal=sig, samplerate=rate, winlen=0.025, winstep=0.01, numcep=13, nfilt=26, nfft=512, lowfreq=0, highfreq=None, preemph=0.97, ceplifter=22, appendEnergy=True)
fbank_feat = logfbank(sig,rate)
plt.plot( mfcc_feat[50:42000,0] )
plt.xlabel("sample")
plt.show()
plt.close()
input('...')

Google Vision API problem with batch annotations

I wanted to use Cloud Vision API to detect labels from ca. 40K photographs and download the results as CSV files. I uploaded photos into the cloud storage and used the following code, but the error occured. I asked a person who uses python in his job but he cannot deal with this error. Can you help mi with fixing it?
TypeError: Invalid constructor input for BatchAnnotateImagesRequest: [{'image': source {
image_uri: "gs://bucket/image-path.jpg"
}
, 'features': [{'type': <Type.LABEL_DETECTION: 4>}]}]
The code I used:
from google.cloud import
from google.cloud import storage
from google.cloud.vision_v1 import ImageAnnotatorClient
from google.cloud.vision_v1 import types
import os
import json
import numpy as np
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]='C://file-path.json'
#(created in step 1)
# Get GCS bucket
storage_client = storage.Client()
bucket = storage_client.bucket('bucket_name')
image_paths = []
for blob in list(bucket.list_blobs()):
image_paths.append("gs://bucket_name/"+blob.name)
# We can send a maximum of 16 images per request.
start = 0
end = 16
label_output = []
for i in range(int(np.floor(len(image_paths)/16))+1):
requests = []
client = vision.ImageAnnotatorClient()
for image_path in image_paths[start:end]:
image = types.Image()
image.source.image_uri = image_path
requests.append({'image': image,'features': [{'type': vision.Feature.Type.LABEL_DETECTION}]})
response = client.batch_annotate_images(requests)
for image_path, i in zip(image_paths[start:end], response.responses):
labels = [{label.description: label.score} for label in i.label_annotations]
labels = {k: v for d in labels for k, v in d.items()}
filename = os.path.basename(image_path)
l = {'filename': filename, 'labels': labels}
label_output.append(l)
start = start+16
end = end+16
#export results to CSV file
for l in label_output:
print('"' + label_output[l]['filename'] + '";', end = '')
for label in label_output[l]["labels"]:
print('"' + label + '";"' + label_output[l][label] + '";', end = '')
print("")
batch_annotate_images() is not getting the contents of requests properly. To fix this, just assign your variable requests explicitly to the parameter requests of batch_annotate_images().
response = client.batch_annotate_images(requests=requests)
See batch_annotate_images() for reference. Also if you are planning to update your Vision API to 2.3.1, you might encounter errors on features: see this reference for the updated usage of its parameters.

AttributeError: 'decode' when reading TIFF images

Here is part of the code I am attempting to run:
import numpy as np
import os
import tensorflow as tf
import imageio
import sys
#Create tensorflowflow dataset
dataset = tf.data.Dataset.from_tensor_slices((image_paths, labels))
if not is_test:
dataset = dataset.shuffle(num_of_samples)
dataset = dataset.repeat(None)
dataset = dataset.map(self._parse_dataset)
if not is_test:
batched_dataset = dataset.batch(self.batch_size, drop_remainder=True).prefetch(20)
else:
batched_dataset = dataset.batch(self.test_batch_size)
#Create the iterator
return batched_dataset, num_of_samples, path_strings
def get_batch(self, subset="train"):
batch_of_images = self.iterators[subset].get_next()
return batch_of_images
def _read_tif(self, file_path):
file_path = file_path.decode(sys.getdefaultencoding())
try :
im = imageio.imread(file_path)
except:
im = np.zeros((self.width, self.height, 3))
if len(im.shape) != 3:
im = np.repeat(im[:, :, np.newaxis], 3, axis=2)
return im
def _read_image(self, file_path):
return tf.py_function(func=self._read_tif, inp=[file_path], Tout=tf.uint8)
and I have the following error coming up:
File "C:\PROJECTS_RUNNING2\pipeline\data_loader\data_generator.py", line 131, in _read_tif
file_path = file_path.decode(sys.getdefaultencoding())
AttributeError: 'tensorflow.python.framework.ops.EagerTensor' object has no attribute 'decode'
The file_path is defined in the run.py and looks like this:
def main(config_file_path):
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333)
config =tf.ConfigProto(gpu_options=gpu_options)
sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
tf.reset_default_graph()
config = parse_config_file(config_file_path)
#Create the experiment output folders, this is where the outputs will be saved
output_folder_path = config["output_path"]
output_path = create_output_folder(output_folder_path, config["experiment_name"])
copyfile(config_file_path, os.path.join(output_path, "%s_parameters.json" % config["experiment_name"]))
data_generator = DataGenerator(config)
Input and Output dataset file paths are correctly defined in the config file.
I a very much beginner in coding though have to use the script for analysis of my images and I am struggling to get it up and running. Im using Python 3.7 and Tensorflow 1.14. Any help to resolve this error will be really much appreciated!

Pyradiomics: Feature class glcm is not recognized. How to fix it?

I am making a project with a GUI for liver ultrasound diagnostics.
I use PyQT5 (5.12.1) for GUI and sklearn (0.21.2) for statistics models. Main texture features I get from pyradiomics (2.2.0).
When I compile my project in PyCharm 2019.1 - all works completely fine.
But when I try to build my project as .exe file with pyinstaller, I got some erors. I solved most of them (about missing libraries) but this one left.
I got errors:
Feature class firstorder is not recognized
Feature class glcm is not recognized
Feature class glrlm is not recognized
Feature class ngtdm is not recognized
Feature class glszm is not recognized
and my model also gives an error (when I fit my new data with models that were already saved in .sav files from sklearn):
ValueEror: operands could not be broadcast together with shapes (1,3)(96,)(1,3)
1) I tried to change from:
extractor.enableFeatureClassByName('glcm')
to:
extractor.enableFeatureClassByName(str('glcm'))
It did not help.
2) Also I tried to build a project at different versions of pyradiomics:
2.1.1 and
2.2.0
give the same result (error)
import pandas as pd
import numpy as np
import pickle
import sklearn
...
folderName = "tmp"
sl = "/"
image_path_to = os.getcwd() + "/data/nrrd/" + folderName + sl + name_image
label_path_to = os.getcwd() + "/data/nrrd/" + folderName + sl + name_label
# Instantiate the extractor
extractor = featureextractor.RadiomicsFeatureExtractor()
# Switch on only needed feature class
extractor.disableAllFeatures()
extractor.enableFeatureClassByName('firstorder') <<< There is a problem
extractor.enableFeatureClassByName('glcm')
extractor.enableFeatureClassByName('glrlm')
extractor.enableFeatureClassByName('ngtdm')
extractor.enableFeatureClassByName('gldm')
extractor.enableFeatureClassByName('glszm')
# result -> ordered dict
result = extractor.execute(image_path_to, label_path_to)
df = pd.DataFrame(result, index=[0])
...
# Load the model from disk
model_name = 'Multi-layer Perceptron'
poolParam = ["diagnosis_code", "isnorm"]
models = [0,5]
for param in poolParam:
filename = 'data/result/model/' + model_name + ' ' + param + '.sav'
file = open(filename, 'rb')
loaded = pickle.load(file)
print("Model <" + model_name + " " + param + "> was loaded")
# Test the classifier
y_pred = int(loaded.predict(data)) <<< There is a problem

Categories

Resources