convert pytorch MAT model in github reposity to onnx - python

Unsupported: ONNX export of convolution for kernel of unknown shape. [Caused by the value 'x.47 defined in (%x.47 : Float(*, *, *, *, strides=[12168000, 67600, 260, 1], requires_grad=0, device=cpu) = onnx::Slice(%874, %875, %876, %877, %878), scope: torch_utils.persistence.persistent_class..Decorator::/torch_utils.persistence.persistent_class..Decorator::synthesis/torch_utils.persistence.persistent_class..Decorator::first_stage/torch_utils.persistence.persistent_class..Decorator::enc_conv.1/torch_utils.persistence.persistent_class..Decorator::conv # /Users/QSoft019/Documents/ai-image-research/MAT/torch_utils/ops/upfirdn2d.py:190:0
)' (type 'Tensor') in the TorchScript graph. The containing node has kind 'onnx::Slice'.]
github: https://github.com/fenglinglwb/mat
there is no error when running generate_image.py with pretrained file, but when converting to onnx, there are many warnings
finally, it stoped at line
assert isinstance(groups, int) and (groups >= 1)
in file MAT/torch_utils/ops/conv2d_resample.py
I had commented that line, but it still stopped at file venv/lib/python3.8/site-packages/torch/onnx/symbolic_opset9.py because weight_size (kernel_shape) variable was full of None value
I found that many integer variable -when converting to onnx- became tensors
this caused warnings, groups variable became a tensor, too
Am I in error at some where ?
My fuction:
def convert_torch_to_onnx_(onnx_path, image_path, model=None, torch_path=None):
"""
Coverts Pytorch model file to ONNX
:param torch_path: Torch model path to load
:param onnx_path: ONNX model path to save
:param image_path: Path to test image to use in export progress
"""
from datasets.mask_generator_512 import RandomMask
if torch_path is not None:
pytorch_model = get_torch_model(torch_path)
else:
pytorch_model = model
device = torch.device('cpu')
# image, _, torch_image = get_example_input(image_path)
image = read_image(image_path)
torch_image = (torch.from_numpy(image).float().to(device) / 127.5 - 1).unsqueeze(0)
label = torch.zeros([1, pytorch_model.c_dim], device=device)
resolution = 512
mask = RandomMask(resolution) # adjust the masking ratio by using 'hole_range'
mask = torch.from_numpy(mask).float().to(device).unsqueeze(0)
z = torch.from_numpy(np.random.randn(1, pytorch_model.z_dim)).to(device)
truncation_psi = 1
noise_mode = 'const'
torch.onnx.export(
pytorch_model,
(torch_image, mask, z, label, truncation_psi, noise_mode),
onnx_path,
verbose=True,
export_params=True,
# do_constant_folding=False,
# input_names=['input'],
opset_version=11,
# output_names=['output']
)
and generate_images function provided by author (default values of input variable were edited)
def generate_images(
# network_pkl: str = 'pretrained/CelebA-HQ_512.pkl',
network_pkl: str = '/Downloads/MAT/models/Places_512_FullData.pkl',
dpath: str = 'test_sets/CelebA-HQ/images',
# mpath=None,
mpath: str = 'test_sets/CelebA-HQ/masks',
resolution: int = 512,
truncation_psi: float = 1,
noise_mode: str = 'const',
outdir: str = 'samples',
model: bool = False,
):
"""
Generate images using pretrained network pickle.
"""
seed = 240 # pick up a random number
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
print(f'Loading data from: {dpath}')
img_list = sorted(glob.glob(dpath + '/*.png') + glob.glob(dpath + '/*.jpg'))
if mpath is not None:
print(f'Loading mask from: {mpath}')
mask_list = sorted(glob.glob(mpath + '/*.png') + glob.glob(mpath + '/*.jpg'))
assert len(img_list) == len(mask_list), 'illegal mapping'
print(f'Loading networks from: {network_pkl}')
device = torch.device('cpu')
# device = torch.device('cuda')
with dnnlib.util.open_url(network_pkl) as f:
G_saved = legacy.load_network_pkl(f)['G_ema'].to(device).eval().requires_grad_(False) # type: ignore
net_res = 512 if resolution > 512 else resolution
G = Generator(z_dim=512, c_dim=0, w_dim=512, img_resolution=net_res, img_channels=3).to(device).eval().requires_grad_(False)
copy_params_and_buffers(G_saved, G, require_all=True)
if model:
return G
os.makedirs(outdir, exist_ok=True)
# no Labels.
label = torch.zeros([1, G.c_dim], device=device)
if resolution != 512:
noise_mode = 'random'
with torch.no_grad():
for i, ipath in enumerate(img_list):
iname = os.path.basename(ipath).replace('.jpg', '.png')
print(f'Prcessing: {iname}')
image = read_image(ipath)
image = (torch.from_numpy(image).float().to(device) / 127.5 - 1).unsqueeze(0)
if mpath is not None:
mask = cv2.imread(mask_list[i], cv2.IMREAD_GRAYSCALE).astype(np.float32) / 255.0
mask = torch.from_numpy(mask).float().to(device).unsqueeze(0).unsqueeze(0)
else:
mask = RandomMask(resolution) # adjust the masking ratio by using 'hole_range'
mask = torch.from_numpy(mask).float().to(device).unsqueeze(0)
z = torch.from_numpy(np.random.randn(1, G.z_dim)).to(device)
output = G(image, mask, z, label, truncation_psi=truncation_psi, noise_mode=noise_mode)
output = (output.permute(0, 2, 3, 1) * 127.5 + 127.5).round().clamp(0, 255).to(torch.uint8)
output = output[0].cpu().numpy()

Related

How to load a TF Lite Model into Python from a file

I've followed the End-to-End image classification tutorial for tensorflow lite and have created and saved my model as '/path/to/model.tflite'.
What I haven't been able to figure out is how to load it.
I'm looking for some kind of syntax that is similar to this:
from tflite_model_maker import image_classifier
from tflite_model_maker.image_classifier import DataLoader
model = image_classifier.Load('/path/to/model.tflite')
I'm sure I'm missing something obvious here. This is definitely not the first place I've looked at. This seems to be the best place for me to find what I need, but the syntax used confuses me.
What do I want to be able to do with the model?
test = DataLoader.from_folder('/path/to/testImages')
loss, accuracy = model.evaluate(test)
# A helper function that returns 'red'/'black' depending on if its two input
# parameter matches or not.
def get_label_color(val1, val2):
if val1 == val2:
return 'black'
else:
return 'red'
# Then plot 100 test images and their predicted labels.
# If a prediction result is different from the label provided label in "test"
# dataset, we will highlight it in red color.
test_data = data
plt.figure(figsize=(20, 20))
predicts = model.predict_top_k(test_data)
for i, (image, label) in enumerate(test_data.gen_dataset().unbatch().take(100)):
ax = plt.subplot(10, 10, i+1)
plt.xticks([])
plt.yticks([])
plt.grid(False)
plt.imshow(image.numpy(), cmap=plt.cm.gray)
predict_label = predicts[i][0][0]
color = get_label_color(predict_label,
test_data.index_to_label[label.numpy()])
ax.xaxis.label.set_color(color)
plt.xlabel('Predicted: %s' % predict_label)
plt.show()
From the syntax above it seems the model isn't just a file but is a type/class/method depending on what name is most suitable for python.
Feels like this should only take one line of code but I haven't been able to find it anywhere.
Managed to do a simple version of it. The images coming up as a stream doesn't work for me using cv2 with Windows as it does for the pi. So instead I created a webpage in the same directory as this script. This generates an image with the bounding box, using a specified tflite model. This is in no way ideal.
It uses a webcam to get the image and saves the image to the directory the script is run in. It then renames the file so it can be viewed by the webpage I setup to view it.
The majority of this code comes from the TFLite Object Detection Raspberry Pi sample.
import time, os
from PIL import Image
from tflite_support import metadata
import platform
from typing import List, NamedTuple
import json
import cv2 as cv2
import numpy as np
import tensorflow as tf
from matplotlib import pyplot as plt
Interpreter = tf.lite.Interpreter
load_delegate = tf.lite.experimental.load_delegate
class ObjectDetectorOptions(NamedTuple):
"""A config to initialize an object detector."""
enable_edgetpu: bool = False
"""Enable the model to run on EdgeTPU."""
label_allow_list: List[str] = None
"""The optional allow list of labels."""
label_deny_list: List[str] = None
"""The optional deny list of labels."""
max_results: int = -1
"""The maximum number of top-scored detection results to return."""
num_threads: int = 1
"""The number of CPU threads to be used."""
score_threshold: float = 0.0
"""The score threshold of detection results to return."""
class Rect(NamedTuple):
"""A rectangle in 2D space."""
left: float
top: float
right: float
bottom: float
class Category(NamedTuple):
"""A result of a classification task."""
label: str
score: float
index: int
class Detection(NamedTuple):
"""A detected object as the result of an ObjectDetector."""
bounding_box: Rect
categories: List[Category]
def edgetpu_lib_name():
"""Returns the library name of EdgeTPU in the current platform."""
return {
'Darwin': 'libedgetpu.1.dylib',
'Linux': 'libedgetpu.so.1',
'Windows': 'edgetpu.dll',
}.get(platform.system(), None)
class ObjectDetector:
"""A wrapper class for a TFLite object detection model."""
_OUTPUT_LOCATION_NAME = 'location'
_OUTPUT_CATEGORY_NAME = 'category'
_OUTPUT_SCORE_NAME = 'score'
_OUTPUT_NUMBER_NAME = 'number of detections'
def __init__(
self,
model_path: str,
options: ObjectDetectorOptions = ObjectDetectorOptions()
) -> None:
"""Initialize a TFLite object detection model.
Args:
model_path: Path to the TFLite model.
options: The config to initialize an object detector. (Optional)
Raises:
ValueError: If the TFLite model is invalid.
OSError: If the current OS isn't supported by EdgeTPU.
"""
# Load metadata from model.
displayer = metadata.MetadataDisplayer.with_model_file(model_path)
# Save model metadata for preprocessing later.
model_metadata = json.loads(displayer.get_metadata_json())
process_units = model_metadata['subgraph_metadata'][0]['input_tensor_metadata'][0]['process_units']
mean = 0.0
std = 1.0
for option in process_units:
if option['options_type'] == 'NormalizationOptions':
mean = option['options']['mean'][0]
std = option['options']['std'][0]
self._mean = mean
self._std = std
# Load label list from metadata.
file_name = displayer.get_packed_associated_file_list()[0]
label_map_file = displayer.get_associated_file_buffer(file_name).decode()
label_list = list(filter(lambda x: len(x) > 0, label_map_file.splitlines()))
self._label_list = label_list
# Initialize TFLite model.
if options.enable_edgetpu:
if edgetpu_lib_name() is None:
raise OSError("The current OS isn't supported by Coral EdgeTPU.")
interpreter = Interpreter(
model_path=model_path,
experimental_delegates=[load_delegate(edgetpu_lib_name())],
num_threads=options.num_threads)
else:
interpreter = Interpreter(
model_path=model_path, num_threads=options.num_threads)
interpreter.allocate_tensors()
input_detail = interpreter.get_input_details()[0]
# From TensorFlow 2.6, the order of the outputs become undefined.
# Therefore we need to sort the tensor indices of TFLite outputs and to know
# exactly the meaning of each output tensor. For example, if
# output indices are [601, 599, 598, 600], tensor names and indices aligned
# are:
# - location: 598
# - category: 599
# - score: 600
# - detection_count: 601
# because of the op's ports of TFLITE_DETECTION_POST_PROCESS
# (https://github.com/tensorflow/tensorflow/blob/a4fe268ea084e7d323133ed7b986e0ae259a2bc7/tensorflow/lite/kernels/detection_postprocess.cc#L47-L50).
sorted_output_indices = sorted(
[output['index'] for output in interpreter.get_output_details()])
self._output_indices = {
self._OUTPUT_LOCATION_NAME: sorted_output_indices[0],
self._OUTPUT_CATEGORY_NAME: sorted_output_indices[1],
self._OUTPUT_SCORE_NAME: sorted_output_indices[2],
self._OUTPUT_NUMBER_NAME: sorted_output_indices[3],
}
self._input_size = input_detail['shape'][2], input_detail['shape'][1]
self._is_quantized_input = input_detail['dtype'] == np.uint8
self._interpreter = interpreter
self._options = options
def detect(self, input_image: np.ndarray) -> List[Detection]:
"""Run detection on an input image.
Args:
input_image: A [height, width, 3] RGB image. Note that height and width
can be anything since the image will be immediately resized according
to the needs of the model within this function.
Returns:
A Person instance.
"""
image_height, image_width, _ = input_image.shape
input_tensor = self._preprocess(input_image)
self._set_input_tensor(input_tensor)
self._interpreter.invoke()
# Get all output details
boxes = self._get_output_tensor(self._OUTPUT_LOCATION_NAME)
classes = self._get_output_tensor(self._OUTPUT_CATEGORY_NAME)
scores = self._get_output_tensor(self._OUTPUT_SCORE_NAME)
count = int(self._get_output_tensor(self._OUTPUT_NUMBER_NAME))
return self._postprocess(boxes, classes, scores, count, image_width,
image_height)
def _preprocess(self, input_image: np.ndarray) -> np.ndarray:
"""Preprocess the input image as required by the TFLite model."""
# Resize the input
input_tensor = cv2.resize(input_image, self._input_size)
# Normalize the input if it's a float model (aka. not quantized)
if not self._is_quantized_input:
input_tensor = (np.float32(input_tensor) - self._mean) / self._std
# Add batch dimension
input_tensor = np.expand_dims(input_tensor, axis=0)
return input_tensor
def _set_input_tensor(self, image):
"""Sets the input tensor."""
tensor_index = self._interpreter.get_input_details()[0]['index']
input_tensor = self._interpreter.tensor(tensor_index)()[0]
input_tensor[:, :] = image
def _get_output_tensor(self, name):
"""Returns the output tensor at the given index."""
output_index = self._output_indices[name]
tensor = np.squeeze(self._interpreter.get_tensor(output_index))
return tensor
def _postprocess(self, boxes: np.ndarray, classes: np.ndarray,
scores: np.ndarray, count: int, image_width: int,
image_height: int) -> List[Detection]:
"""Post-process the output of TFLite model into a list of Detection objects.
Args:
boxes: Bounding boxes of detected objects from the TFLite model.
classes: Class index of the detected objects from the TFLite model.
scores: Confidence scores of the detected objects from the TFLite model.
count: Number of detected objects from the TFLite model.
image_width: Width of the input image.
image_height: Height of the input image.
Returns:
A list of Detection objects detected by the TFLite model.
"""
results = []
# Parse the model output into a list of Detection entities.
for i in range(count):
if scores[i] >= self._options.score_threshold:
y_min, x_min, y_max, x_max = boxes[i]
bounding_box = Rect(
top=int(y_min * image_height),
left=int(x_min * image_width),
bottom=int(y_max * image_height),
right=int(x_max * image_width))
class_id = int(classes[i])
category = Category(
score=scores[i],
label=self._label_list[class_id], # 0 is reserved for background
index=class_id)
result = Detection(bounding_box=bounding_box, categories=[category])
results.append(result)
# Sort detection results by score ascending
sorted_results = sorted(
results,
key=lambda detection: detection.categories[0].score,
reverse=True)
# Filter out detections in deny list
filtered_results = sorted_results
if self._options.label_deny_list is not None:
filtered_results = list(
filter(
lambda detection: detection.categories[0].label not in self.
_options.label_deny_list, filtered_results))
# Keep only detections in allow list
if self._options.label_allow_list is not None:
filtered_results = list(
filter(
lambda detection: detection.categories[0].label in self._options.
label_allow_list, filtered_results))
# Only return maximum of max_results detection.
if self._options.max_results > 0:
result_count = min(len(filtered_results), self._options.max_results)
filtered_results = filtered_results[:result_count]
return filtered_results
_MARGIN = 10 # pixels
_ROW_SIZE = 10 # pixels
_FONT_SIZE = 1
_FONT_THICKNESS = 1
_TEXT_COLOR = (0, 0, 255) # red
def visualize(
image: np.ndarray,
detections: List[Detection],
) -> np.ndarray:
"""Draws bounding boxes on the input image and return it.
Args:
image: The input RGB image.
detections: The list of all "Detection" entities to be visualize.
Returns:
Image with bounding boxes.
"""
for detection in detections:
# Draw bounding_box
start_point = detection.bounding_box.left, detection.bounding_box.top
end_point = detection.bounding_box.right, detection.bounding_box.bottom
cv2.rectangle(image, start_point, end_point, _TEXT_COLOR, 3)
# Draw label and score
category = detection.categories[0]
class_name = category.label
probability = round(category.score, 2)
result_text = class_name + ' (' + str(probability) + ')'
text_location = (_MARGIN + detection.bounding_box.left,
_MARGIN + _ROW_SIZE + detection.bounding_box.top)
cv2.putText(image, result_text, text_location, cv2.FONT_HERSHEY_PLAIN,
_FONT_SIZE, _TEXT_COLOR, _FONT_THICKNESS)
return image
# ---------------------------------- #
# This is where the custom code starts
# ---------------------------------- #
# Load the TFLite model
TFLITE_MODEL_PATH='object.tflite'
DETECTION_THRESHOLD = 0.5 # 50% threshold required before identifying
options = ObjectDetectorOptions(
num_threads=4,
score_threshold=DETECTION_THRESHOLD,
)
# Close camera if already open
try:
cap.release()
except:
print("",end="") # do nothing
detector = ObjectDetector(model_path=TFLITE_MODEL_PATH, options=options)
cap = cv2.VideoCapture(0) #webcam
counter = 0 # Store many times model has run
while cap.isOpened():
success, image = cap.read()
if not success:
sys.exit(
'ERROR: Unable to read from webcam. Please verify your webcam settings.'
)
image = cv2.flip(image, 1)
# Convert the image from BGR to RGB as required by the TFLite model.
rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
#image.thumbnail((512, 512), Image.ANTIALIAS)
image_np = np.asarray(image)
# Run object detection estimation using the model.
detections = detector.detect(image_np)
# Draw keypoints and edges on input image
image_np = visualize(image_np, detections)
if counter == 10: # <- Change this to decide how many iterations
cap.release()
break
image_np = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
plt.imsave('tmp.jpg',image_np) # Saves the image
os.replace("tmp.jpg", "web.jpg",) # Renames it for the webpage
counter += 1
print(counter)
cap.release()
Here's the HTML for the document placed in the same directory as the python file, I saved it as index.html and opened in the browser while running the python script above.
<!DOCTYPE html>
<html>
<head>
<title>Object Detection</title>
</head>
<body>
<h1>Object Detection</h1>
<p>This displays images saved during detection process</p>
<canvas id="x" width="700px" height="500px"></canvas>
<script>
var newImage = new Image();
newImage.src = "web.jpg";
var canvas = document.getElementById("x");
var context = canvas.getContext("2d");
newImage.onload = function() {
context.drawImage(newImage, 0, 0);
console.log("trigger")
setTimeout(timedRefresh, 1000);
};
function timedRefresh() {
// just change src attribute, will always trigger the onload callback
try {
newImage.src = ("web.jpg#" + new Date().getTime());
}catch(e){
console.log(e);
}
}
setTimeout(timedRefresh, 100);
</script>
</body>
</html>
It's incredibly slow, not ideal in many ways and probably breaks many good coding conventions. It was only used locally, would definitely not use this for a production environment nor recommend its use. Just needed a quick proof of concept and this worked for that.

Detectron model giving different results for different machines (constant seed)

My training script for the model:
seed = 42
import random
import os
import numpy as np
def seed_everything(seed):
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True
seed_everything(seed)
from detectron2.engine import DefaultTrainer
from detectron2.config import get_cfg
from detectron2.data.catalog import Metadata
import os
cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file("COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml"))
cfg.DATASETS.TRAIN = ("experiment",)
cfg.DATASETS.TEST = ("test",)
cfg.DATALOADER.NUM_WORKERS = 2
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.7
cfg.MODEL.DEVICE = "cuda"
cfg.SOLVER.IMS_PER_BATCH = 2
num_gpu = 1
bs = (num_gpu * 2)
cfg.SOLVER.BASE_LR = 0.02 * bs / 16
cfg.SOLVER.MAX_ITER = 7500
cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 128
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 4
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
trainer = DefaultTrainer(cfg)
trainer.resume_or_load(resume=False)
trainer.train()
My inference script on server-1 is:
import cv2
cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, "model_final.pth")
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.7
cfg.SEED = 42
predictor = DefaultPredictor(cfg)
img = cv2.imread('filename.jpg')
outputs = predictor(img)
print(outputs["instances"])
pred_classes = outputs['instances'].pred_classes.tolist()
classes = ["Handwritten", "Logo", "Markings", "Signature"]
for pred_class in pred_classes:
print('*'*10)
print(classes[pred_class])
print('*'*10)
if any(classes[pred_class] == "Handwritten" for pred_class in pred_classes):
print(True)
else:
print(False)
My inference script on server-2 is:
class Handwritten:
"""
Detects a list of handwritten pages in a PDF chart.
Attributes
----------
path_of_model : str
Path where the trained model is stored.
path_of_weights : str
Path where the weights file is stored.
"""
def __init__(self, path_of_weights: str) -> None:
"""Initialize Handwritten class.
Parameters
----------
path_of_model : str
path_of_weights : str
"""
self.cfg = get_cfg()
self.cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.7
self.cfg.MODEL.ROI_HEADS.NUM_CLASSES = 4
self.cfg.MODEL.WEIGHTS = path_of_weights
self.cfg.MODEL.DEVICE = "cpu"
self.cfg.SEED = 42
self.predictor = DefaultPredictor(self.cfg)
self.metadata = Metadata()
self.metadata.set(
thing_classes=["Handwritten", "Logo", "Markings", "Signature"],
thing_dataset_id_to_contiguous_id={0: 0, 1: 1, 2: 2, 3: 3},
)
def __call__(self, img: Any) -> Any:
"""Return the predicted output classes for the image."""
self.outputs = self.predictor(img)
return self.outputs["instances"]
def detect_hw(self, image: Any) -> bool:
"""Detect handwritten dx entity in image and if present then classifies it as hw page.
Parameters
----------
image : Matrix
.Image matrix of a page.
Return
-------
True/False : bool
Boolean value that states if the page is handwritten or not.
"""
outputs = self.__call__(image)
pred_classes = outputs.pred_classes.tolist()
classes = ["Handwritten", "Logo", "Markings", "Signature"]
if any(classes[pred_class] == "Handwritten" for pred_class in pred_classes):
return True
else:
return False
app = FastAPI()
path_of_weights = "model/model_final.pth"
model = Handwritten(path_of_weights)
#app.post("/cv/predict", status_code=200)
def predict(
page_no: int = Form(...), dimensions: list = Form(...), image: UploadFile = File(...)
) -> Dict[str, int]:
"""Predicts if image is handwritten page or not.
Parameters
----------
page_no : Page number of the given input page
dimensions : Height and width of the page
image : Image of the page as bytestream
"""
image_bytes = image.file.read()
decoded_image = cv2.imdecode(np.frombuffer(image_bytes, np.uint8), -1)
height, width = int(dimensions[0]), int(dimensions[1])
prediction_time = time.time()
pg_image = cv2.resize(decoded_image, (height, width))
try:
# Check if page is handwritten
hw_result = model.detect_hw(pg_image)
# If handwritten, consider for output
if hw_result:
hw_pages = page_no
else:
hw_pages = -99
prediction_info = {
"hw_pages": hw_pages,
"prediction_time": prediction_time,
}
#_logger.info(f"prediction info: {prediction_info}")
except HTTPError as e:
do something
return {"hw_pages": hw_pages}
While the model keeps giving good results on server-1, it is somehow being very erratic in server-2. The weights and the seed is the same. Somehow, I am unable to understand this change in behavior in both of these scenarios.
The model is trained on server-1
Server-1 is g4dn.2xlarge. Server-2 is g4dn.xlarge
Is there something wrong which I am doing?

Training stuck at Epoch 3 PyTorch

I am training a custom Encoder-Decoder network but the training gets stuck at Epoch 3. Nothing happens for about 2 hours. I will share the Dataset class and the DataLoader object. The version if CUDA and GPU can be seen in the pic below.
Training stuck here:
nvidia-smi output looks like this:
The __getitem__ method of the dataset class looks like this:
def __init__(self,
images_dir,
annots_dir,
train=True,
img_size=(512, 1536),
stride=4,
model='custom',
transforms=None):
"""
:param root: dataset directory
:param filenames: filenames inside the root directory
:param labels: Object Detection Labels
super(CustomDataset).__init__()
self.images_dir = images_dir
self.annots_dir = annots_dir
self.train = train
self.image_size = img_size
self.stride = stride
self.transforms = transforms
self.model = model
# Load the image and annotation files from the dataset
# self.image_files, self.annot_files = self._load_image_and_annot_files()
self.image_files = [os.path.join(self.images_dir, idx) for idx in os.listdir(self.images_dir)]
self.annot_files = [os.path.join(self.annots_dir, idx) for idx in os.listdir(self.annots_dir)]
def __getitem__(self, index):
"""
:param index: index...0 to N
:return: tensor_image and tensor_label
"""
# Image filename from _load_image_files()
# Load Image with _read_matrix() and label
curr_image_filename = self.image_files[index]
curr_annot_filename = self.annot_files[index]
# curr_image_filename = self.image_files[index]
# curr_annot_filename = self.annot_files[index]
np_image = self._read_matrix(raw_img=curr_image_filename)
np_image_normalized = np.squeeze(self._normalize_raw_img(np_image))
# label = self.labels[index]
boxes, classes, depths, tgts = self._load_annotations(curr_annot_filename)
# Normalize bounding boxes: range [0, 1]
targets_normalized = self._normalize_bbox(np_image_normalized, tgts)
# image and the corresponding label should be a tensor
torch_image = torch.from_numpy(np_image).reshape(1, 512, 1536).float() # dtype: torch.float64
torch_boxes = torch.from_numpy(boxes).type(torch.FloatTensor)
torch_depths = torch.from_numpy(depths)
if self.model == 'fasterrcnn':
# For FasterRCNN: As COCO format
area = (torch_boxes[:, 3] - torch_boxes[:, 1]) * (torch_boxes[:, 2] - torch_boxes[:, 0])
iscrowd = torch.zeros((boxes.shape[0],), dtype=torch.int64)
image_id = torch.Tensor([index])
torch_classes = torch.from_numpy(classes)
target = {'boxes': torch_boxes, 'labels': torch_classes.long(),
'area': area, 'iscrowd': iscrowd, 'image_id': image_id}
return torch_image, target
elif self.model == 'custom':
if self.train:
if self.transforms:
try:
tr = self.transforms()
transform_image, transform_boxes, labels = tr.__call__(np_image, tgts, tgts[:, :4], tgts[:, 4:])
transform_targets = np.hstack((np.array(transform_boxes), labels))
gt_tensor = gt_creator(img_size=self.image_size,
stride=self.stride,
num_classes=8,
label_lists=transform_targets)
return torch.from_numpy(transform_image).float(), gt_tensor
except IndexError:
pass
else:
gt_tensor = gt_creator(img_size=self.image_size,
stride=self.stride,
num_classes=8,
label_lists=targets_normalized)
return torch_image, gt_tensor
else:
return torch_image, targets_normalized
And in the train.py script the DataLoader object is:
train_loader = torch.utils.data.DataLoader(dataset=dataset,
shuffle=True,
batch_size=1,
num_workers=0,
collate_fn=detection_collate,
pin_memory=True)
Why does the training get stuck? Is there an issue with the __getitem__ method? Or the DataLoader?
Thank You.
This happens because torch doesnt restart your dataset, if your data runs out it stops and waits for more input so cycling has to be done manually.
I used something along the lines of
from itertools import cycle
class Dataloader():
#init and whatever
self.__iter__():
return cycle(get_sample()) # get_sample is your current getitem

Optimize pytorch data loader for reading small patches in full HD images

I'm training my neural network using PyTorch framework. The data is full HD images (1920x1080). But in each iteration, I just need to crop out a random 256x256 patch from these images. My network is relatively small (5 conv layers), and hence the bottleneck is being caused by loading the data. I've provided my current code below. Is there any way to optimize loading the data and speed up the training?
Code:
from pathlib import Path
import numpy
import skimage.io
import torch.utils.data as data
import Imath
import OpenEXR
class Ours(data.Dataset):
"""
Loads patches of resolution 256x256. Patches are selected such that they contain atleast 1 unknown pixel
"""
def __init__(self, data_dirpath, split_name, patch_size):
super(Ours, self).__init__()
self.dataroot = Path(data_dirpath) / split_name
self.video_names = []
for video_path in sorted(self.dataroot.iterdir()):
for i in range(4):
for j in range(11):
view_num = i * 12 + j
self.video_names.append((video_path.stem, view_num))
self.patch_size = patch_size
return
def __getitem__(self, index):
video_name, view_num = self.video_names[index]
patch_start_pt = (numpy.random.randint(1080), numpy.random.randint(1920))
frame1_path = self.dataroot / video_name / f'render/rgb/{view_num + 1:04}.png'
frame2_path = self.dataroot / video_name / f'render/rgb/{view_num + 2:04}.png'
depth_path = self.dataroot / video_name / f'render/depth/{view_num + 1:04}.exr'
mask_path = self.dataroot / video_name / f'render/masks/{view_num + 1:04}.png'
frame1 = self.get_image(frame1_path, patch_start_pt)
frame2 = self.get_image(frame2_path, patch_start_pt)
mask = self.get_mask(mask_path, patch_start_pt)
depth = self.get_depth(depth_path, patch_start_pt, mask)
data_dict = {
'frame1': frame1,
'frame2': frame2,
'mask': mask,
'depth': depth,
}
return data_dict
def __len__(self):
return len(self.video_names)
#staticmethod
def get_mask(path: Path, patch_start_point: tuple):
h, w = patch_start_point
mask = skimage.io.imread(path.as_posix())[h:h + self.patch_size, w:w + self.patch_size][None]
return mask
def get_image(self, path: Path, patch_start_point: tuple):
h, w = patch_start_point
image = skimage.io.imread(path.as_posix())
image = image[h:h + self.patch_size, w:w + self.patch_size, :3]
image = image.astype(numpy.float32) / 255 * 2 - 1
image_cf = numpy.moveaxis(image, [0, 1, 2], [1, 2, 0])
return image_cf
def get_depth(self, path: Path, patch_start_point: tuple, mask: numpy.ndarray):
h, w = patch_start_point
exrfile = OpenEXR.InputFile(path.as_posix())
raw_bytes = exrfile.channel('B', Imath.PixelType(Imath.PixelType.FLOAT))
depth_vector = numpy.frombuffer(raw_bytes, dtype=numpy.float32)
height = exrfile.header()['displayWindow'].max.y + 1 - exrfile.header()['displayWindow'].min.y
width = exrfile.header()['displayWindow'].max.x + 1 - exrfile.header()['displayWindow'].min.x
depth = numpy.reshape(depth_vector, (height, width))
depth = depth[h:h + self.patch_size, w:w + self.patch_size]
depth = depth[None]
depth = depth.astype(numpy.float32)
depth = depth * mask
return depth
Finally, I'm creating a DataLoader as follows:
train_data_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=4)
What I've tried so far:
I've searched if it is possible to read a part of the image. Unfortunately, I didn't get any leads. Looks like python libraries read the full image.
I'm planning to read more patches from a single image so that I will need to read fewer images. But in PyTorch framework, the get_item() function has to return a single sample, not a batch. So, in each get_item() I can read only a patch.
I'm planning to circumvent this as follows: Read 4 patches in get_item() and return patches of shape (4,3,256,256) instead of (3,256,256). Later when I read a batch using dataloader, I'll get a batch of shape (BS,4,3,256,256) instead of (BS,3,256,256). I can then concatenate the data along dim=1 to convert (BS,4,3,256,256) to (BS*4,3,256,256). Thus I can reduce batch_size (BS) by 4 and hopefully this will speed up data loading by 4 times.
Are there any other options? I'm open to all kind of suggestions. Thanks!

How to use torchvision.transforms for data augmentation of segmentation task in Pytorch?

I am a little bit confused about the data augmentation performed in PyTorch.
Because we are dealing with segmentation tasks, we need data and mask for the same data augmentation, but some of them are random, such as random rotation.
Keras provides a random seed guarantee that data and mask do the same operation, as shown in the following code:
data_gen_args = dict(featurewise_center=True,
featurewise_std_normalization=True,
rotation_range=25,
horizontal_flip=True,
vertical_flip=True)
image_datagen = ImageDataGenerator(**data_gen_args)
mask_datagen = ImageDataGenerator(**data_gen_args)
seed = 1
image_generator = image_datagen.flow(train_data, seed=seed, batch_size=1)
mask_generator = mask_datagen.flow(train_label, seed=seed, batch_size=1)
train_generator = zip(image_generator, mask_generator)
I didn't find a similar description in the official Pytorch documentation, so I don't know how to ensure that data and mask can be processed synchronously.
Pytorch does provide such a function, but I want to apply it to a custom Dataloader.
For example:
def __getitem__(self, index):
img = np.zeros((self.im_ht, self.im_wd, channel_size))
mask = np.zeros((self.im_ht, self.im_wd, channel_size))
temp_img = np.load(Image_path + '{:0>4}'.format(self.patient_index[index]) + '.npy')
temp_label = np.load(Label_path + '{:0>4}'.format(self.patient_index[index]) + '.npy')
for i in range(channel_size):
img[:,:,i] = temp_img[self.count[index] + i]
mask[:,:,i] = temp_label[self.count[index] + i]
if self.transforms:
img = np.uint8(img)
mask = np.uint8(mask)
img = self.transforms(img)
mask = self.transforms(mask)
return img, mask
In this case, img and mask will be transformed separately, because some operations such as random rotation are random, so the correspondence between mask and image may be changed. In other words, the image may have rotated but the mask did not do this.
EDIT 1
I used the method in augmentations.py, but I got an error::
Traceback (most recent call last):
File "test_transform.py", line 87, in <module>
for batch_idx, image, mask in enumerate(train_loader):
File "/home/dirk/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 314, in __next__
batch = self.collate_fn([self.dataset[i] for i in indices])
File "/home/dirk/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 314, in <listcomp>
batch = self.collate_fn([self.dataset[i] for i in indices])
File "/home/dirk/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/utils/data/dataset.py", line 103, in __getitem__
return self.dataset[self.indices[idx]]
File "/home/dirk/home/data/dirk/segmentation_unet_pytorch/data.py", line 164, in __getitem__
img, mask = self.transforms(img, mask)
File "/home/dirk/home/data/dirk/segmentation_unet_pytorch/augmentations.py", line 17, in __call__
img, mask = a(img, mask)
TypeError: __call__() takes 2 positional arguments but 3 were given
This is my code for __getitem__():
data_transforms = {
'train': Compose([
RandomHorizontallyFlip(),
RandomRotate(degree=25),
transforms.ToTensor()
]),
}
train_set = DatasetUnetForTestTransform(fold=args.fold, random_index=args.random_index,transforms=data_transforms['train'])
# __getitem__ in class DatasetUnetForTestTransform
def __getitem__(self, index):
img = np.zeros((self.im_ht, self.im_wd, channel_size))
mask = np.zeros((self.im_ht, self.im_wd, channel_size))
temp_img = np.load(Label_path + '{:0>4}'.format(self.patient_index[index]) + '.npy')
temp_label = np.load(Label_path + '{:0>4}'.format(self.patient_index[index]) + '.npy')
temp_img, temp_label = crop_data_label_from_0(temp_img, temp_label)
for i in range(channel_size):
img[:,:,i] = temp_img[self.count[index] + i]
mask[:,:,i] = temp_label[self.count[index] + i]
if self.transforms:
img = T.ToPILImage()(np.uint8(img))
mask = T.ToPILImage()(np.uint8(mask))
img, mask = self.transforms(img, mask)
img = T.ToTensor()(img).copy()
mask = T.ToTensor()(mask).copy()
return img, mask
EDIT 2
I found that after ToTensor, the dice between the same labels becomes 255 instead of 1, how to fix it?
# Dice computation
def DSC_computation(label, pred):
pred_sum = pred.sum()
label_sum = label.sum()
inter_sum = np.logical_and(pred, label).sum()
return 2 * float(inter_sum) / (pred_sum + label_sum)
Feel free to ask if more code is needed to explain the problem.
Transforms which require input parameters like RandomCrop has a get_param method which would return the parameters for that particular transformation. This can be then applied to both the image and mask using the functional interface of transforms:
from torchvision import transforms
import torchvision.transforms.functional as F
i, j, h, w = transforms.RandomCrop.get_params(input, (100, 100))
input = F.crop(input, i, j, h, w)
target = F.crop(target, i, j, h, w)
Sample available here:
https://github.com/pytorch/vision/releases/tag/v0.2.0
Complete example available here for VOC & COCO:
https://github.com/pytorch/vision/blob/master/references/segmentation/transforms.py
https://github.com/pytorch/vision/blob/master/references/segmentation/train.py
Regarding the error,
ToTensor() was not overridden to handle additional mask argument, so it cannot be in data_transforms. Moreover, __getitem__ does ToTensor of both img and mask before returning them.
data_transforms = {
'train': Compose([
RandomHorizontallyFlip(),
RandomRotate(degree=25),
#transforms.ToTensor() => remove this line
]),
}
torchvision also provides similar functions [document].
Here is a simple example,
import torchvision
from torchvision import transforms
trans = transforms.Compose([transforms.CenterCrop((178, 178)),
transforms.Resize(128),
transforms.RandomRotation(20),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
dset = torchvision.datasets.MNIST(data_root, transforms=trans)
EDIT
A brief example when customizing your own CelebA dataset. Note that, to apply transformations, you need call transform list in __getitem__.
class CelebADataset(Dataset):
def __init__(self, root, transforms=None, num=None):
super(CelebADataset, self).__init__()
self.img_root = os.path.join(root, 'img_align_celeba')
self.attr_root = os.path.join(root, 'Anno/list_attr_celeba.txt')
self.transforms = transforms
df = pd.read_csv(self.attr_root, sep='\s+', header=1, index_col=0)
#print(df.columns.tolist())
if num is None:
self.labels = df.values
self.img_name = df.index.values
else:
self.labels = df.values[:num]
self.img_name = df.index.values[:num]
def __getitem__(self, index):
img = Image.open(os.path.join(self.img_root, self.img_name[index]))
# only use blond_hair, eyeglass, male, smile
indices = [9, 15, 20, 31]
label = np.take(self.labels[index], indices)
label[label==-1] = 0
if self.transforms is not None:
img = self.transforms(img)
return np.asarray(img), label
def __len__(self):
return len(self.labels)
EDIT 2
I probably miss something at the first glance. The main point of your problem is how to apply "the same" data preprocessing to img and labels. To my understanding, there is no available Pytorch built-in function. So, what I did before is to implement the augmentation by myself.
class RandomRotate(object):
def __init__(self, degree):
self.degree = degree
def __call__(self, img, mask):
rotate_degree = random.random() * 2 * self.degree - self.degree
return img.rotate(rotate_degree, Image.BILINEAR),
mask.rotate(rotate_degree, Image.NEAREST)
Note that the input should be PIL format. See this for more information.
Another idea is to stack your image and mask along the channel dimensions and then transform them together. Obviously this only works for geometric-type transforms and you need to use the same dtype for both. I use something like this:
# Apply these to image and mask
affine_transforms = transforms.Compose([
transforms.RandomAffine(degrees=180),
...
])
# Apply these to image only
image_transforms = transforms.Compose([
transforms.GaussianBlur(),
...
])
# Loader...
def __getitem__(self, index: int):
# Get the image and mask, here shape=(HxW) for both
image = self.images[index]
mask = self.masks[index]
# Stack the image and mask together so they get the same geometric transformations
stacked = torch.cat([image, mask], dim=0) # shape=(2xHxW)
stacked = self.affine_transforms(stacked)
# Split them back up again
image, mask = torch.chunk(stacked, chunks=2, dim=0)
# Image transforms are only applied to the image
image = self.image_transforms(image)
return image, mask

Categories

Resources