How to load custom yolo v-7 trained model - python

How do I load a custom yolo v-7 model.
This is how I know to load a yolo v-5 model :
model = torch.hub.load('ultralytics/yolov5', 'custom', path='yolov5/runs/train/exp15/weights/last.pt', force_reload=True)
I saw videos online and they suggested to use this :
!python detect.py --weights runs/train/yolov7x-custom/weights/best.pt --conf 0.5 --img-size 640 --source final_test_v1.mp4
But I want it to be loaded like a normal model and give me the bounding box co-ordinates of where ever it found the objects.
This is how I did it in yolo v-5:
from models.experimental import attempt_load
yolov5_weight_file = r'weights/rider_helmet_number_medium.pt' # ... may need full path
model = attempt_load(yolov5_weight_file, map_location=device)
def object_detection(frame):
img = torch.from_numpy(frame)
img = img.permute(2, 0, 1).float().to(device) #convert to required shape based on index
img /= 255.0
if img.ndimension() == 3:
img = img.unsqueeze(0)
pred = model(img, augment=False)[0]
pred = non_max_suppression(pred, conf_set, 0.20) # prediction, conf, iou
# print(pred)
detection_result = []
for i, det in enumerate(pred):
if len(det):
for d in det: # d = (x1, y1, x2, y2, conf, cls)
x1 = int(d[0].item())
y1 = int(d[1].item())
x2 = int(d[2].item())
y2 = int(d[3].item())
conf = round(d[4].item(), 2)
c = int(d[5].item())
detected_name = names[c]
# print(f'Detected: {detected_name} conf: {conf} bbox: x1:{x1} y1:{y1} x2:{x2} y2:{y2}')
detection_result.append([x1, y1, x2, y2, conf, c])
frame = cv2.rectangle(frame, (x1, y1), (x2, y2), (255,0,0), 1) # box
if c!=1: # if it is not head bbox, then write use putText
frame = cv2.putText(frame, f'{names[c]} {str(conf)}', (x1, y1), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,255), 1, cv2.LINE_AA)
return (frame, detection_result)

Make prediction with yolov7 using torch.hub
!# Download YOLOv7 code
!git clone https://github.com/WongKinYiu/yolov7
%cd yolov7
from pathlib import Path
import torch
from models.yolo import Model
from utils.general import check_requirements, set_logging
from utils.google_utils import attempt_download
from utils.torch_utils import select_device
dependencies = ['torch', 'yaml']
check_requirements(Path("/content/yolov7/").parent / 'requirements.txt', exclude=('pycocotools', 'thop'))
set_logging()
def custom(path_or_model='path/to/model.pt', autoshape=True):
"""custom mode
Arguments (3 options):
path_or_model (str): 'path/to/model.pt'
path_or_model (dict): torch.load('path/to/model.pt')
path_or_model (nn.Module): torch.load('path/to/model.pt')['model']
Returns:
pytorch model
"""
model = torch.load(path_or_model, map_location=torch.device('cpu')) if isinstance(path_or_model, str) else path_or_model # load checkpoint
if isinstance(model, dict):
model = model['ema' if model.get('ema') else 'model'] # load model
hub_model = Model(model.yaml).to(next(model.parameters()).device) # create
hub_model.load_state_dict(model.float().state_dict()) # load state_dict
hub_model.names = model.names # class names
if autoshape:
hub_model = hub_model.autoshape() # for file/URI/PIL/cv2/np inputs and NMS
device = select_device('0' if torch.cuda.is_available() else 'cpu') # default to GPU if available
return hub_model.to(device)
model = custom(path_or_model='yolov7.pt') # custom example
# model = create(name='yolov7', pretrained=True, channels=3, classes=80, autoshape=True) # pretrained example
# Verify inference
import numpy as np
from PIL import Image
imgs = [np.zeros((640, 480, 3))]
results = model(imgs) # batched inference
results.print()
results.save()
df_prediction = results.pandas().xyxy
df_prediction
link to colab
https://colab.research.google.com/drive/1nKoC-_areXmc_20Xn7z6kcqHEKU7SJsX#scrollTo=yyB_MQW1OWhZ

You can do that with:
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
path = '/path/to/your/file.pt'
model = torch.hub.load("WongKinYiu/yolov7","custom",f"{path}",trust_repo=True)
To get results you can run
results = model("/path/to/your/photo")
To get bbox you can use:
results.pandas().xyxy
EDIT
I created a repository with a python package in order to this easily
https://github.com/Tlaloc-Es/aipose

You cannot use attempt_load from the Yolov5 repo as this method is pointing to the ultralytics release files. You need to use attempt_load from Yolov7 repo as this one is pointing to the right files.
# yolov7
def attempt_download(file, repo='WongKinYiu/yolov7'):
# Attempt file download if does not exist
file = Path(str(file).strip().replace("'", '').lower())
...
# yolov5
def attempt_download(file, repo='ultralytics/yolov5', release='v6.2'):
# Attempt file download from GitHub release assets if not found locally. release = 'latest', 'v6.2', etc.
from utils.general import LOGGER
def github_assets(repository, version='latest'):
...
Then you can download it like this:
# load yolov7 method
from models.experimental import attempt_load
model = attempt_load('yolov7.pt', map_location='cuda:0') # load FP32 model

import torch as th
def loadModel(path:str):
model = th.hub.load("WongKinYiu/yolov7","custom",f{path}",trust_repo=True)
This will work. trust_repo = True will not ask to to say y or n.
In path you can just add your custom train model like ./best.pt

Related

How to load a TF Lite Model into Python from a file

I've followed the End-to-End image classification tutorial for tensorflow lite and have created and saved my model as '/path/to/model.tflite'.
What I haven't been able to figure out is how to load it.
I'm looking for some kind of syntax that is similar to this:
from tflite_model_maker import image_classifier
from tflite_model_maker.image_classifier import DataLoader
model = image_classifier.Load('/path/to/model.tflite')
I'm sure I'm missing something obvious here. This is definitely not the first place I've looked at. This seems to be the best place for me to find what I need, but the syntax used confuses me.
What do I want to be able to do with the model?
test = DataLoader.from_folder('/path/to/testImages')
loss, accuracy = model.evaluate(test)
# A helper function that returns 'red'/'black' depending on if its two input
# parameter matches or not.
def get_label_color(val1, val2):
if val1 == val2:
return 'black'
else:
return 'red'
# Then plot 100 test images and their predicted labels.
# If a prediction result is different from the label provided label in "test"
# dataset, we will highlight it in red color.
test_data = data
plt.figure(figsize=(20, 20))
predicts = model.predict_top_k(test_data)
for i, (image, label) in enumerate(test_data.gen_dataset().unbatch().take(100)):
ax = plt.subplot(10, 10, i+1)
plt.xticks([])
plt.yticks([])
plt.grid(False)
plt.imshow(image.numpy(), cmap=plt.cm.gray)
predict_label = predicts[i][0][0]
color = get_label_color(predict_label,
test_data.index_to_label[label.numpy()])
ax.xaxis.label.set_color(color)
plt.xlabel('Predicted: %s' % predict_label)
plt.show()
From the syntax above it seems the model isn't just a file but is a type/class/method depending on what name is most suitable for python.
Feels like this should only take one line of code but I haven't been able to find it anywhere.
Managed to do a simple version of it. The images coming up as a stream doesn't work for me using cv2 with Windows as it does for the pi. So instead I created a webpage in the same directory as this script. This generates an image with the bounding box, using a specified tflite model. This is in no way ideal.
It uses a webcam to get the image and saves the image to the directory the script is run in. It then renames the file so it can be viewed by the webpage I setup to view it.
The majority of this code comes from the TFLite Object Detection Raspberry Pi sample.
import time, os
from PIL import Image
from tflite_support import metadata
import platform
from typing import List, NamedTuple
import json
import cv2 as cv2
import numpy as np
import tensorflow as tf
from matplotlib import pyplot as plt
Interpreter = tf.lite.Interpreter
load_delegate = tf.lite.experimental.load_delegate
class ObjectDetectorOptions(NamedTuple):
"""A config to initialize an object detector."""
enable_edgetpu: bool = False
"""Enable the model to run on EdgeTPU."""
label_allow_list: List[str] = None
"""The optional allow list of labels."""
label_deny_list: List[str] = None
"""The optional deny list of labels."""
max_results: int = -1
"""The maximum number of top-scored detection results to return."""
num_threads: int = 1
"""The number of CPU threads to be used."""
score_threshold: float = 0.0
"""The score threshold of detection results to return."""
class Rect(NamedTuple):
"""A rectangle in 2D space."""
left: float
top: float
right: float
bottom: float
class Category(NamedTuple):
"""A result of a classification task."""
label: str
score: float
index: int
class Detection(NamedTuple):
"""A detected object as the result of an ObjectDetector."""
bounding_box: Rect
categories: List[Category]
def edgetpu_lib_name():
"""Returns the library name of EdgeTPU in the current platform."""
return {
'Darwin': 'libedgetpu.1.dylib',
'Linux': 'libedgetpu.so.1',
'Windows': 'edgetpu.dll',
}.get(platform.system(), None)
class ObjectDetector:
"""A wrapper class for a TFLite object detection model."""
_OUTPUT_LOCATION_NAME = 'location'
_OUTPUT_CATEGORY_NAME = 'category'
_OUTPUT_SCORE_NAME = 'score'
_OUTPUT_NUMBER_NAME = 'number of detections'
def __init__(
self,
model_path: str,
options: ObjectDetectorOptions = ObjectDetectorOptions()
) -> None:
"""Initialize a TFLite object detection model.
Args:
model_path: Path to the TFLite model.
options: The config to initialize an object detector. (Optional)
Raises:
ValueError: If the TFLite model is invalid.
OSError: If the current OS isn't supported by EdgeTPU.
"""
# Load metadata from model.
displayer = metadata.MetadataDisplayer.with_model_file(model_path)
# Save model metadata for preprocessing later.
model_metadata = json.loads(displayer.get_metadata_json())
process_units = model_metadata['subgraph_metadata'][0]['input_tensor_metadata'][0]['process_units']
mean = 0.0
std = 1.0
for option in process_units:
if option['options_type'] == 'NormalizationOptions':
mean = option['options']['mean'][0]
std = option['options']['std'][0]
self._mean = mean
self._std = std
# Load label list from metadata.
file_name = displayer.get_packed_associated_file_list()[0]
label_map_file = displayer.get_associated_file_buffer(file_name).decode()
label_list = list(filter(lambda x: len(x) > 0, label_map_file.splitlines()))
self._label_list = label_list
# Initialize TFLite model.
if options.enable_edgetpu:
if edgetpu_lib_name() is None:
raise OSError("The current OS isn't supported by Coral EdgeTPU.")
interpreter = Interpreter(
model_path=model_path,
experimental_delegates=[load_delegate(edgetpu_lib_name())],
num_threads=options.num_threads)
else:
interpreter = Interpreter(
model_path=model_path, num_threads=options.num_threads)
interpreter.allocate_tensors()
input_detail = interpreter.get_input_details()[0]
# From TensorFlow 2.6, the order of the outputs become undefined.
# Therefore we need to sort the tensor indices of TFLite outputs and to know
# exactly the meaning of each output tensor. For example, if
# output indices are [601, 599, 598, 600], tensor names and indices aligned
# are:
# - location: 598
# - category: 599
# - score: 600
# - detection_count: 601
# because of the op's ports of TFLITE_DETECTION_POST_PROCESS
# (https://github.com/tensorflow/tensorflow/blob/a4fe268ea084e7d323133ed7b986e0ae259a2bc7/tensorflow/lite/kernels/detection_postprocess.cc#L47-L50).
sorted_output_indices = sorted(
[output['index'] for output in interpreter.get_output_details()])
self._output_indices = {
self._OUTPUT_LOCATION_NAME: sorted_output_indices[0],
self._OUTPUT_CATEGORY_NAME: sorted_output_indices[1],
self._OUTPUT_SCORE_NAME: sorted_output_indices[2],
self._OUTPUT_NUMBER_NAME: sorted_output_indices[3],
}
self._input_size = input_detail['shape'][2], input_detail['shape'][1]
self._is_quantized_input = input_detail['dtype'] == np.uint8
self._interpreter = interpreter
self._options = options
def detect(self, input_image: np.ndarray) -> List[Detection]:
"""Run detection on an input image.
Args:
input_image: A [height, width, 3] RGB image. Note that height and width
can be anything since the image will be immediately resized according
to the needs of the model within this function.
Returns:
A Person instance.
"""
image_height, image_width, _ = input_image.shape
input_tensor = self._preprocess(input_image)
self._set_input_tensor(input_tensor)
self._interpreter.invoke()
# Get all output details
boxes = self._get_output_tensor(self._OUTPUT_LOCATION_NAME)
classes = self._get_output_tensor(self._OUTPUT_CATEGORY_NAME)
scores = self._get_output_tensor(self._OUTPUT_SCORE_NAME)
count = int(self._get_output_tensor(self._OUTPUT_NUMBER_NAME))
return self._postprocess(boxes, classes, scores, count, image_width,
image_height)
def _preprocess(self, input_image: np.ndarray) -> np.ndarray:
"""Preprocess the input image as required by the TFLite model."""
# Resize the input
input_tensor = cv2.resize(input_image, self._input_size)
# Normalize the input if it's a float model (aka. not quantized)
if not self._is_quantized_input:
input_tensor = (np.float32(input_tensor) - self._mean) / self._std
# Add batch dimension
input_tensor = np.expand_dims(input_tensor, axis=0)
return input_tensor
def _set_input_tensor(self, image):
"""Sets the input tensor."""
tensor_index = self._interpreter.get_input_details()[0]['index']
input_tensor = self._interpreter.tensor(tensor_index)()[0]
input_tensor[:, :] = image
def _get_output_tensor(self, name):
"""Returns the output tensor at the given index."""
output_index = self._output_indices[name]
tensor = np.squeeze(self._interpreter.get_tensor(output_index))
return tensor
def _postprocess(self, boxes: np.ndarray, classes: np.ndarray,
scores: np.ndarray, count: int, image_width: int,
image_height: int) -> List[Detection]:
"""Post-process the output of TFLite model into a list of Detection objects.
Args:
boxes: Bounding boxes of detected objects from the TFLite model.
classes: Class index of the detected objects from the TFLite model.
scores: Confidence scores of the detected objects from the TFLite model.
count: Number of detected objects from the TFLite model.
image_width: Width of the input image.
image_height: Height of the input image.
Returns:
A list of Detection objects detected by the TFLite model.
"""
results = []
# Parse the model output into a list of Detection entities.
for i in range(count):
if scores[i] >= self._options.score_threshold:
y_min, x_min, y_max, x_max = boxes[i]
bounding_box = Rect(
top=int(y_min * image_height),
left=int(x_min * image_width),
bottom=int(y_max * image_height),
right=int(x_max * image_width))
class_id = int(classes[i])
category = Category(
score=scores[i],
label=self._label_list[class_id], # 0 is reserved for background
index=class_id)
result = Detection(bounding_box=bounding_box, categories=[category])
results.append(result)
# Sort detection results by score ascending
sorted_results = sorted(
results,
key=lambda detection: detection.categories[0].score,
reverse=True)
# Filter out detections in deny list
filtered_results = sorted_results
if self._options.label_deny_list is not None:
filtered_results = list(
filter(
lambda detection: detection.categories[0].label not in self.
_options.label_deny_list, filtered_results))
# Keep only detections in allow list
if self._options.label_allow_list is not None:
filtered_results = list(
filter(
lambda detection: detection.categories[0].label in self._options.
label_allow_list, filtered_results))
# Only return maximum of max_results detection.
if self._options.max_results > 0:
result_count = min(len(filtered_results), self._options.max_results)
filtered_results = filtered_results[:result_count]
return filtered_results
_MARGIN = 10 # pixels
_ROW_SIZE = 10 # pixels
_FONT_SIZE = 1
_FONT_THICKNESS = 1
_TEXT_COLOR = (0, 0, 255) # red
def visualize(
image: np.ndarray,
detections: List[Detection],
) -> np.ndarray:
"""Draws bounding boxes on the input image and return it.
Args:
image: The input RGB image.
detections: The list of all "Detection" entities to be visualize.
Returns:
Image with bounding boxes.
"""
for detection in detections:
# Draw bounding_box
start_point = detection.bounding_box.left, detection.bounding_box.top
end_point = detection.bounding_box.right, detection.bounding_box.bottom
cv2.rectangle(image, start_point, end_point, _TEXT_COLOR, 3)
# Draw label and score
category = detection.categories[0]
class_name = category.label
probability = round(category.score, 2)
result_text = class_name + ' (' + str(probability) + ')'
text_location = (_MARGIN + detection.bounding_box.left,
_MARGIN + _ROW_SIZE + detection.bounding_box.top)
cv2.putText(image, result_text, text_location, cv2.FONT_HERSHEY_PLAIN,
_FONT_SIZE, _TEXT_COLOR, _FONT_THICKNESS)
return image
# ---------------------------------- #
# This is where the custom code starts
# ---------------------------------- #
# Load the TFLite model
TFLITE_MODEL_PATH='object.tflite'
DETECTION_THRESHOLD = 0.5 # 50% threshold required before identifying
options = ObjectDetectorOptions(
num_threads=4,
score_threshold=DETECTION_THRESHOLD,
)
# Close camera if already open
try:
cap.release()
except:
print("",end="") # do nothing
detector = ObjectDetector(model_path=TFLITE_MODEL_PATH, options=options)
cap = cv2.VideoCapture(0) #webcam
counter = 0 # Store many times model has run
while cap.isOpened():
success, image = cap.read()
if not success:
sys.exit(
'ERROR: Unable to read from webcam. Please verify your webcam settings.'
)
image = cv2.flip(image, 1)
# Convert the image from BGR to RGB as required by the TFLite model.
rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
#image.thumbnail((512, 512), Image.ANTIALIAS)
image_np = np.asarray(image)
# Run object detection estimation using the model.
detections = detector.detect(image_np)
# Draw keypoints and edges on input image
image_np = visualize(image_np, detections)
if counter == 10: # <- Change this to decide how many iterations
cap.release()
break
image_np = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
plt.imsave('tmp.jpg',image_np) # Saves the image
os.replace("tmp.jpg", "web.jpg",) # Renames it for the webpage
counter += 1
print(counter)
cap.release()
Here's the HTML for the document placed in the same directory as the python file, I saved it as index.html and opened in the browser while running the python script above.
<!DOCTYPE html>
<html>
<head>
<title>Object Detection</title>
</head>
<body>
<h1>Object Detection</h1>
<p>This displays images saved during detection process</p>
<canvas id="x" width="700px" height="500px"></canvas>
<script>
var newImage = new Image();
newImage.src = "web.jpg";
var canvas = document.getElementById("x");
var context = canvas.getContext("2d");
newImage.onload = function() {
context.drawImage(newImage, 0, 0);
console.log("trigger")
setTimeout(timedRefresh, 1000);
};
function timedRefresh() {
// just change src attribute, will always trigger the onload callback
try {
newImage.src = ("web.jpg#" + new Date().getTime());
}catch(e){
console.log(e);
}
}
setTimeout(timedRefresh, 100);
</script>
</body>
</html>
It's incredibly slow, not ideal in many ways and probably breaks many good coding conventions. It was only used locally, would definitely not use this for a production environment nor recommend its use. Just needed a quick proof of concept and this worked for that.

Pytorch model weights change when put on GPU

I noticed a very strange behaviour regarding the 3D Resnet by Facebookresearch. Using their sample code from the website, I receive different results, when putting the model on GPU. While on cpu the correct class (archery) is predicted, the model fails to predict it on GPU. Can anyone replicate this and confirm that this is indeed the case? Does anyone know, why this is happening and how to prevent it? Following, you will find some code to quickly test it out:
import torch
import json
import urllib
from pytorchvideo.data.encoded_video import EncodedVideo
from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
CenterCropVideo,
NormalizeVideo,
)
from pytorchvideo.transforms import (
ApplyTransformToKey,
ShortSideScale,
UniformTemporalSubsample
)
def predict_archery(model, device):
json_url = "https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json"
json_filename = "kinetics_classnames.json"
try:
urllib.URLopener().retrieve(json_url, json_filename)
except:
urllib.request.urlretrieve(json_url, json_filename)
with open(json_filename, "r") as f:
kinetics_classnames = json.load(f)
# Create an id to label name mapping
kinetics_id_to_classname = {}
for k, v in kinetics_classnames.items():
kinetics_id_to_classname[v] = str(k).replace('"', "")
side_size = 256
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
crop_size = 256
num_frames = 8
sampling_rate = 8
frames_per_second = 30
# Note that this transform is specific to the slow_R50 model.
transform = ApplyTransformToKey(
key="video",
transform=Compose(
[
UniformTemporalSubsample(num_frames),
Lambda(lambda x: x / 255.0),
NormalizeVideo(mean, std),
ShortSideScale(
size=side_size
),
CenterCropVideo(crop_size=(crop_size, crop_size))
]
),
)
# The duration of the input clip is also specific to the model.
clip_duration = (num_frames * sampling_rate) / frames_per_second
url_link = "https://dl.fbaipublicfiles.com/pytorchvideo/projects/archery.mp4"
video_path = 'archery.mp4'
try:
urllib.URLopener().retrieve(url_link, video_path)
except:
urllib.request.urlretrieve(url_link, video_path)
# Select the duration of the clip to load by specifying the start and end duration
# The start_sec should correspond to where the action occurs in the video
start_sec = 0
end_sec = start_sec + clip_duration
# Initialize an EncodedVideo helper class and load the video
video = EncodedVideo.from_path(video_path)
# Load the desired clip
video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)
# Apply a transform to normalize the video input
video_data = transform(video_data)
# Move the inputs to the desired device
inputs = video_data["video"]
inputs = inputs.to(device)
# Pass the input clip through the model
preds = model(inputs[None, ...])
# Get the predicted classes
post_act = torch.nn.Softmax(dim=1)
preds = post_act(preds)
pred_classes = preds.topk(k=5).indices[0]
# Map the predicted classes to the label names
pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]
print("Top 5 predicted labels: %s" % ", ".join(pred_class_names))
if __name__ == '__main__':
# Choose device
# device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device = torch.device("cpu")
# Choose the `slow_r50` model
model = torch.hub.load('facebookresearch/pytorchvideo', 'slow_r50', pretrained=True).to(device)
model = model.eval()
predict_archery(model, device)
Results on cpu:
Top 5 predicted labels: archery, throwing axe, playing paintball,
stretching arm, riding or walking with horse
Results on GPU:
Top 5 predicted labels: flying kite, air drumming, beatboxing,
smoking, reading book
Edit:
Apparently, this issue cannot be reproduced on google colab. I therefore assume that the issue is related to the specific hardware / cuda version. I am using a NVIDIA TITAN Xp and cuda version 11.4.

Learning Object Detection Detected result showed in discolouration

Breif Description
Recently begin to learning Object Detection, Just starting off with PyTorch, YOLOv5. So I thought why not build a small side project to learn? Using it to train to detect Pikachu.
The Problem
I've successfully trained the model with Pikachu and then uses trained weights with myself written Python script/code to detect the Pikachu using test images, now, here's the problem, Pikachus can successfully detected but all the result showed in blue discolouration, what supposed to be yellow, all turned into blue and blue into yellow.
Fig-1 Result images showed in blue discolouration(few example outputs)
Additional Imformation
I've pushed this project to the GitHub, feel free to download it or pull it to debugging.
GitHub repository where it contains all the files
Any solution/suggestion would be helpful, Thanks.
The Code
"""Object detection using YOLOv5
Pokemon Pikachu detecting
"""
# import os, sys to append YOLOv5 folder path
import os, sys
# import object detection needed modules and libraries
# pillow
from PIL import Image, ImageDraw, ImageFont
import numpy as np
import torch # PyTorch
# YOLOv5 folder path and related folder path settings
cwd = os.getcwd()
root_dir = (cwd + "/yolov5_stable")
sys.path.append(root_dir)
# import methods, functions from YOLOv5
from models.experimental import attempt_load
from utils.datasets import LoadImages
from utils.general import non_max_suppression, scale_coords
from utils.plots import colors
# define a function to show detected pikachu
def show_pikachu(img, det):
labels = ["pikachu"]
img = Image.fromarray(img)
draw = ImageDraw.Draw(img)
font_size = max(round(max(img.size)/40), 12)
font = ImageFont.truetype(cwd + "/yolov5_stable/fonts/times.ttf")
for info in det:
color = colors(1)
target, prob = int(info[5].cpu().numpy()), np.round(info[4].cpu().numpy(), 2)
x_min, y_min, x_max, y_max = info[0], info[1], info[2], info[3]
draw.rectangle([x_min, y_min, x_max, y_max], width = 3, outline = color)
draw.text((x_min, y_min), labels[target] + ':' + str(prob), fill = color, font = font)
# Bug unresolved, pikachu shown in blue discolouration
return img
if __name__ == "__main__":
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print("GPU State: ", device)
data_path = (cwd + "/test_data/")
weight_path = (cwd + "/yolov5_stable/weights/best_v1.pt")
dataset = LoadImages(data_path)
model = attempt_load(weight_path, map_location = device)
model.to(device)
for path, img, im0s, _ in dataset:
img = torch.from_numpy(img).to(device)
img = img.float() # uint8 to fp16/32
img /= 255.0 # 0-255 to 0.0-1.0
if img.ndimension() == 3:
img = img.unsqueeze(0)
pred = model(img)[0]
pred = non_max_suppression(pred, 0.25, 0.45)
for i, det in enumerate(pred):
im0 = im0s.copy()
det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0.shape).round()
result = show_pikachu(im0, det)
result.show()
The problem is that Image.fromarray expects image in RGB and you're providing them in BGR. You just need to change that. There are multiple places you could do that, for instance:
Image.fromarray(img[...,::-1]) # assuming `img` is channel-last
An evidence is that the red parts of the mouse (red is RGB(255, 0, 0)) are being shown in blue (which is RGB(0, 0, 255)). FYI, yellow is RGB(255, 255, 0) and cyan is RGB(0, 255, 255), which you can also see in your case.

ModuleNotFoundError: No module named 'art.attacks'

I have already installed the module "art" yesterday, but when I try to run my code, it happens:
Traceback (most recent call last):
File "D:/Desktop/captcha/src1/adv_ex.py", line 10, in <module>
from art.attacks.evasion import FastGradientMethod
ModuleNotFoundError: No module named 'art.attacks'
I try to search it on Internet, but there is little information about this error.
I am sure that the module "art" (version 5.1) is already installed.
I hope you can help me. Many thanks.
Here is the full code:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.transforms as transforms
import torch
import os
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from art.attacks.evasion import FastGradientMethod
from art.attacks.evasion import BasicIterativeMethod
from art.estimators.classification import PyTorchClassifier
from art.utils import load_mnist
from model import Net
from model import CaptchaData
from model import DataLoader
from model import vec2text
# Step1: Load the original dataset
transform = transforms.Compose([transforms.ToTensor()]) # 不做数据增强和标准化了
test_data = CaptchaData('./testset/', transform=transform)
test_data_loader = DataLoader(test_data, batch_size=128, num_workers=0, shuffle=True, drop_last=True)
#Step 2: Load the model
net = Net()
#更换设备
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('current device:' , device)
print("right")
net.to(device)
#Define the loss function and optimizer
criterion = nn.MultiLabelSoftMarginLoss()
criterion.requires_grad = True #loss function
optimizer = torch.optim.Adam(net.parameters(), lr=0.001)
#Load the model
model_path = './module_build/model.pth'
checkpoint = torch.load(model_path)
net.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
net.eval()
# Step 3: Create the ART classifier for attacking
classifier = PyTorchClassifier(
model=net,
clip_values=(0.0,255.0),
loss=criterion,
optimizer=optimizer,
input_shape=(3,140,44),
nb_classes=222,
)
# Step 4: Define the eval function for evaulating the result
captcha_list = list('0123456789abcdefghijklmnopqrstuvwxcyz_')
captcha_length =6
def calculat_acc(output, target):
output, target = output.view(-1,len(captcha_list)),target.view(-1,len(captcha_list))
output = nn.functional.softmax(output, dim=1)
output = torch.argmax(output,dim=1)
target = torch.argmax(target,dim=1)
output,target = output.view(-1, captcha_length),target.view(-1,captcha_length)
c=0
for i, j in zip(target,output):
if torch.equal(i,j):
c += 1
acc = c / output.size()[0]* 100
return acc
# Step 6: craft attack with FGSM and show the adversarial picture
acc,i =0,0
# with torch.no_grad( ) :
for inputs, labels in test_data_loader:
# Before
plt.figure()
plt.imshow(inputs[124].permute(1,2,0))
attack = FastGradientMethod(estimator=classifier, eps=0.1)
inputs_adv = attack.generate(x=inputs)
inputs_adv = torch.as_tensor(inputs_adv)
print(vec2text(labels[124].view(6,-1)))
# After
plt.figure()
plt.imshow(inputs_adv[124].permute(1,2,0))
for i in range(128):
result =transforms.ToPILImage()(inputs_adv[i])
result.save("./input_adv/"+vec2text(labels[i].view(6,-1))+ ".jpg")
outputs = net(inputs_adv)
print(vec2text(outputs[124].view(6,-1)))
acc += calculat_acc(outputs, labels)
i += 1
break
print ('Accuracy: %.3f %%' % (acc/i))
'''
# Step 6: craft attack with BIM and show the adversarial picture
acc,i =0,0
# with torch.no_grad( ) :
for inputs, labels in test_data_loader:
# Before
plt.figure()
plt.imshow(inputs[124]. permute(1,2,0))
attack = BasicIterativeMethod(estimator=classifier, eps=0.1, eps_step=0.01)
inputs_adv = attack.generate(x=inputs)
inputs_adv = torch.as_tensor(inputs_adv)
# After
plt.figure()
plt.imshow(inputs_adv[124].permute(1,2,0))
outputs = net(inputs_adv)
acc += calculat_acc(outputs, labels)
i += 1
break
print ('Accuracy: %.3f %%' % (acc/i))
'''
#Step 7: crop the image
for inputs, labels in test_data_loader:
# Before
plt.figure()
plt.imshows(inputs[124].permute(1,2,0))
#Transform the tensor to image for further operation
image = transforms.ToPILImage()(inputs[124])
# crop the left part
image_left = image.crop((28,0,140,44))
plt.figure()
plt.imshow(np.asarray(image_left))
attack = FastGradientMethod(estimator = classifier,eps=0.1)
inputs_adv = attack.generate(x = inputs)
inputs_adv = torch.as_tensor(inputs_adv)
#After
plt.figure()
plt.imshow(inputs_adv[124].permute(1,2,0))
#Crop the one part of image.
image = transforms.ToPILImage()(inputs_adv[124])
(left, upper, right, lower) = (0,0,28,44)
image_crop = image.crop((left, upper, right,lower))
plt.figure()
plt.imshow(np.asarray(image_crop))
#combine the final image.
dst = Image.new('RGB',(image_crop.width + image_left.width, image_crop.height))
dst.paste(image_crop,(0, 0))
dst.paste(image_left, (image_crop.width, 0))
dst.save("./input_adv/"+ str(i) +".jpg")
# Show the final result
plt.figure()
plt.imshow(np.asarray(dst))
# Test it on pre-trained model
outputs = net(inputs_adv)
break
# save cropped image
for inputs, labels in test_data_loader:
for i in range(128):
# Transform the tensor to image for further operation
image = transforms.ToPILImage()(inputs[i])
#crop the left part
image_left =image.crop((28,8,140,44))
attack = FastGradientMethod(estimator=classifier, eps=0.1)
inputs_adv = attack.generate(x=inputs)
inputs_adv = torch.as_tensor(inputs_adv)
#crop the one part of image.
image = transforms.ToPILImage()(inputs_adv[i])
(left, upper, right, lower) = (0,8,28,44)
image_crop = image.crop((left,upper,right,lower))
# combine the final image.
dst = Image.new('RGB',(image_crop.width + image_left.width, image_crop.height))
dst.paste(image_crop, (0,0))
dst.paste(image_left,(image_crop.width,0))
# Save the final results
dst.save("./input_adv_cropped/"+vec2text(labels[i].view(6,-1))+ ".jpg")
break
# Test accuracy on cropped images
test_data_cropped = CaptchaData('./input_adv_cropped/', transform=transform)
test_data_loader_cropped = DataLoader(test_data_cropped , batch_size=8, num_workers=0,shuffle=True,drop_last=True)
acc,i=0, 0
with torch.no_grad():
for inputs, labels in test_data_loader_cropped:
outputs = net(inputs)
for j in range(8):
plt.figure()
print(vec2text(labels[j].view(6.-1)))
print(vec2text(outputs[j].vieu( 6.-1)))
plt.imshow(inputs[j].permute(1,2,0))
print("\n")
acc += calculat_acc(outputs, labels)
i +=1
break
print('Accuracy: %.3f %%' %(acc/i))
I have solved this problem.
It is because I mixed art with adversarial-robustness-toolbox.
After I use
pip install adversarial-robustness-toolbox
to install it, my code can run normally.

KeyError: 'data' in Caffe

While I was following the deepdream iPython notebook which is here: https://github.com/google/deepdream/blob/master/dream.ipynb, I successfully ran the code and initialized the network until i get this error:
I0218 20:53:01.108750 12174 net.cpp:283] Network initialization done.
I0218 20:53:06.017426 12174 net.cpp:816] Ignoring source layer data
I0218 20:53:06.139768 12174 net.cpp:816] Ignoring source layer loss
Traceback (most recent call last):
File "/home/andrew/PycharmProjects/deepmeme/deepmeme.py", line 122, in <module>
<IPython.core.display.Image object>
frame = deepdream(net, frame)
File "/home/andrew/PycharmProjects/deepmeme/deepmeme.py", line 78, in deepdream
octaves = [preprocess(net, base_img)]
File "/home/andrew/PycharmProjects/deepmeme/deepmeme.py", line 43, in preprocess
return np.float32(np.rollaxis(img, 2)[::-1]) - net.transformer.mean['data']
KeyError: 'data'
This is my code for the python file:
import sys
sys.path.append("/home/andrew/caffe/python")
from cStringIO import StringIO
import numpy as np
import scipy.ndimage as nd
import PIL.Image
from IPython.display import clear_output, Image, display
from google.protobuf import text_format
import caffe
# If your GPU supports CUDA and Caffe was built with CUDA support,
# uncomment the following to run Caffe operations on the GPU.
# caffe.set_mode_gpu()
# caffe.set_device(0) # select GPU device if multiple devices exist
def showarray(a, fmt='jpeg'):
a = np.uint8(np.clip(a, 0, 255))
f = StringIO()
PIL.Image.fromarray(a).save(f, fmt)
display(Image(data=f.getvalue()))
model_path = '/home/andrew/caffe/models/bvlc_reference_caffenet/' # substitute your path here
net_fn = model_path + 'deploy.prototxt'
param_fn = model_path + 'caffe_train_iter_500.caffemodel'
# Patching model to be able to compute gradients.
# Note that you can also manually add "force_backward: true" line to "deploy.prototxt".
model = caffe.io.caffe_pb2.NetParameter()
text_format.Merge(open(net_fn).read(), model)
model.force_backward = True
open('deploy.prototxt', 'w').write(str(model))
net = caffe.Classifier('/home/andrew/caffe/models/bvlc_reference_caffenet/deploy.prototxt', '/home/andrew/caffe/models/bvlc_reference_caffenet/caffenet_train_iter_500.caffemodel', caffe.TEST)
# a couple of utility functions for converting to and from Caffe's input image layout
def preprocess(net, img):
return np.float32(np.rollaxis(img, 2)[::-1]) - net.transformer.mean['data']
def deprocess(net, img):
return np.dstack((img + net.transformer.mean['data'])[::-1])
def objective_L2(dst):
dst.diff[:] = dst.data
def make_step(net, step_size=1.5, end='inception_4c/output',
jitter=32, clip=True, objective=objective_L2):
'''Basic gradient ascent step.'''
src = net.blobs['data'] # input image is stored in Net's 'data' blob
dst = net.blobs[end]
ox, oy = np.random.randint(-jitter, jitter+1, 2)
src.data[0] = np.roll(np.roll(src.data[0], ox, -1), oy, -2) # apply jitter shift
net.forward(end=end)
objective(dst) # specify the optimization objective
net.backward(start=end)
g = src.diff[0]
# apply normalized ascent step to the input image
src.data[:] += step_size/np.abs(g).mean() * g
src.data[0] = np.roll(np.roll(src.data[0], -ox, -1), -oy, -2) # unshift image
if clip:
bias = net.transformer.mean['data']
src.data[:] = np.clip(src.data, -bias, 255-bias)
def deepdream(net, base_img, iter_n=10, octave_n=4, octave_scale=1.4,
end='inception_4c/output', clip=True, **step_params):
# prepare base images for all octaves
octaves = [preprocess(net, base_img)]
for i in xrange(octave_n-1):
octaves.append(nd.zoom(octaves[-1], (1, 1.0/octave_scale,1.0/octave_scale), order=1))
src = net.blobs['data']
detail = np.zeros_like(octaves[-1]) # allocate image for network-produced details
for octave, octave_base in enumerate(octaves[::-1]):
h, w = octave_base.shape[-2:]
if octave > 0:
# upscale details from the previous octave
h1, w1 = detail.shape[-2:]
detail = nd.zoom(detail, (1, 1.0*h/h1,1.0*w/w1), order=1)
src.reshape(1,3,h,w) # resize the network's input image size
src.data[0] = octave_base+detail
for i in xrange(iter_n):
make_step(net, end=end, clip=clip, **step_params)
# visualization
vis = deprocess(net, src.data[0])
if not clip: # adjust image contrast if clipping is disabled
vis = vis*(255.0/np.percentile(vis, 99.98))
showarray(vis)
print octave, i, end, vis.shape
clear_output(wait=True)
# extract details produced on the current octave
detail = src.data[0]-octave_base
# returning the resulting image
return deprocess(net, src.data[0])
img = np.float32(PIL.Image.open('/home/andrew/caffe/examples/images/cat.jpg'))
showarray(img)
net.blobs.keys()
frame = img
frame_i = 0
h, w = frame.shape[:2]
s = 0.05 # scale coefficient
for i in xrange(100):
frame = deepdream(net, frame)
PIL.Image.fromarray(np.uint8(frame)).save("frames/%04d.jpg"%frame_i)
frame = nd.affine_transform(frame, [1-s,1-s,1], [h*s/2,w*s/2,0], order=1)
frame_i += 1
Image(filename='frames/0029.jpg')
Does anybody know what's happening? I am using my own data that I successfully trained a model with.
From the deepdream iPython notebook:
net = caffe.Classifier('tmp.prototxt', param_fn,
mean = np.float32([104.0, 116.0, 122.0]), # ImageNet mean, training set dependent
channel_swap = (2,1,0)) # the reference model has channels in BGR order instead of RGB
vs your:
net = caffe.Classifier('/home/andrew/caffe/models/bvlc_reference_caffenet/deploy.prototxt', '/home/andrew/caffe/models/bvlc_reference_caffenet/caffenet_train_iter_500.caffemodel', caffe.TEST)
You do not seem to include a mean when you create a caffe.Classifier.
See the definition of caffe.Classifier.
If you don't have a mean, you could probably just remove the mention of mean from preprocess/deprocess:
def preprocess(net, img):
return np.float32(np.rollaxis(img, 2)[::-1])
def deprocess(net, img):
return np.dstack((img)[::-1])

Categories

Resources