I have a model in caffe that produce a multi-dimensional array. Size of this array is [1x10x8x8] so in python I haven't problem with this size because python automatically manage this array and I know order of elements in that. but when I switch to opencv with c++ the whole array is a vector and I haven't any idea how to regenerate something like python array, I use cv::NAryMatIterator to access multi-dimensional array such below
const cv::Mat* arrays[]={&prob,0}; //my multi-dimensional array is prob
cv::Mat my_planes[1];
cv::NAryMatIterator it(arrays,my_planes);
cv::Mat Multi_Array ; //temporary Mat
for (int p = 0; p < it.nplanes; ++p,++it) {
Multi_Array = it.planes[0];
}
after doing that I see Multi_Array size is [640x1] which seems that it is equal to 8x8x10 that python produced. is there anyway to access 8x8 planes one by one?
EDIT: my multi-dimensional array size is [1x10x8x8]
To access the 3D array as if it were a 2D array with shape [640][1], you could write 3 loops to iterate on the elements using a [x,y,z] format like:
int data[640][1] = { 0 };
int width = 8, height = 8, depth = 10;
for (int x = 0; x < width; x++)
for (int y = 0; y < height; y++)
for (int z = 0; z < depth; z++)
{
int idx = x * height * depth + y * depth + z;
data[idx][0] = idx;
}
This fills the array with numbers ranging from 0 to 639.
If you are looking to access a 2D array as a 1D, check this answer.
If your model data is ordered in row-major form, you can have OpenCV interpret the data as a Mat of the required size. Then, planes of the Mat can be accessed using multidim_mat.row( row_number ).
In order to create a Mat from the data:
int data[640] = { 0 };
const int size[] = { 8, 8, 10 };
cv::Mat multidim_mat(3, size, CV_32S, data);
std::cout << multidim_mat.dims << std::endl;
for (int i = 0; i < multidim_mat.dims; i++) {
std::cout << "Dimension " << i << " is of size " << multidim_mat.size[i] << std::endl;
}
The CV_32S is to inform OpenCV to interpret the data as signed 32-bit integers.
References: https://docs.opencv.org/3.4.0/d3/d63/classcv_1_1Mat.html#a5fafc033e089143062fd31015b5d0f40, https://docs.opencv.org/3.4.0/d3/d63/classcv_1_1Mat.html#details,
In first step we need to get a pointer to OpenCV Mat object, you can do this by the below command.(I assume that data that represent your data is primarily float and consider probability Mat is prob which we get this Mat from caffe)
float* p = (float*)(prob.data);
This pointer will points to the where data is reside in memory. So for example if we want to get access to the element in (1,3,7,7) location we can do this operation like this:
int S= sizeof(float);
float val = p[(
7*p.step[3]/S + //forth dimension
7*p.step[2]/S + //third dimension
3*p.step[1]/S //second dimension
)]
//first dimension is not needed, because it is decoded in address of p
//and if you have any higher number than 1 in first dimension you need to add it to the above command
So for traversing in the probability matrix you can do that like the below:
auto S=sizeof(float);
for (int d2 = 0; d2 < 129; ++d2) {
for (int d3 = 0; d3 < 129; ++d3) {
for (int d4 = 0; d4 < 10; ++d4) {
float val = p[(d2*prob.step[3]/S + d3*prob.step[2]/S + d4* prob.step[1]/S)];
}
}
}
Related
I'm working on some optimazation and want to convert some parts from python to c++
Is it possible to convert this code to c++ with opencv?
The python code uses numpy
import numpy as np
from PIL import Image
pil_img = Image.open(input_filename)
img = np.array(pil_img)
pixels = img.reshape((-1, 3))
num_pixels = pixels.shape[0]
num_samples = int(num_pixels*5)
idx = np.arange(num_pixels)
np.random.shuffle(idx)
samples = pixels[idx[:num_samples]]
update
std::vector<uchar> sample_pixels(const cv::Mat& m, int sample_percent=5){
assert(m.isContinuous());
const auto* input = m.ptr<uchar>();
int
num_pixels = m.total(),
num_samples = num_pixels * sample_percent;
std::cout
<< "num pixels: " << num_pixels << '\n'
<< "num samples: " << num_samples << '\n';
std::vector<uchar> samples(num_samples);
// Fills idx with sequentially increasing values
std::vector<int> idx(num_pixels);
std::iota(idx.begin(), idx.end(), 0);
// Shuffle idx
std::mt19937 engine(0);
std::shuffle(idx.begin(), idx.end(), engine);
for(int i = 0; i < num_samples; i++){
//samples[i] = input[idx[i]];
}
//auto output_mat = cv::Mat(samples, false);
//cv::imwrite("enhance-samples.png", output_mat);
return samples;
}
This is the equivalent code in C++11. This should be several times faster than your python code.
#include <random>
#include <numeric>
#include <opencv2/opencv.hpp>
void shuffling(const std::string &input_filename, const std::string &output_filename) {
// ========== UPDATE ==========
const cv::Mat plain_input_mat = cv::imread(input_filename, -1);
// Equivalent to img.reshape((-1, 3))
const cv::Mat input_mat = plain_input_mat.reshape(3);
// ============================
// By doing this, you can access the pixels without any extra checks.
assert(input_mat.isContinuous());
const auto *input = input_mat.ptr<cv::Vec3b>();
const auto num_samples = input_mat.total();
std::vector<cv::Vec3b> output(num_samples);
std::vector<int> idx(input_mat.total());
std::iota(idx.begin(), idx.end(), 0); // Equivalent to arange.
// Note: numpy uses PCG64 which does not exist in the std library.
std::mt19937 engine(0);
std::shuffle(idx.begin(), idx.end(), engine);
for (int i = 0; i < num_samples; i++) {
output[i] = input[idx[i]];
}
// Save as an image if necessary.
auto output_mat = cv::Mat(output, false);
cv::imwrite(output_filename, output_mat);
}
There are a couple of additional notes.
Note1: Due to the difference in the shuffle algorithm between python and std, the results are not exactly the same.
Note2: With your code, num_samples cannot be larger than the number of pixels in the input image, which seems to be a bug. Please check the length of the samples.
Note3: In both implementations, the most expensive part is shuffle. 60% for python and more than 80% for C++ is spent here. If you want to optimize further, this is definitely where you should exploit.
I implemented a simple distance based LOD for future terrain rendring using tesselation control/evaluation shader in openGL. Surptisingly, I still get cracks (T junctions) in my output...
The shader code is
uniform mat4 pvmM;
uniform vec4 u_camerapos;
#ifdef TESSELATION_CONTROL_SHADER
layout(vertices = 4) out;
unsigned int dist(vec3 p1,vec3 p2,vec3 p3)
{
float d=distance((p1+p2)/2.0,p3); // precise float d=(...)
if (d<1.5) return 64;
if (d<2.5) return 32;
if (d<3) return 16;
if (d<4) return 8;
if (d<5) return 4;
if (d<6) return 2;
return 1;
}
void main() {
if (gl_InvocationID == 0){
gl_TessLevelOuter[0] = dist( gl_in[3].gl_Position.xyz,gl_in[0].gl_Position.xyz,u_camerapos.xyz);
gl_TessLevelOuter[1] = dist( gl_in[0].gl_Position.xyz,gl_in[1].gl_Position.xyz,u_camerapos.xyz);
gl_TessLevelOuter[2] = dist( gl_in[1].gl_Position.xyz,gl_in[2].gl_Position.xyz,u_camerapos.xyz);
gl_TessLevelOuter[3] = dist( gl_in[2].gl_Position.xyz,gl_in[3].gl_Position.xyz,u_camerapos.xyz);
float mean=floor((gl_TessLevelOuter[0]+gl_TessLevelOuter[1]+gl_TessLevelOuter[2]+gl_TessLevelOuter[3])/4.0);
gl_TessLevelInner[0] = int(mean);
gl_TessLevelInner[1] = int(mean);
}
gl_out[gl_InvocationID].gl_Position = gl_in[gl_InvocationID].gl_Position;
}
#endif
#ifdef TESSELATION_EVALUATION_SHADER
void main() {
vec4 p1 = mix(gl_in[0].gl_Position, gl_in[1].gl_Position, gl_TessCoord.x);
vec4 p2 = mix(gl_in[2].gl_Position, gl_in[3].gl_Position, gl_TessCoord.x);
gl_Position = pvmM*mix(p1, p2, gl_TessCoord.y);
}
#endif
Using python:
vertices and indices are generated using
blocksz=8
vertices=np.array([(x,0,z) for z in range(blocksz) for x in range(blocksz)],dtype='f4')
vertices=vertices-np.array([blocksz/2,0,blocksz/2])
faces= np.array([(x+z*blocksz,x+1+z*blocksz,x+(z+1)*blocksz,x+1+(z+1)*blocksz ) for z in range(blocksz-1) for x in range(blocksz-1) ],dtype='i4')
and drawn using
glDrawElements(GL_PATCHES,count,GL_UNSIGNED_SHORT,None)
On the image below,
you can see the results. some T-Junctions are present on the x axis (left-right, for example first and last columns) but never on the z axis (top bootom). virtual camera is represented by red dot and its position is passed as uniform.
All the rest is working as expected.
Any idea of what I'm mis-understanding?
OK, found the bug.
When generating the index array (faces), I used a wrong winding (same winding as for GL_QUAD_STRIP). The correct faces array should be:
faces= np.array([(x+z*blocksz,x+1+z*blocksz,x+1+(z+1)*blocksz,x+(z+1)*blocksz ) for z in range(blocksz-1) for x in range(blocksz-1) ],dtype='i4')
Then, in tess evaluation shader (notice fliiping points 2 and 3
vec4 p1 = mix(gl_in[0].gl_Position, gl_in[1].gl_Position, gl_TessCoord.x);
vec4 p2 = mix(gl_in[3].gl_Position, gl_in[2].gl_Position, gl_TessCoord.x);
Ive retrained an InceptionV3 model via the Tensorflow for Poets tutorials and can successfully run label_image.py on my trained data and on new data and get correct labels with good accuracy. Awesome!
If I run my model through my Mac Obj-C++ app my resulting labels are wildly different.
For example - my training is to classify which 'shot type' a frame of video is, (extreme close up, close up, medium, long, extreme long) for classifying video editing content.
label_image.py classifies a frame from a video as 85% likely close up.
My C++ / Obj-C App run with the same frame classifies it as Extreme Long with 60%
Both are running the same version of Tensorflow (1.1) on Mac OS X CPU compiled with AVX/SIMD/FMA optimizations.
My Apps pipeline :
I have a BGR ordered OpenCV Mat image which I can use successfully elsewhere and get sane results from. I create this CV Mat from an OS X CVPixelBufferRef mapped to a BGRA CV MAT like so:
cv::cvtColor(BGRAImage, frameMat, cv::COLOR_BGRA2BGR);
I feed that BGR CV Mat (named frameMat) into a Tensor via code borrowed from the iOS contrib example, like so :
void* baseAddress = (void*)frameMat.datastart;
size_t width = (size_t) frameMat.cols;
size_t height = (size_t) frameMat.rows;
size_t bytesPerRow = (size_t) frameMat.cols * 3; // (BGR)
const int wanted_input_width = 299;
const int wanted_input_height = 299;
const int wanted_input_channels = 3;
const float input_mean = 128.0f;
const float input_std = 128.0f;
resized_tensor = tensorflow::Tensor( tensorflow::DT_FLOAT, tensorflow::TensorShape({1, wanted_input_height, wanted_input_width, wanted_input_channels}));
auto image_tensor_mapped = resized_tensor.tensor<float, 4>();
tensorflow::uint8 *in = sourceStartAddr;
float *out = image_tensor_mapped.data();
for (int y = 0; y < wanted_input_height; ++y)
{
float *out_row = out + (y * wanted_input_width * wanted_input_channels);
for (int x = 0; x < wanted_input_width; ++x)
{
const int in_x = (y * (int)width) / wanted_input_width;
const int in_y = (x * image_height) / wanted_input_height;
tensorflow::uint8 *in_pixel = in + (in_y * width * (image_channels)) + (in_x * (image_channels));
float *out_pixel = out_row + (x * wanted_input_channels);
// Interestingly the iOS example uses BGRA and DOES NOT re-order tensor channels to RGB <-> BGR
// Matching that.
out_pixel[0] = ((float)in_pixel[0] - (float)input_mean) / (float)input_std;
out_pixel[1] = ((float)in_pixel[1] - (float)input_mean) / (float)input_std;
out_pixel[2] = ((float)in_pixel[2] - (float)input_mean) / (float)input_std;
}
}
My session creation code:
tensorflow::Status load_graph_status = ReadBinaryProto(tensorflow::Env::Default(), [inception2015GraphPath cStringUsingEncoding:NSUTF8StringEncoding], &inceptionGraphDef);
if (load_graph_status.ok())
{
tensorflow::SessionOptions options;
inceptionSession = std::unique_ptr<tensorflow::Session>(tensorflow::NewSession(options));
tensorflow::Status session_create_status = inceptionSession->Create(inceptionGraphDef);
}
Running the graph:
tensorflow::Status run_status = inceptionSession->Run({ {input_layer, resized_tensor} }, {feature_layer, final_layer}, {}, &outputs);
And pulling out the labels / feature vector (penultimate layer)
NSMutableArray* outputLabels = [NSMutableArray arrayWithCapacity:self.labelsArray.count];
NSMutableArray* outputScores = [NSMutableArray arrayWithCapacity:self.labelsArray.count];
// 1 = labels and scores
auto predictions = outputs[1].flat<float>();
for (int index = 0; index < predictions.size(); index += 1)
{
const float predictionValue = predictions(index);
NSString* labelKey = self.labelsArray[index % predictions.size()];
NSNumber* currentLabelScore = self.averageLabelScores[labelKey];
NSNumber* incrementedScore = #([currentLabelScore floatValue] + predictionValue );
self.averageLabelScores[labelKey] = incrementedScore;
[outputLabels addObject:labelKey];
[outputScores addObject:#(predictionValue)];
}
// 0 is feature vector
tensorflow::Tensor feature = outputs[0];
int64_t numElements = feature.NumElements();
tensorflow::TTypes<float>::Flat featureVec = feature.flat<float>();
NSMutableArray* featureElements = [NSMutableArray arrayWithCapacity:numElements];
for(int i = 0; i < numElements; i++)
{
[featureElements addObject:#( featureVec(i) ) ];
}
if(self.averageFeatureVec == nil)
{
self.averageFeatureVec = featureElements;
}
else
{
// average each vector element with the prior
for(int i = 0; i < featureElements.count; i++)
{
float a = [featureElements[i] floatValue];
float b = [self.averageFeatureVec[i] floatValue];
self.averageFeatureVec[i] = #( MAX(a,b)) ;
}
}
return #{ kSynopsisStandardMetadataFeatureVectorDictKey : featureElements ,
#"Labels" : outputLabels,
#"Scores" : outputScores,
};
I've attempted to look into the tensor ordering (NHWC), and have checked the tensor creation code but I might be missing something obvious to others. Ive also tried changing channel order, to no avail.
Any insight would be greatly helpful. Thank you!
My usual method for debugging issues like this is:
First save out a raw C array of values from an example input that I know works. For example, make sure that label_image works with your newly-trained model, and then write out the float* array you get from input_layer->flat<float>().data(), using pseudo-code like this:
float* input_data = input_layer->flat<float>().data();
int input_data_count = input_layer->flat<float>().size();
printf("float g_test_input[]={\n");
for (int i = 0; i < input_data_count; ++i) {
printf(" %f,\n", input_data[i]);
}
printf("};\n");
You should end up with a big array that you can copy into your new code. Overwrite whatever input you have in the code you want to test. Now run it, and you should see the same output that you saw from label_image. If you don't, you know there's something different about the model you're loading. If the output is identical, then you know that the input preprocessing is different.
Assuming that it's the preprocessing that's wrong, my next step is to try loading an image from disk. The iOS example code does that in the simple example. Save out some of your expected input into an image file, and then make sure that both label_image and your code produce the same result.
So this one is tricky.
I failed to mention I was running the graph_transform tool on my retrained graph - and was running quantize weights to lower my graphs size. In the past, I've not had an issue with this messing up classification scores at all, but apparently that caused an issue.
Running the above code with a graph transform call without quantize weights fixed the issue.
I am writing a thin wrapper around ArUco augmented reality library (which is based on OpenCV). An interface I am trying to build is very simple:
Python passes image to C++ code;
C++ code detects markers and returns their locations and other info to Python as tuple of dicts.
However, I couldn't figure out how to represent an image in Python to pass it to C++. For GUI and camera management I am going to use PyQt, so initially it is going to be QImage, but I can't simply pass it to OpenCV (or I can?). At first, I tried to use nested tuples to represent row, column and color of each pixel, so I ended up with this sample code:
using namespace cv;
namespace py = boost::python;
void display(py::tuple pix)
{
/*
Receive image from Python and display it.
*/
Mat img(py::len(pix), py::len(pix[0]), CV_8UC3, Scalar(0, 0, 255));
for (int y = 0; y < py::len(pix); y++)
for (int x = 0; x < py::len(pix[y]); x++)
{
Vec3b rgb;
for (int i = 0; i < 3; i++)
rgb[i] = py::extract<int>(pix[y][x][i]);
img.at<Vec3b>(Point(x, y)) = rgb;
}
imshow("Image", img);
waitKey(0);
}
BOOST_PYTHON_MODULE(aruco)
{
py::def("display", display);
}
It turned out to be painfully slow (a few seconds for a single frame), so I went googling and found solution that should be much faster: use NumPy arrays, so the code would look something like that:
void display(py::object array)
{
Mat img;
// ... some magic here to convert NumPy array to Mat ...
imshow("Image", img);
waitKey(0);
}
However, I have no idea how to convert NumPy Array (which in C++ level is just a Python Object) to OpenCV Mat. I would appreciate any help here.
Alternatively, maybe NumPy is not really needed, so I could just pass QImage Python object directly to C++ layer? Or maybe there is a different approach to this problem? Any advice is appreciated!
The best solution in your situation is using custom boost::python converter for cv::Mat object. OpenCV has Python wrapper and when you are using this wrapper you are operating on Numpy arrays - you don't even need to know that those arrays are converted to cv::Mat objects while "crossing the c++ <-> python border". Writing such converter for simple type is quite easy, however creating converter for cv::Mat isn't simple. Fortunetely someone else already did this - here is version for OpenCV 2.x and here for 3.x. If you are not familiar with boost::python converters, this article should help you.
Hope it helps, if you wil have any problems, let us know.
I wrote this example for who didn't know there is Boost Numpy module. You can see how to convert Mat to NDArray and vice versa. it will gives you idea the way of convert ndarray.
#define BOOST_PYTHON_STATIC_LIB
#define BOOST_LIB_NAME "boost_numpy35"
//#include <boost/config/auto_link.hpp>
#include <boost/python.hpp>
#include <boost/python/numpy.hpp>
#include <iostream>
#include <opencv2/opencv.hpp>
namespace py = boost::python;
namespace np = boost::python::numpy;
void Init() {
// set your python location.
wchar_t str[] = L"D:\\Anaconda3\\envs\\tensorflow_vision";
Py_SetPythonHome(str);
Py_Initialize();
np::initialize();
}
np::ndarray ConvertMatToNDArray(const cv::Mat& mat) {
py::tuple shape = py::make_tuple(mat.rows, mat.cols, mat.channels());
py::tuple stride = py::make_tuple(mat.channels() * mat.cols * sizeof(uchar), mat.channels() * sizeof(uchar), sizeof(uchar));
np::dtype dt = np::dtype::get_builtin<uchar>();
np::ndarray ndImg = np::from_data(mat.data, dt, shape, stride, py::object());
return ndImg;
}
cv::Mat ConvertNDArrayToMat(const np::ndarray& ndarr) {
//int length = ndarr.get_nd(); // get_nd() returns num of dimensions. this is used as a length, but we don't need to use in this case. because we know that image has 3 dimensions.
const Py_intptr_t* shape = ndarr.get_shape(); // get_shape() returns Py_intptr_t* which we can get the size of n-th dimension of the ndarray.
char* dtype_str = py::extract<char *>(py::str(ndarr.get_dtype()));
// variables for creating Mat object
int rows = shape[0];
int cols = shape[1];
int channel = shape[2];
int depth;
// you should find proper type for c++. in this case we use 'CV_8UC3' image, so we need to create 'uchar' type Mat.
if (!strcmp(dtype_str, "uint8")) {
depth = CV_8U;
}
else {
std::cout << "wrong dtype error" << std::endl;
return cv::Mat();
}
int type = CV_MAKETYPE(depth, channel); // CV_8UC3
cv::Mat mat = cv::Mat(rows, cols, type);
memcpy(mat.data, ndarr.get_data(), sizeof(uchar) * rows * cols * channel);
return mat;
}
int main()
{
using namespace std;
try
{
// initialize boost python and numpy
Init();
// import module
py::object main_module = py::import("__main__");
py::object print = main_module.attr("__builtins__").attr("print"); // this is for printing python object
// get image
cv::Mat img;
img = cv::imread("Lenna.jpg", cv::IMREAD_COLOR);
if (img.empty())
{
std::cout << "can't getting image" << std::endl;
return -1;
}
// convert Mat to NDArray
cv::Mat cloneImg = img.clone(); // converting functions will access to same data between Mat and NDArray. so we should clone Mat object. This may important in your case.
np::ndarray ndImg = ConvertMatToNDArray(cloneImg);
// You can check if it's properly converted.
//print(ndImg);
// convert NDArray to Mat
cv::Mat matImg = ConvertNDArrayToMat(ndImg); // also you can convert ndarray to mat.
// add 10 brightness to converted image
for (int i = 0; i < matImg.rows; i++) {
for (int j = 0; j < matImg.cols; j++) {
for (int c = 0; c < matImg.channels(); c++) {
matImg.at<cv::Vec3b>(i, j)[c] += 10;
}
}
}
// show image
cv::imshow("original image", img);
cv::imshow("converted image", matImg);
cv::waitKey(0);
cv::destroyAllWindows();
}
catch (py::error_already_set&)
{
PyErr_Print();
system("pause");
}
system("pause");
return 0;
}
Optionally, if you don't like to use wrappers, and want to use native python extension module, you can do it like this.
python3:
my_image = cv.imread("my_image.jpg", 1) # reads colorfull image in python
dims = my_image.shape # get image shape (h, w, c)
my_image = my_image.ravel() # flattens 3d array into 1d
cppextenionmodule.np_to_mat(dims, my_image)
c++:
static PyObject *np_to_mat(PyObject *self, PyObject *args){
PyObject *size;
PyArrayObject *image;
if (!PyArg_ParseTuple(args, "O!O!", &PyTuple_Type, &size, &PyArray_Type, &image)) {
return NULL;
}
int rows = PyLong_AsLong(PyTuple_GetItem(size ,0));
int cols = PyLong_AsLong(PyTuple_GetItem(size ,1));
int nchannels = PyLong_AsLong(PyTuple_GetItem(size ,2));
char my_arr[rows * nchannels * cols];
for(size_t length = 0; length<(rows * nchannels * cols); length++){
my_arr[length] = (*(char *)PyArray_GETPTR1(image, length));
}
cv::Mat my_img = cv::Mat(cv::Size(cols, rows), CV_8UC3, &my_arr);
... whatever with the image
}
Here is a pybind11 version of afewthings/DomQ's answer. I found pybind11 was better for my project than boost::python (both libraries are quite nice)
// convert a cv::Mat to an np.array
py::array to_array(const cv::Mat& im) {
const ssize_t channels = im.channels();
const ssize_t height = im.rows;
const ssize_t width = im.cols;
const ssize_t dim = sizeof(uchar) * height * width * channels;
auto data = new uchar[dim];
std::copy(im.data, im.data + dim, data);
return py::array_t<uchar>(
py::buffer_info(
data,
sizeof(uchar), //itemsize
py::format_descriptor<uchar>::format(),
channels, // ndim
std::vector<ssize_t> { height, width, channels }, // shape
std::vector<ssize_t> { width * channels, channels, sizeof(uchar) } // strides
),
py::capsule(data, [](void* f){
// handle releasing data
delete[] reinterpret_cast<uchar*>(f);
})
);
}
// convert an np.array to a cv::Mat
cv::Mat from_array(const py::array& ar) {
if (!ar.dtype().is(py::dtype::of<uchar>())) {
std::cout << "ERROR unsupported dtype!" << std::endl;
return cv::Mat();
}
auto shape = ar.shape();
int rows = shape[0];
int cols = shape[1];
int channels = shape[2];
int type = CV_MAKETYPE(CV_8U, channels); // CV_8UC3
cv::Mat mat = cv::Mat(rows, cols, type);
memcpy(mat.data, ar.data(), sizeof(uchar) * rows * cols * channels);
return mat;
}
I currently have two matching point sets built into a numpy array of float32:
points1 =
[[ 346.70220947 9076.38476562]
[ 922.99554443 9096.4921875 ]
[ 776.96466064 9108.79101562]
[ 449.0173645 9080.61816406]
[ 2843.19433594 1226.93212891]
[ 779.95275879 9094.76855469]
[ 451.46853638 9092.5078125 ]
[ 3981.4621582 1237.50964355]
[ 132.38700867 9086.7890625 ]
[ 819.10943604 8286.74023438]
[ 1963.64025879 1220.06921387]
[ 1253.79321289 9095.75292969]]
points2 =
[[ 55110.36328125 9405.07519531]
[ 55686.71875 9423.63574219]
[ 55540.8515625 9435.80078125]
[ 55212.58203125 9408.00585938]
[ 57598.76171875 1551.92956543]
[ 55543.78125 9421.88769531]
[ 55214.40625 9420.46972656]
[ 58737.41796875 1561.14831543]
[ 54895.9296875 9414.58203125]
[ 55581.87109375 8613.87011719]
[ 56718.76953125 1546.02197266]
[ 56017.8125 9422.52050781]]
and I'm trying to run:
affine = cv2.estimateRigidTransform(points2,points1,True)
print affine
so that I can generate an affine matrix that can then be translated into a world file (.tfw). The world file is for GIS software that will project these on-the-fly.
At the moment I am getting an error:
Both input images must have either 8uC1 or 8uC3 type in function cvEstimateRigidTransform
I'm not really sure what's going on here. I thought I could use two points sets as parameters as long as I have 6 or more pairs.
Any thoughts or recommendations would be much appreciated!
I had the same weird error but in Java. In my case, It seemed that estimateRigidTransform couldn't recognize that the two Mat images I was giving where actually 2D Point Sets. So I applied a workaround in order to convert my match points from MatOfKeyPoint to MatOfPoint2f type.
Here is the complete Java code (It's not Python, but maybe it will help you):
UPDATE: Filtering your matches is important, cause if you don't you may get an empty array as a result of the transform.
FeatureDetector detector = FeatureDetector.create(FeatureDetector.ORB);
DescriptorExtractor descriptor = DescriptorExtractor.create(DescriptorExtractor.ORB);
DescriptorMatcher matcher = DescriptorMatcher.create(DescriptorMatcher.BRUTEFORCE_HAMMING);
// Load First Image
Mat img1 = Imgcodecs.imread("img1_path", Imgcodecs.IMREAD_GRAYSCALE);
Mat img1_descriptors = new Mat();
MatOfKeyPoint img1_keypoints_mat = new MatOfKeyPoint();
// Detect KeyPoints
detector.detect(img1, img1_keypoints_mat);
descriptor.compute(img1, img1_keypoints_mat, img1_descriptors);
// Load Second Image
Mat img2 = Imgcodecs.imread("img2_path", Imgcodecs.IMREAD_GRAYSCALE);
Mat img2_descriptors = new Mat();
MatOfKeyPoint img2_keypoints_mat = new MatOfKeyPoint();
// Detect KeyPoints
detector.detect(img2, img2_keypoints_mat);
descriptor.compute(img2, img2_keypoints_mat, img2_descriptors);
// Match KeyPoints
MatOfDMatch matOfDMatch = new MatOfDMatch();
matcher.match(img1_descriptors, img2_descriptors, matOfDMatch);
// Filtering the matches
List<DMatch> dMatchList = matOfDMatch.toList();
Double max_dist = 0.0;
Double min_dist = 100.0;
for(int i = 0; i < img1_descriptors.rows(); i++){
Double dist = (double) dMatchList.get(i).distance;
if(dist < min_dist) min_dist = dist;
if(dist > max_dist) max_dist = dist;
}
LinkedList<DMatch> good_matches = new LinkedList<>();
for(int i = 0; i < img1_descriptors.rows(); i++){
if(dMatchList.get(i).distance < 3*min_dist){
good_matches.addLast(dMatchList.get(i));
}
}
// Converting to MatOfPoint2f format
LinkedList<Point> img1_points_list = new LinkedList<>();
LinkedList<Point> img2_points_list = new LinkedList<>();
List<KeyPoint> img1_keyPoints_list = img1_keypoints_mat.toList();
List<KeyPoint> img2_keyPoints_list = img2_keypoints_mat.toList();
int limit = good_matches.size();
for(int i = 0; i < limit; i++){
img1_points_list.addLast(img1_keyPoints_list.get(good_matches.get(i).queryIdx).pt);
img2_points_list.addLast(img2_keyPoints_list.get(good_matches.get(i).trainIdx).pt);
}
MatOfPoint2f img1_point2f_mat = new MatOfPoint2f();
img1_point2f_mat.fromList(img1_points_list);
MatOfPoint2f img2_point2f_mat = new MatOfPoint2f();
img2_point2f_mat.fromList(img2_points_list);
// Draw match points
Mat output = new Mat();
Features2d.drawMatches(img1, img1_keypoints_mat, img2, img2_keypoints_mat, matOfDMatch, output);
Imgcodecs.imwrite("output.png", output);
Mat result = Video.estimateRigidTransform(img1_point2f_mat, img2_point2f_mat, true);
printMat(result); // Printing the optimal affine transformation 2x3 array
// The following variables correspond to the estimateRigidTransform result as shown here: https://stackoverflow.com/a/29511091/5165833
double a = result.get(0,0)[0];
double b = result.get(0,1)[0];
double d = result.get(1,1)[0];
double c = result.get(1,0)[0];
// Solving for scale as shown in the link above
double scale_x = Math.signum(a) * Math.sqrt( (a*a) + (b*b) );
double scale_y = Math.signum(d) * Math.sqrt( (c*c) + (d*d) );
System.out.println("a = "+a);
System.out.println("b = "+b);
System.out.println("scale_x = "+scale_x);
System.out.println("scale_y = "+scale_y);
}
public static void printMat(Mat m)
{
for (int x=0; x < m.height(); x++)
{
for (int y=0; y < m.width(); y++)
{
System.out.printf("%f",m.get(x,y)[0]);
System.out.printf("%s"," ");
}
System.out.printf("\n");
}
}