Blame - y2020/vision/sift/sift971.cc - RealtimeRoboticsGroup/test

blob: 6223f77ac5f12d3540cdf62bb62da452ef6650fe [file] [log] [blame]

Brian Silverman	f119612	2020-01-16 00:41:54 -0800	[diff] [blame^]	1	// clang-format off
				2	/*M///////////////////////////////////////////////////////////////////////////////////////
				3	//
				4	// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
				5	//
				6	// By downloading, copying, installing or using the software you agree to this license.
				7	// If you do not agree to this license, do not download, install,
				8	// copy or use the software.
				9	//
				10	//
				11	// License Agreement
				12	// For Open Source Computer Vision Library
				13	//
				14	// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
				15	// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
				16	// Third party copyrights are property of their respective owners.
				17	//
				18	// Redistribution and use in source and binary forms, with or without modification,
				19	// are permitted provided that the following conditions are met:
				20	//
				21	// * Redistribution's of source code must retain the above copyright notice,
				22	// this list of conditions and the following disclaimer.
				23	//
				24	// * Redistribution's in binary form must reproduce the above copyright notice,
				25	// this list of conditions and the following disclaimer in the documentation
				26	// and/or other materials provided with the distribution.
				27	//
				28	// * The name of the copyright holders may not be used to endorse or promote products
				29	// derived from this software without specific prior written permission.
				30	//
				31	// This software is provided by the copyright holders and contributors "as is" and
				32	// any express or implied warranties, including, but not limited to, the implied
				33	// warranties of merchantability and fitness for a particular purpose are disclaimed.
				34	// In no event shall the Intel Corporation or contributors be liable for any direct,
				35	// indirect, incidental, special, exemplary, or consequential damages
				36	// (including, but not limited to, procurement of substitute goods or services;
				37	// loss of use, data, or profits; or business interruption) however caused
				38	// and on any theory of liability, whether in contract, strict liability,
				39	// or tort (including negligence or otherwise) arising in any way out of
				40	// the use of this software, even if advised of the possibility of such damage.
				41	//
				42	//M*/
				43
				44	/**********************************************************************************************\
				45	Implementation of SIFT is based on the code from http://blogs.oregonstate.edu/hess/code/sift/
				46	Below is the original copyright.
				47
				48	// Copyright (c) 2006-2010, Rob Hess <hess@eecs.oregonstate.edu>
				49	// All rights reserved.
				50
				51	// The following patent has been issued for methods embodied in this
				52	// software: "Method and apparatus for identifying scale invariant features
				53	// in an image and use of same for locating an object in an image," David
				54	// G. Lowe, US Patent 6,711,293 (March 23, 2004). Provisional application
				55	// filed March 8, 1999. Asignee: The University of British Columbia. For
				56	// further details, contact David Lowe (lowe@cs.ubc.ca) or the
				57	// University-Industry Liaison Office of the University of British
				58	// Columbia.
				59
				60	// Note that restrictions imposed by this patent (and possibly others)
				61	// exist independently of and may be in conflict with the freedoms granted
				62	// in this license, which refers to copyright of the program, not patents
				63	// for any methods that it implements. Both copyright and patent law must
				64	// be obeyed to legally use and redistribute this program and it is not the
				65	// purpose of this license to induce you to infringe any patents or other
				66	// property right claims or to contest validity of any such claims. If you
				67	// redistribute or use the program, then this license merely protects you
				68	// from committing copyright infringement. It does not protect you from
				69	// committing patent infringement. So, before you do anything with this
				70	// program, make sure that you have permission to do so not merely in terms
				71	// of copyright, but also in terms of patent law.
				72
				73	// Please note that this license is not to be understood as a guarantee
				74	// either. If you use the program according to this license, but in
				75	// conflict with patent law, it does not mean that the licensor will refund
				76	// you for any losses that you incur if you are sued for your patent
				77	// infringement.
				78
				79	// Redistribution and use in source and binary forms, with or without
				80	// modification, are permitted provided that the following conditions are
				81	// met:
				82	// * Redistributions of source code must retain the above copyright and
				83	// patent notices, this list of conditions and the following
				84	// disclaimer.
				85	// * Redistributions in binary form must reproduce the above copyright
				86	// notice, this list of conditions and the following disclaimer in
				87	// the documentation and/or other materials provided with the
				88	// distribution.
				89	// * Neither the name of Oregon State University nor the names of its
				90	// contributors may be used to endorse or promote products derived
				91	// from this software without specific prior written permission.
				92
				93	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
				94	// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
				95	// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
				96	// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				97	// HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
				98	// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
				99	// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
				100	// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
				101	// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
				102	// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
				103	// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				104	\**********************************************************************************************/
				105	// clang-format on
				106
				107	#include "y2020/vision/sift/sift971.h"
				108
				109	#include <iostream>
				110	#include <mutex>
				111	#include <stdarg.h>
				112	#include <opencv2/core/hal/hal.hpp>
				113	#include <opencv2/imgproc.hpp>
				114
				115	using namespace cv;
				116
				117	namespace frc971 {
				118	namespace vision {
				119	namespace {
				120
				121	#define USE_AVX2 0
				122
				123	/***************************** Defs and macros ***************************/
				124
				125	// default width of descriptor histogram array
				126	static const int SIFT_DESCR_WIDTH = 4;
				127
				128	// default number of bins per histogram in descriptor array
				129	static const int SIFT_DESCR_HIST_BINS = 8;
				130
				131	// assumed gaussian blur for input image
				132	static const float SIFT_INIT_SIGMA = 0.5f;
				133
				134	// width of border in which to ignore keypoints
				135	static const int SIFT_IMG_BORDER = 5;
				136
				137	// maximum steps of keypoint interpolation before failure
				138	static const int SIFT_MAX_INTERP_STEPS = 5;
				139
				140	// default number of bins in histogram for orientation assignment
				141	static const int SIFT_ORI_HIST_BINS = 36;
				142
				143	// determines gaussian sigma for orientation assignment
				144	static const float SIFT_ORI_SIG_FCTR = 1.5f;
				145
				146	// determines the radius of the region used in orientation assignment
				147	static const float SIFT_ORI_RADIUS = 3 * SIFT_ORI_SIG_FCTR;
				148
				149	// orientation magnitude relative to max that results in new feature
				150	static const float SIFT_ORI_PEAK_RATIO = 0.8f;
				151
				152	// determines the size of a single descriptor orientation histogram
				153	static const float SIFT_DESCR_SCL_FCTR = 3.f;
				154
				155	// threshold on magnitude of elements of descriptor vector
				156	static const float SIFT_DESCR_MAG_THR = 0.2f;
				157
				158	// factor used to convert floating-point descriptor to unsigned char
				159	static const float SIFT_INT_DESCR_FCTR = 512.f;
				160
				161	#define DoG_TYPE_SHORT 0
				162	#if DoG_TYPE_SHORT
				163	// intermediate type used for DoG pyramids
				164	typedef short sift_wt;
				165	static const int SIFT_FIXPT_SCALE = 48;
				166	#else
				167	// intermediate type used for DoG pyramids
				168	typedef float sift_wt;
				169	static const int SIFT_FIXPT_SCALE = 1;
				170	#endif
				171
				172	static inline void unpackOctave(const KeyPoint &kpt, int &octave, int &layer,
				173	float &scale) {
				174	octave = kpt.octave & 255;
				175	layer = (kpt.octave >> 8) & 255;
				176	octave = octave < 128 ? octave : (-128 \| octave);
				177	scale = octave >= 0 ? 1.f / (1 << octave) : (float)(1 << -octave);
				178	}
				179
				180	static Mat createInitialImage(const Mat &img, bool doubleImageSize,
				181	float sigma) {
				182	Mat gray, gray_fpt;
				183	if (img.channels() == 3 \|\| img.channels() == 4) {
				184	cvtColor(img, gray, COLOR_BGR2GRAY);
				185	gray.convertTo(gray_fpt, DataType<sift_wt>::type, SIFT_FIXPT_SCALE, 0);
				186	} else
				187	img.convertTo(gray_fpt, DataType<sift_wt>::type, SIFT_FIXPT_SCALE, 0);
				188
				189	float sig_diff;
				190
				191	if (doubleImageSize) {
				192	sig_diff = sqrtf(
				193	std::max(sigma * sigma - SIFT_INIT_SIGMA * SIFT_INIT_SIGMA * 4, 0.01f));
				194	Mat dbl;
				195	#if DoG_TYPE_SHORT
				196	resize(gray_fpt, dbl, Size(gray_fpt.cols * 2, gray_fpt.rows * 2), 0, 0,
				197	INTER_LINEAR_EXACT);
				198	#else
				199	resize(gray_fpt, dbl, Size(gray_fpt.cols * 2, gray_fpt.rows * 2), 0, 0,
				200	INTER_LINEAR);
				201	#endif
				202	GaussianBlur(dbl, dbl, Size(), sig_diff, sig_diff);
				203	return dbl;
				204	} else {
				205	sig_diff = sqrtf(
				206	std::max(sigma * sigma - SIFT_INIT_SIGMA * SIFT_INIT_SIGMA, 0.01f));
				207	GaussianBlur(gray_fpt, gray_fpt, Size(), sig_diff, sig_diff);
				208	return gray_fpt;
				209	}
				210	}
				211
				212	} // namespace
				213
				214	void SIFT971_Impl::buildGaussianPyramid(const Mat &base, std::vector<Mat> &pyr,
				215	int nOctaves) const {
				216	std::vector<double> sig(nOctaveLayers + 3);
				217	pyr.resize(nOctaves * (nOctaveLayers + 3));
				218
				219	// precompute Gaussian sigmas using the following formula:
				220	// \sigma_{total}^2 = \sigma_{i}^2 + \sigma_{i-1}^2
				221	sig[0] = sigma;
				222	double k = std::pow(2., 1. / nOctaveLayers);
				223	for (int i = 1; i < nOctaveLayers + 3; i++) {
				224	double sig_prev = std::pow(k, (double)(i - 1)) * sigma;
				225	double sig_total = sig_prev * k;
				226	sig[i] = std::sqrt(sig_total * sig_total - sig_prev * sig_prev);
				227	}
				228
				229	for (int o = 0; o < nOctaves; o++) {
				230	for (int i = 0; i < nOctaveLayers + 3; i++) {
				231	Mat &dst = pyr[o * (nOctaveLayers + 3) + i];
				232	if (o == 0 && i == 0) dst = base;
				233	// base of new octave is halved image from end of previous octave
				234	else if (i == 0) {
				235	const Mat &src = pyr[(o - 1) * (nOctaveLayers + 3) + nOctaveLayers];
				236	resize(src, dst, Size(src.cols / 2, src.rows / 2), 0, 0, INTER_NEAREST);
				237	} else {
				238	const Mat &src = pyr[o * (nOctaveLayers + 3) + i - 1];
				239	GaussianBlur(src, dst, Size(), sig[i], sig[i]);
				240	}
				241	}
				242	}
				243	}
				244
				245	namespace {
				246
				247	class buildDoGPyramidComputer : public ParallelLoopBody {
				248	public:
				249	buildDoGPyramidComputer(int _nOctaveLayers, const std::vector<Mat> &_gpyr,
				250	std::vector<Mat> &_dogpyr)
				251	: nOctaveLayers(_nOctaveLayers), gpyr(_gpyr), dogpyr(_dogpyr) {}
				252
				253	void operator()(const cv::Range &range) const override {
				254	const int begin = range.start;
				255	const int end = range.end;
				256
				257	for (int a = begin; a < end; a++) {
				258	const int o = a / (nOctaveLayers + 2);
				259	const int i = a % (nOctaveLayers + 2);
				260
				261	const Mat &src1 = gpyr[o * (nOctaveLayers + 3) + i];
				262	const Mat &src2 = gpyr[o * (nOctaveLayers + 3) + i + 1];
				263	Mat &dst = dogpyr[o * (nOctaveLayers + 2) + i];
				264	subtract(src2, src1, dst, noArray(), DataType<sift_wt>::type);
				265	}
				266	}
				267
				268	private:
				269	int nOctaveLayers;
				270	const std::vector<Mat> &gpyr;
				271	std::vector<Mat> &dogpyr;
				272	};
				273
				274	} // namespace
				275
				276	void SIFT971_Impl::buildDoGPyramid(const std::vector<Mat> &gpyr,
				277	std::vector<Mat> &dogpyr) const {
				278	int nOctaves = (int)gpyr.size() / (nOctaveLayers + 3);
				279	dogpyr.resize(nOctaves * (nOctaveLayers + 2));
				280
				281	parallel_for_(Range(0, nOctaves * (nOctaveLayers + 2)),
				282	buildDoGPyramidComputer(nOctaveLayers, gpyr, dogpyr));
				283	}
				284
				285	namespace {
				286
				287	// Computes a gradient orientation histogram at a specified pixel
				288	static float calcOrientationHist(const Mat &img, Point pt, int radius,
				289	float sigma, float *hist, int n) {
				290	int i, j, k, len = (radius * 2 + 1) * (radius * 2 + 1);
				291
				292	float expf_scale = -1.f / (2.f * sigma * sigma);
				293	AutoBuffer<float> buf(len * 4 + n + 4);
				294	float X = buf, Y = X + len, Mag = X, Ori = Y + len, *W = Ori + len;
				295	float *temphist = W + len + 2;
				296
				297	for (i = 0; i < n; i++) temphist[i] = 0.f;
				298
				299	for (i = -radius, k = 0; i <= radius; i++) {
				300	int y = pt.y + i;
				301	if (y <= 0 \|\| y >= img.rows - 1) continue;
				302	for (j = -radius; j <= radius; j++) {
				303	int x = pt.x + j;
				304	if (x <= 0 \|\| x >= img.cols - 1) continue;
				305
				306	float dx = (float)(img.at<sift_wt>(y, x + 1) - img.at<sift_wt>(y, x - 1));
				307	float dy = (float)(img.at<sift_wt>(y - 1, x) - img.at<sift_wt>(y + 1, x));
				308
				309	X[k] = dx;
				310	Y[k] = dy;
				311	W[k] = (i * i + j * j) * expf_scale;
				312	k++;
				313	}
				314	}
				315
				316	len = k;
				317
				318	// compute gradient values, orientations and the weights over the pixel
				319	// neighborhood
				320	cv::hal::exp32f(W, W, len);
				321	cv::hal::fastAtan2(Y, X, Ori, len, true);
				322	cv::hal::magnitude32f(X, Y, Mag, len);
				323
				324	k = 0;
				325	#if CV_AVX2
				326	if (USE_AVX2) {
				327	__m256 __nd360 = _mm256_set1_ps(n / 360.f);
				328	__m256i __n = _mm256_set1_epi32(n);
				329	int CV_DECL_ALIGNED(32) bin_buf[8];
				330	float CV_DECL_ALIGNED(32) w_mul_mag_buf[8];
				331	for (; k <= len - 8; k += 8) {
				332	__m256i __bin =
				333	_mm256_cvtps_epi32(_mm256_mul_ps(__nd360, _mm256_loadu_ps(&Ori[k])));
				334
				335	__bin = _mm256_sub_epi32(
				336	__bin, _mm256_andnot_si256(_mm256_cmpgt_epi32(__n, __bin), __n));
				337	__bin = _mm256_add_epi32(
				338	__bin, _mm256_and_si256(
				339	__n, _mm256_cmpgt_epi32(_mm256_setzero_si256(), __bin)));
				340
				341	__m256 __w_mul_mag =
				342	_mm256_mul_ps(_mm256_loadu_ps(&W[k]), _mm256_loadu_ps(&Mag[k]));
				343
				344	_mm256_store_si256((__m256i *)bin_buf, __bin);
				345	_mm256_store_ps(w_mul_mag_buf, __w_mul_mag);
				346
				347	temphist[bin_buf[0]] += w_mul_mag_buf[0];
				348	temphist[bin_buf[1]] += w_mul_mag_buf[1];
				349	temphist[bin_buf[2]] += w_mul_mag_buf[2];
				350	temphist[bin_buf[3]] += w_mul_mag_buf[3];
				351	temphist[bin_buf[4]] += w_mul_mag_buf[4];
				352	temphist[bin_buf[5]] += w_mul_mag_buf[5];
				353	temphist[bin_buf[6]] += w_mul_mag_buf[6];
				354	temphist[bin_buf[7]] += w_mul_mag_buf[7];
				355	}
				356	}
				357	#endif
				358	for (; k < len; k++) {
				359	int bin = cvRound((n / 360.f) * Ori[k]);
				360	if (bin >= n) bin -= n;
				361	if (bin < 0) bin += n;
				362	temphist[bin] += W[k] * Mag[k];
				363	}
				364
				365	// smooth the histogram
				366	temphist[-1] = temphist[n - 1];
				367	temphist[-2] = temphist[n - 2];
				368	temphist[n] = temphist[0];
				369	temphist[n + 1] = temphist[1];
				370
				371	i = 0;
				372	#if CV_AVX2
				373	if (USE_AVX2) {
				374	__m256 __d_1_16 = _mm256_set1_ps(1.f / 16.f);
				375	__m256 __d_4_16 = _mm256_set1_ps(4.f / 16.f);
				376	__m256 __d_6_16 = _mm256_set1_ps(6.f / 16.f);
				377	for (; i <= n - 8; i += 8) {
				378	#if CV_FMA3
				379	__m256 __hist = _mm256_fmadd_ps(
				380	_mm256_add_ps(_mm256_loadu_ps(&temphist[i - 2]),
				381	_mm256_loadu_ps(&temphist[i + 2])),
				382	__d_1_16,
				383	_mm256_fmadd_ps(
				384	_mm256_add_ps(_mm256_loadu_ps(&temphist[i - 1]),
				385	_mm256_loadu_ps(&temphist[i + 1])),
				386	__d_4_16,
				387	_mm256_mul_ps(_mm256_loadu_ps(&temphist[i]), __d_6_16)));
				388	#else
				389	__m256 __hist = _mm256_add_ps(
				390	_mm256_mul_ps(_mm256_add_ps(_mm256_loadu_ps(&temphist[i - 2]),
				391	_mm256_loadu_ps(&temphist[i + 2])),
				392	__d_1_16),
				393	_mm256_add_ps(
				394	_mm256_mul_ps(_mm256_add_ps(_mm256_loadu_ps(&temphist[i - 1]),
				395	_mm256_loadu_ps(&temphist[i + 1])),
				396	__d_4_16),
				397	_mm256_mul_ps(_mm256_loadu_ps(&temphist[i]), __d_6_16)));
				398	#endif
				399	_mm256_storeu_ps(&hist[i], __hist);
				400	}
				401	}
				402	#endif
				403	for (; i < n; i++) {
				404	hist[i] = (temphist[i - 2] + temphist[i + 2]) * (1.f / 16.f) +
				405	(temphist[i - 1] + temphist[i + 1]) * (4.f / 16.f) +
				406	temphist[i] * (6.f / 16.f);
				407	}
				408
				409	float maxval = hist[0];
				410	for (i = 1; i < n; i++) maxval = std::max(maxval, hist[i]);
				411
				412	return maxval;
				413	}
				414
				415	//
				416	// Interpolates a scale-space extremum's location and scale to subpixel
				417	// accuracy to form an image feature. Rejects features with low contrast.
				418	// Based on Section 4 of Lowe's paper.
				419	static bool adjustLocalExtrema(const std::vector<Mat> &dog_pyr, KeyPoint &kpt,
				420	int octv, int &layer, int &r, int &c,
				421	int nOctaveLayers, float contrastThreshold,
				422	float edgeThreshold, float sigma) {
				423	const float img_scale = 1.f / (255 * SIFT_FIXPT_SCALE);
				424	const float deriv_scale = img_scale * 0.5f;
				425	const float second_deriv_scale = img_scale;
				426	const float cross_deriv_scale = img_scale * 0.25f;
				427
				428	float xi = 0, xr = 0, xc = 0, contr = 0;
				429	int i = 0;
				430
				431	for (; i < SIFT_MAX_INTERP_STEPS; i++) {
				432	int idx = octv * (nOctaveLayers + 2) + layer;
				433	const Mat &img = dog_pyr[idx];
				434	const Mat &prev = dog_pyr[idx - 1];
				435	const Mat &next = dog_pyr[idx + 1];
				436
				437	Vec3f dD(
				438	(img.at<sift_wt>(r, c + 1) - img.at<sift_wt>(r, c - 1)) * deriv_scale,
				439	(img.at<sift_wt>(r + 1, c) - img.at<sift_wt>(r - 1, c)) * deriv_scale,
				440	(next.at<sift_wt>(r, c) - prev.at<sift_wt>(r, c)) * deriv_scale);
				441
				442	float v2 = (float)img.at<sift_wt>(r, c) * 2;
				443	float dxx = (img.at<sift_wt>(r, c + 1) + img.at<sift_wt>(r, c - 1) - v2) *
				444	second_deriv_scale;
				445	float dyy = (img.at<sift_wt>(r + 1, c) + img.at<sift_wt>(r - 1, c) - v2) *
				446	second_deriv_scale;
				447	float dss = (next.at<sift_wt>(r, c) + prev.at<sift_wt>(r, c) - v2) *
				448	second_deriv_scale;
				449	float dxy =
				450	(img.at<sift_wt>(r + 1, c + 1) - img.at<sift_wt>(r + 1, c - 1) -
				451	img.at<sift_wt>(r - 1, c + 1) + img.at<sift_wt>(r - 1, c - 1)) *
				452	cross_deriv_scale;
				453	float dxs = (next.at<sift_wt>(r, c + 1) - next.at<sift_wt>(r, c - 1) -
				454	prev.at<sift_wt>(r, c + 1) + prev.at<sift_wt>(r, c - 1)) *
				455	cross_deriv_scale;
				456	float dys = (next.at<sift_wt>(r + 1, c) - next.at<sift_wt>(r - 1, c) -
				457	prev.at<sift_wt>(r + 1, c) + prev.at<sift_wt>(r - 1, c)) *
				458	cross_deriv_scale;
				459
				460	Matx33f H(dxx, dxy, dxs, dxy, dyy, dys, dxs, dys, dss);
				461
				462	Vec3f X = H.solve(dD, DECOMP_LU);
				463
				464	xi = -X[2];
				465	xr = -X[1];
				466	xc = -X[0];
				467
				468	if (std::abs(xi) < 0.5f && std::abs(xr) < 0.5f && std::abs(xc) < 0.5f)
				469	break;
				470
				471	if (std::abs(xi) > (float)(INT_MAX / 3) \|\|
				472	std::abs(xr) > (float)(INT_MAX / 3) \|\|
				473	std::abs(xc) > (float)(INT_MAX / 3))
				474	return false;
				475
				476	c += cvRound(xc);
				477	r += cvRound(xr);
				478	layer += cvRound(xi);
				479
				480	if (layer < 1 \|\| layer > nOctaveLayers \|\| c < SIFT_IMG_BORDER \|\|
				481	c >= img.cols - SIFT_IMG_BORDER \|\| r < SIFT_IMG_BORDER \|\|
				482	r >= img.rows - SIFT_IMG_BORDER)
				483	return false;
				484	}
				485
				486	// ensure convergence of interpolation
				487	if (i >= SIFT_MAX_INTERP_STEPS) return false;
				488
				489	{
				490	int idx = octv * (nOctaveLayers + 2) + layer;
				491	const Mat &img = dog_pyr[idx];
				492	const Mat &prev = dog_pyr[idx - 1];
				493	const Mat &next = dog_pyr[idx + 1];
				494	Matx31f dD(
				495	(img.at<sift_wt>(r, c + 1) - img.at<sift_wt>(r, c - 1)) * deriv_scale,
				496	(img.at<sift_wt>(r + 1, c) - img.at<sift_wt>(r - 1, c)) * deriv_scale,
				497	(next.at<sift_wt>(r, c) - prev.at<sift_wt>(r, c)) * deriv_scale);
				498	float t = dD.dot(Matx31f(xc, xr, xi));
				499
				500	contr = img.at<sift_wt>(r, c) * img_scale + t * 0.5f;
				501	if (std::abs(contr) * nOctaveLayers < contrastThreshold) return false;
				502
				503	// principal curvatures are computed using the trace and det of Hessian
				504	float v2 = img.at<sift_wt>(r, c) * 2.f;
				505	float dxx = (img.at<sift_wt>(r, c + 1) + img.at<sift_wt>(r, c - 1) - v2) *
				506	second_deriv_scale;
				507	float dyy = (img.at<sift_wt>(r + 1, c) + img.at<sift_wt>(r - 1, c) - v2) *
				508	second_deriv_scale;
				509	float dxy =
				510	(img.at<sift_wt>(r + 1, c + 1) - img.at<sift_wt>(r + 1, c - 1) -
				511	img.at<sift_wt>(r - 1, c + 1) + img.at<sift_wt>(r - 1, c - 1)) *
				512	cross_deriv_scale;
				513	float tr = dxx + dyy;
				514	float det = dxx * dyy - dxy * dxy;
				515
				516	if (det <= 0 \|\| tr * tr * edgeThreshold >=
				517	(edgeThreshold + 1) * (edgeThreshold + 1) * det)
				518	return false;
				519	}
				520
				521	kpt.pt.x = (c + xc) * (1 << octv);
				522	kpt.pt.y = (r + xr) * (1 << octv);
				523	kpt.octave = octv + (layer << 8) + (cvRound((xi + 0.5) * 255) << 16);
				524	kpt.size = sigma * powf(2.f, (layer + xi) / nOctaveLayers) * (1 << octv) * 2;
				525	kpt.response = std::abs(contr);
				526
				527	return true;
				528	}
				529
				530	template <typename T>
				531	class PerThreadAccumulator {
				532	public:
				533	void Add(std::vector<T> &&data) {
				534	std::unique_lock locker(mutex_);
				535	result_.emplace_back(data);
				536	}
				537
				538	std::vector<std::vector<T>> move_result() { return std::move(result_); }
				539
				540	private:
				541	// Should we do something more intelligent with per-thread std::vector that we
				542	// merge at the end?
				543	std::vector<std::vector<T>> result_;
				544	std::mutex mutex_;
				545	};
				546
				547	class findScaleSpaceExtremaComputer : public ParallelLoopBody {
				548	public:
				549	findScaleSpaceExtremaComputer(
				550	int _o, int _i, int _threshold, int _idx, int _step, int _cols,
				551	int _nOctaveLayers, double _contrastThreshold, double _edgeThreshold,
				552	double _sigma, const std::vector<Mat> &_gauss_pyr,
				553	const std::vector<Mat> &_dog_pyr,
				554	PerThreadAccumulator<KeyPoint> &_tls_kpts_struct)
				555
				556	: o(_o),
				557	i(_i),
				558	threshold(_threshold),
				559	idx(_idx),
				560	step(_step),
				561	cols(_cols),
				562	nOctaveLayers(_nOctaveLayers),
				563	contrastThreshold(_contrastThreshold),
				564	edgeThreshold(_edgeThreshold),
				565	sigma(_sigma),
				566	gauss_pyr(_gauss_pyr),
				567	dog_pyr(_dog_pyr),
				568	tls_kpts_struct(_tls_kpts_struct) {}
				569	void operator()(const cv::Range &range) const override {
				570	const int begin = range.start;
				571	const int end = range.end;
				572
				573	static const int n = SIFT_ORI_HIST_BINS;
				574	float hist[n];
				575
				576	const Mat &img = dog_pyr[idx];
				577	const Mat &prev = dog_pyr[idx - 1];
				578	const Mat &next = dog_pyr[idx + 1];
				579
				580	std::vector<KeyPoint> tls_kpts;
				581
				582	KeyPoint kpt;
				583	for (int r = begin; r < end; r++) {
				584	const sift_wt *currptr = img.ptr<sift_wt>(r);
				585	const sift_wt *prevptr = prev.ptr<sift_wt>(r);
				586	const sift_wt *nextptr = next.ptr<sift_wt>(r);
				587
				588	for (int c = SIFT_IMG_BORDER; c < cols - SIFT_IMG_BORDER; c++) {
				589	sift_wt val = currptr[c];
				590
				591	// find local extrema with pixel accuracy
				592	if (std::abs(val) > threshold &&
				593	((val > 0 && val >= currptr[c - 1] && val >= currptr[c + 1] &&
				594	val >= currptr[c - step - 1] && val >= currptr[c - step] &&
				595	val >= currptr[c - step + 1] && val >= currptr[c + step - 1] &&
				596	val >= currptr[c + step] && val >= currptr[c + step + 1] &&
				597	val >= nextptr[c] && val >= nextptr[c - 1] &&
				598	val >= nextptr[c + 1] && val >= nextptr[c - step - 1] &&
				599	val >= nextptr[c - step] && val >= nextptr[c - step + 1] &&
				600	val >= nextptr[c + step - 1] && val >= nextptr[c + step] &&
				601	val >= nextptr[c + step + 1] && val >= prevptr[c] &&
				602	val >= prevptr[c - 1] && val >= prevptr[c + 1] &&
				603	val >= prevptr[c - step - 1] && val >= prevptr[c - step] &&
				604	val >= prevptr[c - step + 1] && val >= prevptr[c + step - 1] &&
				605	val >= prevptr[c + step] && val >= prevptr[c + step + 1]) \|\|
				606	(val < 0 && val <= currptr[c - 1] && val <= currptr[c + 1] &&
				607	val <= currptr[c - step - 1] && val <= currptr[c - step] &&
				608	val <= currptr[c - step + 1] && val <= currptr[c + step - 1] &&
				609	val <= currptr[c + step] && val <= currptr[c + step + 1] &&
				610	val <= nextptr[c] && val <= nextptr[c - 1] &&
				611	val <= nextptr[c + 1] && val <= nextptr[c - step - 1] &&
				612	val <= nextptr[c - step] && val <= nextptr[c - step + 1] &&
				613	val <= nextptr[c + step - 1] && val <= nextptr[c + step] &&
				614	val <= nextptr[c + step + 1] && val <= prevptr[c] &&
				615	val <= prevptr[c - 1] && val <= prevptr[c + 1] &&
				616	val <= prevptr[c - step - 1] && val <= prevptr[c - step] &&
				617	val <= prevptr[c - step + 1] && val <= prevptr[c + step - 1] &&
				618	val <= prevptr[c + step] && val <= prevptr[c + step + 1]))) {
				619	int r1 = r, c1 = c, layer = i;
				620	if (!adjustLocalExtrema(dog_pyr, kpt, o, layer, r1, c1, nOctaveLayers,
				621	(float)contrastThreshold,
				622	(float)edgeThreshold, (float)sigma))
				623	continue;
				624	float scl_octv = kpt.size * 0.5f / (1 << o);
				625	float omax = calcOrientationHist(
				626	gauss_pyr[o * (nOctaveLayers + 3) + layer], Point(c1, r1),
				627	cvRound(SIFT_ORI_RADIUS * scl_octv), SIFT_ORI_SIG_FCTR * scl_octv,
				628	hist, n);
				629	float mag_thr = (float)(omax * SIFT_ORI_PEAK_RATIO);
				630	for (int j = 0; j < n; j++) {
				631	int l = j > 0 ? j - 1 : n - 1;
				632	int r2 = j < n - 1 ? j + 1 : 0;
				633
				634	if (hist[j] > hist[l] && hist[j] > hist[r2] && hist[j] >= mag_thr) {
				635	float bin = j + 0.5f * (hist[l] - hist[r2]) /
				636	(hist[l] - 2 * hist[j] + hist[r2]);
				637	bin = bin < 0 ? n + bin : bin >= n ? bin - n : bin;
				638	kpt.angle = 360.f - (float)((360.f / n) * bin);
				639	if (std::abs(kpt.angle - 360.f) < FLT_EPSILON) kpt.angle = 0.f;
				640	{ tls_kpts.push_back(kpt); }
				641	}
				642	}
				643	}
				644	}
				645	}
				646
				647	tls_kpts_struct.Add(std::move(tls_kpts));
				648	}
				649
				650	private:
				651	int o, i;
				652	int threshold;
				653	int idx, step, cols;
				654	int nOctaveLayers;
				655	double contrastThreshold;
				656	double edgeThreshold;
				657	double sigma;
				658	const std::vector<Mat> &gauss_pyr;
				659	const std::vector<Mat> &dog_pyr;
				660	PerThreadAccumulator<KeyPoint> &tls_kpts_struct;
				661	};
				662
				663	} // namespace
				664
				665	//
				666	// Detects features at extrema in DoG scale space. Bad features are discarded
				667	// based on contrast and ratio of principal curvatures.
				668	void SIFT971_Impl::findScaleSpaceExtrema(
				669	const std::vector<Mat> &gauss_pyr, const std::vector<Mat> &dog_pyr,
				670	std::vector<KeyPoint> &keypoints) const {
				671	const int nOctaves = (int)gauss_pyr.size() / (nOctaveLayers + 3);
				672	const int threshold =
				673	cvFloor(0.5 * contrastThreshold / nOctaveLayers * 255 * SIFT_FIXPT_SCALE);
				674
				675	keypoints.clear();
				676	PerThreadAccumulator<KeyPoint> tls_kpts_struct;
				677
				678	for (int o = 0; o < nOctaves; o++)
				679	for (int i = 1; i <= nOctaveLayers; i++) {
				680	const int idx = o * (nOctaveLayers + 2) + i;
				681	const Mat &img = dog_pyr[idx];
				682	const int step = (int)img.step1();
				683	const int rows = img.rows, cols = img.cols;
				684
				685	parallel_for_(Range(SIFT_IMG_BORDER, rows - SIFT_IMG_BORDER),
				686	findScaleSpaceExtremaComputer(
				687	o, i, threshold, idx, step, cols, nOctaveLayers,
				688	contrastThreshold, edgeThreshold, sigma, gauss_pyr,
				689	dog_pyr, tls_kpts_struct));
				690	}
				691
				692	const std::vector<std::vector<KeyPoint>> kpt_vecs =
				693	tls_kpts_struct.move_result();
				694	for (size_t i = 0; i < kpt_vecs.size(); ++i) {
				695	keypoints.insert(keypoints.end(), kpt_vecs[i].begin(), kpt_vecs[i].end());
				696	}
				697	}
				698
				699	namespace {
				700
				701	static void calcSIFTDescriptor(const Mat &img, Point2f ptf, float ori,
				702	float scl, int d, int n, float *dst) {
				703	Point pt(cvRound(ptf.x), cvRound(ptf.y));
				704	float cos_t = cosf(ori * (float)(CV_PI / 180));
				705	float sin_t = sinf(ori * (float)(CV_PI / 180));
				706	float bins_per_rad = n / 360.f;
				707	float exp_scale = -1.f / (d * d * 0.5f);
				708	float hist_width = SIFT_DESCR_SCL_FCTR * scl;
				709	int radius = cvRound(hist_width * 1.4142135623730951f * (d + 1) * 0.5f);
				710	// Clip the radius to the diagonal of the image to avoid autobuffer too large
				711	// exception
				712	radius = std::min(radius, (int)sqrt(((double)img.cols) * img.cols +
				713	((double)img.rows) * img.rows));
				714	cos_t /= hist_width;
				715	sin_t /= hist_width;
				716
				717	int i, j, k, len = (radius * 2 + 1) * (radius * 2 + 1),
				718	histlen = (d + 2) * (d + 2) * (n + 2);
				719	int rows = img.rows, cols = img.cols;
				720
				721	AutoBuffer<float> buf(len * 6 + histlen);
				722	float X = buf, Y = X + len, Mag = Y, Ori = Mag + len, *W = Ori + len;
				723	float RBin = W + len, CBin = RBin + len, *hist = CBin + len;
				724
				725	for (i = 0; i < d + 2; i++) {
				726	for (j = 0; j < d + 2; j++)
				727	for (k = 0; k < n + 2; k++) hist[(i * (d + 2) + j) * (n + 2) + k] = 0.;
				728	}
				729
				730	for (i = -radius, k = 0; i <= radius; i++)
				731	for (j = -radius; j <= radius; j++) {
				732	// Calculate sample's histogram array coords rotated relative to ori.
				733	// Subtract 0.5 so samples that fall e.g. in the center of row 1 (i.e.
				734	// r_rot = 1.5) have full weight placed in row 1 after interpolation.
				735	float c_rot = j * cos_t - i * sin_t;
				736	float r_rot = j * sin_t + i * cos_t;
				737	float rbin = r_rot + d / 2 - 0.5f;
				738	float cbin = c_rot + d / 2 - 0.5f;
				739	int r = pt.y + i, c = pt.x + j;
				740
				741	if (rbin > -1 && rbin < d && cbin > -1 && cbin < d && r > 0 &&
				742	r < rows - 1 && c > 0 && c < cols - 1) {
				743	float dx =
				744	(float)(img.at<sift_wt>(r, c + 1) - img.at<sift_wt>(r, c - 1));
				745	float dy =
				746	(float)(img.at<sift_wt>(r - 1, c) - img.at<sift_wt>(r + 1, c));
				747	X[k] = dx;
				748	Y[k] = dy;
				749	RBin[k] = rbin;
				750	CBin[k] = cbin;
				751	W[k] = (c_rot * c_rot + r_rot * r_rot) * exp_scale;
				752	k++;
				753	}
				754	}
				755
				756	len = k;
				757	cv::hal::fastAtan2(Y, X, Ori, len, true);
				758	cv::hal::magnitude32f(X, Y, Mag, len);
				759	cv::hal::exp32f(W, W, len);
				760
				761	k = 0;
				762	#if CV_AVX2
				763	if (USE_AVX2) {
				764	int CV_DECL_ALIGNED(32) idx_buf[8];
				765	float CV_DECL_ALIGNED(32) rco_buf[64];
				766	const __m256 __ori = _mm256_set1_ps(ori);
				767	const __m256 __bins_per_rad = _mm256_set1_ps(bins_per_rad);
				768	const __m256i __n = _mm256_set1_epi32(n);
				769	for (; k <= len - 8; k += 8) {
				770	__m256 __rbin = _mm256_loadu_ps(&RBin[k]);
				771	__m256 __cbin = _mm256_loadu_ps(&CBin[k]);
				772	__m256 __obin = _mm256_mul_ps(
				773	_mm256_sub_ps(_mm256_loadu_ps(&Ori[k]), __ori), __bins_per_rad);
				774	__m256 __mag =
				775	_mm256_mul_ps(_mm256_loadu_ps(&Mag[k]), _mm256_loadu_ps(&W[k]));
				776
				777	__m256 __r0 = _mm256_floor_ps(__rbin);
				778	__rbin = _mm256_sub_ps(__rbin, __r0);
				779	__m256 __c0 = _mm256_floor_ps(__cbin);
				780	__cbin = _mm256_sub_ps(__cbin, __c0);
				781	__m256 __o0 = _mm256_floor_ps(__obin);
				782	__obin = _mm256_sub_ps(__obin, __o0);
				783
				784	__m256i __o0i = _mm256_cvtps_epi32(__o0);
				785	__o0i = _mm256_add_epi32(
				786	__o0i, _mm256_and_si256(
				787	__n, _mm256_cmpgt_epi32(_mm256_setzero_si256(), __o0i)));
				788	__o0i = _mm256_sub_epi32(
				789	__o0i, _mm256_andnot_si256(_mm256_cmpgt_epi32(__n, __o0i), __n));
				790
				791	__m256 __v_r1 = _mm256_mul_ps(__mag, __rbin);
				792	__m256 __v_r0 = _mm256_sub_ps(__mag, __v_r1);
				793
				794	__m256 __v_rc11 = _mm256_mul_ps(__v_r1, __cbin);
				795	__m256 __v_rc10 = _mm256_sub_ps(__v_r1, __v_rc11);
				796
				797	__m256 __v_rc01 = _mm256_mul_ps(__v_r0, __cbin);
				798	__m256 __v_rc00 = _mm256_sub_ps(__v_r0, __v_rc01);
				799
				800	__m256 __v_rco111 = _mm256_mul_ps(__v_rc11, __obin);
				801	__m256 __v_rco110 = _mm256_sub_ps(__v_rc11, __v_rco111);
				802
				803	__m256 __v_rco101 = _mm256_mul_ps(__v_rc10, __obin);
				804	__m256 __v_rco100 = _mm256_sub_ps(__v_rc10, __v_rco101);
				805
				806	__m256 __v_rco011 = _mm256_mul_ps(__v_rc01, __obin);
				807	__m256 __v_rco010 = _mm256_sub_ps(__v_rc01, __v_rco011);
				808
				809	__m256 __v_rco001 = _mm256_mul_ps(__v_rc00, __obin);
				810	__m256 __v_rco000 = _mm256_sub_ps(__v_rc00, __v_rco001);
				811
				812	__m256i __one = _mm256_set1_epi32(1);
				813	__m256i __idx = _mm256_add_epi32(
				814	_mm256_mullo_epi32(
				815	_mm256_add_epi32(
				816	_mm256_mullo_epi32(
				817	_mm256_add_epi32(_mm256_cvtps_epi32(__r0), __one),
				818	_mm256_set1_epi32(d + 2)),
				819	_mm256_add_epi32(_mm256_cvtps_epi32(__c0), __one)),
				820	_mm256_set1_epi32(n + 2)),
				821	__o0i);
				822
				823	_mm256_store_si256((__m256i *)idx_buf, __idx);
				824
				825	_mm256_store_ps(&(rco_buf[0]), __v_rco000);
				826	_mm256_store_ps(&(rco_buf[8]), __v_rco001);
				827	_mm256_store_ps(&(rco_buf[16]), __v_rco010);
				828	_mm256_store_ps(&(rco_buf[24]), __v_rco011);
				829	_mm256_store_ps(&(rco_buf[32]), __v_rco100);
				830	_mm256_store_ps(&(rco_buf[40]), __v_rco101);
				831	_mm256_store_ps(&(rco_buf[48]), __v_rco110);
				832	_mm256_store_ps(&(rco_buf[56]), __v_rco111);
				833	#define HIST_SUM_HELPER(id) \
				834	hist[idx_buf[(id)]] += rco_buf[(id)]; \
				835	hist[idx_buf[(id)] + 1] += rco_buf[8 + (id)]; \
				836	hist[idx_buf[(id)] + (n + 2)] += rco_buf[16 + (id)]; \
				837	hist[idx_buf[(id)] + (n + 3)] += rco_buf[24 + (id)]; \
				838	hist[idx_buf[(id)] + (d + 2) * (n + 2)] += rco_buf[32 + (id)]; \
				839	hist[idx_buf[(id)] + (d + 2) * (n + 2) + 1] += rco_buf[40 + (id)]; \
				840	hist[idx_buf[(id)] + (d + 3) * (n + 2)] += rco_buf[48 + (id)]; \
				841	hist[idx_buf[(id)] + (d + 3) * (n + 2) + 1] += rco_buf[56 + (id)];
				842
				843	HIST_SUM_HELPER(0);
				844	HIST_SUM_HELPER(1);
				845	HIST_SUM_HELPER(2);
				846	HIST_SUM_HELPER(3);
				847	HIST_SUM_HELPER(4);
				848	HIST_SUM_HELPER(5);
				849	HIST_SUM_HELPER(6);
				850	HIST_SUM_HELPER(7);
				851
				852	#undef HIST_SUM_HELPER
				853	}
				854	}
				855	#endif
				856	for (; k < len; k++) {
				857	float rbin = RBin[k], cbin = CBin[k];
				858	float obin = (Ori[k] - ori) * bins_per_rad;
				859	float mag = Mag[k] * W[k];
				860
				861	int r0 = cvFloor(rbin);
				862	int c0 = cvFloor(cbin);
				863	int o0 = cvFloor(obin);
				864	rbin -= r0;
				865	cbin -= c0;
				866	obin -= o0;
				867
				868	if (o0 < 0) o0 += n;
				869	if (o0 >= n) o0 -= n;
				870
				871	// histogram update using tri-linear interpolation
				872	float v_r1 = mag * rbin, v_r0 = mag - v_r1;
				873	float v_rc11 = v_r1 * cbin, v_rc10 = v_r1 - v_rc11;
				874	float v_rc01 = v_r0 * cbin, v_rc00 = v_r0 - v_rc01;
				875	float v_rco111 = v_rc11 * obin, v_rco110 = v_rc11 - v_rco111;
				876	float v_rco101 = v_rc10 * obin, v_rco100 = v_rc10 - v_rco101;
				877	float v_rco011 = v_rc01 * obin, v_rco010 = v_rc01 - v_rco011;
				878	float v_rco001 = v_rc00 * obin, v_rco000 = v_rc00 - v_rco001;
				879
				880	int idx = ((r0 + 1) * (d + 2) + c0 + 1) * (n + 2) + o0;
				881	hist[idx] += v_rco000;
				882	hist[idx + 1] += v_rco001;
				883	hist[idx + (n + 2)] += v_rco010;
				884	hist[idx + (n + 3)] += v_rco011;
				885	hist[idx + (d + 2) * (n + 2)] += v_rco100;
				886	hist[idx + (d + 2) * (n + 2) + 1] += v_rco101;
				887	hist[idx + (d + 3) * (n + 2)] += v_rco110;
				888	hist[idx + (d + 3) * (n + 2) + 1] += v_rco111;
				889	}
				890
				891	// finalize histogram, since the orientation histograms are circular
				892	for (i = 0; i < d; i++)
				893	for (j = 0; j < d; j++) {
				894	int idx = ((i + 1) * (d + 2) + (j + 1)) * (n + 2);
				895	hist[idx] += hist[idx + n];
				896	hist[idx + 1] += hist[idx + n + 1];
				897	for (k = 0; k < n; k++) dst[(i * d + j) * n + k] = hist[idx + k];
				898	}
				899	// copy histogram to the descriptor,
				900	// apply hysteresis thresholding
				901	// and scale the result, so that it can be easily converted
				902	// to byte array
				903	float nrm2 = 0;
				904	len = d * d * n;
				905	k = 0;
				906	#if CV_AVX2
				907	if (USE_AVX2) {
				908	float CV_DECL_ALIGNED(32) nrm2_buf[8];
				909	__m256 __nrm2 = _mm256_setzero_ps();
				910	__m256 __dst;
				911	for (; k <= len - 8; k += 8) {
				912	__dst = _mm256_loadu_ps(&dst[k]);
				913	#if CV_FMA3
				914	__nrm2 = _mm256_fmadd_ps(__dst, __dst, __nrm2);
				915	#else
				916	__nrm2 = _mm256_add_ps(__nrm2, _mm256_mul_ps(__dst, __dst));
				917	#endif
				918	}
				919	_mm256_store_ps(nrm2_buf, __nrm2);
				920	nrm2 = nrm2_buf[0] + nrm2_buf[1] + nrm2_buf[2] + nrm2_buf[3] + nrm2_buf[4] +
				921	nrm2_buf[5] + nrm2_buf[6] + nrm2_buf[7];
				922	}
				923	#endif
				924	for (; k < len; k++) nrm2 += dst[k] * dst[k];
				925
				926	float thr = std::sqrt(nrm2) * SIFT_DESCR_MAG_THR;
				927
				928	i = 0, nrm2 = 0;
				929	#if 0 // CV_AVX2
				930	// This code cannot be enabled because it sums nrm2 in a different order,
				931	// thus producing slightly different results
				932	if( USE_AVX2 )
				933	{
				934	float CV_DECL_ALIGNED(32) nrm2_buf[8];
				935	__m256 __dst;
				936	__m256 __nrm2 = _mm256_setzero_ps();
				937	__m256 __thr = _mm256_set1_ps(thr);
				938	for( ; i <= len - 8; i += 8 )
				939	{
				940	__dst = _mm256_loadu_ps(&dst[i]);
				941	__dst = _mm256_min_ps(__dst, __thr);
				942	_mm256_storeu_ps(&dst[i], __dst);
				943	#if CV_FMA3
				944	__nrm2 = _mm256_fmadd_ps(__dst, __dst, __nrm2);
				945	#else
				946	__nrm2 = _mm256_add_ps(__nrm2, _mm256_mul_ps(__dst, __dst));
				947	#endif
				948	}
				949	_mm256_store_ps(nrm2_buf, __nrm2);
				950	nrm2 = nrm2_buf[0] + nrm2_buf[1] + nrm2_buf[2] + nrm2_buf[3] +
				951	nrm2_buf[4] + nrm2_buf[5] + nrm2_buf[6] + nrm2_buf[7];
				952	}
				953	#endif
				954	for (; i < len; i++) {
				955	float val = std::min(dst[i], thr);
				956	dst[i] = val;
				957	nrm2 += val * val;
				958	}
				959	nrm2 = SIFT_INT_DESCR_FCTR / std::max(std::sqrt(nrm2), FLT_EPSILON);
				960
				961	#if 1
				962	k = 0;
				963	#if CV_AVX2
				964	if (USE_AVX2) {
				965	__m256 __dst;
				966	__m256 __min = _mm256_setzero_ps();
				967	__m256 __max = _mm256_set1_ps(255.0f); // max of uchar
				968	__m256 __nrm2 = _mm256_set1_ps(nrm2);
				969	for (k = 0; k <= len - 8; k += 8) {
				970	__dst = _mm256_loadu_ps(&dst[k]);
				971	__dst = _mm256_min_ps(
				972	_mm256_max_ps(
				973	_mm256_round_ps(_mm256_mul_ps(__dst, __nrm2),
				974	_MM_FROUND_TO_NEAREST_INT \| _MM_FROUND_NO_EXC),
				975	__min),
				976	__max);
				977	_mm256_storeu_ps(&dst[k], __dst);
				978	}
				979	}
				980	#endif
				981	for (; k < len; k++) {
				982	dst[k] = saturate_cast<uchar>(dst[k] * nrm2);
				983	}
				984	#else
				985	float nrm1 = 0;
				986	for (k = 0; k < len; k++) {
				987	dst[k] *= nrm2;
				988	nrm1 += dst[k];
				989	}
				990	nrm1 = 1.f / std::max(nrm1, FLT_EPSILON);
				991	for (k = 0; k < len; k++) {
				992	dst[k] = std::sqrt(dst[k] * nrm1); // saturate_cast<uchar>(std::sqrt(dst[k]
				993	// * nrm1)*SIFT_INT_DESCR_FCTR);
				994	}
				995	#endif
				996	}
				997
				998	class calcDescriptorsComputer : public ParallelLoopBody {
				999	public:
				1000	calcDescriptorsComputer(const std::vector<Mat> &_gpyr,
				1001	const std::vector<KeyPoint> &_keypoints,
				1002	Mat &_descriptors, int _nOctaveLayers,
				1003	int _firstOctave)
				1004	: gpyr(_gpyr),
				1005	keypoints(_keypoints),
				1006	descriptors(_descriptors),
				1007	nOctaveLayers(_nOctaveLayers),
				1008	firstOctave(_firstOctave) {}
				1009
				1010	void operator()(const cv::Range &range) const override {
				1011	const int begin = range.start;
				1012	const int end = range.end;
				1013
				1014	static const int d = SIFT_DESCR_WIDTH, n = SIFT_DESCR_HIST_BINS;
				1015
				1016	for (int i = begin; i < end; i++) {
				1017	KeyPoint kpt = keypoints[i];
				1018	int octave, layer;
				1019	float scale;
				1020	unpackOctave(kpt, octave, layer, scale);
				1021	CV_Assert(octave >= firstOctave && layer <= nOctaveLayers + 2);
				1022	float size = kpt.size * scale;
				1023	Point2f ptf(kpt.pt.x * scale, kpt.pt.y * scale);
				1024	const Mat &img =
				1025	gpyr[(octave - firstOctave) * (nOctaveLayers + 3) + layer];
				1026
				1027	float angle = 360.f - kpt.angle;
				1028	if (std::abs(angle - 360.f) < FLT_EPSILON) angle = 0.f;
				1029	calcSIFTDescriptor(img, ptf, angle, size * 0.5f, d, n,
				1030	descriptors.ptr<float>((int)i));
				1031	}
				1032	}
				1033
				1034	private:
				1035	const std::vector<Mat> &gpyr;
				1036	const std::vector<KeyPoint> &keypoints;
				1037	Mat &descriptors;
				1038	int nOctaveLayers;
				1039	int firstOctave;
				1040	};
				1041
				1042	static void calcDescriptors(const std::vector<Mat> &gpyr,
				1043	const std::vector<KeyPoint> &keypoints,
				1044	Mat &descriptors, int nOctaveLayers,
				1045	int firstOctave) {
				1046	parallel_for_(Range(0, static_cast<int>(keypoints.size())),
				1047	calcDescriptorsComputer(gpyr, keypoints, descriptors,
				1048	nOctaveLayers, firstOctave));
				1049	}
				1050
				1051	} // namespace
				1052
				1053	//////////////////////////////////////////////////////////////////////////////////////////
				1054
				1055	SIFT971_Impl::SIFT971_Impl(int _nfeatures, int _nOctaveLayers,
				1056	double _contrastThreshold, double _edgeThreshold,
				1057	double _sigma)
				1058	: nfeatures(_nfeatures),
				1059	nOctaveLayers(_nOctaveLayers),
				1060	contrastThreshold(_contrastThreshold),
				1061	edgeThreshold(_edgeThreshold),
				1062	sigma(_sigma) {}
				1063
				1064	int SIFT971_Impl::descriptorSize() const {
				1065	return SIFT_DESCR_WIDTH * SIFT_DESCR_WIDTH * SIFT_DESCR_HIST_BINS;
				1066	}
				1067
				1068	int SIFT971_Impl::descriptorType() const { return CV_32F; }
				1069
				1070	int SIFT971_Impl::defaultNorm() const { return NORM_L2; }
				1071
				1072	void SIFT971_Impl::detectAndCompute(InputArray _image, InputArray _mask,
				1073	std::vector<KeyPoint> &keypoints,
				1074	OutputArray _descriptors,
				1075	bool useProvidedKeypoints) {
				1076	int firstOctave = -1, actualNOctaves = 0, actualNLayers = 0;
				1077	Mat image = _image.getMat(), mask = _mask.getMat();
				1078
				1079	if (image.empty() \|\| image.depth() != CV_8U)
				1080	CV_Error(Error::StsBadArg,
				1081	"image is empty or has incorrect depth (!=CV_8U)");
				1082
				1083	if (!mask.empty() && mask.type() != CV_8UC1)
				1084	CV_Error(Error::StsBadArg, "mask has incorrect type (!=CV_8UC1)");
				1085
				1086	if (useProvidedKeypoints) {
				1087	firstOctave = 0;
				1088	int maxOctave = INT_MIN;
				1089	for (size_t i = 0; i < keypoints.size(); i++) {
				1090	int octave, layer;
				1091	float scale;
				1092	unpackOctave(keypoints[i], octave, layer, scale);
				1093	firstOctave = std::min(firstOctave, octave);
				1094	maxOctave = std::max(maxOctave, octave);
				1095	actualNLayers = std::max(actualNLayers, layer - 2);
				1096	}
				1097
				1098	firstOctave = std::min(firstOctave, 0);
				1099	CV_Assert(firstOctave >= -1 && actualNLayers <= nOctaveLayers);
				1100	actualNOctaves = maxOctave - firstOctave + 1;
				1101	}
				1102
				1103	Mat base = createInitialImage(image, firstOctave < 0, (float)sigma);
				1104	std::vector<Mat> gpyr;
				1105	int nOctaves =
				1106	actualNOctaves > 0
				1107	? actualNOctaves
				1108	: cvRound(std::log((double)std::min(base.cols, base.rows)) /
				1109	std::log(2.) -
				1110	2) -
				1111	firstOctave;
				1112
				1113	// double t, tf = getTickFrequency();
				1114	// t = (double)getTickCount();
				1115	buildGaussianPyramid(base, gpyr, nOctaves);
				1116
				1117	// t = (double)getTickCount() - t;
				1118	// printf("pyramid construction time: %g\n", t*1000./tf);
				1119
				1120	if (!useProvidedKeypoints) {
				1121	std::vector<Mat> dogpyr;
				1122	buildDoGPyramid(gpyr, dogpyr);
				1123	// t = (double)getTickCount();
				1124	findScaleSpaceExtrema(gpyr, dogpyr, keypoints);
				1125	// TODO(Brian): I think it can go faster by knowing they're sorted?
				1126	// KeyPointsFilter::removeDuplicatedSorted( keypoints );
				1127	KeyPointsFilter::removeDuplicated(keypoints);
				1128
				1129	if (nfeatures > 0) KeyPointsFilter::retainBest(keypoints, nfeatures);
				1130	// t = (double)getTickCount() - t;
				1131	// printf("keypoint detection time: %g\n", t*1000./tf);
				1132
				1133	if (firstOctave < 0)
				1134	for (size_t i = 0; i < keypoints.size(); i++) {
				1135	KeyPoint &kpt = keypoints[i];
				1136	float scale = 1.f / (float)(1 << -firstOctave);
				1137	kpt.octave = (kpt.octave & ~255) \| ((kpt.octave + firstOctave) & 255);
				1138	kpt.pt *= scale;
				1139	kpt.size *= scale;
				1140	}
				1141
				1142	if (!mask.empty()) KeyPointsFilter::runByPixelsMask(keypoints, mask);
				1143	} else {
				1144	// filter keypoints by mask
				1145	// KeyPointsFilter::runByPixelsMask( keypoints, mask );
				1146	}
				1147
				1148	if (_descriptors.needed()) {
				1149	// t = (double)getTickCount();
				1150	int dsize = descriptorSize();
				1151	_descriptors.create((int)keypoints.size(), dsize, CV_32F);
				1152	Mat descriptors = _descriptors.getMat();
				1153
				1154	calcDescriptors(gpyr, keypoints, descriptors, nOctaveLayers, firstOctave);
				1155	// t = (double)getTickCount() - t;
				1156	// printf("descriptor extraction time: %g\n", t*1000./tf);
				1157	}
				1158	}
				1159
				1160	} // namespace vision
				1161	} // namespace frc971