diff --git a/upstream_utils/update_eigen.py b/upstream_utils/update_eigen.py index e9be219de5..0df72a8d77 100755 --- a/upstream_utils/update_eigen.py +++ b/upstream_utils/update_eigen.py @@ -20,10 +20,6 @@ def eigen_inclusions(dp, f): abspath = os.path.join(dp, f) - # Exclude build system - if f == "CMakeLists.txt": - return False - # Exclude NonMPL2.h since all non-MPL2 code will be excluded anyway if f == "NonMPL2.h": return False @@ -92,7 +88,7 @@ def unsupported_inclusions(dp, f): def main(): root, repo = setup_upstream_repo("https://gitlab.com/libeigen/eigen.git", - "3.3.9") + "3.4.0") wpimath = os.path.join(root, "wpimath") # Delete old install diff --git a/wpimath/src/main/native/eigeninclude/Eigen/Cholesky b/wpimath/src/main/native/eigeninclude/Eigen/Cholesky index 2460ce049b..ef249de7bc 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/Cholesky +++ b/wpimath/src/main/native/eigeninclude/Eigen/Cholesky @@ -43,4 +43,3 @@ #include "src/Core/util/ReenableStupidWarnings.h" #endif // EIGEN_CHOLESKY_MODULE_H -/* vim: set filetype=cpp et sw=2 ts=2 ai: */ diff --git a/wpimath/src/main/native/eigeninclude/Eigen/Core b/wpimath/src/main/native/eigeninclude/Eigen/Core index 1122fc2ea8..fd5e098cb7 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/Core +++ b/wpimath/src/main/native/eigeninclude/Eigen/Core @@ -11,251 +11,55 @@ #ifndef EIGEN_CORE_H #define EIGEN_CORE_H -// first thing Eigen does: stop the compiler from committing suicide +// first thing Eigen does: stop the compiler from reporting useless warnings. #include "src/Core/util/DisableStupidWarnings.h" -#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA) - #define EIGEN_CUDACC __CUDACC__ +// then include this file where all our macros are defined. It's really important to do it first because +// it's where we do all the compiler/OS/arch detections and define most defaults. +#include "src/Core/util/Macros.h" + +// This detects SSE/AVX/NEON/etc. and configure alignment settings +#include "src/Core/util/ConfigureVectorization.h" + +// We need cuda_runtime.h/hip_runtime.h to ensure that +// the EIGEN_USING_STD macro works properly on the device side +#if defined(EIGEN_CUDACC) + #include +#elif defined(EIGEN_HIPCC) + #include #endif -#if defined(__CUDA_ARCH__) && !defined(EIGEN_NO_CUDA) - #define EIGEN_CUDA_ARCH __CUDA_ARCH__ -#endif - -#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) -#define EIGEN_CUDACC_VER ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100)) -#elif defined(__CUDACC_VER__) -#define EIGEN_CUDACC_VER __CUDACC_VER__ -#else -#define EIGEN_CUDACC_VER 0 -#endif - -// Handle NVCC/CUDA/SYCL -#if defined(__CUDACC__) || defined(__SYCL_DEVICE_ONLY__) - // Do not try asserts on CUDA and SYCL! - #ifndef EIGEN_NO_DEBUG - #define EIGEN_NO_DEBUG - #endif - - #ifdef EIGEN_INTERNAL_DEBUGGING - #undef EIGEN_INTERNAL_DEBUGGING - #endif - - #ifdef EIGEN_EXCEPTIONS - #undef EIGEN_EXCEPTIONS - #endif - - // All functions callable from CUDA code must be qualified with __device__ - #ifdef __CUDACC__ - // Do not try to vectorize on CUDA and SYCL! - #ifndef EIGEN_DONT_VECTORIZE - #define EIGEN_DONT_VECTORIZE - #endif - - #define EIGEN_DEVICE_FUNC __host__ __device__ - // We need cuda_runtime.h to ensure that that EIGEN_USING_STD_MATH macro - // works properly on the device side - #include - #else - #define EIGEN_DEVICE_FUNC - #endif - -#else - #define EIGEN_DEVICE_FUNC - -#endif - -// When compiling CUDA device code with NVCC, pull in math functions from the -// global namespace. In host mode, and when device doee with clang, use the -// std versions. -#if defined(__CUDA_ARCH__) && defined(__NVCC__) - #define EIGEN_USING_STD_MATH(FUNC) using ::FUNC; -#else - #define EIGEN_USING_STD_MATH(FUNC) using std::FUNC; -#endif - -#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL) - #define EIGEN_EXCEPTIONS -#endif #ifdef EIGEN_EXCEPTIONS #include #endif -// then include this file where all our macros are defined. It's really important to do it first because -// it's where we do all the alignment settings (platform detection and honoring the user's will if he -// defined e.g. EIGEN_DONT_ALIGN) so it needs to be done before we do anything with vectorization. -#include "src/Core/util/Macros.h" - // Disable the ipa-cp-clone optimization flag with MinGW 6.x or newer (enabled by default with -O3) // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=556 for details. -#if EIGEN_COMP_MINGW && EIGEN_GNUC_AT_LEAST(4,6) +#if EIGEN_COMP_MINGW && EIGEN_GNUC_AT_LEAST(4,6) && EIGEN_GNUC_AT_MOST(5,5) #pragma GCC optimize ("-fno-ipa-cp-clone") #endif +// Prevent ICC from specializing std::complex operators that silently fail +// on device. This allows us to use our own device-compatible specializations +// instead. +#if defined(EIGEN_COMP_ICC) && defined(EIGEN_GPU_COMPILE_PHASE) \ + && !defined(_OVERRIDE_COMPLEX_SPECIALIZATION_) +#define _OVERRIDE_COMPLEX_SPECIALIZATION_ 1 +#endif #include // this include file manages BLAS and MKL related macros // and inclusion of their respective header files // #include "src/Core/util/MKL_support.h" -// if alignment is disabled, then disable vectorization. Note: EIGEN_MAX_ALIGN_BYTES is the proper check, it takes into -// account both the user's will (EIGEN_MAX_ALIGN_BYTES,EIGEN_DONT_ALIGN) and our own platform checks -#if EIGEN_MAX_ALIGN_BYTES==0 - #ifndef EIGEN_DONT_VECTORIZE - #define EIGEN_DONT_VECTORIZE - #endif + +#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16) + #define EIGEN_HAS_GPU_FP16 #endif -#if EIGEN_COMP_MSVC - #include // for _aligned_malloc -- need it regardless of whether vectorization is enabled - #if (EIGEN_COMP_MSVC >= 1500) // 2008 or later - // Remember that usage of defined() in a #define is undefined by the standard. - // a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP. - #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || EIGEN_ARCH_x86_64 - #define EIGEN_SSE2_ON_MSVC_2008_OR_LATER - #endif - #endif -#else - // Remember that usage of defined() in a #define is undefined by the standard - #if (defined __SSE2__) && ( (!EIGEN_COMP_GNUC) || EIGEN_COMP_ICC || EIGEN_GNUC_AT_LEAST(4,2) ) - #define EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC - #endif -#endif - -#ifndef EIGEN_DONT_VECTORIZE - - #if defined (EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER) - - // Defines symbols for compile-time detection of which instructions are - // used. - // EIGEN_VECTORIZE_YY is defined if and only if the instruction set YY is used - #define EIGEN_VECTORIZE - #define EIGEN_VECTORIZE_SSE - #define EIGEN_VECTORIZE_SSE2 - - // Detect sse3/ssse3/sse4: - // gcc and icc defines __SSE3__, ... - // there is no way to know about this on msvc. You can define EIGEN_VECTORIZE_SSE* if you - // want to force the use of those instructions with msvc. - #ifdef __SSE3__ - #define EIGEN_VECTORIZE_SSE3 - #endif - #ifdef __SSSE3__ - #define EIGEN_VECTORIZE_SSSE3 - #endif - #ifdef __SSE4_1__ - #define EIGEN_VECTORIZE_SSE4_1 - #endif - #ifdef __SSE4_2__ - #define EIGEN_VECTORIZE_SSE4_2 - #endif - #ifdef __AVX__ - #define EIGEN_VECTORIZE_AVX - #define EIGEN_VECTORIZE_SSE3 - #define EIGEN_VECTORIZE_SSSE3 - #define EIGEN_VECTORIZE_SSE4_1 - #define EIGEN_VECTORIZE_SSE4_2 - #endif - #ifdef __AVX2__ - #define EIGEN_VECTORIZE_AVX2 - #endif - #ifdef __FMA__ - #define EIGEN_VECTORIZE_FMA - #endif - #if defined(__AVX512F__) && defined(EIGEN_ENABLE_AVX512) - #define EIGEN_VECTORIZE_AVX512 - #define EIGEN_VECTORIZE_AVX2 - #define EIGEN_VECTORIZE_AVX - #define EIGEN_VECTORIZE_FMA - #ifdef __AVX512DQ__ - #define EIGEN_VECTORIZE_AVX512DQ - #endif - #ifdef __AVX512ER__ - #define EIGEN_VECTORIZE_AVX512ER - #endif - #endif - - // include files - - // This extern "C" works around a MINGW-w64 compilation issue - // https://sourceforge.net/tracker/index.php?func=detail&aid=3018394&group_id=202880&atid=983354 - // In essence, intrin.h is included by windows.h and also declares intrinsics (just as emmintrin.h etc. below do). - // However, intrin.h uses an extern "C" declaration, and g++ thus complains of duplicate declarations - // with conflicting linkage. The linkage for intrinsics doesn't matter, but at that stage the compiler doesn't know; - // so, to avoid compile errors when windows.h is included after Eigen/Core, ensure intrinsics are extern "C" here too. - // notice that since these are C headers, the extern "C" is theoretically needed anyways. - extern "C" { - // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly. - // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus: - #if EIGEN_COMP_ICC >= 1110 - #include - #else - #include - #include - #include - #ifdef EIGEN_VECTORIZE_SSE3 - #include - #endif - #ifdef EIGEN_VECTORIZE_SSSE3 - #include - #endif - #ifdef EIGEN_VECTORIZE_SSE4_1 - #include - #endif - #ifdef EIGEN_VECTORIZE_SSE4_2 - #include - #endif - #if defined(EIGEN_VECTORIZE_AVX) || defined(EIGEN_VECTORIZE_AVX512) - #include - #endif - #endif - } // end extern "C" - #elif defined __VSX__ - #define EIGEN_VECTORIZE - #define EIGEN_VECTORIZE_VSX - #include - // We need to #undef all these ugly tokens defined in - // => use __vector instead of vector - #undef bool - #undef vector - #undef pixel - #elif defined __ALTIVEC__ - #define EIGEN_VECTORIZE - #define EIGEN_VECTORIZE_ALTIVEC - #include - // We need to #undef all these ugly tokens defined in - // => use __vector instead of vector - #undef bool - #undef vector - #undef pixel - #elif (defined __ARM_NEON) || (defined __ARM_NEON__) - #define EIGEN_VECTORIZE - #define EIGEN_VECTORIZE_NEON - #include - #elif (defined __s390x__ && defined __VEC__) - #define EIGEN_VECTORIZE - #define EIGEN_VECTORIZE_ZVECTOR - #include - #endif -#endif - -#if defined(__F16C__) && !defined(EIGEN_COMP_CLANG) - // We can use the optimized fp16 to float and float to fp16 conversion routines - #define EIGEN_HAS_FP16_C -#endif - -#if defined __CUDACC__ - #define EIGEN_VECTORIZE_CUDA - #include - #if EIGEN_CUDACC_VER >= 70500 - #define EIGEN_HAS_CUDA_FP16 - #endif -#endif - -#if defined EIGEN_HAS_CUDA_FP16 - #include - #include +#if defined(EIGEN_HAS_CUDA_BF16) || defined(EIGEN_HAS_HIP_BF16) + #define EIGEN_HAS_GPU_BF16 #endif #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE) @@ -290,6 +94,10 @@ // for min/max: #include +#if EIGEN_HAS_CXX11 +#include +#endif + // for std::is_nothrow_move_assignable #ifdef EIGEN_INCLUDE_TYPE_TRAITS #include @@ -305,38 +113,25 @@ #include #endif -/** \brief Namespace containing all symbols from the %Eigen library. */ -namespace Eigen { - -inline static const char *SimdInstructionSetsInUse(void) { -#if defined(EIGEN_VECTORIZE_AVX512) - return "AVX512, FMA, AVX2, AVX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; -#elif defined(EIGEN_VECTORIZE_AVX) - return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; -#elif defined(EIGEN_VECTORIZE_SSE4_2) - return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; -#elif defined(EIGEN_VECTORIZE_SSE4_1) - return "SSE, SSE2, SSE3, SSSE3, SSE4.1"; -#elif defined(EIGEN_VECTORIZE_SSSE3) - return "SSE, SSE2, SSE3, SSSE3"; -#elif defined(EIGEN_VECTORIZE_SSE3) - return "SSE, SSE2, SSE3"; -#elif defined(EIGEN_VECTORIZE_SSE2) - return "SSE, SSE2"; -#elif defined(EIGEN_VECTORIZE_ALTIVEC) - return "AltiVec"; -#elif defined(EIGEN_VECTORIZE_VSX) - return "VSX"; -#elif defined(EIGEN_VECTORIZE_NEON) - return "ARM NEON"; -#elif defined(EIGEN_VECTORIZE_ZVECTOR) - return "S390X ZVECTOR"; -#else - return "None"; +#if defined(EIGEN_USE_SYCL) + #undef min + #undef max + #undef isnan + #undef isinf + #undef isfinite + #include + #include + #include + #include + #include + #ifndef EIGEN_SYCL_LOCAL_THREAD_DIM0 + #define EIGEN_SYCL_LOCAL_THREAD_DIM0 16 + #endif + #ifndef EIGEN_SYCL_LOCAL_THREAD_DIM1 + #define EIGEN_SYCL_LOCAL_THREAD_DIM1 16 + #endif #endif -} -} // end namespace Eigen #if defined EIGEN2_SUPPORT_STAGE40_FULL_EIGEN3_STRICTNESS || defined EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API || defined EIGEN2_SUPPORT_STAGE20_RESOLVE_API_CONFLICTS || defined EIGEN2_SUPPORT_STAGE10_FULL_EIGEN2_API || defined EIGEN2_SUPPORT // This will generate an error message: @@ -345,7 +140,7 @@ inline static const char *SimdInstructionSetsInUse(void) { namespace Eigen { -// we use size_t frequently and we'll never remember to prepend it with std:: everytime just to +// we use size_t frequently and we'll never remember to prepend it with std:: every time just to // ensure QNX/QCC support using std::size_t; // gcc 4.6.0 wants std:: for ptrdiff_t @@ -369,60 +164,90 @@ using std::ptrdiff_t; #include "src/Core/util/StaticAssert.h" #include "src/Core/util/XprHelper.h" #include "src/Core/util/Memory.h" +#include "src/Core/util/IntegralConstant.h" +#include "src/Core/util/SymbolicIndex.h" #include "src/Core/NumTraits.h" #include "src/Core/MathFunctions.h" #include "src/Core/GenericPacketMath.h" #include "src/Core/MathFunctionsImpl.h" #include "src/Core/arch/Default/ConjHelper.h" +// Generic half float support +#include "src/Core/arch/Default/Half.h" +#include "src/Core/arch/Default/BFloat16.h" +#include "src/Core/arch/Default/TypeCasting.h" +#include "src/Core/arch/Default/GenericPacketMathFunctionsFwd.h" #if defined EIGEN_VECTORIZE_AVX512 #include "src/Core/arch/SSE/PacketMath.h" - #include "src/Core/arch/SSE/MathFunctions.h" + #include "src/Core/arch/SSE/TypeCasting.h" + #include "src/Core/arch/SSE/Complex.h" #include "src/Core/arch/AVX/PacketMath.h" - #include "src/Core/arch/AVX/MathFunctions.h" + #include "src/Core/arch/AVX/TypeCasting.h" + #include "src/Core/arch/AVX/Complex.h" // #include "src/Core/arch/AVX512/PacketMath.h" + // #include "src/Core/arch/AVX512/TypeCasting.h" + // #include "src/Core/arch/AVX512/Complex.h" + #include "src/Core/arch/SSE/MathFunctions.h" + #include "src/Core/arch/AVX/MathFunctions.h" // #include "src/Core/arch/AVX512/MathFunctions.h" #elif defined EIGEN_VECTORIZE_AVX // Use AVX for floats and doubles, SSE for integers #include "src/Core/arch/SSE/PacketMath.h" - #include "src/Core/arch/SSE/Complex.h" - #include "src/Core/arch/SSE/MathFunctions.h" - #include "src/Core/arch/AVX/PacketMath.h" - #include "src/Core/arch/AVX/MathFunctions.h" - #include "src/Core/arch/AVX/Complex.h" - #include "src/Core/arch/AVX/TypeCasting.h" #include "src/Core/arch/SSE/TypeCasting.h" + #include "src/Core/arch/SSE/Complex.h" + #include "src/Core/arch/AVX/PacketMath.h" + #include "src/Core/arch/AVX/TypeCasting.h" + #include "src/Core/arch/AVX/Complex.h" + #include "src/Core/arch/SSE/MathFunctions.h" + #include "src/Core/arch/AVX/MathFunctions.h" #elif defined EIGEN_VECTORIZE_SSE #include "src/Core/arch/SSE/PacketMath.h" + #include "src/Core/arch/SSE/TypeCasting.h" #include "src/Core/arch/SSE/MathFunctions.h" #include "src/Core/arch/SSE/Complex.h" - #include "src/Core/arch/SSE/TypeCasting.h" #elif defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX) // #include "src/Core/arch/AltiVec/PacketMath.h" // #include "src/Core/arch/AltiVec/MathFunctions.h" // #include "src/Core/arch/AltiVec/Complex.h" #elif defined EIGEN_VECTORIZE_NEON #include "src/Core/arch/NEON/PacketMath.h" + #include "src/Core/arch/NEON/TypeCasting.h" #include "src/Core/arch/NEON/MathFunctions.h" #include "src/Core/arch/NEON/Complex.h" +#elif defined EIGEN_VECTORIZE_SVE + // #include "src/Core/arch/SVE/PacketMath.h" + // #include "src/Core/arch/SVE/TypeCasting.h" + // #include "src/Core/arch/SVE/MathFunctions.h" #elif defined EIGEN_VECTORIZE_ZVECTOR // #include "src/Core/arch/ZVector/PacketMath.h" // #include "src/Core/arch/ZVector/MathFunctions.h" // #include "src/Core/arch/ZVector/Complex.h" +#elif defined EIGEN_VECTORIZE_MSA + // #include "src/Core/arch/MSA/PacketMath.h" + // #include "src/Core/arch/MSA/MathFunctions.h" + // #include "src/Core/arch/MSA/Complex.h" #endif -// Half float support -// #include "src/Core/arch/CUDA/Half.h" -// #include "src/Core/arch/CUDA/PacketMathHalf.h" -// #include "src/Core/arch/CUDA/TypeCasting.h" +#if defined EIGEN_VECTORIZE_GPU + // #include "src/Core/arch/GPU/PacketMath.h" + // #include "src/Core/arch/GPU/MathFunctions.h" + // #include "src/Core/arch/GPU/TypeCasting.h" +#endif -#if defined EIGEN_VECTORIZE_CUDA - // #include "src/Core/arch/CUDA/PacketMath.h" - // #include "src/Core/arch/CUDA/MathFunctions.h" +#if defined(EIGEN_USE_SYCL) + // #include "src/Core/arch/SYCL/SyclMemoryModel.h" + // #include "src/Core/arch/SYCL/InteropHeaders.h" +#if !defined(EIGEN_DONT_VECTORIZE_SYCL) + // #include "src/Core/arch/SYCL/PacketMath.h" + // #include "src/Core/arch/SYCL/MathFunctions.h" + // #include "src/Core/arch/SYCL/TypeCasting.h" +#endif #endif #include "src/Core/arch/Default/Settings.h" +// This file provides generic implementations valid for scalar as well +#include "src/Core/arch/Default/GenericPacketMathFunctions.h" #include "src/Core/functors/TernaryFunctors.h" #include "src/Core/functors/BinaryFunctors.h" @@ -433,9 +258,16 @@ using std::ptrdiff_t; // Specialized functors to enable the processing of complex numbers // on CUDA devices +#ifdef EIGEN_CUDACC // #include "src/Core/arch/CUDA/Complex.h" +#endif -#include "src/Core/IO.h" +#include "src/Core/util/IndexedViewHelper.h" +#include "src/Core/util/ReshapedHelper.h" +#include "src/Core/ArithmeticSequence.h" +#ifndef EIGEN_NO_IO + #include "src/Core/IO.h" +#endif #include "src/Core/DenseCoeffsBase.h" #include "src/Core/DenseBase.h" #include "src/Core/MatrixBase.h" @@ -476,6 +308,8 @@ using std::ptrdiff_t; #include "src/Core/Ref.h" #include "src/Core/Block.h" #include "src/Core/VectorBlock.h" +#include "src/Core/IndexedView.h" +#include "src/Core/Reshaped.h" #include "src/Core/Transpose.h" #include "src/Core/DiagonalMatrix.h" #include "src/Core/Diagonal.h" @@ -512,13 +346,21 @@ using std::ptrdiff_t; #include "src/Core/CoreIterators.h" #include "src/Core/ConditionEstimator.h" +#if defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX) + // #include "src/Core/arch/AltiVec/MatrixProduct.h" +#elif defined EIGEN_VECTORIZE_NEON + #include "src/Core/arch/NEON/GeneralBlockPanelKernel.h" +#endif + #include "src/Core/BooleanRedux.h" #include "src/Core/Select.h" #include "src/Core/VectorwiseOp.h" +#include "src/Core/PartialReduxEvaluator.h" #include "src/Core/Random.h" #include "src/Core/Replicate.h" #include "src/Core/Reverse.h" #include "src/Core/ArrayWrapper.h" +#include "src/Core/StlIterators.h" #ifdef EIGEN_USE_BLAS // #include "src/Core/products/GeneralMatrixMatrix_BLAS.h" diff --git a/wpimath/src/main/native/eigeninclude/Eigen/Eigenvalues b/wpimath/src/main/native/eigeninclude/Eigen/Eigenvalues index 83cfdca999..c6defe34a4 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/Eigenvalues +++ b/wpimath/src/main/native/eigeninclude/Eigen/Eigenvalues @@ -58,4 +58,3 @@ #include "src/Core/util/ReenableStupidWarnings.h" #endif // EIGEN_EIGENVALUES_MODULE_H -/* vim: set filetype=cpp et sw=2 ts=2 ai: */ diff --git a/wpimath/src/main/native/eigeninclude/Eigen/Householder b/wpimath/src/main/native/eigeninclude/Eigen/Householder index 89cd81b1af..f2fa79969c 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/Householder +++ b/wpimath/src/main/native/eigeninclude/Eigen/Householder @@ -27,4 +27,3 @@ #include "src/Core/util/ReenableStupidWarnings.h" #endif // EIGEN_HOUSEHOLDER_MODULE_H -/* vim: set filetype=cpp et sw=2 ts=2 ai: */ diff --git a/wpimath/src/main/native/eigeninclude/Eigen/Jacobi b/wpimath/src/main/native/eigeninclude/Eigen/Jacobi index 17c1d785a1..43edc7a194 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/Jacobi +++ b/wpimath/src/main/native/eigeninclude/Eigen/Jacobi @@ -29,5 +29,4 @@ #include "src/Core/util/ReenableStupidWarnings.h" #endif // EIGEN_JACOBI_MODULE_H -/* vim: set filetype=cpp et sw=2 ts=2 ai: */ diff --git a/wpimath/src/main/native/eigeninclude/Eigen/LU b/wpimath/src/main/native/eigeninclude/Eigen/LU index 88f75931c3..a1b5d462f7 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/LU +++ b/wpimath/src/main/native/eigeninclude/Eigen/LU @@ -38,13 +38,10 @@ #include "src/LU/Determinant.h" #include "src/LU/InverseImpl.h" -// Use the SSE optimized version whenever possible. At the moment the -// SSE version doesn't compile when AVX is enabled -#if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX - #include "src/LU/arch/Inverse_SSE.h" +#if defined EIGEN_VECTORIZE_SSE || defined EIGEN_VECTORIZE_NEON + #include "src/LU/arch/InverseSize4.h" #endif #include "src/Core/util/ReenableStupidWarnings.h" #endif // EIGEN_LU_MODULE_H -/* vim: set filetype=cpp et sw=2 ts=2 ai: */ diff --git a/wpimath/src/main/native/eigeninclude/Eigen/QR b/wpimath/src/main/native/eigeninclude/Eigen/QR index 01c37817d0..42a3fa819b 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/QR +++ b/wpimath/src/main/native/eigeninclude/Eigen/QR @@ -48,4 +48,3 @@ #include "src/Core/util/ReenableStupidWarnings.h" #endif // EIGEN_QR_MODULE_H -/* vim: set filetype=cpp et sw=2 ts=2 ai: */ diff --git a/wpimath/src/main/native/eigeninclude/Eigen/SVD b/wpimath/src/main/native/eigeninclude/Eigen/SVD index 167a2b0715..4441a38ef8 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/SVD +++ b/wpimath/src/main/native/eigeninclude/Eigen/SVD @@ -48,4 +48,3 @@ #include "src/Core/util/ReenableStupidWarnings.h" #endif // EIGEN_SVD_MODULE_H -/* vim: set filetype=cpp et sw=2 ts=2 ai: */ diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Cholesky/LDLT.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Cholesky/LDLT.h index 15ccf24f14..1013ca045d 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Cholesky/LDLT.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Cholesky/LDLT.h @@ -16,6 +16,15 @@ namespace Eigen { namespace internal { + template struct traits > + : traits<_MatrixType> + { + typedef MatrixXpr XprKind; + typedef SolverStorage StorageKind; + typedef int StorageIndex; + enum { Flags = 0 }; + }; + template struct LDLT_Traits; // PositiveSemiDef means positive semi-definite and non-zero; same for NegativeSemiDef @@ -36,7 +45,7 @@ namespace internal { * matrix \f$ A \f$ such that \f$ A = P^TLDL^*P \f$, where P is a permutation matrix, L * is lower triangular with a unit diagonal and D is a diagonal matrix. * - * The decomposition uses pivoting to ensure stability, so that L will have + * The decomposition uses pivoting to ensure stability, so that D will have * zeros in the bottom right rank(A) - n submatrix. Avoiding the square root * on D also stabilizes the computation. * @@ -44,24 +53,23 @@ namespace internal { * decomposition to determine whether a system of equations has a solution. * * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism. - * + * * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt(), class LLT */ template class LDLT + : public SolverBase > { public: typedef _MatrixType MatrixType; + typedef SolverBase Base; + friend class SolverBase; + + EIGEN_GENERIC_PUBLIC_INTERFACE(LDLT) enum { - RowsAtCompileTime = MatrixType::RowsAtCompileTime, - ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, UpLo = _UpLo }; - typedef typename MatrixType::Scalar Scalar; - typedef typename NumTraits::Real RealScalar; - typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3 - typedef typename MatrixType::StorageIndex StorageIndex; typedef Matrix TmpMatrixType; typedef Transpositions TranspositionType; @@ -180,6 +188,7 @@ template class LDLT return m_sign == internal::NegativeSemiDef || m_sign == internal::ZeroSign; } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** \returns a solution x of \f$ A x = b \f$ using the current decomposition of A. * * This function also supports in-place solves using the syntax x = decompositionObject.solve(x) . @@ -191,19 +200,14 @@ template class LDLT * \f$ L^* y_4 = y_3 \f$ and \f$ P x = y_4 \f$ in succession. If the matrix \f$ A \f$ is singular, then * \f$ D \f$ will also be singular (all the other matrices are invertible). In that case, the * least-square solution of \f$ D y_3 = y_2 \f$ is computed. This does not mean that this function - * computes the least-square solution of \f$ A x = b \f$ is \f$ A \f$ is singular. + * computes the least-square solution of \f$ A x = b \f$ if \f$ A \f$ is singular. * * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt() */ template inline const Solve - solve(const MatrixBase& b) const - { - eigen_assert(m_isInitialized && "LDLT is not initialized."); - eigen_assert(m_matrix.rows()==b.rows() - && "LDLT::solve(): invalid number of rows of the right hand side matrix b"); - return Solve(*this, b.derived()); - } + solve(const MatrixBase& b) const; + #endif template bool solveInPlace(MatrixBase &bAndX) const; @@ -242,12 +246,12 @@ template class LDLT */ const LDLT& adjoint() const { return *this; }; - inline Index rows() const { return m_matrix.rows(); } - inline Index cols() const { return m_matrix.cols(); } + EIGEN_DEVICE_FUNC inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); } + EIGEN_DEVICE_FUNC inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); } /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the factorization failed because of a zero pivot. */ ComputationInfo info() const @@ -258,8 +262,10 @@ template class LDLT #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; + + template + void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: @@ -560,14 +566,22 @@ template template void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const { - eigen_assert(rhs.rows() == rows()); + _solve_impl_transposed(rhs, dst); +} + +template +template +void LDLT<_MatrixType,_UpLo>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +{ // dst = P b dst = m_transpositions * rhs; // dst = L^-1 (P b) - matrixL().solveInPlace(dst); + // dst = L^-*T (P b) + matrixL().template conjugateIf().solveInPlace(dst); - // dst = D^-1 (L^-1 P b) + // dst = D^-* (L^-1 P b) + // dst = D^-1 (L^-*T P b) // more precisely, use pseudo-inverse of D (see bug 241) using std::abs; const typename Diagonal::RealReturnType vecD(vectorD()); @@ -579,7 +593,6 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) cons // Moreover, Lapack's xSYTRS routines use 0 for the tolerance. // Using numeric_limits::min() gives us more robustness to denormals. RealScalar tolerance = (std::numeric_limits::min)(); - for (Index i = 0; i < vecD.size(); ++i) { if(abs(vecD(i)) > tolerance) @@ -588,10 +601,12 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) cons dst.row(i).setZero(); } - // dst = L^-T (D^-1 L^-1 P b) - matrixU().solveInPlace(dst); + // dst = L^-* (D^-* L^-1 P b) + // dst = L^-T (D^-1 L^-*T P b) + matrixL().transpose().template conjugateIf().solveInPlace(dst); - // dst = P^-1 (L^-T D^-1 L^-1 P b) = A^-1 b + // dst = P^T (L^-* D^-* L^-1 P b) = A^-1 b + // dst = P^-T (L^-T D^-1 L^-*T P b) = A^-1 b dst = m_transpositions.transpose() * dst; } #endif diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Cholesky/LLT.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Cholesky/LLT.h index e1624d21b6..8c9b2b3987 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Cholesky/LLT.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Cholesky/LLT.h @@ -13,6 +13,16 @@ namespace Eigen { namespace internal{ + +template struct traits > + : traits<_MatrixType> +{ + typedef MatrixXpr XprKind; + typedef SolverStorage StorageKind; + typedef int StorageIndex; + enum { Flags = 0 }; +}; + template struct LLT_Traits; } @@ -54,18 +64,17 @@ template struct LLT_Traits; * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT */ template class LLT + : public SolverBase > { public: typedef _MatrixType MatrixType; + typedef SolverBase Base; + friend class SolverBase; + + EIGEN_GENERIC_PUBLIC_INTERFACE(LLT) enum { - RowsAtCompileTime = MatrixType::RowsAtCompileTime, - ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime }; - typedef typename MatrixType::Scalar Scalar; - typedef typename NumTraits::Real RealScalar; - typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3 - typedef typename MatrixType::StorageIndex StorageIndex; enum { PacketSize = internal::packet_traits::size, @@ -100,7 +109,7 @@ template class LLT compute(matrix.derived()); } - /** \brief Constructs a LDLT factorization from a given matrix + /** \brief Constructs a LLT factorization from a given matrix * * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when * \c MatrixType is a Eigen::Ref. @@ -129,6 +138,7 @@ template class LLT return Traits::getL(m_matrix); } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A. * * Since this LLT class assumes anyway that the matrix A is invertible, the solution @@ -141,13 +151,8 @@ template class LLT */ template inline const Solve - solve(const MatrixBase& b) const - { - eigen_assert(m_isInitialized && "LLT is not initialized."); - eigen_assert(m_matrix.rows()==b.rows() - && "LLT::solve(): invalid number of rows of the right hand side matrix b"); - return Solve(*this, b.derived()); - } + solve(const MatrixBase& b) const; + #endif template void solveInPlace(const MatrixBase &bAndX) const; @@ -180,7 +185,7 @@ template class LLT /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the matrix.appears not to be positive definite. */ ComputationInfo info() const @@ -194,18 +199,20 @@ template class LLT * This method is provided for compatibility with other matrix decompositions, thus enabling generic code such as: * \code x = decomposition.adjoint().solve(b) \endcode */ - const LLT& adjoint() const { return *this; }; + const LLT& adjoint() const EIGEN_NOEXCEPT { return *this; }; - inline Index rows() const { return m_matrix.rows(); } - inline Index cols() const { return m_matrix.cols(); } + inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); } + inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); } template - LLT rankUpdate(const VectorType& vec, const RealScalar& sigma = 1); + LLT & rankUpdate(const VectorType& vec, const RealScalar& sigma = 1); #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; + + template + void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: @@ -459,7 +466,7 @@ LLT& LLT::compute(const EigenBase */ template template -LLT<_MatrixType,_UpLo> LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, const RealScalar& sigma) +LLT<_MatrixType,_UpLo> & LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, const RealScalar& sigma) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorType); eigen_assert(v.size()==m_matrix.cols()); @@ -477,8 +484,17 @@ template template void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const { - dst = rhs; - solveInPlace(dst); + _solve_impl_transposed(rhs, dst); +} + +template +template +void LLT<_MatrixType,_UpLo>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +{ + dst = rhs; + + matrixL().template conjugateIf().solveInPlace(dst); + matrixU().template conjugateIf().solveInPlace(dst); } #endif diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/ArithmeticSequence.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/ArithmeticSequence.h new file mode 100644 index 0000000000..b6200fac1b --- /dev/null +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/ArithmeticSequence.h @@ -0,0 +1,413 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_ARITHMETIC_SEQUENCE_H +#define EIGEN_ARITHMETIC_SEQUENCE_H + +namespace Eigen { + +namespace internal { + +#if (!EIGEN_HAS_CXX11) || !((!EIGEN_COMP_GNUC) || EIGEN_COMP_GNUC>=48) +template struct aseq_negate {}; + +template<> struct aseq_negate { + typedef Index type; +}; + +template struct aseq_negate > { + typedef FixedInt<-N> type; +}; + +// Compilation error in the following case: +template<> struct aseq_negate > {}; + +template::value, + bool SizeIsSymbolic =symbolic::is_symbolic::value> +struct aseq_reverse_first_type { + typedef Index type; +}; + +template +struct aseq_reverse_first_type { + typedef symbolic::AddExpr > >, + symbolic::ValueExpr > + > type; +}; + +template +struct aseq_reverse_first_type_aux { + typedef Index type; +}; + +template +struct aseq_reverse_first_type_aux::type> { + typedef FixedInt<(SizeType::value-1)*IncrType::value> type; +}; + +template +struct aseq_reverse_first_type { + typedef typename aseq_reverse_first_type_aux::type Aux; + typedef symbolic::AddExpr > type; +}; + +template +struct aseq_reverse_first_type { + typedef symbolic::AddExpr > >, + symbolic::ValueExpr >, + symbolic::ValueExpr<> > type; +}; +#endif + +// Helper to cleanup the type of the increment: +template struct cleanup_seq_incr { + typedef typename cleanup_index_type::type type; +}; + +} + +//-------------------------------------------------------------------------------- +// seq(first,last,incr) and seqN(first,size,incr) +//-------------------------------------------------------------------------------- + +template > +class ArithmeticSequence; + +template +ArithmeticSequence::type, + typename internal::cleanup_index_type::type, + typename internal::cleanup_seq_incr::type > +seqN(FirstType first, SizeType size, IncrType incr); + +/** \class ArithmeticSequence + * \ingroup Core_Module + * + * This class represents an arithmetic progression \f$ a_0, a_1, a_2, ..., a_{n-1}\f$ defined by + * its \em first value \f$ a_0 \f$, its \em size (aka length) \em n, and the \em increment (aka stride) + * that is equal to \f$ a_{i+1}-a_{i}\f$ for any \em i. + * + * It is internally used as the return type of the Eigen::seq and Eigen::seqN functions, and as the input arguments + * of DenseBase::operator()(const RowIndices&, const ColIndices&), and most of the time this is the + * only way it is used. + * + * \tparam FirstType type of the first element, usually an Index, + * but internally it can be a symbolic expression + * \tparam SizeType type representing the size of the sequence, usually an Index + * or a compile time integral constant. Internally, it can also be a symbolic expression + * \tparam IncrType type of the increment, can be a runtime Index, or a compile time integral constant (default is compile-time 1) + * + * \sa Eigen::seq, Eigen::seqN, DenseBase::operator()(const RowIndices&, const ColIndices&), class IndexedView + */ +template +class ArithmeticSequence +{ +public: + ArithmeticSequence(FirstType first, SizeType size) : m_first(first), m_size(size) {} + ArithmeticSequence(FirstType first, SizeType size, IncrType incr) : m_first(first), m_size(size), m_incr(incr) {} + + enum { + SizeAtCompileTime = internal::get_fixed_value::value, + IncrAtCompileTime = internal::get_fixed_value::value + }; + + /** \returns the size, i.e., number of elements, of the sequence */ + Index size() const { return m_size; } + + /** \returns the first element \f$ a_0 \f$ in the sequence */ + Index first() const { return m_first; } + + /** \returns the value \f$ a_i \f$ at index \a i in the sequence. */ + Index operator[](Index i) const { return m_first + i * m_incr; } + + const FirstType& firstObject() const { return m_first; } + const SizeType& sizeObject() const { return m_size; } + const IncrType& incrObject() const { return m_incr; } + +protected: + FirstType m_first; + SizeType m_size; + IncrType m_incr; + +public: + +#if EIGEN_HAS_CXX11 && ((!EIGEN_COMP_GNUC) || EIGEN_COMP_GNUC>=48) + auto reverse() const -> decltype(Eigen::seqN(m_first+(m_size+fix<-1>())*m_incr,m_size,-m_incr)) { + return seqN(m_first+(m_size+fix<-1>())*m_incr,m_size,-m_incr); + } +#else +protected: + typedef typename internal::aseq_negate::type ReverseIncrType; + typedef typename internal::aseq_reverse_first_type::type ReverseFirstType; +public: + ArithmeticSequence + reverse() const { + return seqN(m_first+(m_size+fix<-1>())*m_incr,m_size,-m_incr); + } +#endif +}; + +/** \returns an ArithmeticSequence starting at \a first, of length \a size, and increment \a incr + * + * \sa seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) */ +template +ArithmeticSequence::type,typename internal::cleanup_index_type::type,typename internal::cleanup_seq_incr::type > +seqN(FirstType first, SizeType size, IncrType incr) { + return ArithmeticSequence::type,typename internal::cleanup_index_type::type,typename internal::cleanup_seq_incr::type>(first,size,incr); +} + +/** \returns an ArithmeticSequence starting at \a first, of length \a size, and unit increment + * + * \sa seqN(FirstType,SizeType,IncrType), seq(FirstType,LastType) */ +template +ArithmeticSequence::type,typename internal::cleanup_index_type::type > +seqN(FirstType first, SizeType size) { + return ArithmeticSequence::type,typename internal::cleanup_index_type::type>(first,size); +} + +#ifdef EIGEN_PARSED_BY_DOXYGEN + +/** \returns an ArithmeticSequence starting at \a f, up (or down) to \a l, and with positive (or negative) increment \a incr + * + * It is essentially an alias to: + * \code + * seqN(f, (l-f+incr)/incr, incr); + * \endcode + * + * \sa seqN(FirstType,SizeType,IncrType), seq(FirstType,LastType) + */ +template +auto seq(FirstType f, LastType l, IncrType incr); + +/** \returns an ArithmeticSequence starting at \a f, up (or down) to \a l, and unit increment + * + * It is essentially an alias to: + * \code + * seqN(f,l-f+1); + * \endcode + * + * \sa seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) + */ +template +auto seq(FirstType f, LastType l); + +#else // EIGEN_PARSED_BY_DOXYGEN + +#if EIGEN_HAS_CXX11 +template +auto seq(FirstType f, LastType l) -> decltype(seqN(typename internal::cleanup_index_type::type(f), + ( typename internal::cleanup_index_type::type(l) + - typename internal::cleanup_index_type::type(f)+fix<1>()))) +{ + return seqN(typename internal::cleanup_index_type::type(f), + (typename internal::cleanup_index_type::type(l) + -typename internal::cleanup_index_type::type(f)+fix<1>())); +} + +template +auto seq(FirstType f, LastType l, IncrType incr) + -> decltype(seqN(typename internal::cleanup_index_type::type(f), + ( typename internal::cleanup_index_type::type(l) + - typename internal::cleanup_index_type::type(f)+typename internal::cleanup_seq_incr::type(incr) + ) / typename internal::cleanup_seq_incr::type(incr), + typename internal::cleanup_seq_incr::type(incr))) +{ + typedef typename internal::cleanup_seq_incr::type CleanedIncrType; + return seqN(typename internal::cleanup_index_type::type(f), + ( typename internal::cleanup_index_type::type(l) + -typename internal::cleanup_index_type::type(f)+CleanedIncrType(incr)) / CleanedIncrType(incr), + CleanedIncrType(incr)); +} + +#else // EIGEN_HAS_CXX11 + +template +typename internal::enable_if::value || symbolic::is_symbolic::value), + ArithmeticSequence::type,Index> >::type +seq(FirstType f, LastType l) +{ + return seqN(typename internal::cleanup_index_type::type(f), + Index((typename internal::cleanup_index_type::type(l)-typename internal::cleanup_index_type::type(f)+fix<1>()))); +} + +template +typename internal::enable_if::value, + ArithmeticSequence,symbolic::ValueExpr<> >, + symbolic::ValueExpr > > > >::type +seq(const symbolic::BaseExpr &f, LastType l) +{ + return seqN(f.derived(),(typename internal::cleanup_index_type::type(l)-f.derived()+fix<1>())); +} + +template +typename internal::enable_if::value, + ArithmeticSequence::type, + symbolic::AddExpr >, + symbolic::ValueExpr > > > >::type +seq(FirstType f, const symbolic::BaseExpr &l) +{ + return seqN(typename internal::cleanup_index_type::type(f),(l.derived()-typename internal::cleanup_index_type::type(f)+fix<1>())); +} + +template +ArithmeticSequence >,symbolic::ValueExpr > > > +seq(const symbolic::BaseExpr &f, const symbolic::BaseExpr &l) +{ + return seqN(f.derived(),(l.derived()-f.derived()+fix<1>())); +} + + +template +typename internal::enable_if::value || symbolic::is_symbolic::value), + ArithmeticSequence::type,Index,typename internal::cleanup_seq_incr::type> >::type +seq(FirstType f, LastType l, IncrType incr) +{ + typedef typename internal::cleanup_seq_incr::type CleanedIncrType; + return seqN(typename internal::cleanup_index_type::type(f), + Index((typename internal::cleanup_index_type::type(l)-typename internal::cleanup_index_type::type(f)+CleanedIncrType(incr))/CleanedIncrType(incr)), incr); +} + +template +typename internal::enable_if::value, + ArithmeticSequence, + symbolic::ValueExpr<> >, + symbolic::ValueExpr::type> >, + symbolic::ValueExpr::type> >, + typename internal::cleanup_seq_incr::type> >::type +seq(const symbolic::BaseExpr &f, LastType l, IncrType incr) +{ + typedef typename internal::cleanup_seq_incr::type CleanedIncrType; + return seqN(f.derived(),(typename internal::cleanup_index_type::type(l)-f.derived()+CleanedIncrType(incr))/CleanedIncrType(incr), incr); +} + +template +typename internal::enable_if::value, + ArithmeticSequence::type, + symbolic::QuotientExpr >, + symbolic::ValueExpr::type> >, + symbolic::ValueExpr::type> >, + typename internal::cleanup_seq_incr::type> >::type +seq(FirstType f, const symbolic::BaseExpr &l, IncrType incr) +{ + typedef typename internal::cleanup_seq_incr::type CleanedIncrType; + return seqN(typename internal::cleanup_index_type::type(f), + (l.derived()-typename internal::cleanup_index_type::type(f)+CleanedIncrType(incr))/CleanedIncrType(incr), incr); +} + +template +ArithmeticSequence >, + symbolic::ValueExpr::type> >, + symbolic::ValueExpr::type> >, + typename internal::cleanup_seq_incr::type> +seq(const symbolic::BaseExpr &f, const symbolic::BaseExpr &l, IncrType incr) +{ + typedef typename internal::cleanup_seq_incr::type CleanedIncrType; + return seqN(f.derived(),(l.derived()-f.derived()+CleanedIncrType(incr))/CleanedIncrType(incr), incr); +} +#endif // EIGEN_HAS_CXX11 + +#endif // EIGEN_PARSED_BY_DOXYGEN + + +#if EIGEN_HAS_CXX11 || defined(EIGEN_PARSED_BY_DOXYGEN) +/** \cpp11 + * \returns a symbolic ArithmeticSequence representing the last \a size elements with increment \a incr. + * + * It is a shortcut for: \code seqN(last-(size-fix<1>)*incr, size, incr) \endcode + * + * \sa lastN(SizeType), seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) */ +template +auto lastN(SizeType size, IncrType incr) +-> decltype(seqN(Eigen::last-(size-fix<1>())*incr, size, incr)) +{ + return seqN(Eigen::last-(size-fix<1>())*incr, size, incr); +} + +/** \cpp11 + * \returns a symbolic ArithmeticSequence representing the last \a size elements with a unit increment. + * + * It is a shortcut for: \code seq(last+fix<1>-size, last) \endcode + * + * \sa lastN(SizeType,IncrType, seqN(FirstType,SizeType), seq(FirstType,LastType) */ +template +auto lastN(SizeType size) +-> decltype(seqN(Eigen::last+fix<1>()-size, size)) +{ + return seqN(Eigen::last+fix<1>()-size, size); +} +#endif + +namespace internal { + +// Convert a symbolic span into a usable one (i.e., remove last/end "keywords") +template +struct make_size_type { + typedef typename internal::conditional::value, Index, T>::type type; +}; + +template +struct IndexedViewCompatibleType, XprSize> { + typedef ArithmeticSequence::type,IncrType> type; +}; + +template +ArithmeticSequence::type,IncrType> +makeIndexedViewCompatible(const ArithmeticSequence& ids, Index size,SpecializedType) { + return ArithmeticSequence::type,IncrType>( + eval_expr_given_size(ids.firstObject(),size),eval_expr_given_size(ids.sizeObject(),size),ids.incrObject()); +} + +template +struct get_compile_time_incr > { + enum { value = get_fixed_value::value }; +}; + +} // end namespace internal + +/** \namespace Eigen::indexing + * \ingroup Core_Module + * + * The sole purpose of this namespace is to be able to import all functions + * and symbols that are expected to be used within operator() for indexing + * and slicing. If you already imported the whole Eigen namespace: + * \code using namespace Eigen; \endcode + * then you are already all set. Otherwise, if you don't want/cannot import + * the whole Eigen namespace, the following line: + * \code using namespace Eigen::indexing; \endcode + * is equivalent to: + * \code + using Eigen::all; + using Eigen::seq; + using Eigen::seqN; + using Eigen::lastN; // c++11 only + using Eigen::last; + using Eigen::lastp1; + using Eigen::fix; + \endcode + */ +namespace indexing { + using Eigen::all; + using Eigen::seq; + using Eigen::seqN; + #if EIGEN_HAS_CXX11 + using Eigen::lastN; + #endif + using Eigen::last; + using Eigen::lastp1; + using Eigen::fix; +} + +} // end namespace Eigen + +#endif // EIGEN_ARITHMETIC_SEQUENCE_H diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Array.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Array.h index 16770fc7b3..20c789b10c 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Array.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Array.h @@ -117,7 +117,7 @@ class Array { return Base::_set(other); } - + /** Default constructor. * * For fixed-size matrices, does nothing. @@ -157,11 +157,50 @@ class Array EIGEN_DEVICE_FUNC Array& operator=(Array&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable::value) { - other.swap(*this); + Base::operator=(std::move(other)); return *this; } #endif + #if EIGEN_HAS_CXX11 + /** \copydoc PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) + * + * Example: \include Array_variadic_ctor_cxx11.cpp + * Output: \verbinclude Array_variadic_ctor_cxx11.out + * + * \sa Array(const std::initializer_list>&) + * \sa Array(const Scalar&), Array(const Scalar&,const Scalar&) + */ + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) + : Base(a0, a1, a2, a3, args...) {} + + /** \brief Constructs an array and initializes it from the coefficients given as initializer-lists grouped by row. \cpp11 + * + * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients: + * + * Example: \include Array_initializer_list_23_cxx11.cpp + * Output: \verbinclude Array_initializer_list_23_cxx11.out + * + * Each of the inner initializer lists must contain the exact same number of elements, otherwise an assertion is triggered. + * + * In the case of a compile-time column 1D array, implicit transposition from a single row is allowed. + * Therefore Array{{1,2,3,4,5}} is legal and the more verbose syntax + * Array{{1},{2},{3},{4},{5}} can be avoided: + * + * Example: \include Array_initializer_list_vector_cxx11.cpp + * Output: \verbinclude Array_initializer_list_vector_cxx11.out + * + * In the case of fixed-sized arrays, the initializer list sizes must exactly match the array sizes, + * and implicit transposition is allowed for compile-time 1D arrays only. + * + * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) + */ + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Array(const std::initializer_list>& list) : Base(list) {} + #endif // end EIGEN_HAS_CXX11 + #ifndef EIGEN_PARSED_BY_DOXYGEN template EIGEN_DEVICE_FUNC @@ -178,6 +217,7 @@ class Array Base::_check_template_params(); this->template _init2(val0, val1); } + #else /** \brief Constructs a fixed-sized array initialized with coefficients starting at \a data */ EIGEN_DEVICE_FUNC explicit Array(const Scalar *data); @@ -189,7 +229,8 @@ class Array */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Array(Index dim); - /** constructs an initialized 1x1 Array with the given coefficient */ + /** constructs an initialized 1x1 Array with the given coefficient + * \sa const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args */ Array(const Scalar& value); /** constructs an uninitialized array with \a rows rows and \a cols columns. * @@ -197,11 +238,14 @@ class Array * it is redundant to pass these parameters, so one should use the default constructor * Array() instead. */ Array(Index rows, Index cols); - /** constructs an initialized 2D vector with given coefficients */ + /** constructs an initialized 2D vector with given coefficients + * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) */ Array(const Scalar& val0, const Scalar& val1); - #endif + #endif // end EIGEN_PARSED_BY_DOXYGEN - /** constructs an initialized 3D vector with given coefficients */ + /** constructs an initialized 3D vector with given coefficients + * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) + */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2) { @@ -211,7 +255,9 @@ class Array m_storage.data()[1] = val1; m_storage.data()[2] = val2; } - /** constructs an initialized 4D vector with given coefficients */ + /** constructs an initialized 4D vector with given coefficients + * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) + */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2, const Scalar& val3) { @@ -242,8 +288,10 @@ class Array : Base(other.derived()) { } - EIGEN_DEVICE_FUNC inline Index innerStride() const { return 1; } - EIGEN_DEVICE_FUNC inline Index outerStride() const { return this->innerSize(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index innerStride() const EIGEN_NOEXCEPT{ return 1; } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index outerStride() const EIGEN_NOEXCEPT { return this->innerSize(); } #ifdef EIGEN_ARRAY_PLUGIN #include EIGEN_ARRAY_PLUGIN @@ -258,7 +306,7 @@ class Array /** \defgroup arraytypedefs Global array typedefs * \ingroup Core_Module * - * Eigen defines several typedef shortcuts for most common 1D and 2D array types. + * %Eigen defines several typedef shortcuts for most common 1D and 2D array types. * * The general patterns are the following: * @@ -271,6 +319,12 @@ class Array * There are also \c ArraySizeType which are self-explanatory. For example, \c Array4cf is * a fixed-size 1D array of 4 complex floats. * + * With \cpp11, template alias are also defined for common sizes. + * They follow the same pattern as above except that the scalar type suffix is replaced by a + * template parameter, i.e.: + * - `ArrayRowsCols` where `Rows` and `Cols` can be \c 2,\c 3,\c 4, or \c X for fixed or dynamic size. + * - `ArraySize` where `Size` can be \c 2,\c 3,\c 4 or \c X for fixed or dynamic size 1D arrays. + * * \sa class Array */ @@ -303,8 +357,42 @@ EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(std::complex, cd) #undef EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES #undef EIGEN_MAKE_ARRAY_TYPEDEFS +#undef EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS -#undef EIGEN_MAKE_ARRAY_TYPEDEFS_LARGE +#if EIGEN_HAS_CXX11 + +#define EIGEN_MAKE_ARRAY_TYPEDEFS(Size, SizeSuffix) \ +/** \ingroup arraytypedefs */ \ +/** \brief \cpp11 */ \ +template \ +using Array##SizeSuffix##SizeSuffix = Array; \ +/** \ingroup arraytypedefs */ \ +/** \brief \cpp11 */ \ +template \ +using Array##SizeSuffix = Array; + +#define EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(Size) \ +/** \ingroup arraytypedefs */ \ +/** \brief \cpp11 */ \ +template \ +using Array##Size##X = Array; \ +/** \ingroup arraytypedefs */ \ +/** \brief \cpp11 */ \ +template \ +using Array##X##Size = Array; + +EIGEN_MAKE_ARRAY_TYPEDEFS(2, 2) +EIGEN_MAKE_ARRAY_TYPEDEFS(3, 3) +EIGEN_MAKE_ARRAY_TYPEDEFS(4, 4) +EIGEN_MAKE_ARRAY_TYPEDEFS(Dynamic, X) +EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(2) +EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(3) +EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(4) + +#undef EIGEN_MAKE_ARRAY_TYPEDEFS +#undef EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS + +#endif // EIGEN_HAS_CXX11 #define EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, SizeSuffix) \ using Eigen::Matrix##SizeSuffix##TypeSuffix; \ diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/ArrayBase.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/ArrayBase.h index 33f644e213..ea3dd1c3b3 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/ArrayBase.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/ArrayBase.h @@ -69,6 +69,7 @@ template class ArrayBase using Base::coeff; using Base::coeffRef; using Base::lazyAssign; + using Base::operator-; using Base::operator=; using Base::operator+=; using Base::operator-=; @@ -88,7 +89,6 @@ template class ArrayBase #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::ArrayBase #define EIGEN_DOC_UNARY_ADDONS(X,Y) -# include "../plugins/CommonCwiseUnaryOps.h" # include "../plugins/MatrixCwiseUnaryOps.h" # include "../plugins/ArrayCwiseUnaryOps.h" # include "../plugins/CommonCwiseBinaryOps.h" diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/ArrayWrapper.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/ArrayWrapper.h index 688aadd626..2e9555b537 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/ArrayWrapper.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/ArrayWrapper.h @@ -10,7 +10,7 @@ #ifndef EIGEN_ARRAYWRAPPER_H #define EIGEN_ARRAYWRAPPER_H -namespace Eigen { +namespace Eigen { /** \class ArrayWrapper * \ingroup Core_Module @@ -60,14 +60,14 @@ class ArrayWrapper : public ArrayBase > EIGEN_DEVICE_FUNC explicit EIGEN_STRONG_INLINE ArrayWrapper(ExpressionType& matrix) : m_expression(matrix) {} - EIGEN_DEVICE_FUNC - inline Index rows() const { return m_expression.rows(); } - EIGEN_DEVICE_FUNC - inline Index cols() const { return m_expression.cols(); } - EIGEN_DEVICE_FUNC - inline Index outerStride() const { return m_expression.outerStride(); } - EIGEN_DEVICE_FUNC - inline Index innerStride() const { return m_expression.innerStride(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index outerStride() const EIGEN_NOEXCEPT { return m_expression.outerStride(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index innerStride() const EIGEN_NOEXCEPT { return m_expression.innerStride(); } EIGEN_DEVICE_FUNC inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); } @@ -90,9 +90,9 @@ class ArrayWrapper : public ArrayBase > EIGEN_DEVICE_FUNC inline void evalTo(Dest& dst) const { dst = m_expression; } - const typename internal::remove_all::type& EIGEN_DEVICE_FUNC - nestedExpression() const + const typename internal::remove_all::type& + nestedExpression() const { return m_expression; } @@ -158,14 +158,14 @@ class MatrixWrapper : public MatrixBase > EIGEN_DEVICE_FUNC explicit inline MatrixWrapper(ExpressionType& matrix) : m_expression(matrix) {} - EIGEN_DEVICE_FUNC - inline Index rows() const { return m_expression.rows(); } - EIGEN_DEVICE_FUNC - inline Index cols() const { return m_expression.cols(); } - EIGEN_DEVICE_FUNC - inline Index outerStride() const { return m_expression.outerStride(); } - EIGEN_DEVICE_FUNC - inline Index innerStride() const { return m_expression.innerStride(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index outerStride() const EIGEN_NOEXCEPT { return m_expression.outerStride(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index innerStride() const EIGEN_NOEXCEPT { return m_expression.innerStride(); } EIGEN_DEVICE_FUNC inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); } @@ -185,8 +185,8 @@ class MatrixWrapper : public MatrixBase > } EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - nestedExpression() const + const typename internal::remove_all::type& + nestedExpression() const { return m_expression; } diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Assign.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Assign.h index 53806ba33c..655412efd7 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Assign.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Assign.h @@ -16,7 +16,7 @@ namespace Eigen { template template -EIGEN_STRONG_INLINE Derived& DenseBase +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase ::lazyAssign(const DenseBase& other) { enum{ diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/AssignEvaluator.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/AssignEvaluator.h index dbe435d86b..7d76f0c256 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/AssignEvaluator.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/AssignEvaluator.h @@ -17,24 +17,24 @@ namespace Eigen { // This implementation is based on Assign.h namespace internal { - + /*************************************************************************** * Part 1 : the logic deciding a strategy for traversal and unrolling * ***************************************************************************/ // copy_using_evaluator_traits is based on assign_traits -template +template struct copy_using_evaluator_traits { typedef typename DstEvaluator::XprType Dst; typedef typename Dst::Scalar DstScalar; - + enum { DstFlags = DstEvaluator::Flags, SrcFlags = SrcEvaluator::Flags }; - + public: enum { DstAlignment = DstEvaluator::Alignment, @@ -51,13 +51,15 @@ private: InnerMaxSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::MaxSizeAtCompileTime) : int(DstFlags)&RowMajorBit ? int(Dst::MaxColsAtCompileTime) : int(Dst::MaxRowsAtCompileTime), + RestrictedInnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(InnerSize,MaxPacketSize), + RestrictedLinearSize = EIGEN_SIZE_MIN_PREFER_FIXED(Dst::SizeAtCompileTime,MaxPacketSize), OuterStride = int(outer_stride_at_compile_time::ret), MaxSizeAtCompileTime = Dst::SizeAtCompileTime }; // TODO distinguish between linear traversal and inner-traversals - typedef typename find_best_packet::type LinearPacketType; - typedef typename find_best_packet::type InnerPacketType; + typedef typename find_best_packet::type LinearPacketType; + typedef typename find_best_packet::type InnerPacketType; enum { LinearPacketSize = unpacket_traits::size, @@ -97,7 +99,8 @@ private: public: enum { - Traversal = int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize) ? int(LinearVectorizedTraversal) + Traversal = int(Dst::SizeAtCompileTime) == 0 ? int(AllAtOnceTraversal) // If compile-size is zero, traversing will fail at compile-time. + : (int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize)) ? int(LinearVectorizedTraversal) : int(MayInnerVectorize) ? int(InnerVectorizedTraversal) : int(MayLinearVectorize) ? int(LinearVectorizedTraversal) : int(MaySliceVectorize) ? int(SliceVectorizedTraversal) @@ -135,7 +138,7 @@ public: ? int(CompleteUnrolling) : int(NoUnrolling) ) : int(Traversal) == int(LinearTraversal) - ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling) + ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling) : int(NoUnrolling) ) #if EIGEN_UNALIGNED_VECTORIZE : int(Traversal) == int(SliceVectorizedTraversal) @@ -172,6 +175,8 @@ public: EIGEN_DEBUG_VAR(MaySliceVectorize) std::cerr << "Traversal" << " = " << Traversal << " (" << demangle_traversal(Traversal) << ")" << std::endl; EIGEN_DEBUG_VAR(SrcEvaluator::CoeffReadCost) + EIGEN_DEBUG_VAR(DstEvaluator::CoeffReadCost) + EIGEN_DEBUG_VAR(Dst::SizeAtCompileTime) EIGEN_DEBUG_VAR(UnrollingLimit) EIGEN_DEBUG_VAR(MayUnrollCompletely) EIGEN_DEBUG_VAR(MayUnrollInner) @@ -195,7 +200,7 @@ struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling // FIXME: this is not very clean, perhaps this information should be provided by the kernel? typedef typename Kernel::DstEvaluatorType DstEvaluatorType; typedef typename DstEvaluatorType::XprType DstXprType; - + enum { outer = Index / DstXprType::InnerSizeAtCompileTime, inner = Index % DstXprType::InnerSizeAtCompileTime @@ -261,7 +266,7 @@ struct copy_using_evaluator_innervec_CompleteUnrolling typedef typename Kernel::DstEvaluatorType DstEvaluatorType; typedef typename DstEvaluatorType::XprType DstXprType; typedef typename Kernel::PacketType PacketType; - + enum { outer = Index / DstXprType::InnerSizeAtCompileTime, inner = Index % DstXprType::InnerSizeAtCompileTime, @@ -312,6 +317,22 @@ template struct dense_assignment_loop; +/************************ +***** Special Cases ***** +************************/ + +// Zero-sized assignment is a no-op. +template +struct dense_assignment_loop +{ + EIGEN_DEVICE_FUNC static void EIGEN_STRONG_INLINE run(Kernel& /*kernel*/) + { + typedef typename Kernel::DstEvaluatorType::XprType DstXprType; + EIGEN_STATIC_ASSERT(int(DstXprType::SizeAtCompileTime) == 0, + EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT) + } +}; + /************************ *** Default traversal *** ************************/ @@ -426,10 +447,10 @@ struct dense_assignment_loop::size, - alignedSize = (size/packetSize)*packetSize }; + alignedSize = (int(size)/packetSize)*packetSize }; copy_using_evaluator_innervec_CompleteUnrolling::run(kernel); copy_using_evaluator_DefaultTraversal_CompleteUnrolling::run(kernel); @@ -530,7 +551,7 @@ struct dense_assignment_loop const Scalar *dst_ptr = kernel.dstDataPtr(); if((!bool(dstIsAligned)) && (UIntPtr(dst_ptr) % sizeof(Scalar))>0) { - // the pointer is not aligend-on scalar, so alignment is not possible + // the pointer is not aligned-on scalar, so alignment is not possible return dense_assignment_loop::run(kernel); } const Index packetAlignedMask = packetSize - 1; @@ -568,14 +589,15 @@ struct dense_assignment_loop typedef typename Kernel::DstEvaluatorType::XprType DstXprType; typedef typename Kernel::PacketType PacketType; - enum { size = DstXprType::InnerSizeAtCompileTime, + enum { innerSize = DstXprType::InnerSizeAtCompileTime, packetSize =unpacket_traits::size, - vectorizableSize = (size/packetSize)*packetSize }; + vectorizableSize = (int(innerSize) / int(packetSize)) * int(packetSize), + size = DstXprType::SizeAtCompileTime }; for(Index outer = 0; outer < kernel.outerSize(); ++outer) { copy_using_evaluator_innervec_InnerUnrolling::run(kernel, outer); - copy_using_evaluator_DefaultTraversal_InnerUnrolling::run(kernel, outer); + copy_using_evaluator_DefaultTraversal_InnerUnrolling::run(kernel, outer); } } }; @@ -599,73 +621,74 @@ protected: typedef typename DstEvaluatorTypeT::XprType DstXprType; typedef typename SrcEvaluatorTypeT::XprType SrcXprType; public: - + typedef DstEvaluatorTypeT DstEvaluatorType; typedef SrcEvaluatorTypeT SrcEvaluatorType; typedef typename DstEvaluatorType::Scalar Scalar; typedef copy_using_evaluator_traits AssignmentTraits; typedef typename AssignmentTraits::PacketType PacketType; - - - EIGEN_DEVICE_FUNC generic_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr) + + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + generic_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr) : m_dst(dst), m_src(src), m_functor(func), m_dstExpr(dstExpr) { #ifdef EIGEN_DEBUG_ASSIGN AssignmentTraits::debug(); #endif } - - EIGEN_DEVICE_FUNC Index size() const { return m_dstExpr.size(); } - EIGEN_DEVICE_FUNC Index innerSize() const { return m_dstExpr.innerSize(); } - EIGEN_DEVICE_FUNC Index outerSize() const { return m_dstExpr.outerSize(); } - EIGEN_DEVICE_FUNC Index rows() const { return m_dstExpr.rows(); } - EIGEN_DEVICE_FUNC Index cols() const { return m_dstExpr.cols(); } - EIGEN_DEVICE_FUNC Index outerStride() const { return m_dstExpr.outerStride(); } - - EIGEN_DEVICE_FUNC DstEvaluatorType& dstEvaluator() { return m_dst; } - EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const { return m_src; } - + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT { return m_dstExpr.size(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index innerSize() const EIGEN_NOEXCEPT { return m_dstExpr.innerSize(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index outerSize() const EIGEN_NOEXCEPT { return m_dstExpr.outerSize(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_dstExpr.rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_dstExpr.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index outerStride() const EIGEN_NOEXCEPT { return m_dstExpr.outerStride(); } + + EIGEN_DEVICE_FUNC DstEvaluatorType& dstEvaluator() EIGEN_NOEXCEPT { return m_dst; } + EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const EIGEN_NOEXCEPT { return m_src; } + /// Assign src(row,col) to dst(row,col) through the assignment functor. EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index row, Index col) { m_functor.assignCoeff(m_dst.coeffRef(row,col), m_src.coeff(row,col)); } - + /// \sa assignCoeff(Index,Index) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index index) { m_functor.assignCoeff(m_dst.coeffRef(index), m_src.coeff(index)); } - + /// \sa assignCoeff(Index,Index) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeffByOuterInner(Index outer, Index inner) { - Index row = rowIndexByOuterInner(outer, inner); - Index col = colIndexByOuterInner(outer, inner); + Index row = rowIndexByOuterInner(outer, inner); + Index col = colIndexByOuterInner(outer, inner); assignCoeff(row, col); } - - + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacket(Index row, Index col) { m_functor.template assignPacket(&m_dst.coeffRef(row,col), m_src.template packet(row,col)); } - + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacket(Index index) { m_functor.template assignPacket(&m_dst.coeffRef(index), m_src.template packet(index)); } - + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketByOuterInner(Index outer, Index inner) { - Index row = rowIndexByOuterInner(outer, inner); + Index row = rowIndexByOuterInner(outer, inner); Index col = colIndexByOuterInner(outer, inner); assignPacket(row, col); } - + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index rowIndexByOuterInner(Index outer, Index inner) { typedef typename DstEvaluatorType::ExpressionTraits Traits; @@ -688,7 +711,7 @@ public: { return m_dstExpr.data(); } - + protected: DstEvaluatorType& m_dst; const SrcEvaluatorType& m_src; @@ -697,6 +720,27 @@ protected: DstXprType& m_dstExpr; }; +// Special kernel used when computing small products whose operands have dynamic dimensions. It ensures that the +// PacketSize used is no larger than 4, thereby increasing the chance that vectorized instructions will be used +// when computing the product. + +template +class restricted_packet_dense_assignment_kernel : public generic_dense_assignment_kernel +{ +protected: + typedef generic_dense_assignment_kernel Base; + public: + typedef typename Base::Scalar Scalar; + typedef typename Base::DstXprType DstXprType; + typedef copy_using_evaluator_traits AssignmentTraits; + typedef typename AssignmentTraits::PacketType PacketType; + + EIGEN_DEVICE_FUNC restricted_packet_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr) + : Base(dst, src, func, dstExpr) + { + } + }; + /*************************************************************************** * Part 5 : Entry point for dense rectangular assignment ***************************************************************************/ @@ -734,13 +778,23 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType resize_if_allowed(dst, src, func); DstEvaluatorType dstEvaluator(dst); - + typedef generic_dense_assignment_kernel Kernel; Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived()); dense_assignment_loop::run(kernel); } +// Specialization for filling the destination with a constant value. +#ifndef EIGEN_GPU_COMPILE_PHASE +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const Eigen::CwiseNullaryOp, DstXprType>& src, const internal::assign_op& func) +{ + resize_if_allowed(dst, src, func); + std::fill_n(dst.data(), dst.size(), src.functor()()); +} +#endif + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src) { @@ -756,13 +810,13 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType // AssignmentKind must define a Kind typedef. template struct AssignmentKind; -// Assignement kind defined in this file: +// Assignment kind defined in this file: struct Dense2Dense {}; struct EigenBase2EigenBase {}; template struct AssignmentKind { typedef EigenBase2EigenBase Kind; }; template<> struct AssignmentKind { typedef Dense2Dense Kind; }; - + // This is the main assignment class template< typename DstXprType, typename SrcXprType, typename Functor, typename Kind = typename AssignmentKind< typename evaluator_traits::Shape , typename evaluator_traits::Shape >::Kind, @@ -787,7 +841,7 @@ void call_assignment(const Dst& dst, const Src& src) { call_assignment(dst, src, internal::assign_op()); } - + // Deal with "assume-aliasing" template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -827,14 +881,35 @@ void call_assignment_no_alias(Dst& dst, const Src& src, const Func& func) typedef typename internal::conditional, Dst>::type ActualDstTypeCleaned; typedef typename internal::conditional, Dst&>::type ActualDstType; ActualDstType actualDst(dst); - + // TODO check whether this is the right place to perform these checks: EIGEN_STATIC_ASSERT_LVALUE(Dst) EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(ActualDstTypeCleaned,Src) EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename ActualDstTypeCleaned::Scalar,typename Src::Scalar); - + Assignment::run(actualDst, src, func); } + +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_restricted_packet_assignment_no_alias(Dst& dst, const Src& src, const Func& func) +{ + typedef evaluator DstEvaluatorType; + typedef evaluator SrcEvaluatorType; + typedef restricted_packet_dense_assignment_kernel Kernel; + + EIGEN_STATIC_ASSERT_LVALUE(Dst) + EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename Dst::Scalar,typename Src::Scalar); + + SrcEvaluatorType srcEvaluator(src); + resize_if_allowed(dst, src, func); + + DstEvaluatorType dstEvaluator(dst); + Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived()); + + dense_assignment_loop::run(kernel); +} + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_assignment_no_alias(Dst& dst, const Src& src) @@ -875,7 +950,7 @@ struct Assignment #ifndef EIGEN_NO_DEBUG internal::check_for_aliasing(dst, src); #endif - + call_dense_assignment_loop(dst, src, func); } }; @@ -899,7 +974,7 @@ struct Assignment src.evalTo(dst); } - // NOTE The following two functions are templated to avoid their instanciation if not needed + // NOTE The following two functions are templated to avoid their instantiation if not needed // This is needed because some expressions supports evalTo only and/or have 'void' as scalar type. template EIGEN_DEVICE_FUNC diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/BandMatrix.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/BandMatrix.h index 4978c91405..878c0240ac 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/BandMatrix.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/BandMatrix.h @@ -10,7 +10,7 @@ #ifndef EIGEN_BANDMATRIX_H #define EIGEN_BANDMATRIX_H -namespace Eigen { +namespace Eigen { namespace internal { @@ -45,7 +45,7 @@ class BandMatrixBase : public EigenBase }; public: - + using Base::derived; using Base::rows; using Base::cols; @@ -55,10 +55,10 @@ class BandMatrixBase : public EigenBase /** \returns the number of sub diagonals */ inline Index subs() const { return derived().subs(); } - + /** \returns an expression of the underlying coefficient matrix */ inline const CoefficientsType& coeffs() const { return derived().coeffs(); } - + /** \returns an expression of the underlying coefficient matrix */ inline CoefficientsType& coeffs() { return derived().coeffs(); } @@ -67,7 +67,7 @@ class BandMatrixBase : public EigenBase * \warning the internal storage must be column major. */ inline Block col(Index i) { - EIGEN_STATIC_ASSERT((Options&RowMajor)==0,THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES); + EIGEN_STATIC_ASSERT((int(Options) & int(RowMajor)) == 0, THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES); Index start = 0; Index len = coeffs().rows(); if (i<=supers()) @@ -90,7 +90,7 @@ class BandMatrixBase : public EigenBase template struct DiagonalIntReturnType { enum { - ReturnOpposite = (Options&SelfAdjoint) && (((Index)>0 && Supers==0) || ((Index)<0 && Subs==0)), + ReturnOpposite = (int(Options) & int(SelfAdjoint)) && (((Index) > 0 && Supers == 0) || ((Index) < 0 && Subs == 0)), Conjugate = ReturnOpposite && NumTraits::IsComplex, ActualIndex = ReturnOpposite ? -Index : Index, DiagonalSize = (RowsAtCompileTime==Dynamic || ColsAtCompileTime==Dynamic) @@ -130,7 +130,7 @@ class BandMatrixBase : public EigenBase eigen_assert((i<0 && -i<=subs()) || (i>=0 && i<=supers())); return Block(coeffs(), supers()-i, std::max(0,i), 1, diagonalLength(i)); } - + template inline void evalTo(Dest& dst) const { dst.resize(rows(),cols()); @@ -192,7 +192,7 @@ struct traits > Options = _Options, DataRowsAtCompileTime = ((Supers!=Dynamic) && (Subs!=Dynamic)) ? 1 + Supers + Subs : Dynamic }; - typedef Matrix CoefficientsType; + typedef Matrix CoefficientsType; }; template @@ -211,16 +211,16 @@ class BandMatrix : public BandMatrixBase @@ -52,7 +52,7 @@ struct traits > : traits::Flags & (DirectAccessBit | (InnerPanel?CompressedAccessBit:0))) | FlagsLvalueBit | FlagsRowMajorBit, // FIXME DirectAccessBit should not be handled by expressions - // + // // Alignment is needed by MapBase's assertions // We can sefely set it to false here. Internal alignment errors will be detected by an eigen_internal_assert in the respective evaluator Alignment = 0 @@ -61,7 +61,7 @@ struct traits > : traits::ret> class BlockImpl_dense; - + } // end namespace internal template class BlockImpl; @@ -109,13 +109,13 @@ template class typedef Impl Base; EIGEN_GENERIC_PUBLIC_INTERFACE(Block) EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Block) - + typedef typename internal::remove_all::type NestedExpression; - + /** Column or Row constructor */ - EIGEN_DEVICE_FUNC - inline Block(XprType& xpr, Index i) : Impl(xpr,i) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Block(XprType& xpr, Index i) : Impl(xpr,i) { eigen_assert( (i>=0) && ( ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && i class /** Fixed-size constructor */ - EIGEN_DEVICE_FUNC - inline Block(XprType& xpr, Index startRow, Index startCol) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Block(XprType& xpr, Index startRow, Index startCol) : Impl(xpr, startRow, startCol) { EIGEN_STATIC_ASSERT(RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic,THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE) @@ -135,8 +135,8 @@ template class /** Dynamic-size constructor */ - EIGEN_DEVICE_FUNC - inline Block(XprType& xpr, + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Block(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols) : Impl(xpr, startRow, startCol, blockRows, blockCols) @@ -147,7 +147,7 @@ template class && startCol >= 0 && blockCols >= 0 && startCol <= xpr.cols() - blockCols); } }; - + // The generic default implementation for dense block simplu forward to the internal::BlockImpl_dense // that must be specialized for direct and non-direct access... template @@ -159,10 +159,10 @@ class BlockImpl public: typedef Impl Base; EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl) - EIGEN_DEVICE_FUNC inline BlockImpl(XprType& xpr, Index i) : Impl(xpr,i) {} - EIGEN_DEVICE_FUNC inline BlockImpl(XprType& xpr, Index startRow, Index startCol) : Impl(xpr, startRow, startCol) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index i) : Impl(xpr,i) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index startRow, Index startCol) : Impl(xpr, startRow, startCol) {} EIGEN_DEVICE_FUNC - inline BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols) + EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols) : Impl(xpr, startRow, startCol, blockRows, blockCols) {} }; @@ -294,25 +294,25 @@ template::type& nestedExpression() const - { - return m_xpr; + { + return m_xpr; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE XprType& nestedExpression() { return m_xpr; } - - EIGEN_DEVICE_FUNC - StorageIndex startRow() const - { - return m_startRow.value(); + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + StorageIndex startRow() const EIGEN_NOEXCEPT + { + return m_startRow.value(); } - - EIGEN_DEVICE_FUNC - StorageIndex startCol() const - { - return m_startCol.value(); + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + StorageIndex startCol() const EIGEN_NOEXCEPT + { + return m_startCol.value(); } protected: @@ -342,9 +342,9 @@ class BlockImpl_dense /** Column or Row constructor */ - EIGEN_DEVICE_FUNC - inline BlockImpl_dense(XprType& xpr, Index i) - : Base(xpr.data() + i * ( ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor)) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + BlockImpl_dense(XprType& xpr, Index i) + : Base(xpr.data() + i * ( ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor)) || ((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && ( XprTypeIsRowMajor)) ? xpr.innerStride() : xpr.outerStride()), BlockRows==1 ? 1 : xpr.rows(), BlockCols==1 ? 1 : xpr.cols()), @@ -357,8 +357,8 @@ class BlockImpl_dense /** Fixed-size constructor */ - EIGEN_DEVICE_FUNC - inline BlockImpl_dense(XprType& xpr, Index startRow, Index startCol) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + BlockImpl_dense(XprType& xpr, Index startRow, Index startCol) : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol)), m_xpr(xpr), m_startRow(startRow), m_startCol(startCol) { @@ -367,8 +367,8 @@ class BlockImpl_dense /** Dynamic-size constructor */ - EIGEN_DEVICE_FUNC - inline BlockImpl_dense(XprType& xpr, + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + BlockImpl_dense(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols) : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol), blockRows, blockCols), @@ -377,18 +377,18 @@ class BlockImpl_dense init(); } - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& nestedExpression() const - { - return m_xpr; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const typename internal::remove_all::type& nestedExpression() const EIGEN_NOEXCEPT + { + return m_xpr; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE XprType& nestedExpression() { return m_xpr; } - + /** \sa MapBase::innerStride() */ - EIGEN_DEVICE_FUNC - inline Index innerStride() const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index innerStride() const EIGEN_NOEXCEPT { return internal::traits::HasSameStorageOrderAsXprType ? m_xpr.innerStride() @@ -396,23 +396,19 @@ class BlockImpl_dense } /** \sa MapBase::outerStride() */ - EIGEN_DEVICE_FUNC - inline Index outerStride() const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index outerStride() const EIGEN_NOEXCEPT { - return m_outerStride; + return internal::traits::HasSameStorageOrderAsXprType + ? m_xpr.outerStride() + : m_xpr.innerStride(); } - EIGEN_DEVICE_FUNC - StorageIndex startRow() const - { - return m_startRow.value(); - } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + StorageIndex startRow() const EIGEN_NOEXCEPT { return m_startRow.value(); } - EIGEN_DEVICE_FUNC - StorageIndex startCol() const - { - return m_startCol.value(); - } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + StorageIndex startCol() const EIGEN_NOEXCEPT { return m_startCol.value(); } #ifndef __SUNPRO_CC // FIXME sunstudio is not friendly with the above friend... @@ -422,8 +418,8 @@ class BlockImpl_dense #ifndef EIGEN_PARSED_BY_DOXYGEN /** \internal used by allowAligned() */ - EIGEN_DEVICE_FUNC - inline BlockImpl_dense(XprType& xpr, const Scalar* data, Index blockRows, Index blockCols) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + BlockImpl_dense(XprType& xpr, const Scalar* data, Index blockRows, Index blockCols) : Base(data, blockRows, blockCols), m_xpr(xpr) { init(); @@ -431,7 +427,7 @@ class BlockImpl_dense #endif protected: - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void init() { m_outerStride = internal::traits::HasSameStorageOrderAsXprType diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/BooleanRedux.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/BooleanRedux.h index 8409d8749a..852de8b90a 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/BooleanRedux.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/BooleanRedux.h @@ -14,58 +14,56 @@ namespace Eigen { namespace internal { -template +template struct all_unroller { - typedef typename Derived::ExpressionTraits Traits; enum { - col = (UnrollCount-1) / Traits::RowsAtCompileTime, - row = (UnrollCount-1) % Traits::RowsAtCompileTime + col = (UnrollCount-1) / Rows, + row = (UnrollCount-1) % Rows }; - static inline bool run(const Derived &mat) + EIGEN_DEVICE_FUNC static inline bool run(const Derived &mat) { - return all_unroller::run(mat) && mat.coeff(row, col); + return all_unroller::run(mat) && mat.coeff(row, col); } }; -template -struct all_unroller +template +struct all_unroller { - static inline bool run(const Derived &/*mat*/) { return true; } + EIGEN_DEVICE_FUNC static inline bool run(const Derived &/*mat*/) { return true; } }; -template -struct all_unroller +template +struct all_unroller { - static inline bool run(const Derived &) { return false; } + EIGEN_DEVICE_FUNC static inline bool run(const Derived &) { return false; } }; -template +template struct any_unroller { - typedef typename Derived::ExpressionTraits Traits; enum { - col = (UnrollCount-1) / Traits::RowsAtCompileTime, - row = (UnrollCount-1) % Traits::RowsAtCompileTime + col = (UnrollCount-1) / Rows, + row = (UnrollCount-1) % Rows }; - static inline bool run(const Derived &mat) + EIGEN_DEVICE_FUNC static inline bool run(const Derived &mat) { - return any_unroller::run(mat) || mat.coeff(row, col); + return any_unroller::run(mat) || mat.coeff(row, col); } }; -template -struct any_unroller +template +struct any_unroller { - static inline bool run(const Derived & /*mat*/) { return false; } + EIGEN_DEVICE_FUNC static inline bool run(const Derived & /*mat*/) { return false; } }; -template -struct any_unroller +template +struct any_unroller { - static inline bool run(const Derived &) { return false; } + EIGEN_DEVICE_FUNC static inline bool run(const Derived &) { return false; } }; } // end namespace internal @@ -78,16 +76,16 @@ struct any_unroller * \sa any(), Cwise::operator<() */ template -inline bool DenseBase::all() const +EIGEN_DEVICE_FUNC inline bool DenseBase::all() const { typedef internal::evaluator Evaluator; enum { unroll = SizeAtCompileTime != Dynamic - && SizeAtCompileTime * (Evaluator::CoeffReadCost + NumTraits::AddCost) <= EIGEN_UNROLLING_LIMIT + && SizeAtCompileTime * (int(Evaluator::CoeffReadCost) + int(NumTraits::AddCost)) <= EIGEN_UNROLLING_LIMIT }; Evaluator evaluator(derived()); if(unroll) - return internal::all_unroller::run(evaluator); + return internal::all_unroller::RowsAtCompileTime>::run(evaluator); else { for(Index j = 0; j < cols(); ++j) @@ -102,16 +100,16 @@ inline bool DenseBase::all() const * \sa all() */ template -inline bool DenseBase::any() const +EIGEN_DEVICE_FUNC inline bool DenseBase::any() const { typedef internal::evaluator Evaluator; enum { unroll = SizeAtCompileTime != Dynamic - && SizeAtCompileTime * (Evaluator::CoeffReadCost + NumTraits::AddCost) <= EIGEN_UNROLLING_LIMIT + && SizeAtCompileTime * (int(Evaluator::CoeffReadCost) + int(NumTraits::AddCost)) <= EIGEN_UNROLLING_LIMIT }; Evaluator evaluator(derived()); if(unroll) - return internal::any_unroller::run(evaluator); + return internal::any_unroller::RowsAtCompileTime>::run(evaluator); else { for(Index j = 0; j < cols(); ++j) @@ -126,7 +124,7 @@ inline bool DenseBase::any() const * \sa all(), any() */ template -inline Eigen::Index DenseBase::count() const +EIGEN_DEVICE_FUNC inline Eigen::Index DenseBase::count() const { return derived().template cast().template cast().sum(); } diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/CommaInitializer.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/CommaInitializer.h index d218e98143..c0e29c75c2 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/CommaInitializer.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/CommaInitializer.h @@ -33,6 +33,8 @@ struct CommaInitializer inline CommaInitializer(XprType& xpr, const Scalar& s) : m_xpr(xpr), m_row(0), m_col(1), m_currentBlockRows(1) { + eigen_assert(m_xpr.rows() > 0 && m_xpr.cols() > 0 + && "Cannot comma-initialize a 0x0 matrix (operator<<)"); m_xpr.coeffRef(0,0) = s; } @@ -41,6 +43,8 @@ struct CommaInitializer inline CommaInitializer(XprType& xpr, const DenseBase& other) : m_xpr(xpr), m_row(0), m_col(other.cols()), m_currentBlockRows(other.rows()) { + eigen_assert(m_xpr.rows() >= other.rows() && m_xpr.cols() >= other.cols() + && "Cannot comma-initialize a 0x0 matrix (operator<<)"); m_xpr.block(0, 0, other.rows(), other.cols()) = other; } @@ -103,7 +107,7 @@ struct CommaInitializer EIGEN_EXCEPTION_SPEC(Eigen::eigen_assert_exception) #endif { - finished(); + finished(); } /** \returns the built matrix once all its coefficients have been set. @@ -141,7 +145,7 @@ struct CommaInitializer * \sa CommaInitializer::finished(), class CommaInitializer */ template -inline CommaInitializer DenseBase::operator<< (const Scalar& s) +EIGEN_DEVICE_FUNC inline CommaInitializer DenseBase::operator<< (const Scalar& s) { return CommaInitializer(*static_cast(this), s); } @@ -149,7 +153,7 @@ inline CommaInitializer DenseBase::operator<< (const Scalar& s /** \sa operator<<(const Scalar&) */ template template -inline CommaInitializer +EIGEN_DEVICE_FUNC inline CommaInitializer DenseBase::operator<<(const DenseBase& other) { return CommaInitializer(*static_cast(this), other); diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/CoreEvaluators.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/CoreEvaluators.h index 910889efa7..0ff8c8deb8 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/CoreEvaluators.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/CoreEvaluators.h @@ -14,7 +14,7 @@ #define EIGEN_COREEVALUATORS_H namespace Eigen { - + namespace internal { // This class returns the evaluator kind from the expression storage kind. @@ -63,8 +63,8 @@ template< typename T, template< typename T, typename Kind = typename evaluator_traits::Kind, typename Scalar = typename T::Scalar> struct unary_evaluator; - -// evaluator_traits contains traits for evaluator + +// evaluator_traits contains traits for evaluator template struct evaluator_traits_base @@ -90,7 +90,8 @@ template struct evaluator : public unary_evaluator { typedef unary_evaluator Base; - EIGEN_DEVICE_FUNC explicit evaluator(const T& xpr) : Base(xpr) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit evaluator(const T& xpr) : Base(xpr) {} }; @@ -99,21 +100,29 @@ template struct evaluator : evaluator { - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const T& xpr) : evaluator(xpr) {} }; // ---------- base class for all evaluators ---------- template -struct evaluator_base : public noncopyable +struct evaluator_base { // TODO that's not very nice to have to propagate all these traits. They are currently only needed to handle outer,inner indices. typedef traits ExpressionTraits; - + enum { Alignment = 0 }; + // noncopyable: + // Don't make this class inherit noncopyable as this kills EBO (Empty Base Optimization) + // and make complex evaluator much larger than then should do. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE evaluator_base() {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~evaluator_base() {} +private: + EIGEN_DEVICE_FUNC evaluator_base(const evaluator_base&); + EIGEN_DEVICE_FUNC const evaluator_base& operator=(const evaluator_base&); }; // -------------------- Matrix and Array -------------------- @@ -123,6 +132,33 @@ struct evaluator_base : public noncopyable // Here we directly specialize evaluator. This is not really a unary expression, and it is, by definition, dense, // so no need for more sophisticated dispatching. +// this helper permits to completely eliminate m_outerStride if it is known at compiletime. +template class plainobjectbase_evaluator_data { +public: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr) + { +#ifndef EIGEN_INTERNAL_DEBUGGING + EIGEN_UNUSED_VARIABLE(outerStride); +#endif + eigen_internal_assert(outerStride==OuterStride); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index outerStride() const EIGEN_NOEXCEPT { return OuterStride; } + const Scalar *data; +}; + +template class plainobjectbase_evaluator_data { +public: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr), m_outerStride(outerStride) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index outerStride() const { return m_outerStride; } + const Scalar *data; +protected: + Index m_outerStride; +}; + template struct evaluator > : evaluator_base @@ -136,23 +172,28 @@ struct evaluator > IsVectorAtCompileTime = PlainObjectType::IsVectorAtCompileTime, RowsAtCompileTime = PlainObjectType::RowsAtCompileTime, ColsAtCompileTime = PlainObjectType::ColsAtCompileTime, - + CoeffReadCost = NumTraits::ReadCost, Flags = traits::EvaluatorFlags, Alignment = traits::Alignment }; - - EIGEN_DEVICE_FUNC evaluator() - : m_data(0), - m_outerStride(IsVectorAtCompileTime ? 0 - : int(IsRowMajor) ? ColsAtCompileTime - : RowsAtCompileTime) + enum { + // We do not need to know the outer stride for vectors + OuterStrideAtCompileTime = IsVectorAtCompileTime ? 0 + : int(IsRowMajor) ? ColsAtCompileTime + : RowsAtCompileTime + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + evaluator() + : m_d(0,OuterStrideAtCompileTime) { EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); } - - EIGEN_DEVICE_FUNC explicit evaluator(const PlainObjectType& m) - : m_data(m.data()), m_outerStride(IsVectorAtCompileTime ? 0 : m.outerStride()) + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit evaluator(const PlainObjectType& m) + : m_d(m.data(),IsVectorAtCompileTime ? 0 : m.outerStride()) { EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); } @@ -161,30 +202,30 @@ struct evaluator > CoeffReturnType coeff(Index row, Index col) const { if (IsRowMajor) - return m_data[row * m_outerStride.value() + col]; + return m_d.data[row * m_d.outerStride() + col]; else - return m_data[row + col * m_outerStride.value()]; + return m_d.data[row + col * m_d.outerStride()]; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_data[index]; + return m_d.data[index]; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) { if (IsRowMajor) - return const_cast(m_data)[row * m_outerStride.value() + col]; + return const_cast(m_d.data)[row * m_d.outerStride() + col]; else - return const_cast(m_data)[row + col * m_outerStride.value()]; + return const_cast(m_d.data)[row + col * m_d.outerStride()]; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { - return const_cast(m_data)[index]; + return const_cast(m_d.data)[index]; } template @@ -192,16 +233,16 @@ struct evaluator > PacketType packet(Index row, Index col) const { if (IsRowMajor) - return ploadt(m_data + row * m_outerStride.value() + col); + return ploadt(m_d.data + row * m_d.outerStride() + col); else - return ploadt(m_data + row + col * m_outerStride.value()); + return ploadt(m_d.data + row + col * m_d.outerStride()); } template EIGEN_STRONG_INLINE PacketType packet(Index index) const { - return ploadt(m_data + index); + return ploadt(m_d.data + index); } template @@ -210,26 +251,22 @@ struct evaluator > { if (IsRowMajor) return pstoret - (const_cast(m_data) + row * m_outerStride.value() + col, x); + (const_cast(m_d.data) + row * m_d.outerStride() + col, x); else return pstoret - (const_cast(m_data) + row + col * m_outerStride.value(), x); + (const_cast(m_d.data) + row + col * m_d.outerStride(), x); } template EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) { - return pstoret(const_cast(m_data) + index, x); + return pstoret(const_cast(m_d.data) + index, x); } protected: - const Scalar *m_data; - // We do not need to know the outer stride for vectors - variable_if_dynamic m_outerStride; + plainobjectbase_evaluator_data m_d; }; template @@ -237,11 +274,13 @@ struct evaluator > : evaluator > > { typedef Matrix XprType; - - EIGEN_DEVICE_FUNC evaluator() {} - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& m) - : evaluator >(m) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + evaluator() {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit evaluator(const XprType& m) + : evaluator >(m) { } }; @@ -251,10 +290,12 @@ struct evaluator > { typedef Array XprType; - EIGEN_DEVICE_FUNC evaluator() {} - - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& m) - : evaluator >(m) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + evaluator() {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit evaluator(const XprType& m) + : evaluator >(m) { } }; @@ -265,14 +306,15 @@ struct unary_evaluator, IndexBased> : evaluator_base > { typedef Transpose XprType; - + enum { - CoeffReadCost = evaluator::CoeffReadCost, + CoeffReadCost = evaluator::CoeffReadCost, Flags = evaluator::Flags ^ RowMajorBit, Alignment = evaluator::Alignment }; - EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& t) : m_argImpl(t.nestedExpression()) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit unary_evaluator(const XprType& t) : m_argImpl(t.nestedExpression()) {} typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; @@ -457,10 +499,10 @@ struct evaluator > { typedef CwiseNullaryOp XprType; typedef typename internal::remove_all::type PlainObjectTypeCleaned; - + enum { CoeffReadCost = internal::functor_traits::Cost, - + Flags = (evaluator::Flags & ( HereditaryBits | (functor_has_linear_access::ret ? LinearAccessBit : 0) @@ -517,19 +559,17 @@ struct unary_evaluator, IndexBased > : evaluator_base > { typedef CwiseUnaryOp XprType; - + enum { - CoeffReadCost = evaluator::CoeffReadCost + functor_traits::Cost, - + CoeffReadCost = int(evaluator::CoeffReadCost) + int(functor_traits::Cost), + Flags = evaluator::Flags & (HereditaryBits | LinearAccessBit | (functor_traits::PacketAccess ? PacketAccessBit : 0)), Alignment = evaluator::Alignment }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - explicit unary_evaluator(const XprType& op) - : m_functor(op.functor()), - m_argImpl(op.nestedExpression()) + explicit unary_evaluator(const XprType& op) : m_d(op) { EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits::Cost); EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); @@ -540,32 +580,43 @@ struct unary_evaluator, IndexBased > EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const { - return m_functor(m_argImpl.coeff(row, col)); + return m_d.func()(m_d.argImpl.coeff(row, col)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_functor(m_argImpl.coeff(index)); + return m_d.func()(m_d.argImpl.coeff(index)); } template EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { - return m_functor.packetOp(m_argImpl.template packet(row, col)); + return m_d.func().packetOp(m_d.argImpl.template packet(row, col)); } template EIGEN_STRONG_INLINE PacketType packet(Index index) const { - return m_functor.packetOp(m_argImpl.template packet(index)); + return m_d.func().packetOp(m_d.argImpl.template packet(index)); } protected: - const UnaryOp m_functor; - evaluator m_argImpl; + + // this helper permits to completely eliminate the functor if it is empty + struct Data + { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Data(const XprType& xpr) : op(xpr.functor()), argImpl(xpr.nestedExpression()) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const UnaryOp& func() const { return op; } + UnaryOp op; + evaluator argImpl; + }; + + Data m_d; }; // -------------------- CwiseTernaryOp -------------------- @@ -577,7 +628,7 @@ struct evaluator > { typedef CwiseTernaryOp XprType; typedef ternary_evaluator > Base; - + EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {} }; @@ -586,10 +637,10 @@ struct ternary_evaluator, IndexBased : evaluator_base > { typedef CwiseTernaryOp XprType; - + enum { - CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, - + CoeffReadCost = int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost) + int(functor_traits::Cost), + Arg1Flags = evaluator::Flags, Arg2Flags = evaluator::Flags, Arg3Flags = evaluator::Flags, @@ -609,11 +660,7 @@ struct ternary_evaluator, IndexBased evaluator::Alignment) }; - EIGEN_DEVICE_FUNC explicit ternary_evaluator(const XprType& xpr) - : m_functor(xpr.functor()), - m_arg1Impl(xpr.arg1()), - m_arg2Impl(xpr.arg2()), - m_arg3Impl(xpr.arg3()) + EIGEN_DEVICE_FUNC explicit ternary_evaluator(const XprType& xpr) : m_d(xpr) { EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits::Cost); EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); @@ -624,38 +671,48 @@ struct ternary_evaluator, IndexBased EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const { - return m_functor(m_arg1Impl.coeff(row, col), m_arg2Impl.coeff(row, col), m_arg3Impl.coeff(row, col)); + return m_d.func()(m_d.arg1Impl.coeff(row, col), m_d.arg2Impl.coeff(row, col), m_d.arg3Impl.coeff(row, col)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_functor(m_arg1Impl.coeff(index), m_arg2Impl.coeff(index), m_arg3Impl.coeff(index)); + return m_d.func()(m_d.arg1Impl.coeff(index), m_d.arg2Impl.coeff(index), m_d.arg3Impl.coeff(index)); } template EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { - return m_functor.packetOp(m_arg1Impl.template packet(row, col), - m_arg2Impl.template packet(row, col), - m_arg3Impl.template packet(row, col)); + return m_d.func().packetOp(m_d.arg1Impl.template packet(row, col), + m_d.arg2Impl.template packet(row, col), + m_d.arg3Impl.template packet(row, col)); } template EIGEN_STRONG_INLINE PacketType packet(Index index) const { - return m_functor.packetOp(m_arg1Impl.template packet(index), - m_arg2Impl.template packet(index), - m_arg3Impl.template packet(index)); + return m_d.func().packetOp(m_d.arg1Impl.template packet(index), + m_d.arg2Impl.template packet(index), + m_d.arg3Impl.template packet(index)); } protected: - const TernaryOp m_functor; - evaluator m_arg1Impl; - evaluator m_arg2Impl; - evaluator m_arg3Impl; + // this helper permits to completely eliminate the functor if it is empty + struct Data + { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Data(const XprType& xpr) : op(xpr.functor()), arg1Impl(xpr.arg1()), arg2Impl(xpr.arg2()), arg3Impl(xpr.arg3()) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TernaryOp& func() const { return op; } + TernaryOp op; + evaluator arg1Impl; + evaluator arg2Impl; + evaluator arg3Impl; + }; + + Data m_d; }; // -------------------- CwiseBinaryOp -------------------- @@ -667,8 +724,9 @@ struct evaluator > { typedef CwiseBinaryOp XprType; typedef binary_evaluator > Base; - - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit evaluator(const XprType& xpr) : Base(xpr) {} }; template @@ -676,10 +734,10 @@ struct binary_evaluator, IndexBased, IndexBase : evaluator_base > { typedef CwiseBinaryOp XprType; - + enum { - CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, - + CoeffReadCost = int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost) + int(functor_traits::Cost), + LhsFlags = evaluator::Flags, RhsFlags = evaluator::Flags, SameType = is_same::value, @@ -696,10 +754,8 @@ struct binary_evaluator, IndexBased, IndexBase Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator::Alignment,evaluator::Alignment) }; - EIGEN_DEVICE_FUNC explicit binary_evaluator(const XprType& xpr) - : m_functor(xpr.functor()), - m_lhsImpl(xpr.lhs()), - m_rhsImpl(xpr.rhs()) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit binary_evaluator(const XprType& xpr) : m_d(xpr) { EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits::Cost); EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); @@ -710,35 +766,46 @@ struct binary_evaluator, IndexBased, IndexBase EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const { - return m_functor(m_lhsImpl.coeff(row, col), m_rhsImpl.coeff(row, col)); + return m_d.func()(m_d.lhsImpl.coeff(row, col), m_d.rhsImpl.coeff(row, col)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_functor(m_lhsImpl.coeff(index), m_rhsImpl.coeff(index)); + return m_d.func()(m_d.lhsImpl.coeff(index), m_d.rhsImpl.coeff(index)); } template EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { - return m_functor.packetOp(m_lhsImpl.template packet(row, col), - m_rhsImpl.template packet(row, col)); + return m_d.func().packetOp(m_d.lhsImpl.template packet(row, col), + m_d.rhsImpl.template packet(row, col)); } template EIGEN_STRONG_INLINE PacketType packet(Index index) const { - return m_functor.packetOp(m_lhsImpl.template packet(index), - m_rhsImpl.template packet(index)); + return m_d.func().packetOp(m_d.lhsImpl.template packet(index), + m_d.rhsImpl.template packet(index)); } protected: - const BinaryOp m_functor; - evaluator m_lhsImpl; - evaluator m_rhsImpl; + + // this helper permits to completely eliminate the functor if it is empty + struct Data + { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Data(const XprType& xpr) : op(xpr.functor()), lhsImpl(xpr.lhs()), rhsImpl(xpr.rhs()) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const BinaryOp& func() const { return op; } + BinaryOp op; + evaluator lhsImpl; + evaluator rhsImpl; + }; + + Data m_d; }; // -------------------- CwiseUnaryView -------------------- @@ -748,18 +815,16 @@ struct unary_evaluator, IndexBased> : evaluator_base > { typedef CwiseUnaryView XprType; - + enum { - CoeffReadCost = evaluator::CoeffReadCost + functor_traits::Cost, - + CoeffReadCost = int(evaluator::CoeffReadCost) + int(functor_traits::Cost), + Flags = (evaluator::Flags & (HereditaryBits | LinearAccessBit | DirectAccessBit)), - + Alignment = 0 // FIXME it is not very clear why alignment is necessarily lost... }; - EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op) - : m_unaryOp(op.functor()), - m_argImpl(op.nestedExpression()) + EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op) : m_d(op) { EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits::Cost); EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); @@ -771,30 +836,41 @@ struct unary_evaluator, IndexBased> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const { - return m_unaryOp(m_argImpl.coeff(row, col)); + return m_d.func()(m_d.argImpl.coeff(row, col)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_unaryOp(m_argImpl.coeff(index)); + return m_d.func()(m_d.argImpl.coeff(index)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) { - return m_unaryOp(m_argImpl.coeffRef(row, col)); + return m_d.func()(m_d.argImpl.coeffRef(row, col)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { - return m_unaryOp(m_argImpl.coeffRef(index)); + return m_d.func()(m_d.argImpl.coeffRef(index)); } protected: - const UnaryOp m_unaryOp; - evaluator m_argImpl; + + // this helper permits to completely eliminate the functor if it is empty + struct Data + { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Data(const XprType& xpr) : op(xpr.functor()), argImpl(xpr.nestedExpression()) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const UnaryOp& func() const { return op; } + UnaryOp op; + evaluator argImpl; + }; + + Data m_d; }; // -------------------- Map -------------------- @@ -811,14 +887,15 @@ struct mapbase_evaluator : evaluator_base typedef typename XprType::PointerType PointerType; typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - + enum { IsRowMajor = XprType::RowsAtCompileTime, ColsAtCompileTime = XprType::ColsAtCompileTime, CoeffReadCost = NumTraits::ReadCost }; - EIGEN_DEVICE_FUNC explicit mapbase_evaluator(const XprType& map) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit mapbase_evaluator(const XprType& map) : m_data(const_cast(map.data())), m_innerStride(map.innerStride()), m_outerStride(map.outerStride()) @@ -882,17 +959,21 @@ struct mapbase_evaluator : evaluator_base internal::pstoret(m_data + index * m_innerStride.value(), x); } protected: - EIGEN_DEVICE_FUNC - inline Index rowStride() const { return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value(); } - EIGEN_DEVICE_FUNC - inline Index colStride() const { return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index rowStride() const EIGEN_NOEXCEPT { + return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value(); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index colStride() const EIGEN_NOEXCEPT { + return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value(); + } PointerType m_data; const internal::variable_if_dynamic m_innerStride; const internal::variable_if_dynamic m_outerStride; }; -template +template struct evaluator > : public mapbase_evaluator, PlainObjectType> { @@ -900,7 +981,7 @@ struct evaluator > typedef typename XprType::Scalar Scalar; // TODO: should check for smaller packet types once we can handle multi-sized packet types typedef typename packet_traits::type PacketScalar; - + enum { InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0 ? int(PlainObjectType::InnerStrideAtCompileTime) @@ -912,34 +993,35 @@ struct evaluator > HasNoOuterStride = StrideType::OuterStrideAtCompileTime == 0, HasNoStride = HasNoInnerStride && HasNoOuterStride, IsDynamicSize = PlainObjectType::SizeAtCompileTime==Dynamic, - + PacketAccessMask = bool(HasNoInnerStride) ? ~int(0) : ~int(PacketAccessBit), LinearAccessMask = bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime) ? ~int(0) : ~int(LinearAccessBit), Flags = int( evaluator::Flags) & (LinearAccessMask&PacketAccessMask), - + Alignment = int(MapOptions)&int(AlignedMask) }; EIGEN_DEVICE_FUNC explicit evaluator(const XprType& map) - : mapbase_evaluator(map) + : mapbase_evaluator(map) { } }; // -------------------- Ref -------------------- -template +template struct evaluator > : public mapbase_evaluator, PlainObjectType> { typedef Ref XprType; - + enum { Flags = evaluator >::Flags, Alignment = evaluator >::Alignment }; - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& ref) - : mapbase_evaluator(ref) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit evaluator(const XprType& ref) + : mapbase_evaluator(ref) { } }; @@ -947,8 +1029,8 @@ struct evaluator > template::ret> struct block_evaluator; - -template + +template struct evaluator > : block_evaluator { @@ -956,15 +1038,15 @@ struct evaluator > typedef typename XprType::Scalar Scalar; // TODO: should check for smaller packet types once we can handle multi-sized packet types typedef typename packet_traits::type PacketScalar; - + enum { CoeffReadCost = evaluator::CoeffReadCost, - + RowsAtCompileTime = traits::RowsAtCompileTime, ColsAtCompileTime = traits::ColsAtCompileTime, MaxRowsAtCompileTime = traits::MaxRowsAtCompileTime, MaxColsAtCompileTime = traits::MaxColsAtCompileTime, - + ArgTypeIsRowMajor = (int(evaluator::Flags)&RowMajorBit) != 0, IsRowMajor = (MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1) ? 1 : (MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1) ? 0 @@ -978,14 +1060,14 @@ struct evaluator > ? int(outer_stride_at_compile_time::ret) : int(inner_stride_at_compile_time::ret), MaskPacketAccessBit = (InnerStrideAtCompileTime == 1 || HasSameStorageOrderAsArgType) ? PacketAccessBit : 0, - - FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator::Flags&LinearAccessBit))) ? LinearAccessBit : 0, + + FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator::Flags&LinearAccessBit))) ? LinearAccessBit : 0, FlagsRowMajorBit = XprType::Flags&RowMajorBit, Flags0 = evaluator::Flags & ( (HereditaryBits & ~RowMajorBit) | DirectAccessBit | MaskPacketAccessBit), Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit, - + PacketAlignment = unpacket_traits::alignment, Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (OuterStrideAtCompileTime!=0) @@ -993,7 +1075,8 @@ struct evaluator > Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator::Alignment, Alignment0) }; typedef block_evaluator block_evaluator_type; - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& block) : block_evaluator_type(block) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit evaluator(const XprType& block) : block_evaluator_type(block) { EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); } @@ -1006,8 +1089,9 @@ struct block_evaluator XprType; - EIGEN_DEVICE_FUNC explicit block_evaluator(const XprType& block) - : unary_evaluator(block) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit block_evaluator(const XprType& block) + : unary_evaluator(block) {} }; @@ -1017,79 +1101,74 @@ struct unary_evaluator, IndexBa { typedef Block XprType; - EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& block) - : m_argImpl(block.nestedExpression()), - m_startRow(block.startRow()), + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit unary_evaluator(const XprType& block) + : m_argImpl(block.nestedExpression()), + m_startRow(block.startRow()), m_startCol(block.startCol()), - m_linear_offset(InnerPanel?(XprType::IsRowMajor ? block.startRow()*block.cols() : block.startCol()*block.rows()):0) + m_linear_offset(ForwardLinearAccess?(ArgType::IsRowMajor ? block.startRow()*block.nestedExpression().cols() + block.startCol() : block.startCol()*block.nestedExpression().rows() + block.startRow()):0) { } - + typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; enum { RowsAtCompileTime = XprType::RowsAtCompileTime, - ForwardLinearAccess = InnerPanel && bool(evaluator::Flags&LinearAccessBit) + ForwardLinearAccess = (InnerPanel || int(XprType::IsRowMajor)==int(ArgType::IsRowMajor)) && bool(evaluator::Flags&LinearAccessBit) }; - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const - { - return m_argImpl.coeff(m_startRow.value() + row, m_startCol.value() + col); + { + return m_argImpl.coeff(m_startRow.value() + row, m_startCol.value() + col); } - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - if (ForwardLinearAccess) - return m_argImpl.coeff(m_linear_offset.value() + index); - else - return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); + { + return linear_coeff_impl(index, bool_constant()); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) - { - return m_argImpl.coeffRef(m_startRow.value() + row, m_startCol.value() + col); + { + return m_argImpl.coeffRef(m_startRow.value() + row, m_startCol.value() + col); } - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) - { - if (ForwardLinearAccess) - return m_argImpl.coeffRef(m_linear_offset.value() + index); - else - return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); - } - - template - EIGEN_STRONG_INLINE - PacketType packet(Index row, Index col) const - { - return m_argImpl.template packet(m_startRow.value() + row, m_startCol.value() + col); + { + return linear_coeffRef_impl(index, bool_constant()); } template EIGEN_STRONG_INLINE - PacketType packet(Index index) const - { + PacketType packet(Index row, Index col) const + { + return m_argImpl.template packet(m_startRow.value() + row, m_startCol.value() + col); + } + + template + EIGEN_STRONG_INLINE + PacketType packet(Index index) const + { if (ForwardLinearAccess) return m_argImpl.template packet(m_linear_offset.value() + index); else return packet(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); } - + template EIGEN_STRONG_INLINE - void writePacket(Index row, Index col, const PacketType& x) + void writePacket(Index row, Index col, const PacketType& x) { - return m_argImpl.template writePacket(m_startRow.value() + row, m_startCol.value() + col, x); + return m_argImpl.template writePacket(m_startRow.value() + row, m_startCol.value() + col, x); } - + template EIGEN_STRONG_INLINE - void writePacket(Index index, const PacketType& x) + void writePacket(Index index, const PacketType& x) { if (ForwardLinearAccess) return m_argImpl.template writePacket(m_linear_offset.value() + index, x); @@ -1098,18 +1177,40 @@ struct unary_evaluator, IndexBa RowsAtCompileTime == 1 ? index : 0, x); } - + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType linear_coeff_impl(Index index, internal::true_type /* ForwardLinearAccess */) const + { + return m_argImpl.coeff(m_linear_offset.value() + index); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType linear_coeff_impl(Index index, internal::false_type /* not ForwardLinearAccess */) const + { + return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& linear_coeffRef_impl(Index index, internal::true_type /* ForwardLinearAccess */) + { + return m_argImpl.coeffRef(m_linear_offset.value() + index); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& linear_coeffRef_impl(Index index, internal::false_type /* not ForwardLinearAccess */) + { + return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); + } + evaluator m_argImpl; const variable_if_dynamic m_startRow; const variable_if_dynamic m_startCol; - const variable_if_dynamic m_linear_offset; + const variable_if_dynamic m_linear_offset; }; -// TODO: This evaluator does not actually use the child evaluator; +// TODO: This evaluator does not actually use the child evaluator; // all action is via the data() as returned by the Block expression. -template +template struct block_evaluator : mapbase_evaluator, typename Block::PlainObject> @@ -1117,8 +1218,9 @@ struct block_evaluator XprType; typedef typename XprType::Scalar Scalar; - EIGEN_DEVICE_FUNC explicit block_evaluator(const XprType& block) - : mapbase_evaluator(block) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit block_evaluator(const XprType& block) + : mapbase_evaluator(block) { // TODO: for the 3.3 release, this should be turned to an internal assertion, but let's keep it as is for the beta lifetime eigen_assert(((internal::UIntPtr(block.data()) % EIGEN_PLAIN_ENUM_MAX(1,evaluator::Alignment)) == 0) && "data is not aligned"); @@ -1141,18 +1243,19 @@ struct evaluator > evaluator::CoeffReadCost), Flags = (unsigned int)evaluator::Flags & evaluator::Flags & HereditaryBits, - + Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator::Alignment, evaluator::Alignment) }; - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& select) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit evaluator(const XprType& select) : m_conditionImpl(select.conditionMatrix()), m_thenImpl(select.thenMatrix()), m_elseImpl(select.elseMatrix()) { EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); } - + typedef typename XprType::CoeffReturnType CoeffReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -1172,7 +1275,7 @@ struct evaluator > else return m_elseImpl.coeff(index); } - + protected: evaluator m_conditionImpl; evaluator m_thenImpl; @@ -1182,7 +1285,7 @@ protected: // -------------------- Replicate -------------------- -template +template struct unary_evaluator > : evaluator_base > { @@ -1193,22 +1296,23 @@ struct unary_evaluator > }; typedef typename internal::nested_eval::type ArgTypeNested; typedef typename internal::remove_all::type ArgTypeNestedCleaned; - + enum { CoeffReadCost = evaluator::CoeffReadCost, LinearAccessMask = XprType::IsVectorAtCompileTime ? LinearAccessBit : 0, Flags = (evaluator::Flags & (HereditaryBits|LinearAccessMask) & ~RowMajorBit) | (traits::Flags & RowMajorBit), - + Alignment = evaluator::Alignment }; - EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& replicate) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit unary_evaluator(const XprType& replicate) : m_arg(replicate.nestedExpression()), m_argImpl(m_arg), m_rows(replicate.nestedExpression().rows()), m_cols(replicate.nestedExpression().cols()) {} - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const { @@ -1219,10 +1323,10 @@ struct unary_evaluator > const Index actual_col = internal::traits::ColsAtCompileTime==1 ? 0 : ColFactor==1 ? col : col % m_cols.value(); - + return m_argImpl.coeff(actual_row, actual_col); } - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { @@ -1230,7 +1334,7 @@ struct unary_evaluator > const Index actual_index = internal::traits::RowsAtCompileTime==1 ? (ColFactor==1 ? index : index%m_cols.value()) : (RowFactor==1 ? index : index%m_rows.value()); - + return m_argImpl.coeff(actual_index); } @@ -1247,7 +1351,7 @@ struct unary_evaluator > return m_argImpl.template packet(actual_row, actual_col); } - + template EIGEN_STRONG_INLINE PacketType packet(Index index) const @@ -1258,7 +1362,7 @@ struct unary_evaluator > return m_argImpl.template packet(actual_index); } - + protected: const ArgTypeNested m_arg; evaluator m_argImpl; @@ -1266,64 +1370,6 @@ protected: const variable_if_dynamic m_cols; }; - -// -------------------- PartialReduxExpr -------------------- - -template< typename ArgType, typename MemberOp, int Direction> -struct evaluator > - : evaluator_base > -{ - typedef PartialReduxExpr XprType; - typedef typename internal::nested_eval::type ArgTypeNested; - typedef typename internal::remove_all::type ArgTypeNestedCleaned; - typedef typename ArgType::Scalar InputScalar; - typedef typename XprType::Scalar Scalar; - enum { - TraversalSize = Direction==int(Vertical) ? int(ArgType::RowsAtCompileTime) : int(ArgType::ColsAtCompileTime) - }; - typedef typename MemberOp::template Cost CostOpType; - enum { - CoeffReadCost = TraversalSize==Dynamic ? HugeCost - : TraversalSize * evaluator::CoeffReadCost + int(CostOpType::value), - - Flags = (traits::Flags&RowMajorBit) | (evaluator::Flags&(HereditaryBits&(~RowMajorBit))) | LinearAccessBit, - - Alignment = 0 // FIXME this will need to be improved once PartialReduxExpr is vectorized - }; - - EIGEN_DEVICE_FUNC explicit evaluator(const XprType xpr) - : m_arg(xpr.nestedExpression()), m_functor(xpr.functor()) - { - EIGEN_INTERNAL_CHECK_COST_VALUE(TraversalSize==Dynamic ? HugeCost : int(CostOpType::value)); - EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); - } - - typedef typename XprType::CoeffReturnType CoeffReturnType; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const Scalar coeff(Index i, Index j) const - { - if (Direction==Vertical) - return m_functor(m_arg.col(j)); - else - return m_functor(m_arg.row(i)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const Scalar coeff(Index index) const - { - if (Direction==Vertical) - return m_functor(m_arg.col(index)); - else - return m_functor(m_arg.row(index)); - } - -protected: - typename internal::add_const_on_value_type::type m_arg; - const MemberOp m_functor; -}; - - // -------------------- MatrixWrapper and ArrayWrapper -------------------- // // evaluator_wrapper_base is a common base class for the @@ -1340,7 +1386,8 @@ struct evaluator_wrapper_base Alignment = evaluator::Alignment }; - EIGEN_DEVICE_FUNC explicit evaluator_wrapper_base(const ArgType& arg) : m_argImpl(arg) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit evaluator_wrapper_base(const ArgType& arg) : m_argImpl(arg) {} typedef typename ArgType::Scalar Scalar; typedef typename ArgType::CoeffReturnType CoeffReturnType; @@ -1407,7 +1454,8 @@ struct unary_evaluator > { typedef MatrixWrapper XprType; - EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& wrapper) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit unary_evaluator(const XprType& wrapper) : evaluator_wrapper_base >(wrapper.nestedExpression()) { } }; @@ -1418,7 +1466,8 @@ struct unary_evaluator > { typedef ArrayWrapper XprType; - EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& wrapper) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit unary_evaluator(const XprType& wrapper) : evaluator_wrapper_base >(wrapper.nestedExpression()) { } }; @@ -1445,9 +1494,9 @@ struct unary_evaluator > ReversePacket = (Direction == BothDirections) || ((Direction == Vertical) && IsColMajor) || ((Direction == Horizontal) && IsRowMajor), - + CoeffReadCost = evaluator::CoeffReadCost, - + // let's enable LinearAccess only with vectorization because of the product overhead // FIXME enable DirectAccess with negative strides? Flags0 = evaluator::Flags, @@ -1456,16 +1505,17 @@ struct unary_evaluator > ? LinearAccessBit : 0, Flags = int(Flags0) & (HereditaryBits | PacketAccessBit | LinearAccess), - + Alignment = 0 // FIXME in some rare cases, Alignment could be preserved, like a Vector4f. }; - EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& reverse) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit unary_evaluator(const XprType& reverse) : m_argImpl(reverse.nestedExpression()), m_rows(ReverseRow ? reverse.nestedExpression().rows() : 1), m_cols(ReverseCol ? reverse.nestedExpression().cols() : 1) { } - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const { @@ -1540,7 +1590,7 @@ struct unary_evaluator > m_argImpl.template writePacket (m_rows.value() * m_cols.value() - index - PacketSize, preverse(x)); } - + protected: evaluator m_argImpl; @@ -1558,20 +1608,21 @@ struct evaluator > : evaluator_base > { typedef Diagonal XprType; - + enum { CoeffReadCost = evaluator::CoeffReadCost, - + Flags = (unsigned int)(evaluator::Flags & (HereditaryBits | DirectAccessBit) & ~RowMajorBit) | LinearAccessBit, - + Alignment = 0 }; - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& diagonal) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit evaluator(const XprType& diagonal) : m_argImpl(diagonal.nestedExpression()), m_index(diagonal.index()) { } - + typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; @@ -1604,8 +1655,10 @@ protected: const internal::variable_if_dynamicindex m_index; private: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowOffset() const { return m_index.value() > 0 ? 0 : -m_index.value(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colOffset() const { return m_index.value() > 0 ? m_index.value() : 0; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index rowOffset() const { return m_index.value() > 0 ? 0 : -m_index.value(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index colOffset() const { return m_index.value() > 0 ? m_index.value() : 0; } }; @@ -1629,25 +1682,25 @@ class EvalToTemp : public dense_xpr_base >::type { public: - + typedef typename dense_xpr_base::type Base; EIGEN_GENERIC_PUBLIC_INTERFACE(EvalToTemp) - + explicit EvalToTemp(const ArgType& arg) : m_arg(arg) { } - + const ArgType& arg() const { return m_arg; } - Index rows() const + EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_arg.rows(); } - Index cols() const + EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_arg.cols(); } @@ -1655,7 +1708,7 @@ class EvalToTemp private: const ArgType& m_arg; }; - + template struct evaluator > : public evaluator @@ -1663,7 +1716,7 @@ struct evaluator > typedef EvalToTemp XprType; typedef typename ArgType::PlainObject PlainObject; typedef evaluator Base; - + EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : m_result(xpr.arg()) { diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/CoreIterators.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/CoreIterators.h index 4eb42b93af..b967196813 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/CoreIterators.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/CoreIterators.h @@ -48,6 +48,11 @@ public: * Explicit zeros are not skipped over. To skip explicit zeros, see class SparseView */ EIGEN_STRONG_INLINE InnerIterator& operator++() { m_iter.operator++(); return *this; } + EIGEN_STRONG_INLINE InnerIterator& operator+=(Index i) { m_iter.operator+=(i); return *this; } + EIGEN_STRONG_INLINE InnerIterator operator+(Index i) + { InnerIterator result(*this); result+=i; return result; } + + /// \returns the column or row index of the current coefficient. EIGEN_STRONG_INLINE Index index() const { return m_iter.index(); } /// \returns the row index of the current coefficient. diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/CwiseBinaryOp.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/CwiseBinaryOp.h index a36765e396..2202b1cc6b 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/CwiseBinaryOp.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/CwiseBinaryOp.h @@ -74,7 +74,7 @@ class CwiseBinaryOpImpl; * \sa MatrixBase::binaryExpr(const MatrixBase &,const CustomBinaryOp &) const, class CwiseUnaryOp, class CwiseNullaryOp */ template -class CwiseBinaryOp : +class CwiseBinaryOp : public CwiseBinaryOpImpl< BinaryOp, LhsType, RhsType, typename internal::cwise_promote_storage_type::StorageKind, @@ -83,7 +83,7 @@ class CwiseBinaryOp : internal::no_assignment_operator { public: - + typedef typename internal::remove_all::type Functor; typedef typename internal::remove_all::type Lhs; typedef typename internal::remove_all::type Rhs; @@ -100,8 +100,14 @@ class CwiseBinaryOp : typedef typename internal::remove_reference::type _LhsNested; typedef typename internal::remove_reference::type _RhsNested; - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE CwiseBinaryOp(const Lhs& aLhs, const Rhs& aRhs, const BinaryOp& func = BinaryOp()) +#if EIGEN_COMP_MSVC && EIGEN_HAS_CXX11 + //Required for Visual Studio or the Copy constructor will probably not get inlined! + EIGEN_STRONG_INLINE + CwiseBinaryOp(const CwiseBinaryOp&) = default; +#endif + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CwiseBinaryOp(const Lhs& aLhs, const Rhs& aRhs, const BinaryOp& func = BinaryOp()) : m_lhs(aLhs), m_rhs(aRhs), m_functor(func) { EIGEN_CHECK_BINARY_COMPATIBILIY(BinaryOp,typename Lhs::Scalar,typename Rhs::Scalar); @@ -110,31 +116,25 @@ class CwiseBinaryOp : eigen_assert(aLhs.rows() == aRhs.rows() && aLhs.cols() == aRhs.cols()); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index rows() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index rows() const EIGEN_NOEXCEPT { // return the fixed size type if available to enable compile time optimizations - if (internal::traits::type>::RowsAtCompileTime==Dynamic) - return m_rhs.rows(); - else - return m_lhs.rows(); + return internal::traits::type>::RowsAtCompileTime==Dynamic ? m_rhs.rows() : m_lhs.rows(); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index cols() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index cols() const EIGEN_NOEXCEPT { // return the fixed size type if available to enable compile time optimizations - if (internal::traits::type>::ColsAtCompileTime==Dynamic) - return m_rhs.cols(); - else - return m_lhs.cols(); + return internal::traits::type>::ColsAtCompileTime==Dynamic ? m_rhs.cols() : m_lhs.cols(); } /** \returns the left hand side nested expression */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const _LhsNested& lhs() const { return m_lhs; } /** \returns the right hand side nested expression */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const _RhsNested& rhs() const { return m_rhs; } /** \returns the functor representing the binary operation */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const BinaryOp& functor() const { return m_functor; } protected: @@ -158,7 +158,7 @@ public: */ template template -EIGEN_STRONG_INLINE Derived & +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived & MatrixBase::operator-=(const MatrixBase &other) { call_assignment(derived(), other.derived(), internal::sub_assign_op()); @@ -171,7 +171,7 @@ MatrixBase::operator-=(const MatrixBase &other) */ template template -EIGEN_STRONG_INLINE Derived & +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived & MatrixBase::operator+=(const MatrixBase& other) { call_assignment(derived(), other.derived(), internal::add_assign_op()); @@ -181,4 +181,3 @@ MatrixBase::operator+=(const MatrixBase& other) } // end namespace Eigen #endif // EIGEN_CWISE_BINARY_OP_H - diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/CwiseNullaryOp.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/CwiseNullaryOp.h index ddd607e383..289ec510a8 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/CwiseNullaryOp.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/CwiseNullaryOp.h @@ -74,10 +74,10 @@ class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp::PlainObject> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const CwiseNullaryOp::PlainObject> +#else +const CwiseNullaryOp +#endif DenseBase::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func) { return CwiseNullaryOp(rows, cols, func); @@ -126,12 +131,17 @@ DenseBase::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& f * * Here is an example with C++11 random generators: \include random_cpp11.cpp * Output: \verbinclude random_cpp11.out - * + * * \sa class CwiseNullaryOp */ template template -EIGEN_STRONG_INLINE const CwiseNullaryOp::PlainObject> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const CwiseNullaryOp::PlainObject> +#else +const CwiseNullaryOp +#endif DenseBase::NullaryExpr(Index size, const CustomNullaryOp& func) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) @@ -150,7 +160,12 @@ DenseBase::NullaryExpr(Index size, const CustomNullaryOp& func) */ template template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp::PlainObject> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const CwiseNullaryOp::PlainObject> +#else +const CwiseNullaryOp +#endif DenseBase::NullaryExpr(const CustomNullaryOp& func) { return CwiseNullaryOp(RowsAtCompileTime, ColsAtCompileTime, func); @@ -170,7 +185,7 @@ DenseBase::NullaryExpr(const CustomNullaryOp& func) * \sa class CwiseNullaryOp */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Constant(Index rows, Index cols, const Scalar& value) { return DenseBase::NullaryExpr(rows, cols, internal::scalar_constant_op(value)); @@ -217,27 +232,32 @@ DenseBase::Constant(const Scalar& value) /** \deprecated because of accuracy loss. In Eigen 3.3, it is an alias for LinSpaced(Index,const Scalar&,const Scalar&) * - * \sa LinSpaced(Index,Scalar,Scalar), setLinSpaced(Index,const Scalar&,const Scalar&) + * \only_for_vectors + * + * Example: \include DenseBase_LinSpaced_seq_deprecated.cpp + * Output: \verbinclude DenseBase_LinSpaced_seq_deprecated.out + * + * \sa LinSpaced(Index,const Scalar&, const Scalar&), setLinSpaced(Index,const Scalar&,const Scalar&) */ template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType +EIGEN_DEPRECATED EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType DenseBase::LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return DenseBase::NullaryExpr(size, internal::linspaced_op(low,high,size)); + return DenseBase::NullaryExpr(size, internal::linspaced_op(low,high,size)); } /** \deprecated because of accuracy loss. In Eigen 3.3, it is an alias for LinSpaced(const Scalar&,const Scalar&) * - * \sa LinSpaced(Scalar,Scalar) + * \sa LinSpaced(const Scalar&, const Scalar&) */ template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType +EIGEN_DEPRECATED EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType DenseBase::LinSpaced(Sequential_t, const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived) - return DenseBase::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op(low,high,Derived::SizeAtCompileTime)); + return DenseBase::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op(low,high,Derived::SizeAtCompileTime)); } /** @@ -268,7 +288,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomA DenseBase::LinSpaced(Index size, const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return DenseBase::NullaryExpr(size, internal::linspaced_op(low,high,size)); + return DenseBase::NullaryExpr(size, internal::linspaced_op(low,high,size)); } /** @@ -281,7 +301,7 @@ DenseBase::LinSpaced(const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived) - return DenseBase::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op(low,high,Derived::SizeAtCompileTime)); + return DenseBase::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op(low,high,Derived::SizeAtCompileTime)); } /** \returns true if all coefficients in this matrix are approximately equal to \a val, to within precision \a prec */ @@ -363,6 +383,33 @@ PlainObjectBase::setConstant(Index rows, Index cols, const Scalar& val) return setConstant(val); } +/** Resizes to the given size, changing only the number of columns, and sets all + * coefficients in this expression to the given value \a val. For the parameter + * of type NoChange_t, just pass the special value \c NoChange. + * + * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&) + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& +PlainObjectBase::setConstant(NoChange_t, Index cols, const Scalar& val) +{ + return setConstant(rows(), cols, val); +} + +/** Resizes to the given size, changing only the number of rows, and sets all + * coefficients in this expression to the given value \a val. For the parameter + * of type NoChange_t, just pass the special value \c NoChange. + * + * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&) + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& +PlainObjectBase::setConstant(Index rows, NoChange_t, const Scalar& val) +{ + return setConstant(rows, cols(), val); +} + + /** * \brief Sets a linearly spaced vector. * @@ -383,7 +430,7 @@ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op(low,high,newSize)); + return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op(low,high,newSize)); } /** @@ -536,6 +583,32 @@ PlainObjectBase::setZero(Index rows, Index cols) return setConstant(Scalar(0)); } +/** Resizes to the given size, changing only the number of columns, and sets all + * coefficients in this expression to zero. For the parameter of type NoChange_t, + * just pass the special value \c NoChange. + * + * \sa DenseBase::setZero(), setZero(Index), setZero(Index, Index), setZero(Index, NoChange_t), class CwiseNullaryOp, DenseBase::Zero() + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& +PlainObjectBase::setZero(NoChange_t, Index cols) +{ + return setZero(rows(), cols); +} + +/** Resizes to the given size, changing only the number of rows, and sets all + * coefficients in this expression to zero. For the parameter of type NoChange_t, + * just pass the special value \c NoChange. + * + * \sa DenseBase::setZero(), setZero(Index), setZero(Index, Index), setZero(NoChange_t, Index), class CwiseNullaryOp, DenseBase::Zero() + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& +PlainObjectBase::setZero(Index rows, NoChange_t) +{ + return setZero(rows, cols()); +} + // ones: /** \returns an expression of a matrix where all coefficients equal one. @@ -662,6 +735,32 @@ PlainObjectBase::setOnes(Index rows, Index cols) return setConstant(Scalar(1)); } +/** Resizes to the given size, changing only the number of rows, and sets all + * coefficients in this expression to one. For the parameter of type NoChange_t, + * just pass the special value \c NoChange. + * + * \sa MatrixBase::setOnes(), setOnes(Index), setOnes(Index, Index), setOnes(NoChange_t, Index), class CwiseNullaryOp, MatrixBase::Ones() + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& +PlainObjectBase::setOnes(Index rows, NoChange_t) +{ + return setOnes(rows, cols()); +} + +/** Resizes to the given size, changing only the number of columns, and sets all + * coefficients in this expression to one. For the parameter of type NoChange_t, + * just pass the special value \c NoChange. + * + * \sa MatrixBase::setOnes(), setOnes(Index), setOnes(Index, Index), setOnes(Index, NoChange_t) class CwiseNullaryOp, MatrixBase::Ones() + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& +PlainObjectBase::setOnes(NoChange_t, Index cols) +{ + return setOnes(rows(), cols); +} + // Identity: /** \returns an expression of the identity matrix (not necessarily square). @@ -861,6 +960,42 @@ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitW() { return Derived::Unit(3); } +/** \brief Set the coefficients of \c *this to the i-th unit (basis) vector + * + * \param i index of the unique coefficient to be set to 1 + * + * \only_for_vectors + * + * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Unit(Index,Index) + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase::setUnit(Index i) +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived); + eigen_assert(i +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase::setUnit(Index newSize, Index i) +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived); + eigen_assert(i @@ -24,7 +24,7 @@ struct traits > typedef typename XprType::Nested XprTypeNested; typedef typename remove_reference::type _XprTypeNested; enum { - Flags = _XprTypeNested::Flags & RowMajorBit + Flags = _XprTypeNested::Flags & RowMajorBit }; }; } @@ -65,10 +65,10 @@ class CwiseUnaryOp : public CwiseUnaryOpImpl::non_const_type MatrixTypeNested; typedef typename internal::remove_all::type NestedExpression; - explicit inline CwiseUnaryView(MatrixType& mat, const ViewOp& func = ViewOp()) + explicit EIGEN_DEVICE_FUNC inline CwiseUnaryView(MatrixType& mat, const ViewOp& func = ViewOp()) : m_matrix(mat), m_functor(func) {} EIGEN_INHERIT_ASSIGNMENT_OPERATORS(CwiseUnaryView) - EIGEN_STRONG_INLINE Index rows() const { return m_matrix.rows(); } - EIGEN_STRONG_INLINE Index cols() const { return m_matrix.cols(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); } /** \returns the functor representing unary operation */ - const ViewOp& functor() const { return m_functor; } + EIGEN_DEVICE_FUNC const ViewOp& functor() const { return m_functor; } /** \returns the nested expression */ - const typename internal::remove_all::type& + EIGEN_DEVICE_FUNC const typename internal::remove_all::type& nestedExpression() const { return m_matrix; } /** \returns the nested expression */ - typename internal::remove_reference::type& - nestedExpression() { return m_matrix.const_cast_derived(); } + EIGEN_DEVICE_FUNC typename internal::remove_reference::type& + nestedExpression() { return m_matrix; } protected: MatrixTypeNested m_matrix; @@ -108,16 +110,16 @@ class CwiseUnaryViewImpl EIGEN_DENSE_PUBLIC_INTERFACE(Derived) EIGEN_INHERIT_ASSIGNMENT_OPERATORS(CwiseUnaryViewImpl) - + EIGEN_DEVICE_FUNC inline Scalar* data() { return &(this->coeffRef(0)); } EIGEN_DEVICE_FUNC inline const Scalar* data() const { return &(this->coeff(0)); } - EIGEN_DEVICE_FUNC inline Index innerStride() const + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const { return derived().nestedExpression().innerStride() * sizeof(typename internal::traits::Scalar) / sizeof(Scalar); } - EIGEN_DEVICE_FUNC inline Index outerStride() const + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const { return derived().nestedExpression().outerStride() * sizeof(typename internal::traits::Scalar) / sizeof(Scalar); } diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/DenseBase.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/DenseBase.h index c55a68230c..9b16db68d4 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/DenseBase.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/DenseBase.h @@ -14,15 +14,15 @@ namespace Eigen { namespace internal { - + // The index type defined by EIGEN_DEFAULT_DENSE_INDEX_TYPE must be a signed type. // This dummy function simply aims at checking that at compile time. static inline void check_DenseIndex_is_signed() { - EIGEN_STATIC_ASSERT(NumTraits::IsSigned,THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE); + EIGEN_STATIC_ASSERT(NumTraits::IsSigned,THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE) } } // end namespace internal - + /** \class DenseBase * \ingroup Core_Module * @@ -64,12 +64,12 @@ template class DenseBase /** The numeric type of the expression' coefficients, e.g. float, double, int or std::complex, etc. */ typedef typename internal::traits::Scalar Scalar; - + /** The numeric type of the expression' coefficients, e.g. float, double, int or std::complex, etc. * * It is an alias for the Scalar type */ typedef Scalar value_type; - + typedef typename NumTraits::Real RealScalar; typedef DenseCoeffsBase::value> Base; @@ -150,13 +150,18 @@ template class DenseBase * \sa SizeAtCompileTime, MaxRowsAtCompileTime, MaxColsAtCompileTime */ - IsVectorAtCompileTime = internal::traits::MaxRowsAtCompileTime == 1 - || internal::traits::MaxColsAtCompileTime == 1, + IsVectorAtCompileTime = internal::traits::RowsAtCompileTime == 1 + || internal::traits::ColsAtCompileTime == 1, /**< This is set to true if either the number of rows or the number of * columns is known at compile-time to be equal to 1. Indeed, in that case, * we are dealing with a column-vector (if there is only one column) or with * a row-vector (if there is only one row). */ + NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2, + /**< This value is equal to Tensor::NumDimensions, i.e. 0 for scalars, 1 for vectors, + * and 2 for matrices. + */ + Flags = internal::traits::Flags, /**< This stores expression \ref flags flags which may or may not be inherited by new expressions * constructed from this one. See the \ref flags "list of flags". @@ -170,11 +175,11 @@ template class DenseBase InnerStrideAtCompileTime = internal::inner_stride_at_compile_time::ret, OuterStrideAtCompileTime = internal::outer_stride_at_compile_time::ret }; - + typedef typename internal::find_best_packet::type PacketScalar; enum { IsPlainObjectBase = 0 }; - + /** The plain matrix type corresponding to this expression. * \sa PlainObject */ typedef Matrix::Scalar, @@ -184,7 +189,7 @@ template class DenseBase internal::traits::MaxRowsAtCompileTime, internal::traits::MaxColsAtCompileTime > PlainMatrix; - + /** The plain array type corresponding to this expression. * \sa PlainObject */ typedef Array::Scalar, @@ -206,7 +211,7 @@ template class DenseBase /** \returns the number of nonzero coefficients which is in practice the number * of stored coefficients. */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index nonZeros() const { return size(); } /** \returns the outer size. @@ -214,7 +219,7 @@ template class DenseBase * \note For a vector, this returns just 1. For a matrix (non-vector), this is the major dimension * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of columns for a * column-major matrix, and the number of rows for a row-major matrix. */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index outerSize() const { return IsVectorAtCompileTime ? 1 @@ -224,9 +229,9 @@ template class DenseBase /** \returns the inner size. * * \note For a vector, this is just the size. For a matrix (non-vector), this is the minor dimension - * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of rows for a + * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of rows for a * column-major matrix, and the number of columns for a row-major matrix. */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index innerSize() const { return IsVectorAtCompileTime ? this->size() @@ -261,9 +266,9 @@ template class DenseBase /** \internal Represents a matrix with all coefficients equal to one another*/ typedef CwiseNullaryOp,PlainObject> ConstantReturnType; /** \internal \deprecated Represents a vector with linearly spaced coefficients that allows sequential access only. */ - typedef CwiseNullaryOp,PlainObject> SequentialLinSpacedReturnType; + EIGEN_DEPRECATED typedef CwiseNullaryOp,PlainObject> SequentialLinSpacedReturnType; /** \internal Represents a vector with linearly spaced coefficients that allows random access. */ - typedef CwiseNullaryOp,PlainObject> RandomAccessLinSpacedReturnType; + typedef CwiseNullaryOp,PlainObject> RandomAccessLinSpacedReturnType; /** \internal the return type of MatrixBase::eigenvalues() */ typedef Matrix::Scalar>::Real, internal::traits::ColsAtCompileTime, 1> EigenvaluesReturnType; @@ -297,17 +302,17 @@ template class DenseBase Derived& operator=(const ReturnByValue& func); /** \internal - * Copies \a other into *this without evaluating other. \returns a reference to *this. - * \deprecated */ + * Copies \a other into *this without evaluating other. \returns a reference to *this. */ template - EIGEN_DEVICE_FUNC + /** \deprecated */ + EIGEN_DEPRECATED EIGEN_DEVICE_FUNC Derived& lazyAssign(const DenseBase& other); EIGEN_DEVICE_FUNC CommaInitializer operator<< (const Scalar& s); - /** \deprecated it now returns \c *this */ template + /** \deprecated it now returns \c *this */ EIGEN_DEPRECATED const Derived& flagged() const { return derived(); } @@ -332,12 +337,13 @@ template class DenseBase EIGEN_DEVICE_FUNC static const ConstantReturnType Constant(const Scalar& value); - EIGEN_DEVICE_FUNC static const SequentialLinSpacedReturnType + EIGEN_DEPRECATED EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high); + EIGEN_DEPRECATED EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType + LinSpaced(Sequential_t, const Scalar& low, const Scalar& high); + EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType LinSpaced(Index size, const Scalar& low, const Scalar& high); - EIGEN_DEVICE_FUNC static const SequentialLinSpacedReturnType - LinSpaced(Sequential_t, const Scalar& low, const Scalar& high); EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType LinSpaced(const Scalar& low, const Scalar& high); @@ -369,7 +375,7 @@ template class DenseBase template EIGEN_DEVICE_FUNC bool isApprox(const DenseBase& other, const RealScalar& prec = NumTraits::dummy_precision()) const; - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC bool isMuchSmallerThan(const RealScalar& other, const RealScalar& prec = NumTraits::dummy_precision()) const; template EIGEN_DEVICE_FUNC @@ -380,7 +386,7 @@ template class DenseBase EIGEN_DEVICE_FUNC bool isConstant(const Scalar& value, const RealScalar& prec = NumTraits::dummy_precision()) const; EIGEN_DEVICE_FUNC bool isZero(const RealScalar& prec = NumTraits::dummy_precision()) const; EIGEN_DEVICE_FUNC bool isOnes(const RealScalar& prec = NumTraits::dummy_precision()) const; - + inline bool hasNaN() const; inline bool allFinite() const; @@ -394,8 +400,8 @@ template class DenseBase * * Notice that in the case of a plain matrix or vector (not an expression) this function just returns * a const reference, in order to avoid a useless copy. - * - * \warning Be carefull with eval() and the auto C++ keyword, as detailed in this \link TopicPitfalls_auto_keyword page \endlink. + * + * \warning Be careful with eval() and the auto C++ keyword, as detailed in this \link TopicPitfalls_auto_keyword page \endlink. */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvalReturnType eval() const @@ -405,12 +411,12 @@ template class DenseBase // size types on MSVC. return typename internal::eval::type(derived()); } - + /** swaps *this with the expression \a other. * */ template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void swap(const DenseBase& other) { EIGEN_STATIC_ASSERT(!OtherDerived::IsPlainObjectBase,THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY); @@ -422,7 +428,7 @@ template class DenseBase * */ template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void swap(PlainObjectBase& other) { eigen_assert(rows()==other.rows() && cols()==other.cols()); @@ -443,18 +449,58 @@ template class DenseBase EIGEN_DEVICE_FUNC Scalar prod() const; + template EIGEN_DEVICE_FUNC typename internal::traits::Scalar minCoeff() const; + template EIGEN_DEVICE_FUNC typename internal::traits::Scalar maxCoeff() const; - template EIGEN_DEVICE_FUNC + + // By default, the fastest version with undefined NaN propagation semantics is + // used. + // TODO(rmlarsen): Replace with default template argument when we move to + // c++11 or beyond. + EIGEN_DEVICE_FUNC inline typename internal::traits::Scalar minCoeff() const { + return minCoeff(); + } + EIGEN_DEVICE_FUNC inline typename internal::traits::Scalar maxCoeff() const { + return maxCoeff(); + } + + template + EIGEN_DEVICE_FUNC typename internal::traits::Scalar minCoeff(IndexType* row, IndexType* col) const; - template EIGEN_DEVICE_FUNC + template + EIGEN_DEVICE_FUNC typename internal::traits::Scalar maxCoeff(IndexType* row, IndexType* col) const; - template EIGEN_DEVICE_FUNC + template + EIGEN_DEVICE_FUNC typename internal::traits::Scalar minCoeff(IndexType* index) const; - template EIGEN_DEVICE_FUNC + template + EIGEN_DEVICE_FUNC typename internal::traits::Scalar maxCoeff(IndexType* index) const; + // TODO(rmlarsen): Replace these methods with a default template argument. + template + EIGEN_DEVICE_FUNC inline + typename internal::traits::Scalar minCoeff(IndexType* row, IndexType* col) const { + return minCoeff(row, col); + } + template + EIGEN_DEVICE_FUNC inline + typename internal::traits::Scalar maxCoeff(IndexType* row, IndexType* col) const { + return maxCoeff(row, col); + } + template + EIGEN_DEVICE_FUNC inline + typename internal::traits::Scalar minCoeff(IndexType* index) const { + return minCoeff(index); + } + template + EIGEN_DEVICE_FUNC inline + typename internal::traits::Scalar maxCoeff(IndexType* index) const { + return maxCoeff(index); + } + template EIGEN_DEVICE_FUNC Scalar redux(const BinaryOp& func) const; @@ -493,7 +539,7 @@ template class DenseBase typedef VectorwiseOp ColwiseReturnType; typedef const VectorwiseOp ConstColwiseReturnType; - /** \returns a VectorwiseOp wrapper of *this providing additional partial reduction operations + /** \returns a VectorwiseOp wrapper of *this for broadcasting and partial reductions * * Example: \include MatrixBase_rowwise.cpp * Output: \verbinclude MatrixBase_rowwise.out @@ -506,7 +552,7 @@ template class DenseBase } EIGEN_DEVICE_FUNC RowwiseReturnType rowwise(); - /** \returns a VectorwiseOp wrapper of *this providing additional partial reduction operations + /** \returns a VectorwiseOp wrapper of *this broadcasting and partial reductions * * Example: \include MatrixBase_colwise.cpp * Output: \verbinclude MatrixBase_colwise.out @@ -524,16 +570,16 @@ template class DenseBase static const RandomReturnType Random(); template - const Select + inline EIGEN_DEVICE_FUNC const Select select(const DenseBase& thenMatrix, const DenseBase& elseMatrix) const; template - inline const Select + inline EIGEN_DEVICE_FUNC const Select select(const DenseBase& thenMatrix, const typename ThenDerived::Scalar& elseScalar) const; template - inline const Select + inline EIGEN_DEVICE_FUNC const Select select(const typename ElseDerived::Scalar& thenScalar, const DenseBase& elseMatrix) const; template RealScalar lpNorm() const; @@ -567,16 +613,59 @@ template class DenseBase } EIGEN_DEVICE_FUNC void reverseInPlace(); + #ifdef EIGEN_PARSED_BY_DOXYGEN + /** STL-like RandomAccessIterator + * iterator type as returned by the begin() and end() methods. + */ + typedef random_access_iterator_type iterator; + /** This is the const version of iterator (aka read-only) */ + typedef random_access_iterator_type const_iterator; + #else + typedef typename internal::conditional< (Flags&DirectAccessBit)==DirectAccessBit, + internal::pointer_based_stl_iterator, + internal::generic_randaccess_stl_iterator + >::type iterator_type; + + typedef typename internal::conditional< (Flags&DirectAccessBit)==DirectAccessBit, + internal::pointer_based_stl_iterator, + internal::generic_randaccess_stl_iterator + >::type const_iterator_type; + + // Stl-style iterators are supported only for vectors. + + typedef typename internal::conditional< IsVectorAtCompileTime, + iterator_type, + void + >::type iterator; + + typedef typename internal::conditional< IsVectorAtCompileTime, + const_iterator_type, + void + >::type const_iterator; + #endif + + inline iterator begin(); + inline const_iterator begin() const; + inline const_iterator cbegin() const; + inline iterator end(); + inline const_iterator end() const; + inline const_iterator cend() const; + #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::DenseBase #define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL #define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND) +#define EIGEN_DOC_UNARY_ADDONS(X,Y) +# include "../plugins/CommonCwiseUnaryOps.h" # include "../plugins/BlockMethods.h" +# include "../plugins/IndexedViewMethods.h" +# include "../plugins/ReshapedMethods.h" # ifdef EIGEN_DENSEBASE_PLUGIN # include EIGEN_DENSEBASE_PLUGIN # endif #undef EIGEN_CURRENT_STORAGE_BASE_CLASS #undef EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL #undef EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF +#undef EIGEN_DOC_UNARY_ADDONS // disable the use of evalTo for dense objects with a nice compilation error template diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/DenseCoeffsBase.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/DenseCoeffsBase.h index c4af48ab69..37fcdb5911 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/DenseCoeffsBase.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/DenseCoeffsBase.h @@ -22,11 +22,12 @@ template struct add_const_on_value_type_if_arithmetic /** \brief Base class providing read-only coefficient access to matrices and arrays. * \ingroup Core_Module * \tparam Derived Type of the derived class - * \tparam #ReadOnlyAccessors Constant indicating read-only access + * + * \note #ReadOnlyAccessors Constant indicating read-only access * * This class defines the \c operator() \c const function and friends, which can be used to read specific * entries of a matrix or array. - * + * * \sa DenseCoeffsBase, DenseCoeffsBase, * \ref TopicClassHierarchy */ @@ -288,12 +289,13 @@ class DenseCoeffsBase : public EigenBase /** \brief Base class providing read/write coefficient access to matrices and arrays. * \ingroup Core_Module * \tparam Derived Type of the derived class - * \tparam #WriteAccessors Constant indicating read/write access + * + * \note #WriteAccessors Constant indicating read/write access * * This class defines the non-const \c operator() function and friends, which can be used to write specific * entries of a matrix or array. This class inherits DenseCoeffsBase which * defines the const variant for reading specific entries. - * + * * \sa DenseCoeffsBase, \ref TopicClassHierarchy */ template @@ -466,7 +468,8 @@ class DenseCoeffsBase : public DenseCoeffsBase which defines functions to access entries read-only using @@ -492,7 +495,7 @@ class DenseCoeffsBase : public DenseCoeffsBase : public DenseCoeffsBase : public DenseCoeffsBase : public DenseCoeffsBase : public DenseCoeffsBase which defines functions to access entries read/write using @@ -566,8 +570,8 @@ class DenseCoeffsBase * * \sa outerStride(), rowStride(), colStride() */ - EIGEN_DEVICE_FUNC - inline Index innerStride() const + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index innerStride() const EIGEN_NOEXCEPT { return derived().innerStride(); } @@ -577,14 +581,14 @@ class DenseCoeffsBase * * \sa innerStride(), rowStride(), colStride() */ - EIGEN_DEVICE_FUNC - inline Index outerStride() const + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index outerStride() const EIGEN_NOEXCEPT { return derived().outerStride(); } // FIXME shall we remove it ? - inline Index stride() const + EIGEN_CONSTEXPR inline Index stride() const EIGEN_NOEXCEPT { return Derived::IsVectorAtCompileTime ? innerStride() : outerStride(); } @@ -593,8 +597,8 @@ class DenseCoeffsBase * * \sa innerStride(), outerStride(), colStride() */ - EIGEN_DEVICE_FUNC - inline Index rowStride() const + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rowStride() const EIGEN_NOEXCEPT { return Derived::IsRowMajor ? outerStride() : innerStride(); } @@ -603,8 +607,8 @@ class DenseCoeffsBase * * \sa innerStride(), outerStride(), rowStride() */ - EIGEN_DEVICE_FUNC - inline Index colStride() const + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index colStride() const EIGEN_NOEXCEPT { return Derived::IsRowMajor ? innerStride() : outerStride(); } @@ -615,7 +619,7 @@ namespace internal { template struct first_aligned_impl { - static inline Index run(const Derived&) + static EIGEN_CONSTEXPR inline Index run(const Derived&) EIGEN_NOEXCEPT { return 0; } }; diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/DenseStorage.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/DenseStorage.h index 7d6d4e66d4..08ef6c5306 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/DenseStorage.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/DenseStorage.h @@ -47,21 +47,21 @@ struct plain_array EIGEN_DEVICE_FUNC plain_array() - { + { check_static_allocation_size(); } EIGEN_DEVICE_FUNC plain_array(constructor_without_unaligned_array_assert) - { + { check_static_allocation_size(); } }; #if defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT) #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) -#elif EIGEN_GNUC_AT_LEAST(4,7) - // GCC 4.7 is too aggressive in its optimizations and remove the alignement test based on the fact the array is declared to be aligned. +#elif EIGEN_GNUC_AT_LEAST(4,7) + // GCC 4.7 is too aggressive in its optimizations and remove the alignment test based on the fact the array is declared to be aligned. // See this bug report: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53900 // Hiding the origin of the array pointer behind a function argument seems to do the trick even if the function is inlined: template @@ -85,15 +85,15 @@ struct plain_array EIGEN_ALIGN_TO_BOUNDARY(8) T array[Size]; EIGEN_DEVICE_FUNC - plain_array() + plain_array() { EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(7); check_static_allocation_size(); } EIGEN_DEVICE_FUNC - plain_array(constructor_without_unaligned_array_assert) - { + plain_array(constructor_without_unaligned_array_assert) + { check_static_allocation_size(); } }; @@ -104,15 +104,15 @@ struct plain_array EIGEN_ALIGN_TO_BOUNDARY(16) T array[Size]; EIGEN_DEVICE_FUNC - plain_array() - { + plain_array() + { EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(15); check_static_allocation_size(); } EIGEN_DEVICE_FUNC - plain_array(constructor_without_unaligned_array_assert) - { + plain_array(constructor_without_unaligned_array_assert) + { check_static_allocation_size(); } }; @@ -123,15 +123,15 @@ struct plain_array EIGEN_ALIGN_TO_BOUNDARY(32) T array[Size]; EIGEN_DEVICE_FUNC - plain_array() + plain_array() { EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(31); check_static_allocation_size(); } EIGEN_DEVICE_FUNC - plain_array(constructor_without_unaligned_array_assert) - { + plain_array(constructor_without_unaligned_array_assert) + { check_static_allocation_size(); } }; @@ -142,15 +142,15 @@ struct plain_array EIGEN_ALIGN_TO_BOUNDARY(64) T array[Size]; EIGEN_DEVICE_FUNC - plain_array() - { + plain_array() + { EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(63); check_static_allocation_size(); } EIGEN_DEVICE_FUNC - plain_array(constructor_without_unaligned_array_assert) - { + plain_array(constructor_without_unaligned_array_assert) + { check_static_allocation_size(); } }; @@ -163,6 +163,30 @@ struct plain_array EIGEN_DEVICE_FUNC plain_array(constructor_without_unaligned_array_assert) {} }; +struct plain_array_helper { + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static void copy(const plain_array& src, const Eigen::Index size, + plain_array& dst) { + smart_copy(src.array, src.array + size, dst.array); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static void swap(plain_array& a, const Eigen::Index a_size, + plain_array& b, const Eigen::Index b_size) { + if (a_size < b_size) { + std::swap_ranges(b.array, b.array + a_size, a.array); + smart_move(b.array + a_size, b.array + b_size, a.array + a_size); + } else if (a_size > b_size) { + std::swap_ranges(a.array, a.array + b_size, b.array); + smart_move(a.array + b_size, a.array + a_size, b.array + b_size); + } else { + std::swap_ranges(a.array, a.array + a_size, b.array); + } + } +}; + } // end namespace internal /** \internal @@ -190,16 +214,41 @@ template class DenseSt EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(internal::constructor_without_unaligned_array_assert()) {} - EIGEN_DEVICE_FUNC +#if !EIGEN_HAS_CXX11 || defined(EIGEN_DENSE_STORAGE_CTOR_PLUGIN) + EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) : m_data(other.m_data) { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size) } - EIGEN_DEVICE_FUNC +#else + EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage&) = default; +#endif +#if !EIGEN_HAS_CXX11 + EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) - { + { if (this != &other) m_data = other.m_data; - return *this; + return *this; } +#else + EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage&) = default; +#endif +#if EIGEN_HAS_RVALUE_REFERENCES +#if !EIGEN_HAS_CXX11 + EIGEN_DEVICE_FUNC DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT + : m_data(std::move(other.m_data)) + { + } + EIGEN_DEVICE_FUNC DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT + { + if (this != &other) + m_data = std::move(other.m_data); + return *this; + } +#else + EIGEN_DEVICE_FUNC DenseStorage(DenseStorage&&) = default; + EIGEN_DEVICE_FUNC DenseStorage& operator=(DenseStorage&&) = default; +#endif +#endif EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) eigen_internal_assert(size==rows*cols && rows==_Rows && cols==_Cols); @@ -207,9 +256,11 @@ template class DenseSt EIGEN_UNUSED_VARIABLE(rows); EIGEN_UNUSED_VARIABLE(cols); } - EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); } - EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;} - EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;} + EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { + numext::swap(m_data, other.m_data); + } + EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index rows(void) EIGEN_NOEXCEPT {return _Rows;} + EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index cols(void) EIGEN_NOEXCEPT {return _Cols;} EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {} EIGEN_DEVICE_FUNC void resize(Index,Index,Index) {} EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; } @@ -226,8 +277,8 @@ template class DenseStorage class DenseStorage class DenseStorage class DenseStorage class DenseStorage(m_data, m_rows*m_cols); } EIGEN_DEVICE_FUNC void swap(DenseStorage& other) - { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); } - EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;} - EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;} + { + numext::swap(m_data,other.m_data); + numext::swap(m_rows,other.m_rows); + numext::swap(m_cols,other.m_cols); + } + EIGEN_DEVICE_FUNC Index rows(void) const EIGEN_NOEXCEPT {return m_rows;} + EIGEN_DEVICE_FUNC Index cols(void) const EIGEN_NOEXCEPT {return m_cols;} void conservativeResize(Index size, Index rows, Index cols) { m_data = internal::conditional_aligned_realloc_new_auto(m_data, size, m_rows*m_cols); @@ -446,7 +524,7 @@ template class DenseStorageswap(tmp); } return *this; - } + } #if EIGEN_HAS_RVALUE_REFERENCES EIGEN_DEVICE_FUNC DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT @@ -459,16 +537,18 @@ template class DenseStorage(m_data, _Rows*m_cols); } - EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); } - EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;} - EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;} + EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { + numext::swap(m_data,other.m_data); + numext::swap(m_cols,other.m_cols); + } + EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index rows(void) EIGEN_NOEXCEPT {return _Rows;} + EIGEN_DEVICE_FUNC Index cols(void) const EIGEN_NOEXCEPT {return m_cols;} EIGEN_DEVICE_FUNC void conservativeResize(Index size, Index, Index cols) { m_data = internal::conditional_aligned_realloc_new_auto(m_data, size, _Rows*m_cols); @@ -520,7 +600,7 @@ template class DenseStorageswap(tmp); } return *this; - } + } #if EIGEN_HAS_RVALUE_REFERENCES EIGEN_DEVICE_FUNC DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT @@ -533,16 +613,18 @@ template class DenseStorage(m_data, _Cols*m_rows); } - EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); } - EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;} - EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;} + EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { + numext::swap(m_data,other.m_data); + numext::swap(m_rows,other.m_rows); + } + EIGEN_DEVICE_FUNC Index rows(void) const EIGEN_NOEXCEPT {return m_rows;} + EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index cols(void) {return _Cols;} void conservativeResize(Index size, Index rows, Index) { m_data = internal::conditional_aligned_realloc_new_auto(m_data, size, m_rows*_Cols); diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Diagonal.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Diagonal.h index afcaf35756..3112d2c16a 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Diagonal.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Diagonal.h @@ -11,7 +11,7 @@ #ifndef EIGEN_DIAGONAL_H #define EIGEN_DIAGONAL_H -namespace Eigen { +namespace Eigen { /** \class Diagonal * \ingroup Core_Module @@ -84,20 +84,16 @@ template class Diagonal : numext::mini(m_matrix.rows(),m_matrix.cols()-m_index.value()); } - EIGEN_DEVICE_FUNC - inline Index cols() const { return 1; } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return 1; } - EIGEN_DEVICE_FUNC - inline Index innerStride() const - { + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index innerStride() const EIGEN_NOEXCEPT { return m_matrix.outerStride() + 1; } - EIGEN_DEVICE_FUNC - inline Index outerStride() const - { - return 0; - } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index outerStride() const EIGEN_NOEXCEPT { return 0; } typedef typename internal::conditional< internal::is_lvalue::value, @@ -149,8 +145,8 @@ template class Diagonal } EIGEN_DEVICE_FUNC - inline const typename internal::remove_all::type& - nestedExpression() const + inline const typename internal::remove_all::type& + nestedExpression() const { return m_matrix; } @@ -167,12 +163,12 @@ template class Diagonal private: // some compilers may fail to optimize std::max etc in case of compile-time constants... - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index absDiagIndex() const { return m_index.value()>0 ? m_index.value() : -m_index.value(); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index rowOffset() const { return m_index.value()>0 ? 0 : -m_index.value(); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index colOffset() const { return m_index.value()>0 ? m_index.value() : 0; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index absDiagIndex() const EIGEN_NOEXCEPT { return m_index.value()>0 ? m_index.value() : -m_index.value(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index rowOffset() const EIGEN_NOEXCEPT { return m_index.value()>0 ? 0 : -m_index.value(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index colOffset() const EIGEN_NOEXCEPT { return m_index.value()>0 ? m_index.value() : 0; } // trigger a compile-time error if someone try to call packet template typename MatrixType::PacketReturnType packet(Index) const; template typename MatrixType::PacketReturnType packet(Index,Index) const; @@ -187,7 +183,7 @@ template class Diagonal * * \sa class Diagonal */ template -inline typename MatrixBase::DiagonalReturnType +EIGEN_DEVICE_FUNC inline typename MatrixBase::DiagonalReturnType MatrixBase::diagonal() { return DiagonalReturnType(derived()); @@ -195,7 +191,7 @@ MatrixBase::diagonal() /** This is the const version of diagonal(). */ template -inline typename MatrixBase::ConstDiagonalReturnType +EIGEN_DEVICE_FUNC inline typename MatrixBase::ConstDiagonalReturnType MatrixBase::diagonal() const { return ConstDiagonalReturnType(derived()); @@ -213,7 +209,7 @@ MatrixBase::diagonal() const * * \sa MatrixBase::diagonal(), class Diagonal */ template -inline typename MatrixBase::DiagonalDynamicIndexReturnType +EIGEN_DEVICE_FUNC inline typename MatrixBase::DiagonalDynamicIndexReturnType MatrixBase::diagonal(Index index) { return DiagonalDynamicIndexReturnType(derived(), index); @@ -221,7 +217,7 @@ MatrixBase::diagonal(Index index) /** This is the const version of diagonal(Index). */ template -inline typename MatrixBase::ConstDiagonalDynamicIndexReturnType +EIGEN_DEVICE_FUNC inline typename MatrixBase::ConstDiagonalDynamicIndexReturnType MatrixBase::diagonal(Index index) const { return ConstDiagonalDynamicIndexReturnType(derived(), index); @@ -240,6 +236,7 @@ MatrixBase::diagonal(Index index) const * \sa MatrixBase::diagonal(), class Diagonal */ template template +EIGEN_DEVICE_FUNC inline typename MatrixBase::template DiagonalIndexReturnType::Type MatrixBase::diagonal() { @@ -249,6 +246,7 @@ MatrixBase::diagonal() /** This is the const version of diagonal(). */ template template +EIGEN_DEVICE_FUNC inline typename MatrixBase::template ConstDiagonalIndexReturnType::Type MatrixBase::diagonal() const { diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/DiagonalMatrix.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/DiagonalMatrix.h index ecfdce8efa..542685c659 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/DiagonalMatrix.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/DiagonalMatrix.h @@ -44,7 +44,7 @@ class DiagonalBase : public EigenBase EIGEN_DEVICE_FUNC DenseMatrixType toDenseMatrix() const { return derived(); } - + EIGEN_DEVICE_FUNC inline const DiagonalVectorType& diagonal() const { return derived().diagonal(); } EIGEN_DEVICE_FUNC @@ -83,6 +83,30 @@ class DiagonalBase : public EigenBase { return DiagonalWrapper(scalar * other.diagonal()); } + + template + EIGEN_DEVICE_FUNC + #ifdef EIGEN_PARSED_BY_DOXYGEN + inline unspecified_expression_type + #else + inline const DiagonalWrapper + #endif + operator+(const DiagonalBase& other) const + { + return (diagonal() + other.diagonal()).asDiagonal(); + } + + template + EIGEN_DEVICE_FUNC + #ifdef EIGEN_PARSED_BY_DOXYGEN + inline unspecified_expression_type + #else + inline const DiagonalWrapper + #endif + operator-(const DiagonalBase& other) const + { + return (diagonal() - other.diagonal()).asDiagonal(); + } }; #endif @@ -154,6 +178,30 @@ class DiagonalMatrix EIGEN_DEVICE_FUNC inline DiagonalMatrix(const Scalar& x, const Scalar& y, const Scalar& z) : m_diagonal(x,y,z) {} + #if EIGEN_HAS_CXX11 + /** \brief Construct a diagonal matrix with fixed size from an arbitrary number of coefficients. \cpp11 + * + * There exists C++98 anologue constructors for fixed-size diagonal matrices having 2 or 3 coefficients. + * + * \warning To construct a diagonal matrix of fixed size, the number of values passed to this + * constructor must match the fixed dimension of \c *this. + * + * \sa DiagonalMatrix(const Scalar&, const Scalar&) + * \sa DiagonalMatrix(const Scalar&, const Scalar&, const Scalar&) + */ + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + DiagonalMatrix(const Scalar& a0, const Scalar& a1, const Scalar& a2, const ArgTypes&... args) + : m_diagonal(a0, a1, a2, args...) {} + + /** \brief Constructs a DiagonalMatrix and initializes it by elements given by an initializer list of initializer + * lists \cpp11 + */ + EIGEN_DEVICE_FUNC + explicit EIGEN_STRONG_INLINE DiagonalMatrix(const std::initializer_list>& list) + : m_diagonal(list) {} + #endif // EIGEN_HAS_CXX11 + /** Copy constructor. */ template EIGEN_DEVICE_FUNC @@ -273,7 +321,7 @@ class DiagonalWrapper * \sa class DiagonalWrapper, class DiagonalMatrix, diagonal(), isDiagonal() **/ template -inline const DiagonalWrapper +EIGEN_DEVICE_FUNC inline const DiagonalWrapper MatrixBase::asDiagonal() const { return DiagonalWrapper(derived()); diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/DiagonalProduct.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/DiagonalProduct.h index d372b938f6..7911d1cd17 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/DiagonalProduct.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/DiagonalProduct.h @@ -17,7 +17,7 @@ namespace Eigen { */ template template -inline const Product +EIGEN_DEVICE_FUNC inline const Product MatrixBase::operator*(const DiagonalBase &a_diagonal) const { return Product(derived(),a_diagonal.derived()); diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Dot.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Dot.h index 1fe7a84a48..5c3441b926 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Dot.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Dot.h @@ -86,14 +86,14 @@ MatrixBase::dot(const MatrixBase& other) const //---------- implementation of L2 norm and related functions ---------- -/** \returns, for vectors, the squared \em l2 norm of \c *this, and for matrices the Frobenius norm. +/** \returns, for vectors, the squared \em l2 norm of \c *this, and for matrices the squared Frobenius norm. * In both cases, it consists in the sum of the square of all the matrix entries. * For vectors, this is also equals to the dot product of \c *this with itself. * * \sa dot(), norm(), lpNorm() */ template -EIGEN_STRONG_INLINE typename NumTraits::Scalar>::Real MatrixBase::squaredNorm() const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits::Scalar>::Real MatrixBase::squaredNorm() const { return numext::real((*this).cwiseAbs2().sum()); } @@ -105,7 +105,7 @@ EIGEN_STRONG_INLINE typename NumTraits::Scala * \sa lpNorm(), dot(), squaredNorm() */ template -EIGEN_STRONG_INLINE typename NumTraits::Scalar>::Real MatrixBase::norm() const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits::Scalar>::Real MatrixBase::norm() const { return numext::sqrt(squaredNorm()); } @@ -120,7 +120,7 @@ EIGEN_STRONG_INLINE typename NumTraits::Scala * \sa norm(), normalize() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::PlainObject +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::PlainObject MatrixBase::normalized() const { typedef typename internal::nested_eval::type _Nested; @@ -142,7 +142,7 @@ MatrixBase::normalized() const * \sa norm(), normalized() */ template -EIGEN_STRONG_INLINE void MatrixBase::normalize() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase::normalize() { RealScalar z = squaredNorm(); // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU @@ -163,7 +163,7 @@ EIGEN_STRONG_INLINE void MatrixBase::normalize() * \sa stableNorm(), stableNormalize(), normalized() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::PlainObject +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::PlainObject MatrixBase::stableNormalized() const { typedef typename internal::nested_eval::type _Nested; @@ -188,7 +188,7 @@ MatrixBase::stableNormalized() const * \sa stableNorm(), stableNormalized(), normalize() */ template -EIGEN_STRONG_INLINE void MatrixBase::stableNormalize() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase::stableNormalize() { RealScalar w = cwiseAbs().maxCoeff(); RealScalar z = (derived()/w).squaredNorm(); @@ -207,7 +207,7 @@ struct lpNorm_selector EIGEN_DEVICE_FUNC static inline RealScalar run(const MatrixBase& m) { - EIGEN_USING_STD_MATH(pow) + EIGEN_USING_STD(pow) return pow(m.cwiseAbs().array().pow(p).sum(), RealScalar(1)/p); } }; @@ -260,9 +260,9 @@ struct lpNorm_selector template template #ifndef EIGEN_PARSED_BY_DOXYGEN -inline typename NumTraits::Scalar>::Real +EIGEN_DEVICE_FUNC inline typename NumTraits::Scalar>::Real #else -MatrixBase::RealScalar +EIGEN_DEVICE_FUNC MatrixBase::RealScalar #endif MatrixBase::lpNorm() const { diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/EigenBase.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/EigenBase.h index b195506a91..6b3c7d3745 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/EigenBase.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/EigenBase.h @@ -15,7 +15,7 @@ namespace Eigen { /** \class EigenBase * \ingroup Core_Module - * + * * Common base class for all classes T such that MatrixBase has an operator=(T) and a constructor MatrixBase(T). * * In other words, an EigenBase object is an object that can be copied into a MatrixBase. @@ -29,11 +29,12 @@ namespace Eigen { template struct EigenBase { // typedef typename internal::plain_matrix_type::type PlainObject; - + /** \brief The interface type of indices * \details To change this, \c \#define the preprocessor symbol \c EIGEN_DEFAULT_DENSE_INDEX_TYPE. - * \deprecated Since Eigen 3.3, its usage is deprecated. Use Eigen::Index instead. * \sa StorageIndex, \ref TopicPreprocessorDirectives. + * DEPRECATED: Since Eigen 3.3, its usage is deprecated. Use Eigen::Index instead. + * Deprecation is not marked with a doxygen comment because there are too many existing usages to add the deprecation attribute. */ typedef Eigen::Index Index; @@ -55,15 +56,15 @@ template struct EigenBase { return *static_cast(this); } /** \returns the number of rows. \sa cols(), RowsAtCompileTime */ - EIGEN_DEVICE_FUNC - inline Index rows() const { return derived().rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rows() const EIGEN_NOEXCEPT { return derived().rows(); } /** \returns the number of columns. \sa rows(), ColsAtCompileTime*/ - EIGEN_DEVICE_FUNC - inline Index cols() const { return derived().cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return derived().cols(); } /** \returns the number of coefficients, which is rows()*cols(). * \sa rows(), cols(), SizeAtCompileTime. */ - EIGEN_DEVICE_FUNC - inline Index size() const { return rows() * cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index size() const EIGEN_NOEXCEPT { return rows() * cols(); } /** \internal Don't use it, but do the equivalent: \code dst = *this; \endcode */ template diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/ForceAlignedAccess.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/ForceAlignedAccess.h index 7b08b45e67..817a43afce 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/ForceAlignedAccess.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/ForceAlignedAccess.h @@ -41,10 +41,14 @@ template class ForceAlignedAccess EIGEN_DEVICE_FUNC explicit inline ForceAlignedAccess(const ExpressionType& matrix) : m_expression(matrix) {} - EIGEN_DEVICE_FUNC inline Index rows() const { return m_expression.rows(); } - EIGEN_DEVICE_FUNC inline Index cols() const { return m_expression.cols(); } - EIGEN_DEVICE_FUNC inline Index outerStride() const { return m_expression.outerStride(); } - EIGEN_DEVICE_FUNC inline Index innerStride() const { return m_expression.innerStride(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index outerStride() const EIGEN_NOEXCEPT { return m_expression.outerStride(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index innerStride() const EIGEN_NOEXCEPT { return m_expression.innerStride(); } EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index row, Index col) const { diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Fuzzy.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Fuzzy.h index 3e403a09d9..43aa49b2bc 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Fuzzy.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Fuzzy.h @@ -100,7 +100,7 @@ struct isMuchSmallerThan_scalar_selector */ template template -bool DenseBase::isApprox( +EIGEN_DEVICE_FUNC bool DenseBase::isApprox( const DenseBase& other, const RealScalar& prec ) const @@ -122,7 +122,7 @@ bool DenseBase::isApprox( * \sa isApprox(), isMuchSmallerThan(const DenseBase&, RealScalar) const */ template -bool DenseBase::isMuchSmallerThan( +EIGEN_DEVICE_FUNC bool DenseBase::isMuchSmallerThan( const typename NumTraits::Real& other, const RealScalar& prec ) const @@ -142,7 +142,7 @@ bool DenseBase::isMuchSmallerThan( */ template template -bool DenseBase::isMuchSmallerThan( +EIGEN_DEVICE_FUNC bool DenseBase::isMuchSmallerThan( const DenseBase& other, const RealScalar& prec ) const diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/GeneralProduct.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/GeneralProduct.h index 6f0cc80e94..6906aa75d1 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/GeneralProduct.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/GeneralProduct.h @@ -18,6 +18,16 @@ enum { Small = 3 }; +// Define the threshold value to fallback from the generic matrix-matrix product +// implementation (heavy) to the lightweight coeff-based product one. +// See generic_product_impl +// in products/GeneralMatrixMatrix.h for more details. +// TODO This threshold should also be used in the compile-time selector below. +#ifndef EIGEN_GEMM_TO_COEFFBASED_THRESHOLD +// This default value has been obtained on a Haswell architecture. +#define EIGEN_GEMM_TO_COEFFBASED_THRESHOLD 20 +#endif + namespace internal { template struct product_type_selector; @@ -25,7 +35,7 @@ template struct product_type_selector; template struct product_size_category { enum { - #ifndef EIGEN_CUDA_ARCH + #ifndef EIGEN_GPU_COMPILE_PHASE is_large = MaxSize == Dynamic || Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD || (Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD), @@ -153,13 +163,13 @@ template struct gemv_static_vect template struct gemv_static_vector_if { - EIGEN_STRONG_INLINE Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; } + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; } }; template struct gemv_static_vector_if { - EIGEN_STRONG_INLINE Scalar* data() { return 0; } + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { return 0; } }; template @@ -218,8 +228,7 @@ template<> struct gemv_dense_selector ActualLhsType actualLhs = LhsBlasTraits::extract(lhs); ActualRhsType actualRhs = RhsBlasTraits::extract(rhs); - ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs) - * RhsBlasTraits::extractScalarFactor(rhs); + ResScalar actualAlpha = combine_scalar_factors(alpha, lhs, rhs); // make sure Dest is a compile-time vector type (bug 1166) typedef typename conditional::type ActualDest; @@ -229,7 +238,7 @@ template<> struct gemv_dense_selector // on, the other hand it is good for the cache to pack the vector anyways... EvalToDestAtCompileTime = (ActualDest::InnerStrideAtCompileTime==1), ComplexByReal = (NumTraits::IsComplex) && (!NumTraits::IsComplex), - MightCannotUseDest = (!EvalToDestAtCompileTime) || ComplexByReal + MightCannotUseDest = ((!EvalToDestAtCompileTime) || ComplexByReal) && (ActualDest::MaxSizeAtCompileTime!=0) }; typedef const_blas_data_mapper LhsMapper; @@ -310,13 +319,12 @@ template<> struct gemv_dense_selector typename add_const::type actualLhs = LhsBlasTraits::extract(lhs); typename add_const::type actualRhs = RhsBlasTraits::extract(rhs); - ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs) - * RhsBlasTraits::extractScalarFactor(rhs); + ResScalar actualAlpha = combine_scalar_factors(alpha, lhs, rhs); enum { // FIXME find a way to allow an inner stride on the result if packet_traits::size==1 // on, the other hand it is good for the cache to pack the vector anyways... - DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1 + DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1 || ActualRhsTypeCleaned::MaxSizeAtCompileTime==0 }; gemv_static_vector_if static_rhs; @@ -386,7 +394,8 @@ template<> struct gemv_dense_selector */ template template -inline const Product +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const Product MatrixBase::operator*(const MatrixBase &other) const { // A note regarding the function declaration: In MSVC, this function will sometimes @@ -428,6 +437,7 @@ MatrixBase::operator*(const MatrixBase &other) const */ template template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Product MatrixBase::lazyProduct(const MatrixBase &other) const { diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/GenericPacketMath.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/GenericPacketMath.h index e594437791..cf677a1905 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/GenericPacketMath.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/GenericPacketMath.h @@ -44,23 +44,29 @@ struct default_packet_traits enum { HasHalfPacket = 0, - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasNegate = 1, - HasAbs = 1, - HasArg = 0, - HasAbs2 = 1, - HasMin = 1, - HasMax = 1, - HasConj = 1, + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasAbsDiff = 0, + HasMin = 1, + HasMax = 1, + HasConj = 1, HasSetLinear = 1, - HasBlend = 0, + HasBlend = 0, + // This flag is used to indicate whether packet comparison is supported. + // pcmp_eq, pcmp_lt and pcmp_le should be defined for it to be true. + HasCmp = 0, HasDiv = 0, HasSqrt = 0, HasRsqrt = 0, HasExp = 0, + HasExpm1 = 0, HasLog = 0, HasLog1p = 0, HasLog10 = 0, @@ -81,14 +87,18 @@ struct default_packet_traits HasPolygamma = 0, HasErf = 0, HasErfc = 0, + HasNdtri = 0, + HasBessel = 0, HasIGamma = 0, + HasIGammaDerA = 0, + HasGammaSampleDerAlpha = 0, HasIGammac = 0, HasBetaInc = 0, HasRound = 0, + HasRint = 0, HasFloor = 0, HasCeil = 0, - HasSign = 0 }; }; @@ -119,6 +129,22 @@ template struct packet_traits : default_packet_traits template struct packet_traits : packet_traits { }; +template struct unpacket_traits +{ + typedef T type; + typedef T half; + enum + { + size = 1, + alignment = 1, + vectorizable = false, + masked_load_available=false, + masked_store_available=false + }; +}; + +template struct unpacket_traits : unpacket_traits { }; + template struct type_casting_traits { enum { VectorizedCast = 0, @@ -127,6 +153,34 @@ template struct type_casting_traits { }; }; +/** \internal Wrapper to ensure that multiple packet types can map to the same + same underlying vector type. */ +template +struct eigen_packet_wrapper +{ + EIGEN_ALWAYS_INLINE operator T&() { return m_val; } + EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; } + EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {} + EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T &v) : m_val(v) {} + EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T &v) { + m_val = v; + return *this; + } + + T m_val; +}; + + +/** \internal A convenience utility for determining if the type is a scalar. + * This is used to enable some generic packet implementations. + */ +template +struct is_scalar { + typedef typename unpacket_traits::type Scalar; + enum { + value = internal::is_same::value + }; +}; /** \internal \returns static_cast(a) (coeff-wise) */ template @@ -139,75 +193,406 @@ EIGEN_DEVICE_FUNC inline TgtPacket pcast(const SrcPacket& a, const SrcPacket& /*b*/) { return static_cast(a); } - template EIGEN_DEVICE_FUNC inline TgtPacket pcast(const SrcPacket& a, const SrcPacket& /*b*/, const SrcPacket& /*c*/, const SrcPacket& /*d*/) { return static_cast(a); } +template +EIGEN_DEVICE_FUNC inline TgtPacket +pcast(const SrcPacket& a, const SrcPacket& /*b*/, const SrcPacket& /*c*/, const SrcPacket& /*d*/, + const SrcPacket& /*e*/, const SrcPacket& /*f*/, const SrcPacket& /*g*/, const SrcPacket& /*h*/) { + return static_cast(a); +} + +/** \internal \returns reinterpret_cast(a) */ +template +EIGEN_DEVICE_FUNC inline Target +preinterpret(const Packet& a); /* { return reinterpret_cast(a); } */ /** \internal \returns a + b (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet -padd(const Packet& a, - const Packet& b) { return a+b; } +padd(const Packet& a, const Packet& b) { return a+b; } +// Avoid compiler warning for boolean algebra. +template<> EIGEN_DEVICE_FUNC inline bool +padd(const bool& a, const bool& b) { return a || b; } /** \internal \returns a - b (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet -psub(const Packet& a, - const Packet& b) { return a-b; } +psub(const Packet& a, const Packet& b) { return a-b; } /** \internal \returns -a (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet pnegate(const Packet& a) { return -a; } -/** \internal \returns conj(a) (coeff-wise) */ +template<> EIGEN_DEVICE_FUNC inline bool +pnegate(const bool& a) { return !a; } +/** \internal \returns conj(a) (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet pconj(const Packet& a) { return numext::conj(a); } /** \internal \returns a * b (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet -pmul(const Packet& a, - const Packet& b) { return a*b; } +pmul(const Packet& a, const Packet& b) { return a*b; } +// Avoid compiler warning for boolean algebra. +template<> EIGEN_DEVICE_FUNC inline bool +pmul(const bool& a, const bool& b) { return a && b; } /** \internal \returns a / b (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet -pdiv(const Packet& a, - const Packet& b) { return a/b; } +pdiv(const Packet& a, const Packet& b) { return a/b; } -/** \internal \returns the min of \a a and \a b (coeff-wise) */ -template EIGEN_DEVICE_FUNC inline Packet -pmin(const Packet& a, - const Packet& b) { return numext::mini(a, b); } +// In the generic case, memset to all one bits. +template +struct ptrue_impl { + static EIGEN_DEVICE_FUNC inline Packet run(const Packet& /*a*/){ + Packet b; + memset(static_cast(&b), 0xff, sizeof(Packet)); + return b; + } +}; -/** \internal \returns the max of \a a and \a b (coeff-wise) */ +// For non-trivial scalars, set to Scalar(1) (i.e. a non-zero value). +// Although this is technically not a valid bitmask, the scalar path for pselect +// uses a comparison to zero, so this should still work in most cases. We don't +// have another option, since the scalar type requires initialization. +template +struct ptrue_impl::value && NumTraits::RequireInitialization>::type > { + static EIGEN_DEVICE_FUNC inline T run(const T& /*a*/){ + return T(1); + } +}; + +/** \internal \returns one bits. */ template EIGEN_DEVICE_FUNC inline Packet -pmax(const Packet& a, - const Packet& b) { return numext::maxi(a, b); } +ptrue(const Packet& a) { + return ptrue_impl::run(a); +} + +// In the general case, memset to zero. +template +struct pzero_impl { + static EIGEN_DEVICE_FUNC inline Packet run(const Packet& /*a*/) { + Packet b; + memset(static_cast(&b), 0x00, sizeof(Packet)); + return b; + } +}; + +// For scalars, explicitly set to Scalar(0), since the underlying representation +// for zero may not consist of all-zero bits. +template +struct pzero_impl::value>::type> { + static EIGEN_DEVICE_FUNC inline T run(const T& /*a*/) { + return T(0); + } +}; + +/** \internal \returns packet of zeros */ +template EIGEN_DEVICE_FUNC inline Packet +pzero(const Packet& a) { + return pzero_impl::run(a); +} + +/** \internal \returns a <= b as a bit mask */ +template EIGEN_DEVICE_FUNC inline Packet +pcmp_le(const Packet& a, const Packet& b) { return a<=b ? ptrue(a) : pzero(a); } + +/** \internal \returns a < b as a bit mask */ +template EIGEN_DEVICE_FUNC inline Packet +pcmp_lt(const Packet& a, const Packet& b) { return a EIGEN_DEVICE_FUNC inline Packet +pcmp_eq(const Packet& a, const Packet& b) { return a==b ? ptrue(a) : pzero(a); } + +/** \internal \returns a < b or a==NaN or b==NaN as a bit mask */ +template EIGEN_DEVICE_FUNC inline Packet +pcmp_lt_or_nan(const Packet& a, const Packet& b) { return a>=b ? pzero(a) : ptrue(a); } + +template +struct bit_and { + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const { + return a & b; + } +}; + +template +struct bit_or { + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const { + return a | b; + } +}; + +template +struct bit_xor { + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const { + return a ^ b; + } +}; + +template +struct bit_not { + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a) const { + return ~a; + } +}; + +// Use operators &, |, ^, ~. +template +struct operator_bitwise_helper { + EIGEN_DEVICE_FUNC static inline T bitwise_and(const T& a, const T& b) { return bit_and()(a, b); } + EIGEN_DEVICE_FUNC static inline T bitwise_or(const T& a, const T& b) { return bit_or()(a, b); } + EIGEN_DEVICE_FUNC static inline T bitwise_xor(const T& a, const T& b) { return bit_xor()(a, b); } + EIGEN_DEVICE_FUNC static inline T bitwise_not(const T& a) { return bit_not()(a); } +}; + +// Apply binary operations byte-by-byte +template +struct bytewise_bitwise_helper { + EIGEN_DEVICE_FUNC static inline T bitwise_and(const T& a, const T& b) { + return binary(a, b, bit_and()); + } + EIGEN_DEVICE_FUNC static inline T bitwise_or(const T& a, const T& b) { + return binary(a, b, bit_or()); + } + EIGEN_DEVICE_FUNC static inline T bitwise_xor(const T& a, const T& b) { + return binary(a, b, bit_xor()); + } + EIGEN_DEVICE_FUNC static inline T bitwise_not(const T& a) { + return unary(a,bit_not()); + } + + private: + template + EIGEN_DEVICE_FUNC static inline T unary(const T& a, Op op) { + const unsigned char* a_ptr = reinterpret_cast(&a); + T c; + unsigned char* c_ptr = reinterpret_cast(&c); + for (size_t i = 0; i < sizeof(T); ++i) { + *c_ptr++ = op(*a_ptr++); + } + return c; + } + + template + EIGEN_DEVICE_FUNC static inline T binary(const T& a, const T& b, Op op) { + const unsigned char* a_ptr = reinterpret_cast(&a); + const unsigned char* b_ptr = reinterpret_cast(&b); + T c; + unsigned char* c_ptr = reinterpret_cast(&c); + for (size_t i = 0; i < sizeof(T); ++i) { + *c_ptr++ = op(*a_ptr++, *b_ptr++); + } + return c; + } +}; + +// In the general case, use byte-by-byte manipulation. +template +struct bitwise_helper : public bytewise_bitwise_helper {}; + +// For integers or non-trivial scalars, use binary operators. +template +struct bitwise_helper::value && (NumTraits::IsInteger || NumTraits::RequireInitialization)>::type + > : public operator_bitwise_helper {}; + +/** \internal \returns the bitwise and of \a a and \a b */ +template EIGEN_DEVICE_FUNC inline Packet +pand(const Packet& a, const Packet& b) { + return bitwise_helper::bitwise_and(a, b); +} + +/** \internal \returns the bitwise or of \a a and \a b */ +template EIGEN_DEVICE_FUNC inline Packet +por(const Packet& a, const Packet& b) { + return bitwise_helper::bitwise_or(a, b); +} + +/** \internal \returns the bitwise xor of \a a and \a b */ +template EIGEN_DEVICE_FUNC inline Packet +pxor(const Packet& a, const Packet& b) { + return bitwise_helper::bitwise_xor(a, b); +} + +/** \internal \returns the bitwise not of \a a */ +template EIGEN_DEVICE_FUNC inline Packet +pnot(const Packet& a) { + return bitwise_helper::bitwise_not(a); +} + +/** \internal \returns the bitwise and of \a a and not \a b */ +template EIGEN_DEVICE_FUNC inline Packet +pandnot(const Packet& a, const Packet& b) { return pand(a, pnot(b)); } + +// In the general case, use bitwise select. +template +struct pselect_impl { + static EIGEN_DEVICE_FUNC inline Packet run(const Packet& mask, const Packet& a, const Packet& b) { + return por(pand(a,mask),pandnot(b,mask)); + } +}; + +// For scalars, use ternary select. +template +struct pselect_impl::value>::type > { + static EIGEN_DEVICE_FUNC inline Packet run(const Packet& mask, const Packet& a, const Packet& b) { + return numext::equal_strict(mask, Packet(0)) ? b : a; + } +}; + +/** \internal \returns \a or \b for each field in packet according to \mask */ +template EIGEN_DEVICE_FUNC inline Packet +pselect(const Packet& mask, const Packet& a, const Packet& b) { + return pselect_impl::run(mask, a, b); +} + +template<> EIGEN_DEVICE_FUNC inline bool pselect( + const bool& cond, const bool& a, const bool& b) { + return cond ? a : b; +} + +/** \internal \returns the min or of \a a and \a b (coeff-wise) + If either \a a or \a b are NaN, the result is implementation defined. */ +template +struct pminmax_impl { + template + static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a, const Packet& b, Op op) { + return op(a,b); + } +}; + +/** \internal \returns the min or max of \a a and \a b (coeff-wise) + If either \a a or \a b are NaN, NaN is returned. */ +template<> +struct pminmax_impl { + template + static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a, const Packet& b, Op op) { + Packet not_nan_mask_a = pcmp_eq(a, a); + Packet not_nan_mask_b = pcmp_eq(b, b); + return pselect(not_nan_mask_a, + pselect(not_nan_mask_b, op(a, b), b), + a); + } +}; + +/** \internal \returns the min or max of \a a and \a b (coeff-wise) + If both \a a and \a b are NaN, NaN is returned. + Equivalent to std::fmin(a, b). */ +template<> +struct pminmax_impl { + template + static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a, const Packet& b, Op op) { + Packet not_nan_mask_a = pcmp_eq(a, a); + Packet not_nan_mask_b = pcmp_eq(b, b); + return pselect(not_nan_mask_a, + pselect(not_nan_mask_b, op(a, b), a), + b); + } +}; + + +#ifndef SYCL_DEVICE_ONLY +#define EIGEN_BINARY_OP_NAN_PROPAGATION(Type, Func) Func +#else +#define EIGEN_BINARY_OP_NAN_PROPAGATION(Type, Func) \ +[](const Type& a, const Type& b) { \ + return Func(a, b);} +#endif + +/** \internal \returns the min of \a a and \a b (coeff-wise). + If \a a or \b b is NaN, the return value is implementation defined. */ +template EIGEN_DEVICE_FUNC inline Packet +pmin(const Packet& a, const Packet& b) { return numext::mini(a,b); } + +/** \internal \returns the min of \a a and \a b (coeff-wise). + NaNPropagation determines the NaN propagation semantics. */ +template +EIGEN_DEVICE_FUNC inline Packet pmin(const Packet& a, const Packet& b) { + return pminmax_impl::run(a, b, EIGEN_BINARY_OP_NAN_PROPAGATION(Packet, (pmin))); +} + +/** \internal \returns the max of \a a and \a b (coeff-wise) + If \a a or \b b is NaN, the return value is implementation defined. */ +template EIGEN_DEVICE_FUNC inline Packet +pmax(const Packet& a, const Packet& b) { return numext::maxi(a, b); } + +/** \internal \returns the max of \a a and \a b (coeff-wise). + NaNPropagation determines the NaN propagation semantics. */ +template +EIGEN_DEVICE_FUNC inline Packet pmax(const Packet& a, const Packet& b) { + return pminmax_impl::run(a, b, EIGEN_BINARY_OP_NAN_PROPAGATION(Packet,(pmax))); +} /** \internal \returns the absolute value of \a a */ template EIGEN_DEVICE_FUNC inline Packet -pabs(const Packet& a) { using std::abs; return abs(a); } +pabs(const Packet& a) { return numext::abs(a); } +template<> EIGEN_DEVICE_FUNC inline unsigned int +pabs(const unsigned int& a) { return a; } +template<> EIGEN_DEVICE_FUNC inline unsigned long +pabs(const unsigned long& a) { return a; } +template<> EIGEN_DEVICE_FUNC inline unsigned long long +pabs(const unsigned long long& a) { return a; } + +/** \internal \returns the addsub value of \a a,b */ +template EIGEN_DEVICE_FUNC inline Packet +paddsub(const Packet& a, const Packet& b) { + return pselect(peven_mask(a), padd(a, b), psub(a, b)); + } /** \internal \returns the phase angle of \a a */ template EIGEN_DEVICE_FUNC inline Packet parg(const Packet& a) { using numext::arg; return arg(a); } -/** \internal \returns the bitwise and of \a a and \a b */ -template EIGEN_DEVICE_FUNC inline Packet -pand(const Packet& a, const Packet& b) { return a & b; } -/** \internal \returns the bitwise or of \a a and \a b */ -template EIGEN_DEVICE_FUNC inline Packet -por(const Packet& a, const Packet& b) { return a | b; } +/** \internal \returns \a a logically shifted by N bits to the right */ +template EIGEN_DEVICE_FUNC inline int +parithmetic_shift_right(const int& a) { return a >> N; } +template EIGEN_DEVICE_FUNC inline long int +parithmetic_shift_right(const long int& a) { return a >> N; } -/** \internal \returns the bitwise xor of \a a and \a b */ -template EIGEN_DEVICE_FUNC inline Packet -pxor(const Packet& a, const Packet& b) { return a ^ b; } +/** \internal \returns \a a arithmetically shifted by N bits to the right */ +template EIGEN_DEVICE_FUNC inline int +plogical_shift_right(const int& a) { return static_cast(static_cast(a) >> N); } +template EIGEN_DEVICE_FUNC inline long int +plogical_shift_right(const long int& a) { return static_cast(static_cast(a) >> N); } -/** \internal \returns the bitwise andnot of \a a and \a b */ +/** \internal \returns \a a shifted by N bits to the left */ +template EIGEN_DEVICE_FUNC inline int +plogical_shift_left(const int& a) { return a << N; } +template EIGEN_DEVICE_FUNC inline long int +plogical_shift_left(const long int& a) { return a << N; } + +/** \internal \returns the significant and exponent of the underlying floating point numbers + * See https://en.cppreference.com/w/cpp/numeric/math/frexp + */ +template +EIGEN_DEVICE_FUNC inline Packet pfrexp(const Packet& a, Packet& exponent) { + int exp; + EIGEN_USING_STD(frexp); + Packet result = static_cast(frexp(a, &exp)); + exponent = static_cast(exp); + return result; +} + +/** \internal \returns a * 2^((int)exponent) + * See https://en.cppreference.com/w/cpp/numeric/math/ldexp + */ template EIGEN_DEVICE_FUNC inline Packet -pandnot(const Packet& a, const Packet& b) { return a & (!b); } +pldexp(const Packet &a, const Packet &exponent) { + EIGEN_USING_STD(ldexp) + return static_cast(ldexp(a, static_cast(exponent))); +} + +/** \internal \returns the min of \a a and \a b (coeff-wise) */ +template EIGEN_DEVICE_FUNC inline Packet +pabsdiff(const Packet& a, const Packet& b) { return pselect(pcmp_lt(a, b), psub(b, a), psub(a, b)); } /** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */ template EIGEN_DEVICE_FUNC inline Packet @@ -217,10 +602,22 @@ pload(const typename unpacket_traits::type* from) { return *from; } template EIGEN_DEVICE_FUNC inline Packet ploadu(const typename unpacket_traits::type* from) { return *from; } +/** \internal \returns a packet version of \a *from, (un-aligned masked load) + * There is no generic implementation. We only have implementations for specialized + * cases. Generic case should not be called. + */ +template EIGEN_DEVICE_FUNC inline +typename enable_if::masked_load_available, Packet>::type +ploadu(const typename unpacket_traits::type* from, typename unpacket_traits::mask_t umask); + /** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */ template EIGEN_DEVICE_FUNC inline Packet pset1(const typename unpacket_traits::type& a) { return a; } +/** \internal \returns a packet with constant coefficients set from bits */ +template EIGEN_DEVICE_FUNC inline Packet +pset1frombits(BitsType a); + /** \internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */ template EIGEN_DEVICE_FUNC inline Packet pload1(const typename unpacket_traits::type *a) { return pset1(*a); } @@ -237,7 +634,7 @@ ploaddup(const typename unpacket_traits::type* from) { return *from; } * For instance, for a packet of 8 elements, 2 scalars will be read from \a *from and * replicated to form: {from[0],from[0],from[0],from[0],from[1],from[1],from[1],from[1]} * Currently, this function is only used in matrix products. - * For packet-size smaller or equal to 4, this function is equivalent to pload1 + * For packet-size smaller or equal to 4, this function is equivalent to pload1 */ template EIGEN_DEVICE_FUNC inline Packet ploadquad(const typename unpacket_traits::type* from) @@ -281,6 +678,20 @@ inline void pbroadcast2(const typename unpacket_traits::type *a, template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet plset(const typename unpacket_traits::type& a) { return a; } +/** \internal \returns a packet with constant coefficients \a a, e.g.: (x, 0, x, 0), + where x is the value of all 1-bits. */ +template EIGEN_DEVICE_FUNC inline Packet +peven_mask(const Packet& /*a*/) { + typedef typename unpacket_traits::type Scalar; + const size_t n = unpacket_traits::size; + EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) Scalar elements[n]; + for(size_t i = 0; i < n; ++i) { + memset(elements+i, ((i & 1) == 0 ? 0xff : 0), sizeof(Scalar)); + } + return ploadu(elements); +} + + /** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */ template EIGEN_DEVICE_FUNC inline void pstore(Scalar* to, const Packet& from) { (*to) = from; } @@ -289,6 +700,15 @@ template EIGEN_DEVICE_FUNC inline void pstore( template EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from) { (*to) = from; } +/** \internal copy the packet \a from to \a *to, (un-aligned store with a mask) + * There is no generic implementation. We only have implementations for specialized + * cases. Generic case should not be called. + */ +template +EIGEN_DEVICE_FUNC inline +typename enable_if::masked_store_available, void>::type +pstoreu(Scalar* to, const Packet& from, typename unpacket_traits::mask_t umask); + template EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, Index /*stride*/) { return ploadu(from); } @@ -298,8 +718,10 @@ template EIGEN_DEVICE_FUNC inline void pstoreu /** \internal tries to do cache prefetching of \a addr */ template EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr) { -#ifdef __CUDA_ARCH__ -#if defined(__LP64__) +#if defined(EIGEN_HIP_DEVICE_COMPILE) + // do nothing +#elif defined(EIGEN_CUDA_ARCH) +#if defined(__LP64__) || EIGEN_OS_WIN64 // 64-bit pointer operand constraint for inlined asm asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr)); #else @@ -311,39 +733,6 @@ template EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* a #endif } -/** \internal \returns the first element of a packet */ -template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type pfirst(const Packet& a) -{ return a; } - -/** \internal \returns a packet where the element i contains the sum of the packet of \a vec[i] */ -template EIGEN_DEVICE_FUNC inline Packet -preduxp(const Packet* vecs) { return vecs[0]; } - -/** \internal \returns the sum of the elements of \a a*/ -template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux(const Packet& a) -{ return a; } - -/** \internal \returns the sum of the elements of \a a by block of 4 elements. - * For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7} - * For packet-size smaller or equal to 4, this boils down to a noop. - */ -template EIGEN_DEVICE_FUNC inline -typename conditional<(unpacket_traits::size%8)==0,typename unpacket_traits::half,Packet>::type -predux_downto4(const Packet& a) -{ return a; } - -/** \internal \returns the product of the elements of \a a*/ -template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_mul(const Packet& a) -{ return a; } - -/** \internal \returns the min of the elements of \a a*/ -template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_min(const Packet& a) -{ return a; } - -/** \internal \returns the max of the elements of \a a*/ -template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_max(const Packet& a) -{ return a; } - /** \internal \returns the reversed elements of \a a*/ template EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a) { return a; } @@ -351,7 +740,7 @@ template EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& /** \internal \returns \a a with real and imaginary part flipped (for complex type only) */ template EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a) { - return Packet(a.imag(),a.real()); + return Packet(numext::imag(a),numext::real(a)); } /************************** @@ -360,47 +749,51 @@ template EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet /** \internal \returns the sine of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet psin(const Packet& a) { using std::sin; return sin(a); } +Packet psin(const Packet& a) { EIGEN_USING_STD(sin); return sin(a); } /** \internal \returns the cosine of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet pcos(const Packet& a) { using std::cos; return cos(a); } +Packet pcos(const Packet& a) { EIGEN_USING_STD(cos); return cos(a); } /** \internal \returns the tan of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet ptan(const Packet& a) { using std::tan; return tan(a); } +Packet ptan(const Packet& a) { EIGEN_USING_STD(tan); return tan(a); } /** \internal \returns the arc sine of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet pasin(const Packet& a) { using std::asin; return asin(a); } +Packet pasin(const Packet& a) { EIGEN_USING_STD(asin); return asin(a); } /** \internal \returns the arc cosine of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet pacos(const Packet& a) { using std::acos; return acos(a); } +Packet pacos(const Packet& a) { EIGEN_USING_STD(acos); return acos(a); } /** \internal \returns the arc tangent of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet patan(const Packet& a) { using std::atan; return atan(a); } +Packet patan(const Packet& a) { EIGEN_USING_STD(atan); return atan(a); } /** \internal \returns the hyperbolic sine of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet psinh(const Packet& a) { using std::sinh; return sinh(a); } +Packet psinh(const Packet& a) { EIGEN_USING_STD(sinh); return sinh(a); } /** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet pcosh(const Packet& a) { using std::cosh; return cosh(a); } +Packet pcosh(const Packet& a) { EIGEN_USING_STD(cosh); return cosh(a); } /** \internal \returns the hyperbolic tan of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet ptanh(const Packet& a) { using std::tanh; return tanh(a); } +Packet ptanh(const Packet& a) { EIGEN_USING_STD(tanh); return tanh(a); } /** \internal \returns the exp of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet pexp(const Packet& a) { using std::exp; return exp(a); } +Packet pexp(const Packet& a) { EIGEN_USING_STD(exp); return exp(a); } + +/** \internal \returns the expm1 of \a a (coeff-wise) */ +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pexpm1(const Packet& a) { return numext::expm1(a); } /** \internal \returns the log of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet plog(const Packet& a) { using std::log; return log(a); } +Packet plog(const Packet& a) { EIGEN_USING_STD(log); return log(a); } /** \internal \returns the log1p of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS @@ -408,16 +801,24 @@ Packet plog1p(const Packet& a) { return numext::log1p(a); } /** \internal \returns the log10 of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet plog10(const Packet& a) { using std::log10; return log10(a); } +Packet plog10(const Packet& a) { EIGEN_USING_STD(log10); return log10(a); } + +/** \internal \returns the log10 of \a a (coeff-wise) */ +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet plog2(const Packet& a) { + typedef typename internal::unpacket_traits::type Scalar; + return pmul(pset1(Scalar(EIGEN_LOG2E)), plog(a)); +} /** \internal \returns the square-root of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet psqrt(const Packet& a) { using std::sqrt; return sqrt(a); } +Packet psqrt(const Packet& a) { return numext::sqrt(a); } /** \internal \returns the reciprocal square-root of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet prsqrt(const Packet& a) { - return pdiv(pset1(1), psqrt(a)); + typedef typename internal::unpacket_traits::type Scalar; + return pdiv(pset1(Scalar(1)), psqrt(a)); } /** \internal \returns the rounded value of \a a (coeff-wise) */ @@ -428,15 +829,121 @@ Packet pround(const Packet& a) { using numext::round; return round(a); } template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pfloor(const Packet& a) { using numext::floor; return floor(a); } +/** \internal \returns the rounded value of \a a (coeff-wise) with current + * rounding mode */ +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet print(const Packet& a) { using numext::rint; return rint(a); } + /** \internal \returns the ceil of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pceil(const Packet& a) { using numext::ceil; return ceil(a); } +/** \internal \returns the first element of a packet */ +template +EIGEN_DEVICE_FUNC inline typename unpacket_traits::type +pfirst(const Packet& a) +{ return a; } + +/** \internal \returns the sum of the elements of upper and lower half of \a a if \a a is larger than 4. + * For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7} + * For packet-size smaller or equal to 4, this boils down to a noop. + */ +template +EIGEN_DEVICE_FUNC inline typename conditional<(unpacket_traits::size%8)==0,typename unpacket_traits::half,Packet>::type +predux_half_dowto4(const Packet& a) +{ return a; } + +// Slow generic implementation of Packet reduction. +template +EIGEN_DEVICE_FUNC inline typename unpacket_traits::type +predux_helper(const Packet& a, Op op) { + typedef typename unpacket_traits::type Scalar; + const size_t n = unpacket_traits::size; + EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) Scalar elements[n]; + pstoreu(elements, a); + for(size_t k = n / 2; k > 0; k /= 2) { + for(size_t i = 0; i < k; ++i) { + elements[i] = op(elements[i], elements[i + k]); + } + } + return elements[0]; +} + +/** \internal \returns the sum of the elements of \a a*/ +template +EIGEN_DEVICE_FUNC inline typename unpacket_traits::type +predux(const Packet& a) +{ + return a; +} + +/** \internal \returns the product of the elements of \a a */ +template +EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_mul( + const Packet& a) { + typedef typename unpacket_traits::type Scalar; + return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmul))); +} + +/** \internal \returns the min of the elements of \a a */ +template +EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_min( + const Packet &a) { + typedef typename unpacket_traits::type Scalar; + return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmin))); +} + +template +EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_min( + const Packet& a) { + typedef typename unpacket_traits::type Scalar; + return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmin))); +} + +/** \internal \returns the min of the elements of \a a */ +template +EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_max( + const Packet &a) { + typedef typename unpacket_traits::type Scalar; + return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmax))); +} + +template +EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_max( + const Packet& a) { + typedef typename unpacket_traits::type Scalar; + return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmax))); +} + +#undef EIGEN_BINARY_OP_NAN_PROPAGATION + +/** \internal \returns true if all coeffs of \a a means "true" + * It is supposed to be called on values returned by pcmp_*. + */ +// not needed yet +// template EIGEN_DEVICE_FUNC inline bool predux_all(const Packet& a) +// { return bool(a); } + +/** \internal \returns true if any coeffs of \a a means "true" + * It is supposed to be called on values returned by pcmp_*. + */ +template EIGEN_DEVICE_FUNC inline bool predux_any(const Packet& a) +{ + // Dirty but generic implementation where "true" is assumed to be non 0 and all the sames. + // It is expected that "true" is either: + // - Scalar(1) + // - bits full of ones (NaN for floats), + // - or first bit equals to 1 (1 for ints, smallest denormal for floats). + // For all these cases, taking the sum is just fine, and this boils down to a no-op for scalars. + typedef typename unpacket_traits::type Scalar; + return numext::not_equal_strict(predux(a), Scalar(0)); +} + /*************************************************************************** * The following functions might not have to be overwritten for vectorized types ***************************************************************************/ -/** \internal copy a packet with constant coeficient \a a (e.g., [a,a,a,a]) to \a *to. \a to must be 16 bytes aligned */ +/** \internal copy a packet with constant coefficient \a a (e.g., [a,a,a,a]) to \a *to. \a to must be 16 bytes aligned */ // NOTE: this function must really be templated on the packet type (think about different packet types for the same scalar type) template inline void pstore1(typename unpacket_traits::type* to, const typename unpacket_traits::type& a) @@ -484,41 +991,12 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_ro(const typename unpacket_t return ploadt(from); } -/** \internal default implementation of palign() allowing partial specialization */ -template -struct palign_impl -{ - // by default data are aligned, so there is nothing to be done :) - static inline void run(PacketType&, const PacketType&) {} -}; - -/** \internal update \a first using the concatenation of the packet_size minus \a Offset last elements - * of \a first and \a Offset first elements of \a second. - * - * This function is currently only used to optimize matrix-vector products on unligned matrices. - * It takes 2 packets that represent a contiguous memory array, and returns a packet starting - * at the position \a Offset. For instance, for packets of 4 elements, we have: - * Input: - * - first = {f0,f1,f2,f3} - * - second = {s0,s1,s2,s3} - * Output: - * - if Offset==0 then {f0,f1,f2,f3} - * - if Offset==1 then {f1,f2,f3,s0} - * - if Offset==2 then {f2,f3,s0,s1} - * - if Offset==3 then {f3,s0,s1,s3} - */ -template -inline void palign(PacketType& first, const PacketType& second) -{ - palign_impl::run(first,second); -} - /*************************************************************************** * Fast complex products (GCC generates a function call which is very slow) ***************************************************************************/ // Eigen+CUDA does not support complexes. -#ifndef __CUDACC__ +#if !defined(EIGEN_GPUCC) template<> inline std::complex pmul(const std::complex& a, const std::complex& b) { return std::complex(a.real()*b.real() - a.imag()*b.imag(), a.imag()*b.real() + a.real()*b.imag()); } @@ -555,34 +1033,6 @@ pblend(const Selector::size>& ifPacket, const Packet& th return ifPacket.select[0] ? thenPacket : elsePacket; } -/** \internal \returns \a a with the first coefficient replaced by the scalar b */ -template EIGEN_DEVICE_FUNC inline Packet -pinsertfirst(const Packet& a, typename unpacket_traits::type b) -{ - // Default implementation based on pblend. - // It must be specialized for higher performance. - Selector::size> mask; - mask.select[0] = true; - // This for loop should be optimized away by the compiler. - for(Index i=1; i::size; ++i) - mask.select[i] = false; - return pblend(mask, pset1(b), a); -} - -/** \internal \returns \a a with the last coefficient replaced by the scalar b */ -template EIGEN_DEVICE_FUNC inline Packet -pinsertlast(const Packet& a, typename unpacket_traits::type b) -{ - // Default implementation based on pblend. - // It must be specialized for higher performance. - Selector::size> mask; - // This for loop should be optimized away by the compiler. - for(Index i=0; i::size-1; ++i) - mask.select[i] = false; - mask.select[unpacket_traits::size-1] = true; - return pblend(mask, pset1(b), a); -} - } // end namespace internal } // end namespace Eigen diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/GlobalFunctions.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/GlobalFunctions.h index 769dc255c2..629af94b99 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/GlobalFunctions.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/GlobalFunctions.h @@ -66,21 +66,31 @@ namespace Eigen EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sinh,scalar_sinh_op,hyperbolic sine,\sa ArrayBase::sinh) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cosh,scalar_cosh_op,hyperbolic cosine,\sa ArrayBase::cosh) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op,hyperbolic tangent,\sa ArrayBase::tanh) +#if EIGEN_HAS_CXX11_MATH + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asinh,scalar_asinh_op,inverse hyperbolic sine,\sa ArrayBase::asinh) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acosh,scalar_acosh_op,inverse hyperbolic cosine,\sa ArrayBase::acosh) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(atanh,scalar_atanh_op,inverse hyperbolic tangent,\sa ArrayBase::atanh) +#endif + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(logistic,scalar_logistic_op,logistic function,\sa ArrayBase::logistic) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(lgamma,scalar_lgamma_op,natural logarithm of the gamma function,\sa ArrayBase::lgamma) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(digamma,scalar_digamma_op,derivative of lgamma,\sa ArrayBase::digamma) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erf,scalar_erf_op,error function,\sa ArrayBase::erf) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erfc,scalar_erfc_op,complement error function,\sa ArrayBase::erfc) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(ndtri,scalar_ndtri_op,inverse normal distribution function,\sa ArrayBase::ndtri) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op,exponential,\sa ArrayBase::exp) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(expm1,scalar_expm1_op,exponential of a value minus 1,\sa ArrayBase::expm1) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log,scalar_log_op,natural logarithm,\sa Eigen::log10 DOXCOMMA ArrayBase::log) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log1p,scalar_log1p_op,natural logarithm of 1 plus the value,\sa ArrayBase::log1p) - EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log10,scalar_log10_op,base 10 logarithm,\sa Eigen::log DOXCOMMA ArrayBase::log) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log10,scalar_log10_op,base 10 logarithm,\sa Eigen::log DOXCOMMA ArrayBase::log10) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log2,scalar_log2_op,base 2 logarithm,\sa Eigen::log DOXCOMMA ArrayBase::log2) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs,scalar_abs_op,absolute value,\sa ArrayBase::abs DOXCOMMA MatrixBase::cwiseAbs) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs2,scalar_abs2_op,squared absolute value,\sa ArrayBase::abs2 DOXCOMMA MatrixBase::cwiseAbs2) - EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(arg,scalar_arg_op,complex argument,\sa ArrayBase::arg) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(arg,scalar_arg_op,complex argument,\sa ArrayBase::arg DOXCOMMA MatrixBase::cwiseArg) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sqrt,scalar_sqrt_op,square root,\sa ArrayBase::sqrt DOXCOMMA MatrixBase::cwiseSqrt) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(rsqrt,scalar_rsqrt_op,reciprocal square root,\sa ArrayBase::rsqrt) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(square,scalar_square_op,square (power 2),\sa Eigen::abs2 DOXCOMMA Eigen::pow DOXCOMMA ArrayBase::square) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cube,scalar_cube_op,cube (power 3),\sa Eigen::pow DOXCOMMA ArrayBase::cube) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(rint,scalar_rint_op,nearest integer,\sa Eigen::floor DOXCOMMA Eigen::ceil DOXCOMMA ArrayBase::round) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(round,scalar_round_op,nearest integer,\sa Eigen::floor DOXCOMMA Eigen::ceil DOXCOMMA ArrayBase::round) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(floor,scalar_floor_op,nearest integer not greater than the giben value,\sa Eigen::ceil DOXCOMMA ArrayBase::floor) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(ceil,scalar_ceil_op,nearest integer not less than the giben value,\sa Eigen::floor DOXCOMMA ArrayBase::ceil) @@ -88,7 +98,7 @@ namespace Eigen EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isinf,scalar_isinf_op,infinite value test,\sa Eigen::isnan DOXCOMMA Eigen::isfinite DOXCOMMA ArrayBase::isinf) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isfinite,scalar_isfinite_op,finite value test,\sa Eigen::isinf DOXCOMMA Eigen::isnan DOXCOMMA ArrayBase::isfinite) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sign,scalar_sign_op,sign (or 0),\sa ArrayBase::sign) - + /** \returns an expression of the coefficient-wise power of \a x to the given constant \a exponent. * * \tparam ScalarExponent is the scalar type of \a exponent. It must be compatible with the scalar type of the given expression (\c Derived::Scalar). @@ -102,17 +112,18 @@ namespace Eigen inline const CwiseBinaryOp,Derived,Constant > pow(const Eigen::ArrayBase& x, const ScalarExponent& exponent); #else - template - inline typename internal::enable_if< !(internal::is_same::value) && EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent), - const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,ScalarExponent,pow) >::type - pow(const Eigen::ArrayBase& x, const ScalarExponent& exponent) { - return x.derived().pow(exponent); - } - - template - inline const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename Derived::Scalar,pow) - pow(const Eigen::ArrayBase& x, const typename Derived::Scalar& exponent) { - return x.derived().pow(exponent); + template + EIGEN_DEVICE_FUNC inline + EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE( + const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename internal::promote_scalar_arg::type,pow)) + pow(const Eigen::ArrayBase& x, const ScalarExponent& exponent) + { + typedef typename internal::promote_scalar_arg::type PromotedExponent; + return EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,PromotedExponent,pow)(x.derived(), + typename internal::plain_constant_type::type(x.derived().rows(), x.derived().cols(), internal::scalar_constant_op(exponent))); } #endif @@ -122,21 +133,21 @@ namespace Eigen * * Example: \include Cwise_array_power_array.cpp * Output: \verbinclude Cwise_array_power_array.out - * + * * \sa ArrayBase::pow() * * \relates ArrayBase */ template inline const Eigen::CwiseBinaryOp, const Derived, const ExponentDerived> - pow(const Eigen::ArrayBase& x, const Eigen::ArrayBase& exponents) + pow(const Eigen::ArrayBase& x, const Eigen::ArrayBase& exponents) { return Eigen::CwiseBinaryOp, const Derived, const ExponentDerived>( x.derived(), exponents.derived() ); } - + /** \returns an expression of the coefficient-wise power of the scalar \a x to the given array of \a exponents. * * This function computes the coefficient-wise power between a scalar and an array of exponents. @@ -145,7 +156,7 @@ namespace Eigen * * Example: \include Cwise_scalar_power_array.cpp * Output: \verbinclude Cwise_scalar_power_array.out - * + * * \sa ArrayBase::pow() * * \relates ArrayBase @@ -155,21 +166,17 @@ namespace Eigen inline const CwiseBinaryOp,Constant,Derived> pow(const Scalar& x,const Eigen::ArrayBase& x); #else - template - inline typename internal::enable_if< !(internal::is_same::value) && EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar), - const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,Derived,pow) >::type - pow(const Scalar& x, const Eigen::ArrayBase& exponents) - { - return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,Derived,pow)( - typename internal::plain_constant_type::type(exponents.rows(), exponents.cols(), x), exponents.derived() ); - } - - template - inline const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename Derived::Scalar,Derived,pow) - pow(const typename Derived::Scalar& x, const Eigen::ArrayBase& exponents) - { - return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename Derived::Scalar,Derived,pow)( - typename internal::plain_constant_type::type(exponents.rows(), exponents.cols(), x), exponents.derived() ); + template + EIGEN_DEVICE_FUNC inline + EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE( + const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename internal::promote_scalar_arg::type,Derived,pow)) + pow(const Scalar& x, const Eigen::ArrayBase& exponents) { + typedef typename internal::promote_scalar_arg::type PromotedScalar; + return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(PromotedScalar,Derived,pow)( + typename internal::plain_constant_type::type(exponents.derived().rows(), exponents.derived().cols(), internal::scalar_constant_op(x)), exponents.derived()); } #endif diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/IO.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/IO.h index da7fd6cce2..e81c315216 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/IO.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/IO.h @@ -41,6 +41,7 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat& * - \b rowSuffix string printed at the end of each row * - \b matPrefix string printed at the beginning of the matrix * - \b matSuffix string printed at the end of the matrix + * - \b fill character printed to fill the empty space in aligned columns * * Example: \include IOFormat.cpp * Output: \verbinclude IOFormat.out @@ -53,9 +54,9 @@ struct IOFormat IOFormat(int _precision = StreamPrecision, int _flags = 0, const std::string& _coeffSeparator = " ", const std::string& _rowSeparator = "\n", const std::string& _rowPrefix="", const std::string& _rowSuffix="", - const std::string& _matPrefix="", const std::string& _matSuffix="") + const std::string& _matPrefix="", const std::string& _matSuffix="", const char _fill=' ') : matPrefix(_matPrefix), matSuffix(_matSuffix), rowPrefix(_rowPrefix), rowSuffix(_rowSuffix), rowSeparator(_rowSeparator), - rowSpacer(""), coeffSeparator(_coeffSeparator), precision(_precision), flags(_flags) + rowSpacer(""), coeffSeparator(_coeffSeparator), fill(_fill), precision(_precision), flags(_flags) { // TODO check if rowPrefix, rowSuffix or rowSeparator contains a newline // don't add rowSpacer if columns are not to be aligned @@ -71,6 +72,7 @@ struct IOFormat std::string matPrefix, matSuffix; std::string rowPrefix, rowSuffix, rowSeparator, rowSpacer; std::string coeffSeparator; + char fill; int precision; int flags; }; @@ -128,6 +130,9 @@ struct significant_decimals_impl template std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat& fmt) { + using internal::is_same; + using internal::conditional; + if(_m.size() == 0) { s << fmt.matPrefix << fmt.matSuffix; @@ -136,6 +141,22 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat& typename Derived::Nested m = _m; typedef typename Derived::Scalar Scalar; + typedef typename + conditional< + is_same::value || + is_same::value || + is_same::value || + is_same::value, + int, + typename conditional< + is_same >::value || + is_same >::value || + is_same >::value || + is_same >::value, + std::complex, + const Scalar& + >::type + >::type PrintType; Index width = 0; @@ -172,23 +193,31 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat& { std::stringstream sstr; sstr.copyfmt(s); - sstr << m.coeff(i,j); + sstr << static_cast(m.coeff(i,j)); width = std::max(width, Index(sstr.str().length())); } } + std::streamsize old_width = s.width(); + char old_fill_character = s.fill(); s << fmt.matPrefix; for(Index i = 0; i < m.rows(); ++i) { if (i) s << fmt.rowSpacer; s << fmt.rowPrefix; - if(width) s.width(width); - s << m.coeff(i, 0); + if(width) { + s.fill(fmt.fill); + s.width(width); + } + s << static_cast(m.coeff(i, 0)); for(Index j = 1; j < m.cols(); ++j) { s << fmt.coeffSeparator; - if (width) s.width(width); - s << m.coeff(i, j); + if(width) { + s.fill(fmt.fill); + s.width(width); + } + s << static_cast(m.coeff(i, j)); } s << fmt.rowSuffix; if( i < m.rows() - 1) @@ -196,6 +225,10 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat& } s << fmt.matSuffix; if(explicit_precision) s.precision(old_precision); + if(width) { + s.fill(old_fill_character); + s.width(old_width); + } return s; } diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/IndexedView.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/IndexedView.h new file mode 100644 index 0000000000..08476251d3 --- /dev/null +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/IndexedView.h @@ -0,0 +1,237 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_INDEXED_VIEW_H +#define EIGEN_INDEXED_VIEW_H + +namespace Eigen { + +namespace internal { + +template +struct traits > + : traits +{ + enum { + RowsAtCompileTime = int(array_size::value), + ColsAtCompileTime = int(array_size::value), + MaxRowsAtCompileTime = RowsAtCompileTime != Dynamic ? int(RowsAtCompileTime) : Dynamic, + MaxColsAtCompileTime = ColsAtCompileTime != Dynamic ? int(ColsAtCompileTime) : Dynamic, + + XprTypeIsRowMajor = (int(traits::Flags)&RowMajorBit) != 0, + IsRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1 + : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0 + : XprTypeIsRowMajor, + + RowIncr = int(get_compile_time_incr::value), + ColIncr = int(get_compile_time_incr::value), + InnerIncr = IsRowMajor ? ColIncr : RowIncr, + OuterIncr = IsRowMajor ? RowIncr : ColIncr, + + HasSameStorageOrderAsXprType = (IsRowMajor == XprTypeIsRowMajor), + XprInnerStride = HasSameStorageOrderAsXprType ? int(inner_stride_at_compile_time::ret) : int(outer_stride_at_compile_time::ret), + XprOuterstride = HasSameStorageOrderAsXprType ? int(outer_stride_at_compile_time::ret) : int(inner_stride_at_compile_time::ret), + + InnerSize = XprTypeIsRowMajor ? ColsAtCompileTime : RowsAtCompileTime, + IsBlockAlike = InnerIncr==1 && OuterIncr==1, + IsInnerPannel = HasSameStorageOrderAsXprType && is_same,typename conditional::type>::value, + + InnerStrideAtCompileTime = InnerIncr<0 || InnerIncr==DynamicIndex || XprInnerStride==Dynamic ? Dynamic : XprInnerStride * InnerIncr, + OuterStrideAtCompileTime = OuterIncr<0 || OuterIncr==DynamicIndex || XprOuterstride==Dynamic ? Dynamic : XprOuterstride * OuterIncr, + + ReturnAsScalar = is_same::value && is_same::value, + ReturnAsBlock = (!ReturnAsScalar) && IsBlockAlike, + ReturnAsIndexedView = (!ReturnAsScalar) && (!ReturnAsBlock), + + // FIXME we deal with compile-time strides if and only if we have DirectAccessBit flag, + // but this is too strict regarding negative strides... + DirectAccessMask = (int(InnerIncr)!=UndefinedIncr && int(OuterIncr)!=UndefinedIncr && InnerIncr>=0 && OuterIncr>=0) ? DirectAccessBit : 0, + FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0, + FlagsLvalueBit = is_lvalue::value ? LvalueBit : 0, + FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1) ? LinearAccessBit : 0, + Flags = (traits::Flags & (HereditaryBits | DirectAccessMask )) | FlagsLvalueBit | FlagsRowMajorBit | FlagsLinearAccessBit + }; + + typedef Block BlockType; +}; + +} + +template +class IndexedViewImpl; + + +/** \class IndexedView + * \ingroup Core_Module + * + * \brief Expression of a non-sequential sub-matrix defined by arbitrary sequences of row and column indices + * + * \tparam XprType the type of the expression in which we are taking the intersections of sub-rows and sub-columns + * \tparam RowIndices the type of the object defining the sequence of row indices + * \tparam ColIndices the type of the object defining the sequence of column indices + * + * This class represents an expression of a sub-matrix (or sub-vector) defined as the intersection + * of sub-sets of rows and columns, that are themself defined by generic sequences of row indices \f$ \{r_0,r_1,..r_{m-1}\} \f$ + * and column indices \f$ \{c_0,c_1,..c_{n-1} \}\f$. Let \f$ A \f$ be the nested matrix, then the resulting matrix \f$ B \f$ has \c m + * rows and \c n columns, and its entries are given by: \f$ B(i,j) = A(r_i,c_j) \f$. + * + * The \c RowIndices and \c ColIndices types must be compatible with the following API: + * \code + * operator[](Index) const; + * Index size() const; + * \endcode + * + * Typical supported types thus include: + * - std::vector + * - std::valarray + * - std::array + * - Plain C arrays: int[N] + * - Eigen::ArrayXi + * - decltype(ArrayXi::LinSpaced(...)) + * - Any view/expressions of the previous types + * - Eigen::ArithmeticSequence + * - Eigen::internal::AllRange (helper for Eigen::all) + * - Eigen::internal::SingleRange (helper for single index) + * - etc. + * + * In typical usages of %Eigen, this class should never be used directly. It is the return type of + * DenseBase::operator()(const RowIndices&, const ColIndices&). + * + * \sa class Block + */ +template +class IndexedView : public IndexedViewImpl::StorageKind> +{ +public: + typedef typename IndexedViewImpl::StorageKind>::Base Base; + EIGEN_GENERIC_PUBLIC_INTERFACE(IndexedView) + EIGEN_INHERIT_ASSIGNMENT_OPERATORS(IndexedView) + + typedef typename internal::ref_selector::non_const_type MatrixTypeNested; + typedef typename internal::remove_all::type NestedExpression; + + template + IndexedView(XprType& xpr, const T0& rowIndices, const T1& colIndices) + : m_xpr(xpr), m_rowIndices(rowIndices), m_colIndices(colIndices) + {} + + /** \returns number of rows */ + Index rows() const { return internal::size(m_rowIndices); } + + /** \returns number of columns */ + Index cols() const { return internal::size(m_colIndices); } + + /** \returns the nested expression */ + const typename internal::remove_all::type& + nestedExpression() const { return m_xpr; } + + /** \returns the nested expression */ + typename internal::remove_reference::type& + nestedExpression() { return m_xpr; } + + /** \returns a const reference to the object storing/generating the row indices */ + const RowIndices& rowIndices() const { return m_rowIndices; } + + /** \returns a const reference to the object storing/generating the column indices */ + const ColIndices& colIndices() const { return m_colIndices; } + +protected: + MatrixTypeNested m_xpr; + RowIndices m_rowIndices; + ColIndices m_colIndices; +}; + + +// Generic API dispatcher +template +class IndexedViewImpl + : public internal::generic_xpr_base >::type +{ +public: + typedef typename internal::generic_xpr_base >::type Base; +}; + +namespace internal { + + +template +struct unary_evaluator, IndexBased> + : evaluator_base > +{ + typedef IndexedView XprType; + + enum { + CoeffReadCost = evaluator::CoeffReadCost /* TODO + cost of row/col index */, + + FlagsLinearAccessBit = (traits::RowsAtCompileTime == 1 || traits::ColsAtCompileTime == 1) ? LinearAccessBit : 0, + + FlagsRowMajorBit = traits::FlagsRowMajorBit, + + Flags = (evaluator::Flags & (HereditaryBits & ~RowMajorBit /*| LinearAccessBit | DirectAccessBit*/)) | FlagsLinearAccessBit | FlagsRowMajorBit, + + Alignment = 0 + }; + + EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& xpr) : m_argImpl(xpr.nestedExpression()), m_xpr(xpr) + { + EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); + } + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const + { + return m_argImpl.coeff(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index row, Index col) + { + return m_argImpl.coeffRef(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index index) + { + EIGEN_STATIC_ASSERT_LVALUE(XprType) + Index row = XprType::RowsAtCompileTime == 1 ? 0 : index; + Index col = XprType::RowsAtCompileTime == 1 ? index : 0; + return m_argImpl.coeffRef( m_xpr.rowIndices()[row], m_xpr.colIndices()[col]); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Scalar& coeffRef(Index index) const + { + Index row = XprType::RowsAtCompileTime == 1 ? 0 : index; + Index col = XprType::RowsAtCompileTime == 1 ? index : 0; + return m_argImpl.coeffRef( m_xpr.rowIndices()[row], m_xpr.colIndices()[col]); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const CoeffReturnType coeff(Index index) const + { + Index row = XprType::RowsAtCompileTime == 1 ? 0 : index; + Index col = XprType::RowsAtCompileTime == 1 ? index : 0; + return m_argImpl.coeff( m_xpr.rowIndices()[row], m_xpr.colIndices()[col]); + } + +protected: + + evaluator m_argImpl; + const XprType& m_xpr; + +}; + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_INDEXED_VIEW_H diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Inverse.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Inverse.h index b76f0439d8..c514438c45 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Inverse.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Inverse.h @@ -1,7 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2014 Gael Guennebaud +// Copyright (C) 2014-2019 Gael Guennebaud // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -10,7 +10,7 @@ #ifndef EIGEN_INVERSE_H #define EIGEN_INVERSE_H -namespace Eigen { +namespace Eigen { template class InverseImpl; @@ -44,19 +44,18 @@ class Inverse : public InverseImpl::S { public: typedef typename XprType::StorageIndex StorageIndex; - typedef typename XprType::PlainObject PlainObject; typedef typename XprType::Scalar Scalar; typedef typename internal::ref_selector::type XprTypeNested; typedef typename internal::remove_all::type XprTypeNestedCleaned; typedef typename internal::ref_selector::type Nested; typedef typename internal::remove_all::type NestedExpression; - + explicit EIGEN_DEVICE_FUNC Inverse(const XprType &xpr) : m_xpr(xpr) {} - EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); } - EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_xpr.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_xpr.rows(); } EIGEN_DEVICE_FUNC const XprTypeNestedCleaned& nestedExpression() const { return m_xpr; } @@ -82,7 +81,7 @@ namespace internal { /** \internal * \brief Default evaluator for Inverse expression. - * + * * This default evaluator for Inverse expression simply evaluate the inverse into a temporary * by a call to internal::call_assignment_no_alias. * Therefore, inverse implementers only have to specialize Assignment, ...> for @@ -97,7 +96,7 @@ struct unary_evaluator > typedef Inverse InverseType; typedef typename InverseType::PlainObject PlainObject; typedef evaluator Base; - + enum { Flags = Base::Flags | EvalBeforeNestingBit }; unary_evaluator(const InverseType& inv_xpr) @@ -106,11 +105,11 @@ struct unary_evaluator > ::new (static_cast(this)) Base(m_result); internal::call_assignment_no_alias(m_result, inv_xpr); } - + protected: PlainObject m_result; }; - + } // end namespace internal } // end namespace Eigen diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Map.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Map.h index 548bf9a2d5..218cc157f3 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Map.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Map.h @@ -11,7 +11,7 @@ #ifndef EIGEN_MAP_H #define EIGEN_MAP_H -namespace Eigen { +namespace Eigen { namespace internal { template @@ -47,7 +47,7 @@ private: * \brief A matrix or vector expression mapping an existing array of data. * * \tparam PlainObjectType the equivalent matrix type of the mapped data - * \tparam MapOptions specifies the pointer alignment in bytes. It can be: \c #Aligned128, , \c #Aligned64, \c #Aligned32, \c #Aligned16, \c #Aligned8 or \c #Unaligned. + * \tparam MapOptions specifies the pointer alignment in bytes. It can be: \c #Aligned128, \c #Aligned64, \c #Aligned32, \c #Aligned16, \c #Aligned8 or \c #Unaligned. * The default is \c #Unaligned. * \tparam StrideType optionally specifies strides. By default, Map assumes the memory layout * of an ordinary, contiguous array. This can be overridden by specifying strides. @@ -104,19 +104,19 @@ template class Ma EIGEN_DEVICE_FUNC inline PointerType cast_to_pointer_type(PointerArgType ptr) { return ptr; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const { return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const { - return int(StrideType::OuterStrideAtCompileTime) != 0 ? m_stride.outer() - : int(internal::traits::OuterStrideAtCompileTime) != Dynamic ? Index(internal::traits::OuterStrideAtCompileTime) + return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer() + : internal::traits::OuterStrideAtCompileTime != Dynamic ? Index(internal::traits::OuterStrideAtCompileTime) : IsVectorAtCompileTime ? (this->size() * innerStride()) - : (int(Flags)&RowMajorBit) ? (this->cols() * innerStride()) + : int(Flags)&RowMajorBit ? (this->cols() * innerStride()) : (this->rows() * innerStride()); } diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/MapBase.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/MapBase.h index 92c3b2818a..d856447f03 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/MapBase.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/MapBase.h @@ -15,7 +15,7 @@ EIGEN_STATIC_ASSERT((int(internal::evaluator::Flags) & LinearAccessBit) || Derived::IsVectorAtCompileTime, \ YOU_ARE_TRYING_TO_USE_AN_INDEX_BASED_ACCESSOR_ON_AN_EXPRESSION_THAT_DOES_NOT_SUPPORT_THAT) -namespace Eigen { +namespace Eigen { /** \ingroup Core_Module * @@ -87,9 +87,11 @@ template class MapBase typedef typename Base::CoeffReturnType CoeffReturnType; /** \copydoc DenseBase::rows() */ - EIGEN_DEVICE_FUNC inline Index rows() const { return m_rows.value(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rows() const EIGEN_NOEXCEPT { return m_rows.value(); } /** \copydoc DenseBase::cols() */ - EIGEN_DEVICE_FUNC inline Index cols() const { return m_cols.value(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return m_cols.value(); } /** Returns a pointer to the first coefficient of the matrix or vector. * diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/MathFunctions.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/MathFunctions.h index 01736c2a06..61b78f4f20 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/MathFunctions.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/MathFunctions.h @@ -2,6 +2,7 @@ // for linear algebra. // // Copyright (C) 2006-2010 Benoit Jacob +// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -10,10 +11,11 @@ #ifndef EIGEN_MATHFUNCTIONS_H #define EIGEN_MATHFUNCTIONS_H -// source: http://www.geom.uiuc.edu/~huberty/math5337/groupe/digits.html // TODO this should better be moved to NumTraits -#define EIGEN_PI 3.141592653589793238462643383279502884197169399375105820974944592307816406L - +// Source: WolframAlpha +#define EIGEN_PI 3.141592653589793238462643383279502884197169399375105820974944592307816406L +#define EIGEN_LOG2E 1.442695040888963407359924681001892137426645954152985934135449406931109219L +#define EIGEN_LN2 0.693147180559945309417232121458176568075500134360255254120680009493393621L namespace Eigen { @@ -97,7 +99,7 @@ struct real_default_impl template struct real_impl : real_default_impl {}; -#ifdef __CUDA_ARCH__ +#if defined(EIGEN_GPU_COMPILE_PHASE) template struct real_impl > { @@ -145,7 +147,7 @@ struct imag_default_impl template struct imag_impl : imag_default_impl {}; -#ifdef __CUDA_ARCH__ +#if defined(EIGEN_GPU_COMPILE_PHASE) template struct imag_impl > { @@ -213,12 +215,12 @@ struct imag_ref_default_impl template struct imag_ref_default_impl { - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline Scalar run(Scalar&) { return Scalar(0); } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline const Scalar run(const Scalar&) { return Scalar(0); @@ -239,7 +241,7 @@ struct imag_ref_retval ****************************************************************************/ template::IsComplex> -struct conj_impl +struct conj_default_impl { EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) @@ -249,7 +251,7 @@ struct conj_impl }; template -struct conj_impl +struct conj_default_impl { EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) @@ -259,6 +261,9 @@ struct conj_impl } }; +template::IsComplex> +struct conj_impl : conj_default_impl {}; + template struct conj_retval { @@ -308,6 +313,65 @@ struct abs2_retval typedef typename NumTraits::Real type; }; +/**************************************************************************** +* Implementation of sqrt/rsqrt * +****************************************************************************/ + +template +struct sqrt_impl +{ + EIGEN_DEVICE_FUNC + static EIGEN_ALWAYS_INLINE Scalar run(const Scalar& x) + { + EIGEN_USING_STD(sqrt); + return sqrt(x); + } +}; + +// Complex sqrt defined in MathFunctionsImpl.h. +template EIGEN_DEVICE_FUNC std::complex complex_sqrt(const std::complex& a_x); + +// Custom implementation is faster than `std::sqrt`, works on +// GPU, and correctly handles special cases (unlike MSVC). +template +struct sqrt_impl > +{ + EIGEN_DEVICE_FUNC + static EIGEN_ALWAYS_INLINE std::complex run(const std::complex& x) + { + return complex_sqrt(x); + } +}; + +template +struct sqrt_retval +{ + typedef Scalar type; +}; + +// Default implementation relies on numext::sqrt, at bottom of file. +template +struct rsqrt_impl; + +// Complex rsqrt defined in MathFunctionsImpl.h. +template EIGEN_DEVICE_FUNC std::complex complex_rsqrt(const std::complex& a_x); + +template +struct rsqrt_impl > +{ + EIGEN_DEVICE_FUNC + static EIGEN_ALWAYS_INLINE std::complex run(const std::complex& x) + { + return complex_rsqrt(x); + } +}; + +template +struct rsqrt_retval +{ + typedef Scalar type; +}; + /**************************************************************************** * Implementation of norm1 * ****************************************************************************/ @@ -322,7 +386,7 @@ struct norm1_default_impl EIGEN_DEVICE_FUNC static inline RealScalar run(const Scalar& x) { - EIGEN_USING_STD_MATH(abs); + EIGEN_USING_STD(abs); return abs(x.real()) + abs(x.imag()); } }; @@ -333,7 +397,7 @@ struct norm1_default_impl EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) { - EIGEN_USING_STD_MATH(abs); + EIGEN_USING_STD(abs); return abs(x); } }; @@ -363,7 +427,7 @@ struct hypot_retval * Implementation of cast * ****************************************************************************/ -template +template struct cast_impl { EIGEN_DEVICE_FUNC @@ -373,6 +437,22 @@ struct cast_impl } }; +// Casting from S -> Complex leads to an implicit conversion from S to T, +// generating warnings on clang. Here we explicitly cast the real component. +template +struct cast_impl::IsComplex && NumTraits::IsComplex + >::type> +{ + EIGEN_DEVICE_FUNC + static inline NewType run(const OldType& x) + { + typedef typename NumTraits::Real NewReal; + return static_cast(static_cast(x)); + } +}; + // here, for once, we're plainly returning NewType: we don't want cast to do weird things. template @@ -386,29 +466,59 @@ inline NewType cast(const OldType& x) * Implementation of round * ****************************************************************************/ -#if EIGEN_HAS_CXX11_MATH - template - struct round_impl { - static inline Scalar run(const Scalar& x) - { - EIGEN_STATIC_ASSERT((!NumTraits::IsComplex), NUMERIC_TYPE_MUST_BE_REAL) - using std::round; - return round(x); - } - }; -#else - template - struct round_impl +template +struct round_impl +{ + EIGEN_DEVICE_FUNC + static inline Scalar run(const Scalar& x) { - static inline Scalar run(const Scalar& x) - { - EIGEN_STATIC_ASSERT((!NumTraits::IsComplex), NUMERIC_TYPE_MUST_BE_REAL) - EIGEN_USING_STD_MATH(floor); - EIGEN_USING_STD_MATH(ceil); - return (x > Scalar(0)) ? floor(x + Scalar(0.5)) : ceil(x - Scalar(0.5)); - } - }; + EIGEN_STATIC_ASSERT((!NumTraits::IsComplex), NUMERIC_TYPE_MUST_BE_REAL) +#if EIGEN_HAS_CXX11_MATH + EIGEN_USING_STD(round); #endif + return Scalar(round(x)); + } +}; + +#if !EIGEN_HAS_CXX11_MATH +#if EIGEN_HAS_C99_MATH +// Use ::roundf for float. +template<> +struct round_impl { + EIGEN_DEVICE_FUNC + static inline float run(const float& x) + { + return ::roundf(x); + } +}; +#else +template +struct round_using_floor_ceil_impl +{ + EIGEN_DEVICE_FUNC + static inline Scalar run(const Scalar& x) + { + EIGEN_STATIC_ASSERT((!NumTraits::IsComplex), NUMERIC_TYPE_MUST_BE_REAL) + // Without C99 round/roundf, resort to floor/ceil. + EIGEN_USING_STD(floor); + EIGEN_USING_STD(ceil); + // If not enough precision to resolve a decimal at all, return the input. + // Otherwise, adding 0.5 can trigger an increment by 1. + const Scalar limit = Scalar(1ull << (NumTraits::digits() - 1)); + if (x >= limit || x <= -limit) { + return x; + } + return (x > Scalar(0)) ? Scalar(floor(x + Scalar(0.5))) : Scalar(ceil(x - Scalar(0.5))); + } +}; + +template<> +struct round_impl : round_using_floor_ceil_impl {}; + +template<> +struct round_impl : round_using_floor_ceil_impl {}; +#endif // EIGEN_HAS_C99_MATH +#endif // !EIGEN_HAS_CXX11_MATH template struct round_retval @@ -416,44 +526,113 @@ struct round_retval typedef Scalar type; }; +/**************************************************************************** +* Implementation of rint * +****************************************************************************/ + +template +struct rint_impl { + EIGEN_DEVICE_FUNC + static inline Scalar run(const Scalar& x) + { + EIGEN_STATIC_ASSERT((!NumTraits::IsComplex), NUMERIC_TYPE_MUST_BE_REAL) +#if EIGEN_HAS_CXX11_MATH + EIGEN_USING_STD(rint); +#endif + return rint(x); + } +}; + +#if !EIGEN_HAS_CXX11_MATH +template<> +struct rint_impl { + EIGEN_DEVICE_FUNC + static inline double run(const double& x) + { + return ::rint(x); + } +}; +template<> +struct rint_impl { + EIGEN_DEVICE_FUNC + static inline float run(const float& x) + { + return ::rintf(x); + } +}; +#endif + +template +struct rint_retval +{ + typedef Scalar type; +}; + /**************************************************************************** * Implementation of arg * ****************************************************************************/ -#if EIGEN_HAS_CXX11_MATH - template - struct arg_impl { - static inline Scalar run(const Scalar& x) - { - EIGEN_USING_STD_MATH(arg); - return arg(x); - } - }; +// Visual Studio 2017 has a bug where arg(float) returns 0 for negative inputs. +// This seems to be fixed in VS 2019. +#if EIGEN_HAS_CXX11_MATH && (!EIGEN_COMP_MSVC || EIGEN_COMP_MSVC >= 1920) +// std::arg is only defined for types of std::complex, or integer types or float/double/long double +template::IsComplex || is_integral::value + || is_same::value || is_same::value + || is_same::value > +struct arg_default_impl; + +template +struct arg_default_impl { + typedef typename NumTraits::Real RealScalar; + EIGEN_DEVICE_FUNC + static inline RealScalar run(const Scalar& x) + { + #if defined(EIGEN_HIP_DEVICE_COMPILE) + // HIP does not seem to have a native device side implementation for the math routine "arg" + using std::arg; + #else + EIGEN_USING_STD(arg); + #endif + return static_cast(arg(x)); + } +}; + +// Must be non-complex floating-point type (e.g. half/bfloat16). +template +struct arg_default_impl { + typedef typename NumTraits::Real RealScalar; + EIGEN_DEVICE_FUNC + static inline RealScalar run(const Scalar& x) + { + return (x < Scalar(0)) ? RealScalar(EIGEN_PI) : RealScalar(0); + } +}; #else - template::IsComplex> - struct arg_default_impl +template::IsComplex> +struct arg_default_impl +{ + typedef typename NumTraits::Real RealScalar; + EIGEN_DEVICE_FUNC + static inline RealScalar run(const Scalar& x) { - typedef typename NumTraits::Real RealScalar; - EIGEN_DEVICE_FUNC - static inline RealScalar run(const Scalar& x) - { - return (x < Scalar(0)) ? Scalar(EIGEN_PI) : Scalar(0); } - }; + return (x < RealScalar(0)) ? RealScalar(EIGEN_PI) : RealScalar(0); + } +}; - template - struct arg_default_impl +template +struct arg_default_impl +{ + typedef typename NumTraits::Real RealScalar; + EIGEN_DEVICE_FUNC + static inline RealScalar run(const Scalar& x) { - typedef typename NumTraits::Real RealScalar; - EIGEN_DEVICE_FUNC - static inline RealScalar run(const Scalar& x) - { - EIGEN_USING_STD_MATH(arg); - return arg(x); - } - }; - - template struct arg_impl : arg_default_impl {}; + EIGEN_USING_STD(arg); + return arg(x); + } +}; #endif +template struct arg_impl : arg_default_impl {}; template struct arg_retval @@ -461,6 +640,80 @@ struct arg_retval typedef typename NumTraits::Real type; }; +/**************************************************************************** +* Implementation of expm1 * +****************************************************************************/ + +// This implementation is based on GSL Math's expm1. +namespace std_fallback { + // fallback expm1 implementation in case there is no expm1(Scalar) function in namespace of Scalar, + // or that there is no suitable std::expm1 function available. Implementation + // attributed to Kahan. See: http://www.plunk.org/~hatch/rightway.php. + template + EIGEN_DEVICE_FUNC inline Scalar expm1(const Scalar& x) { + EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) + typedef typename NumTraits::Real RealScalar; + + EIGEN_USING_STD(exp); + Scalar u = exp(x); + if (numext::equal_strict(u, Scalar(1))) { + return x; + } + Scalar um1 = u - RealScalar(1); + if (numext::equal_strict(um1, Scalar(-1))) { + return RealScalar(-1); + } + + EIGEN_USING_STD(log); + Scalar logu = log(u); + return numext::equal_strict(u, logu) ? u : (u - RealScalar(1)) * x / logu; + } +} + +template +struct expm1_impl { + EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) + { + EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) + #if EIGEN_HAS_CXX11_MATH + using std::expm1; + #else + using std_fallback::expm1; + #endif + return expm1(x); + } +}; + +template +struct expm1_retval +{ + typedef Scalar type; +}; + +/**************************************************************************** +* Implementation of log * +****************************************************************************/ + +// Complex log defined in MathFunctionsImpl.h. +template EIGEN_DEVICE_FUNC std::complex complex_log(const std::complex& z); + +template +struct log_impl { + EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) + { + EIGEN_USING_STD(log); + return static_cast(log(x)); + } +}; + +template +struct log_impl > { + EIGEN_DEVICE_FUNC static inline std::complex run(const std::complex& z) + { + return complex_log(z); + } +}; + /**************************************************************************** * Implementation of log1p * ****************************************************************************/ @@ -472,25 +725,38 @@ namespace std_fallback { EIGEN_DEVICE_FUNC inline Scalar log1p(const Scalar& x) { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) typedef typename NumTraits::Real RealScalar; - EIGEN_USING_STD_MATH(log); + EIGEN_USING_STD(log); Scalar x1p = RealScalar(1) + x; - return numext::equal_strict(x1p, Scalar(1)) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) ); + Scalar log_1p = log_impl::run(x1p); + const bool is_small = numext::equal_strict(x1p, Scalar(1)); + const bool is_inf = numext::equal_strict(x1p, log_1p); + return (is_small || is_inf) ? x : x * (log_1p / (x1p - RealScalar(1))); } } template struct log1p_impl { - static inline Scalar run(const Scalar& x) + EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) #if EIGEN_HAS_CXX11_MATH using std::log1p; - #endif + #else using std_fallback::log1p; + #endif return log1p(x); } }; +// Specialization for complex types that are not supported by std::log1p. +template +struct log1p_impl > { + EIGEN_DEVICE_FUNC static inline std::complex run( + const std::complex& x) { + EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar) + return std_fallback::log1p(x); + } +}; template struct log1p_retval @@ -509,7 +775,7 @@ struct pow_impl typedef typename ScalarBinaryOpTraits >::ReturnType result_type; static EIGEN_DEVICE_FUNC inline result_type run(const ScalarX& x, const ScalarY& y) { - EIGEN_USING_STD_MATH(pow); + EIGEN_USING_STD(pow); return pow(x, y); } }; @@ -687,7 +953,7 @@ inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random() return EIGEN_MATHFUNC_IMPL(random, Scalar)::run(); } -// Implementatin of is* functions +// Implementation of is* functions // std::is* do not work with fast-math and gcc, std::is* are available on MSVC 2013 and newer, as well as in clang. #if (EIGEN_HAS_CXX11_MATH && !(EIGEN_COMP_GNUC_STRICT && __FINITE_MATH_ONLY__)) || (EIGEN_COMP_MSVC>=1800) || (EIGEN_COMP_CLANG) @@ -716,7 +982,7 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type isfinite_impl(const T& x) { - #ifdef __CUDA_ARCH__ + #if defined(EIGEN_GPU_COMPILE_PHASE) return (::isfinite)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isfinite; @@ -731,7 +997,7 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type isinf_impl(const T& x) { - #ifdef __CUDA_ARCH__ + #if defined(EIGEN_GPU_COMPILE_PHASE) return (::isinf)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isinf; @@ -746,7 +1012,7 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type isnan_impl(const T& x) { - #ifdef __CUDA_ARCH__ + #if defined(EIGEN_GPU_COMPILE_PHASE) return (::isnan)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isnan; @@ -803,7 +1069,6 @@ template EIGEN_DEVICE_FUNC bool isnan_impl(const std::complex& x) template EIGEN_DEVICE_FUNC bool isinf_impl(const std::complex& x); template T generic_fast_tanh_float(const T& a_x); - } // end namespace internal /**************************************************************************** @@ -812,12 +1077,12 @@ template T generic_fast_tanh_float(const T& a_x); namespace numext { -#ifndef __CUDA_ARCH__ +#if (!defined(EIGEN_GPUCC) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)) template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) { - EIGEN_USING_STD_MATH(min); + EIGEN_USING_STD(min) return min EIGEN_NOT_A_MACRO (x,y); } @@ -825,7 +1090,7 @@ template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) { - EIGEN_USING_STD_MATH(max); + EIGEN_USING_STD(max) return max EIGEN_NOT_A_MACRO (x,y); } #else @@ -841,6 +1106,24 @@ EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y) { return fminf(x, y); } +template<> +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE double mini(const double& x, const double& y) +{ + return fmin(x, y); +} +template<> +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE long double mini(const long double& x, const long double& y) +{ +#if defined(EIGEN_HIPCC) + // no "fminl" on HIP yet + return (x < y) ? x : y; +#else + return fminl(x, y); +#endif +} + template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) @@ -853,6 +1136,92 @@ EIGEN_ALWAYS_INLINE float maxi(const float& x, const float& y) { return fmaxf(x, y); } +template<> +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE double maxi(const double& x, const double& y) +{ + return fmax(x, y); +} +template<> +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE long double maxi(const long double& x, const long double& y) +{ +#if defined(EIGEN_HIPCC) + // no "fmaxl" on HIP yet + return (x > y) ? x : y; +#else + return fmaxl(x, y); +#endif +} +#endif + +#if defined(SYCL_DEVICE_ONLY) + + +#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \ + SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_char) \ + SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_short) \ + SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_int) \ + SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_long) +#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \ + SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_char) \ + SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_short) \ + SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_int) \ + SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_long) +#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \ + SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar) \ + SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort) \ + SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uint) \ + SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong) +#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \ + SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar) \ + SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort) \ + SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uint) \ + SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong) +#define SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(NAME, FUNC) \ + SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \ + SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) +#define SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(NAME, FUNC) \ + SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \ + SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) +#define SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(NAME, FUNC) \ + SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \ + SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC,cl::sycl::cl_double) +#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(NAME, FUNC) \ + SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \ + SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC,cl::sycl::cl_double) +#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(NAME, FUNC, RET_TYPE) \ + SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_float) \ + SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_double) + +#define SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE) \ +template<> \ + EIGEN_DEVICE_FUNC \ + EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE& x) { \ + return cl::sycl::FUNC(x); \ + } + +#define SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, TYPE) \ + SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, TYPE, TYPE) + +#define SYCL_SPECIALIZE_GEN1_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE1, ARG_TYPE2) \ + template<> \ + EIGEN_DEVICE_FUNC \ + EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE1& x, const ARG_TYPE2& y) { \ + return cl::sycl::FUNC(x, y); \ + } + +#define SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE) \ + SYCL_SPECIALIZE_GEN1_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE, ARG_TYPE) + +#define SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, TYPE) \ + SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, TYPE, TYPE) + +SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(mini, min) +SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(mini, fmin) +SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(maxi, max) +SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(maxi, fmax) + #endif @@ -922,6 +1291,34 @@ inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar) abs2(const Scalar& x) EIGEN_DEVICE_FUNC inline bool abs2(bool x) { return x; } +template +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE T absdiff(const T& x, const T& y) +{ + return x > y ? x - y : y - x; +} +template<> +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE float absdiff(const float& x, const float& y) +{ + return fabsf(x - y); +} +template<> +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE double absdiff(const double& x, const double& y) +{ + return fabs(x - y); +} + +#if !defined(EIGEN_GPUCC) +// HIP and CUDA do not support long double. +template<> +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE long double absdiff(const long double& x, const long double& y) { + return fabsl(x - y); +} +#endif + template EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar) norm1(const Scalar& x) @@ -936,6 +1333,10 @@ inline EIGEN_MATHFUNC_RETVAL(hypot, Scalar) hypot(const Scalar& x, const Scalar& return EIGEN_MATHFUNC_IMPL(hypot, Scalar)::run(x, y); } +#if defined(SYCL_DEVICE_ONLY) + SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(hypot, hypot) +#endif + template EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x) @@ -943,7 +1344,11 @@ inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x) return EIGEN_MATHFUNC_IMPL(log1p, Scalar)::run(x); } -#ifdef __CUDACC__ +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log1p, log1p) +#endif + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log1p(const float &x) { return ::log1pf(x); } @@ -958,10 +1363,27 @@ inline typename internal::pow_impl::result_type pow(const Scala return internal::pow_impl::run(x, y); } +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(pow, pow) +#endif + template EIGEN_DEVICE_FUNC bool (isnan) (const T &x) { return internal::isnan_impl(x); } template EIGEN_DEVICE_FUNC bool (isinf) (const T &x) { return internal::isinf_impl(x); } template EIGEN_DEVICE_FUNC bool (isfinite)(const T &x) { return internal::isfinite_impl(x); } +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isnan, isnan, bool) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isinf, isinf, bool) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isfinite, isfinite, bool) +#endif + +template +EIGEN_DEVICE_FUNC +inline EIGEN_MATHFUNC_RETVAL(rint, Scalar) rint(const Scalar& x) +{ + return EIGEN_MATHFUNC_IMPL(rint, Scalar)::run(x); +} + template EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(round, Scalar) round(const Scalar& x) @@ -969,15 +1391,23 @@ inline EIGEN_MATHFUNC_RETVAL(round, Scalar) round(const Scalar& x) return EIGEN_MATHFUNC_IMPL(round, Scalar)::run(x); } +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(round, round) +#endif + template EIGEN_DEVICE_FUNC T (floor)(const T& x) { - EIGEN_USING_STD_MATH(floor); + EIGEN_USING_STD(floor) return floor(x); } -#ifdef __CUDACC__ +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(floor, floor) +#endif + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float floor(const float &x) { return ::floorf(x); } @@ -989,11 +1419,15 @@ template EIGEN_DEVICE_FUNC T (ceil)(const T& x) { - EIGEN_USING_STD_MATH(ceil); + EIGEN_USING_STD(ceil); return ceil(x); } -#ifdef __CUDACC__ +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(ceil, ceil) +#endif + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float ceil(const float &x) { return ::ceilf(x); } @@ -1026,22 +1460,42 @@ inline int log2(int x) * * It's usage is justified in performance critical functions, like norm/normalize. */ +template +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE EIGEN_MATHFUNC_RETVAL(sqrt, Scalar) sqrt(const Scalar& x) +{ + return EIGEN_MATHFUNC_IMPL(sqrt, Scalar)::run(x); +} + +// Boolean specialization, avoids implicit float to bool conversion (-Wimplicit-conversion-floating-point-to-bool). +template<> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC +bool sqrt(const bool &x) { return x; } + +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sqrt, sqrt) +#endif + +/** \returns the reciprocal square root of \a x. **/ template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -T sqrt(const T &x) +T rsqrt(const T& x) { - EIGEN_USING_STD_MATH(sqrt); - return sqrt(x); + return internal::rsqrt_impl::run(x); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T log(const T &x) { - EIGEN_USING_STD_MATH(log); - return log(x); + return internal::log_impl::run(x); } -#ifdef __CUDACC__ +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log, log) +#endif + + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log(const float &x) { return ::logf(x); } @@ -1053,7 +1507,7 @@ template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename internal::enable_if::IsSigned || NumTraits::IsComplex,typename NumTraits::Real>::type abs(const T &x) { - EIGEN_USING_STD_MATH(abs); + EIGEN_USING_STD(abs); return abs(x); } @@ -1064,12 +1518,12 @@ abs(const T &x) { return x; } -#if defined(__SYCL_DEVICE_ONLY__) -EIGEN_ALWAYS_INLINE float abs(float x) { return cl::sycl::fabs(x); } -EIGEN_ALWAYS_INLINE double abs(double x) { return cl::sycl::fabs(x); } -#endif // defined(__SYCL_DEVICE_ONLY__) +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(abs, abs) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(abs, fabs) +#endif -#ifdef __CUDACC__ +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float abs(const float &x) { return ::fabsf(x); } @@ -1090,26 +1544,69 @@ double abs(const std::complex& x) { template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T exp(const T &x) { - EIGEN_USING_STD_MATH(exp); + EIGEN_USING_STD(exp); return exp(x); } -#ifdef __CUDACC__ +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(exp, exp) +#endif + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float exp(const float &x) { return ::expf(x); } template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double exp(const double &x) { return ::exp(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +std::complex exp(const std::complex& x) { + float com = ::expf(x.real()); + float res_real = com * ::cosf(x.imag()); + float res_imag = com * ::sinf(x.imag()); + return std::complex(res_real, res_imag); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +std::complex exp(const std::complex& x) { + double com = ::exp(x.real()); + double res_real = com * ::cos(x.imag()); + double res_imag = com * ::sin(x.imag()); + return std::complex(res_real, res_imag); +} +#endif + +template +EIGEN_DEVICE_FUNC +inline EIGEN_MATHFUNC_RETVAL(expm1, Scalar) expm1(const Scalar& x) +{ + return EIGEN_MATHFUNC_IMPL(expm1, Scalar)::run(x); +} + +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(expm1, expm1) +#endif + +#if defined(EIGEN_GPUCC) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float expm1(const float &x) { return ::expm1f(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double expm1(const double &x) { return ::expm1(x); } #endif template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T cos(const T &x) { - EIGEN_USING_STD_MATH(cos); + EIGEN_USING_STD(cos); return cos(x); } -#ifdef __CUDACC__ +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cos,cos) +#endif + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float cos(const float &x) { return ::cosf(x); } @@ -1120,11 +1617,15 @@ double cos(const double &x) { return ::cos(x); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T sin(const T &x) { - EIGEN_USING_STD_MATH(sin); + EIGEN_USING_STD(sin); return sin(x); } -#ifdef __CUDACC__ +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sin, sin) +#endif + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sin(const float &x) { return ::sinf(x); } @@ -1135,11 +1636,15 @@ double sin(const double &x) { return ::sin(x); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T tan(const T &x) { - EIGEN_USING_STD_MATH(tan); + EIGEN_USING_STD(tan); return tan(x); } -#ifdef __CUDACC__ +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tan, tan) +#endif + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tan(const float &x) { return ::tanf(x); } @@ -1150,11 +1655,25 @@ double tan(const double &x) { return ::tan(x); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T acos(const T &x) { - EIGEN_USING_STD_MATH(acos); + EIGEN_USING_STD(acos); return acos(x); } -#ifdef __CUDACC__ +#if EIGEN_HAS_CXX11_MATH +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T acosh(const T &x) { + EIGEN_USING_STD(acosh); + return static_cast(acosh(x)); +} +#endif + +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acos, acos) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acosh, acosh) +#endif + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float acos(const float &x) { return ::acosf(x); } @@ -1165,11 +1684,25 @@ double acos(const double &x) { return ::acos(x); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T asin(const T &x) { - EIGEN_USING_STD_MATH(asin); + EIGEN_USING_STD(asin); return asin(x); } -#ifdef __CUDACC__ +#if EIGEN_HAS_CXX11_MATH +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T asinh(const T &x) { + EIGEN_USING_STD(asinh); + return static_cast(asinh(x)); +} +#endif + +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asin, asin) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asinh, asinh) +#endif + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float asin(const float &x) { return ::asinf(x); } @@ -1180,11 +1713,25 @@ double asin(const double &x) { return ::asin(x); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T atan(const T &x) { - EIGEN_USING_STD_MATH(atan); - return atan(x); + EIGEN_USING_STD(atan); + return static_cast(atan(x)); } -#ifdef __CUDACC__ +#if EIGEN_HAS_CXX11_MATH +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T atanh(const T &x) { + EIGEN_USING_STD(atanh); + return static_cast(atanh(x)); +} +#endif + +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atan, atan) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atanh, atanh) +#endif + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float atan(const float &x) { return ::atanf(x); } @@ -1196,11 +1743,15 @@ double atan(const double &x) { return ::atan(x); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T cosh(const T &x) { - EIGEN_USING_STD_MATH(cosh); - return cosh(x); + EIGEN_USING_STD(cosh); + return static_cast(cosh(x)); } -#ifdef __CUDACC__ +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cosh, cosh) +#endif + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float cosh(const float &x) { return ::coshf(x); } @@ -1211,11 +1762,15 @@ double cosh(const double &x) { return ::cosh(x); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T sinh(const T &x) { - EIGEN_USING_STD_MATH(sinh); - return sinh(x); + EIGEN_USING_STD(sinh); + return static_cast(sinh(x)); } -#ifdef __CUDACC__ +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sinh, sinh) +#endif + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sinh(const float &x) { return ::sinhf(x); } @@ -1226,16 +1781,20 @@ double sinh(const double &x) { return ::sinh(x); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T tanh(const T &x) { - EIGEN_USING_STD_MATH(tanh); + EIGEN_USING_STD(tanh); return tanh(x); } -#if (!defined(__CUDACC__)) && EIGEN_FAST_MATH +#if (!defined(EIGEN_GPUCC)) && EIGEN_FAST_MATH && !defined(SYCL_DEVICE_ONLY) EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tanh(float x) { return internal::generic_fast_tanh_float(x); } #endif -#ifdef __CUDACC__ +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tanh, tanh) +#endif + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tanh(const float &x) { return ::tanhf(x); } @@ -1246,11 +1805,15 @@ double tanh(const double &x) { return ::tanh(x); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T fmod(const T& a, const T& b) { - EIGEN_USING_STD_MATH(fmod); + EIGEN_USING_STD(fmod); return fmod(a, b); } -#ifdef __CUDACC__ +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(fmod, fmod) +#endif + +#if defined(EIGEN_GPUCC) template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float fmod(const float& a, const float& b) { @@ -1264,6 +1827,23 @@ double fmod(const double& a, const double& b) { } #endif +#if defined(SYCL_DEVICE_ONLY) +#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY +#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY +#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY +#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY +#undef SYCL_SPECIALIZE_INTEGER_TYPES_BINARY +#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY +#undef SYCL_SPECIALIZE_FLOATING_TYPES_BINARY +#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY +#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE +#undef SYCL_SPECIALIZE_GEN_UNARY_FUNC +#undef SYCL_SPECIALIZE_UNARY_FUNC +#undef SYCL_SPECIALIZE_GEN1_BINARY_FUNC +#undef SYCL_SPECIALIZE_GEN2_BINARY_FUNC +#undef SYCL_SPECIALIZE_BINARY_FUNC +#endif + } // end namespace numext namespace internal { @@ -1387,18 +1967,23 @@ template<> struct random_impl { return random(0,1)==0 ? false : true; } + + static inline bool run(const bool& a, const bool& b) + { + return random(a, b)==0 ? false : true; + } }; template<> struct scalar_fuzzy_impl { typedef bool RealScalar; - + template EIGEN_DEVICE_FUNC static inline bool isMuchSmallerThan(const bool& x, const bool&, const bool&) { return !x; } - + EIGEN_DEVICE_FUNC static inline bool isApprox(bool x, bool y, bool) { @@ -1410,10 +1995,61 @@ template<> struct scalar_fuzzy_impl { return (!x) || y; } - + }; - +} // end namespace internal + +// Default implementations that rely on other numext implementations +namespace internal { + +// Specialization for complex types that are not supported by std::expm1. +template +struct expm1_impl > { + EIGEN_DEVICE_FUNC static inline std::complex run( + const std::complex& x) { + EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar) + RealScalar xr = x.real(); + RealScalar xi = x.imag(); + // expm1(z) = exp(z) - 1 + // = exp(x + i * y) - 1 + // = exp(x) * (cos(y) + i * sin(y)) - 1 + // = exp(x) * cos(y) - 1 + i * exp(x) * sin(y) + // Imag(expm1(z)) = exp(x) * sin(y) + // Real(expm1(z)) = exp(x) * cos(y) - 1 + // = exp(x) * cos(y) - 1. + // = expm1(x) + exp(x) * (cos(y) - 1) + // = expm1(x) + exp(x) * (2 * sin(y / 2) ** 2) + RealScalar erm1 = numext::expm1(xr); + RealScalar er = erm1 + RealScalar(1.); + RealScalar sin2 = numext::sin(xi / RealScalar(2.)); + sin2 = sin2 * sin2; + RealScalar s = numext::sin(xi); + RealScalar real_part = erm1 - RealScalar(2.) * er * sin2; + return std::complex(real_part, er * s); + } +}; + +template +struct rsqrt_impl { + EIGEN_DEVICE_FUNC + static EIGEN_ALWAYS_INLINE T run(const T& x) { + return T(1)/numext::sqrt(x); + } +}; + +#if defined(EIGEN_GPU_COMPILE_PHASE) +template +struct conj_impl, true> +{ + EIGEN_DEVICE_FUNC + static inline std::complex run(const std::complex& x) + { + return std::complex(numext::real(x), -numext::imag(x)); + } +}; +#endif + } // end namespace internal } // end namespace Eigen diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/MathFunctionsImpl.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/MathFunctionsImpl.h index 9c1ceb0eb0..4eaaaa7844 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/MathFunctionsImpl.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/MathFunctionsImpl.h @@ -17,24 +17,28 @@ namespace internal { /** \internal \returns the hyperbolic tan of \a a (coeff-wise) Doesn't do anything fancy, just a 13/6-degree rational interpolant which - is accurate up to a couple of ulp in the range [-9, 9], outside of which - the tanh(x) = +/-1. + is accurate up to a couple of ulps in the (approximate) range [-8, 8], + outside of which tanh(x) = +/-1 in single precision. The input is clamped + to the range [-c, c]. The value c is chosen as the smallest value where + the approximation evaluates to exactly 1. In the reange [-0.0004, 0.0004] + the approxmation tanh(x) ~= x is used for better accuracy as x tends to zero. This implementation works on both scalars and packets. */ template T generic_fast_tanh_float(const T& a_x) { - // Clamp the inputs to the range [-9, 9] since anything outside - // this range is +/-1.0f in single-precision. - const T plus_9 = pset1(9.f); - const T minus_9 = pset1(-9.f); - // NOTE GCC prior to 6.3 might improperly optimize this max/min - // step such that if a_x is nan, x will be either 9 or -9, - // and tanh will return 1 or -1 instead of nan. - // This is supposed to be fixed in gcc6.3, - // see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 - const T x = pmax(minus_9,pmin(plus_9,a_x)); + // Clamp the inputs to the range [-c, c] +#ifdef EIGEN_VECTORIZE_FMA + const T plus_clamp = pset1(7.99881172180175781f); + const T minus_clamp = pset1(-7.99881172180175781f); +#else + const T plus_clamp = pset1(7.90531110763549805f); + const T minus_clamp = pset1(-7.90531110763549805f); +#endif + const T tiny = pset1(0.0004f); + const T x = pmax(pmin(a_x, plus_clamp), minus_clamp); + const T tiny_mask = pcmp_lt(pabs(a_x), tiny); // The monomial coefficients of the numerator polynomial (odd). const T alpha_1 = pset1(4.89352455891786e-03f); const T alpha_3 = pset1(6.37261928875436e-04f); @@ -62,24 +66,30 @@ T generic_fast_tanh_float(const T& a_x) p = pmadd(x2, p, alpha_1); p = pmul(x, p); - // Evaluate the denominator polynomial p. + // Evaluate the denominator polynomial q. T q = pmadd(x2, beta_6, beta_4); q = pmadd(x2, q, beta_2); q = pmadd(x2, q, beta_0); // Divide the numerator by the denominator. - return pdiv(p, q); + return pselect(tiny_mask, x, pdiv(p, q)); } template -EIGEN_STRONG_INLINE +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE RealScalar positive_real_hypot(const RealScalar& x, const RealScalar& y) { - EIGEN_USING_STD_MATH(sqrt); + // IEEE IEC 6059 special cases. + if ((numext::isinf)(x) || (numext::isinf)(y)) + return NumTraits::infinity(); + if ((numext::isnan)(x) || (numext::isnan)(y)) + return NumTraits::quiet_NaN(); + + EIGEN_USING_STD(sqrt); RealScalar p, qp; p = numext::maxi(x,y); if(p==RealScalar(0)) return RealScalar(0); - qp = numext::mini(y,x) / p; + qp = numext::mini(y,x) / p; return p * sqrt(RealScalar(1) + qp*qp); } @@ -87,13 +97,102 @@ template struct hypot_impl { typedef typename NumTraits::Real RealScalar; - static inline RealScalar run(const Scalar& x, const Scalar& y) + static EIGEN_DEVICE_FUNC + inline RealScalar run(const Scalar& x, const Scalar& y) { - EIGEN_USING_STD_MATH(abs); + EIGEN_USING_STD(abs); return positive_real_hypot(abs(x), abs(y)); } }; +// Generic complex sqrt implementation that correctly handles corner cases +// according to https://en.cppreference.com/w/cpp/numeric/complex/sqrt +template +EIGEN_DEVICE_FUNC std::complex complex_sqrt(const std::complex& z) { + // Computes the principal sqrt of the input. + // + // For a complex square root of the number x + i*y. We want to find real + // numbers u and v such that + // (u + i*v)^2 = x + i*y <=> + // u^2 - v^2 + i*2*u*v = x + i*v. + // By equating the real and imaginary parts we get: + // u^2 - v^2 = x + // 2*u*v = y. + // + // For x >= 0, this has the numerically stable solution + // u = sqrt(0.5 * (x + sqrt(x^2 + y^2))) + // v = y / (2 * u) + // and for x < 0, + // v = sign(y) * sqrt(0.5 * (-x + sqrt(x^2 + y^2))) + // u = y / (2 * v) + // + // Letting w = sqrt(0.5 * (|x| + |z|)), + // if x == 0: u = w, v = sign(y) * w + // if x > 0: u = w, v = y / (2 * w) + // if x < 0: u = |y| / (2 * w), v = sign(y) * w + + const T x = numext::real(z); + const T y = numext::imag(z); + const T zero = T(0); + const T w = numext::sqrt(T(0.5) * (numext::abs(x) + numext::hypot(x, y))); + + return + (numext::isinf)(y) ? std::complex(NumTraits::infinity(), y) + : x == zero ? std::complex(w, y < zero ? -w : w) + : x > zero ? std::complex(w, y / (2 * w)) + : std::complex(numext::abs(y) / (2 * w), y < zero ? -w : w ); +} + +// Generic complex rsqrt implementation. +template +EIGEN_DEVICE_FUNC std::complex complex_rsqrt(const std::complex& z) { + // Computes the principal reciprocal sqrt of the input. + // + // For a complex reciprocal square root of the number z = x + i*y. We want to + // find real numbers u and v such that + // (u + i*v)^2 = 1 / (x + i*y) <=> + // u^2 - v^2 + i*2*u*v = x/|z|^2 - i*v/|z|^2. + // By equating the real and imaginary parts we get: + // u^2 - v^2 = x/|z|^2 + // 2*u*v = y/|z|^2. + // + // For x >= 0, this has the numerically stable solution + // u = sqrt(0.5 * (x + |z|)) / |z| + // v = -y / (2 * u * |z|) + // and for x < 0, + // v = -sign(y) * sqrt(0.5 * (-x + |z|)) / |z| + // u = -y / (2 * v * |z|) + // + // Letting w = sqrt(0.5 * (|x| + |z|)), + // if x == 0: u = w / |z|, v = -sign(y) * w / |z| + // if x > 0: u = w / |z|, v = -y / (2 * w * |z|) + // if x < 0: u = |y| / (2 * w * |z|), v = -sign(y) * w / |z| + + const T x = numext::real(z); + const T y = numext::imag(z); + const T zero = T(0); + + const T abs_z = numext::hypot(x, y); + const T w = numext::sqrt(T(0.5) * (numext::abs(x) + abs_z)); + const T woz = w / abs_z; + // Corner cases consistent with 1/sqrt(z) on gcc/clang. + return + abs_z == zero ? std::complex(NumTraits::infinity(), NumTraits::quiet_NaN()) + : ((numext::isinf)(x) || (numext::isinf)(y)) ? std::complex(zero, zero) + : x == zero ? std::complex(woz, y < zero ? woz : -woz) + : x > zero ? std::complex(woz, -y / (2 * w * abs_z)) + : std::complex(numext::abs(y) / (2 * w * abs_z), y < zero ? woz : -woz ); +} + +template +EIGEN_DEVICE_FUNC std::complex complex_log(const std::complex& z) { + // Computes complex log. + T a = numext::abs(z); + EIGEN_USING_STD(atan2); + T b = atan2(z.imag(), z.real()); + return std::complex(numext::log(a), b); +} + } // end namespace internal } // end namespace Eigen diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Matrix.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Matrix.h index 7f4a7af93c..f0e59a911d 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Matrix.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Matrix.h @@ -29,7 +29,7 @@ private: required_alignment = unpacket_traits::alignment, packet_access_bit = (packet_traits<_Scalar>::Vectorizable && (EIGEN_UNALIGNED_VECTORIZE || (actual_alignment>=required_alignment))) ? PacketAccessBit : 0 }; - + public: typedef _Scalar Scalar; typedef Dense StorageKind; @@ -44,7 +44,7 @@ public: Options = _Options, InnerStrideAtCompileTime = 1, OuterStrideAtCompileTime = (Options&RowMajor) ? ColsAtCompileTime : RowsAtCompileTime, - + // FIXME, the following flag in only used to define NeedsToAlign in PlainObjectBase EvaluatorFlags = LinearAccessBit | DirectAccessBit | packet_access_bit | row_major_bit, Alignment = actual_alignment @@ -255,53 +255,93 @@ class Matrix * * \sa resize(Index,Index) */ - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Matrix() : Base() + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Matrix() : Base() { Base::_check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED } // FIXME is it still needed - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Matrix(internal::constructor_without_unaligned_array_assert) : Base(internal::constructor_without_unaligned_array_assert()) { Base::_check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED } #if EIGEN_HAS_RVALUE_REFERENCES - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible::value) : Base(std::move(other)) { Base::_check_template_params(); } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix& operator=(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable::value) { - other.swap(*this); + Base::operator=(std::move(other)); return *this; } #endif - #ifndef EIGEN_PARSED_BY_DOXYGEN +#if EIGEN_HAS_CXX11 + /** \copydoc PlainObjectBase(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&... args) + * + * Example: \include Matrix_variadic_ctor_cxx11.cpp + * Output: \verbinclude Matrix_variadic_ctor_cxx11.out + * + * \sa Matrix(const std::initializer_list>&) + */ + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Matrix(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) + : Base(a0, a1, a2, a3, args...) {} + + /** \brief Constructs a Matrix and initializes it from the coefficients given as initializer-lists grouped by row. \cpp11 + * + * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients: + * + * Example: \include Matrix_initializer_list_23_cxx11.cpp + * Output: \verbinclude Matrix_initializer_list_23_cxx11.out + * + * Each of the inner initializer lists must contain the exact same number of elements, otherwise an assertion is triggered. + * + * In the case of a compile-time column vector, implicit transposition from a single row is allowed. + * Therefore VectorXd{{1,2,3,4,5}} is legal and the more verbose syntax + * RowVectorXd{{1},{2},{3},{4},{5}} can be avoided: + * + * Example: \include Matrix_initializer_list_vector_cxx11.cpp + * Output: \verbinclude Matrix_initializer_list_vector_cxx11.out + * + * In the case of fixed-sized matrices, the initializer list sizes must exactly match the matrix sizes, + * and implicit transposition is allowed for compile-time vectors only. + * + * \sa Matrix(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) + */ + EIGEN_DEVICE_FUNC + explicit EIGEN_STRONG_INLINE Matrix(const std::initializer_list>& list) : Base(list) {} +#endif // end EIGEN_HAS_CXX11 + +#ifndef EIGEN_PARSED_BY_DOXYGEN // This constructor is for both 1x1 matrices and dynamic vectors template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE explicit Matrix(const T& x) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit Matrix(const T& x) { Base::_check_template_params(); Base::template _init1(x); } template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Matrix(const T0& x, const T1& y) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Matrix(const T0& x, const T1& y) { Base::_check_template_params(); Base::template _init2(x, y); } - #else + + +#else /** \brief Constructs a fixed-sized matrix initialized with coefficients starting at \a data */ EIGEN_DEVICE_FUNC explicit Matrix(const Scalar *data); @@ -311,7 +351,7 @@ class Matrix * This is useful for dynamic-size vectors. For fixed-size vectors, * it is redundant to pass these parameters, so one should use the default constructor * Matrix() instead. - * + * * \warning This constructor is disabled for fixed-size \c 1x1 matrices. For instance, * calling Matrix(1) will call the initialization constructor: Matrix(const Scalar&). * For fixed-size \c 1x1 matrices it is therefore recommended to use the default @@ -319,14 +359,15 @@ class Matrix * \c EIGEN_INITIALIZE_MATRICES_BY_{ZERO,\c NAN} macros (see \ref TopicPreprocessorDirectives). */ EIGEN_STRONG_INLINE explicit Matrix(Index dim); - /** \brief Constructs an initialized 1x1 matrix with the given coefficient */ + /** \brief Constructs an initialized 1x1 matrix with the given coefficient + * \sa Matrix(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&...) */ Matrix(const Scalar& x); /** \brief Constructs an uninitialized matrix with \a rows rows and \a cols columns. * * This is useful for dynamic-size matrices. For fixed-size matrices, * it is redundant to pass these parameters, so one should use the default constructor * Matrix() instead. - * + * * \warning This constructor is disabled for fixed-size \c 1x2 and \c 2x1 vectors. For instance, * calling Matrix2f(2,1) will call the initialization constructor: Matrix(const Scalar& x, const Scalar& y). * For fixed-size \c 1x2 or \c 2x1 vectors it is therefore recommended to use the default @@ -335,12 +376,15 @@ class Matrix */ EIGEN_DEVICE_FUNC Matrix(Index rows, Index cols); - - /** \brief Constructs an initialized 2D vector with given coefficients */ - Matrix(const Scalar& x, const Scalar& y); - #endif - /** \brief Constructs an initialized 3D vector with given coefficients */ + /** \brief Constructs an initialized 2D vector with given coefficients + * \sa Matrix(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&...) */ + Matrix(const Scalar& x, const Scalar& y); + #endif // end EIGEN_PARSED_BY_DOXYGEN + + /** \brief Constructs an initialized 3D vector with given coefficients + * \sa Matrix(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&...) + */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z) { @@ -350,7 +394,9 @@ class Matrix m_storage.data()[1] = y; m_storage.data()[2] = z; } - /** \brief Constructs an initialized 4D vector with given coefficients */ + /** \brief Constructs an initialized 4D vector with given coefficients + * \sa Matrix(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&...) + */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z, const Scalar& w) { @@ -377,8 +423,10 @@ class Matrix : Base(other.derived()) { } - EIGEN_DEVICE_FUNC inline Index innerStride() const { return 1; } - EIGEN_DEVICE_FUNC inline Index outerStride() const { return this->innerSize(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index innerStride() const EIGEN_NOEXCEPT { return 1; } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index outerStride() const EIGEN_NOEXCEPT { return this->innerSize(); } /////////// Geometry module /////////// @@ -405,7 +453,7 @@ class Matrix * * \ingroup Core_Module * - * Eigen defines several typedef shortcuts for most common matrix and vector types. + * %Eigen defines several typedef shortcuts for most common matrix and vector types. * * The general patterns are the following: * @@ -418,6 +466,15 @@ class Matrix * There are also \c VectorSizeType and \c RowVectorSizeType which are self-explanatory. For example, \c Vector4cf is * a fixed-size vector of 4 complex floats. * + * With \cpp11, template alias are also defined for common sizes. + * They follow the same pattern as above except that the scalar type suffix is replaced by a + * template parameter, i.e.: + * - `MatrixSize` where `Size` can be \c 2,\c 3,\c 4 for fixed size square matrices or \c X for dynamic size. + * - `MatrixXSize` and `MatrixSizeX` where `Size` can be \c 2,\c 3,\c 4 for hybrid dynamic/fixed matrices. + * - `VectorSize` and `RowVectorSize` for column and row vectors. + * + * With \cpp11, you can also use fully generic column and row vector types: `Vector` and `RowVector`. + * * \sa class Matrix */ @@ -454,6 +511,55 @@ EIGEN_MAKE_TYPEDEFS_ALL_SIZES(std::complex, cd) #undef EIGEN_MAKE_TYPEDEFS #undef EIGEN_MAKE_FIXED_TYPEDEFS +#if EIGEN_HAS_CXX11 + +#define EIGEN_MAKE_TYPEDEFS(Size, SizeSuffix) \ +/** \ingroup matrixtypedefs */ \ +/** \brief \cpp11 */ \ +template \ +using Matrix##SizeSuffix = Matrix; \ +/** \ingroup matrixtypedefs */ \ +/** \brief \cpp11 */ \ +template \ +using Vector##SizeSuffix = Matrix; \ +/** \ingroup matrixtypedefs */ \ +/** \brief \cpp11 */ \ +template \ +using RowVector##SizeSuffix = Matrix; + +#define EIGEN_MAKE_FIXED_TYPEDEFS(Size) \ +/** \ingroup matrixtypedefs */ \ +/** \brief \cpp11 */ \ +template \ +using Matrix##Size##X = Matrix; \ +/** \ingroup matrixtypedefs */ \ +/** \brief \cpp11 */ \ +template \ +using Matrix##X##Size = Matrix; + +EIGEN_MAKE_TYPEDEFS(2, 2) +EIGEN_MAKE_TYPEDEFS(3, 3) +EIGEN_MAKE_TYPEDEFS(4, 4) +EIGEN_MAKE_TYPEDEFS(Dynamic, X) +EIGEN_MAKE_FIXED_TYPEDEFS(2) +EIGEN_MAKE_FIXED_TYPEDEFS(3) +EIGEN_MAKE_FIXED_TYPEDEFS(4) + +/** \ingroup matrixtypedefs + * \brief \cpp11 */ +template +using Vector = Matrix; + +/** \ingroup matrixtypedefs + * \brief \cpp11 */ +template +using RowVector = Matrix; + +#undef EIGEN_MAKE_TYPEDEFS +#undef EIGEN_MAKE_FIXED_TYPEDEFS + +#endif // EIGEN_HAS_CXX11 + } // end namespace Eigen #endif // EIGEN_MATRIX_H diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/MatrixBase.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/MatrixBase.h index f8bcc8c6f5..45c3a596ec 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/MatrixBase.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/MatrixBase.h @@ -76,6 +76,7 @@ template class MatrixBase using Base::coeffRef; using Base::lazyAssign; using Base::eval; + using Base::operator-; using Base::operator+=; using Base::operator-=; using Base::operator*=; @@ -122,7 +123,6 @@ template class MatrixBase #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::MatrixBase #define EIGEN_DOC_UNARY_ADDONS(X,Y) -# include "../plugins/CommonCwiseUnaryOps.h" # include "../plugins/CommonCwiseBinaryOps.h" # include "../plugins/MatrixCwiseUnaryOps.h" # include "../plugins/MatrixCwiseBinaryOps.h" @@ -268,6 +268,8 @@ template class MatrixBase Derived& setIdentity(); EIGEN_DEVICE_FUNC Derived& setIdentity(Index rows, Index cols); + EIGEN_DEVICE_FUNC Derived& setUnit(Index i); + EIGEN_DEVICE_FUNC Derived& setUnit(Index newSize, Index i); bool isIdentity(const RealScalar& prec = NumTraits::dummy_precision()) const; bool isDiagonal(const RealScalar& prec = NumTraits::dummy_precision()) const; @@ -296,7 +298,7 @@ template class MatrixBase EIGEN_DEVICE_FUNC inline bool operator!=(const MatrixBase& other) const { return cwiseNotEqual(other).any(); } - NoAlias noalias(); + NoAlias EIGEN_DEVICE_FUNC noalias(); // TODO forceAlignedAccess is temporarily disabled // Need to find a nicer workaround. @@ -326,6 +328,7 @@ template class MatrixBase inline const PartialPivLU lu() const; + EIGEN_DEVICE_FUNC inline const Inverse inverse() const; template @@ -335,12 +338,15 @@ template class MatrixBase bool& invertible, const RealScalar& absDeterminantThreshold = NumTraits::dummy_precision() ) const; + template inline void computeInverseWithCheck( ResultType& inverse, bool& invertible, const RealScalar& absDeterminantThreshold = NumTraits::dummy_precision() ) const; + + EIGEN_DEVICE_FUNC Scalar determinant() const; /////////// Cholesky module /////////// @@ -412,15 +418,19 @@ template class MatrixBase ////////// Householder module /////////// + EIGEN_DEVICE_FUNC void makeHouseholderInPlace(Scalar& tau, RealScalar& beta); template + EIGEN_DEVICE_FUNC void makeHouseholder(EssentialPart& essential, Scalar& tau, RealScalar& beta) const; template + EIGEN_DEVICE_FUNC void applyHouseholderOnTheLeft(const EssentialPart& essential, const Scalar& tau, Scalar* workspace); template + EIGEN_DEVICE_FUNC void applyHouseholderOnTheRight(const EssentialPart& essential, const Scalar& tau, Scalar* workspace); @@ -428,8 +438,10 @@ template class MatrixBase ///////// Jacobi module ///////// template + EIGEN_DEVICE_FUNC void applyOnTheLeft(Index p, Index q, const JacobiRotation& j); template + EIGEN_DEVICE_FUNC void applyOnTheRight(Index p, Index q, const JacobiRotation& j); ///////// SparseCore module ///////// @@ -456,6 +468,11 @@ template class MatrixBase const MatrixFunctionReturnValue matrixFunction(StemFunction f) const; EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cosh, hyperbolic cosine) EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sinh, hyperbolic sine) +#if EIGEN_HAS_CXX11_MATH + EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, atanh, inverse hyperbolic cosine) + EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, acosh, inverse hyperbolic cosine) + EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, asinh, inverse hyperbolic sine) +#endif EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cos, cosine) EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sin, sine) EIGEN_MATRIX_FUNCTION(MatrixSquareRootReturnValue, sqrt, square root) diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/NestByValue.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/NestByValue.h index 13adf070e8..b4275768a0 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/NestByValue.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/NestByValue.h @@ -16,7 +16,11 @@ namespace Eigen { namespace internal { template struct traits > : public traits -{}; +{ + enum { + Flags = traits::Flags & ~NestByRefBit + }; +}; } /** \class NestByValue @@ -41,57 +45,13 @@ template class NestByValue EIGEN_DEVICE_FUNC explicit inline NestByValue(const ExpressionType& matrix) : m_expression(matrix) {} - EIGEN_DEVICE_FUNC inline Index rows() const { return m_expression.rows(); } - EIGEN_DEVICE_FUNC inline Index cols() const { return m_expression.cols(); } - EIGEN_DEVICE_FUNC inline Index outerStride() const { return m_expression.outerStride(); } - EIGEN_DEVICE_FUNC inline Index innerStride() const { return m_expression.innerStride(); } - - EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index row, Index col) const - { - return m_expression.coeff(row, col); - } - - EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index col) - { - return m_expression.const_cast_derived().coeffRef(row, col); - } - - EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index index) const - { - return m_expression.coeff(index); - } - - EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index) - { - return m_expression.const_cast_derived().coeffRef(index); - } - - template - inline const PacketScalar packet(Index row, Index col) const - { - return m_expression.template packet(row, col); - } - - template - inline void writePacket(Index row, Index col, const PacketScalar& x) - { - m_expression.const_cast_derived().template writePacket(row, col, x); - } - - template - inline const PacketScalar packet(Index index) const - { - return m_expression.template packet(index); - } - - template - inline void writePacket(Index index, const PacketScalar& x) - { - m_expression.const_cast_derived().template writePacket(index, x); - } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); } EIGEN_DEVICE_FUNC operator const ExpressionType&() const { return m_expression; } + EIGEN_DEVICE_FUNC const ExpressionType& nestedExpression() const { return m_expression; } + protected: const ExpressionType m_expression; }; @@ -99,12 +59,27 @@ template class NestByValue /** \returns an expression of the temporary version of *this. */ template -inline const NestByValue +EIGEN_DEVICE_FUNC inline const NestByValue DenseBase::nestByValue() const { return NestByValue(derived()); } +namespace internal { + +// Evaluator of Solve -> eval into a temporary +template +struct evaluator > + : public evaluator +{ + typedef evaluator Base; + + EIGEN_DEVICE_FUNC explicit evaluator(const NestByValue& xpr) + : Base(xpr.nestedExpression()) + {} +}; +} + } // end namespace Eigen #endif // EIGEN_NESTBYVALUE_H diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/NoAlias.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/NoAlias.h index 33908010b4..570283d90f 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/NoAlias.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/NoAlias.h @@ -33,6 +33,7 @@ class NoAlias public: typedef typename ExpressionType::Scalar Scalar; + EIGEN_DEVICE_FUNC explicit NoAlias(ExpressionType& expression) : m_expression(expression) {} template @@ -74,10 +75,10 @@ class NoAlias * * More precisely, noalias() allows to bypass the EvalBeforeAssignBit flag. * Currently, even though several expressions may alias, only product - * expressions have this flag. Therefore, noalias() is only usefull when + * expressions have this flag. Therefore, noalias() is only useful when * the source expression contains a matrix product. * - * Here are some examples where noalias is usefull: + * Here are some examples where noalias is useful: * \code * D.noalias() = A * B; * D.noalias() += A.transpose() * B; @@ -98,7 +99,7 @@ class NoAlias * \sa class NoAlias */ template -NoAlias MatrixBase::noalias() +NoAlias EIGEN_DEVICE_FUNC MatrixBase::noalias() { return NoAlias(derived()); } diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/NumTraits.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/NumTraits.h index daf4898789..72eac5a93c 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/NumTraits.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/NumTraits.h @@ -21,12 +21,14 @@ template< typename T, bool is_integer = NumTraits::IsInteger> struct default_digits10_impl { + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() { return std::numeric_limits::digits10; } }; template struct default_digits10_impl // Floating point { + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() { using std::log10; using std::ceil; @@ -38,11 +40,64 @@ struct default_digits10_impl // Floating point template struct default_digits10_impl // Integer { + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + static int run() { return 0; } +}; + + +// default implementation of digits(), based on numeric_limits if specialized, +// 0 for integer types, and log2(epsilon()) otherwise. +template< typename T, + bool use_numeric_limits = std::numeric_limits::is_specialized, + bool is_integer = NumTraits::IsInteger> +struct default_digits_impl +{ + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + static int run() { return std::numeric_limits::digits; } +}; + +template +struct default_digits_impl // Floating point +{ + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + static int run() { + using std::log; + using std::ceil; + typedef typename NumTraits::Real Real; + return int(ceil(-log(NumTraits::epsilon())/log(static_cast(2)))); + } +}; + +template +struct default_digits_impl // Integer +{ + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() { return 0; } }; } // end namespace internal +namespace numext { +/** \internal bit-wise cast without changing the underlying bit representation. */ + +// TODO: Replace by std::bit_cast (available in C++20) +template +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Tgt bit_cast(const Src& src) { +#if EIGEN_HAS_TYPE_TRAITS + // The behaviour of memcpy is not specified for non-trivially copyable types + EIGEN_STATIC_ASSERT(std::is_trivially_copyable::value, THIS_TYPE_IS_NOT_SUPPORTED); + EIGEN_STATIC_ASSERT(std::is_trivially_copyable::value && std::is_default_constructible::value, + THIS_TYPE_IS_NOT_SUPPORTED); +#endif + + EIGEN_STATIC_ASSERT(sizeof(Src) == sizeof(Tgt), THIS_TYPE_IS_NOT_SUPPORTED); + Tgt tgt; + EIGEN_USING_STD(memcpy) + memcpy(&tgt, &src, sizeof(Tgt)); + return tgt; +} +} // namespace numext + /** \class NumTraits * \ingroup Core_Module * @@ -71,7 +126,7 @@ struct default_digits10_impl // Integer * and to \c 0 otherwise. * \li Enum values ReadCost, AddCost and MulCost representing a rough estimate of the number of CPU cycles needed * to by move / add / mul instructions respectively, assuming the data is already stored in CPU registers. - * Stay vague here. No need to do architecture-specific stuff. + * Stay vague here. No need to do architecture-specific stuff. If you don't know what this means, just use \c Eigen::HugeCost. * \li An enum value \a IsSigned. It is equal to \c 1 if \a T is a signed type and to 0 if \a T is unsigned. * \li An enum value \a RequireInitialization. It is equal to \c 1 if the constructor of the numeric type \a T must * be called, and to 0 if it is safe not to call it. Default is 0 if \a T is an arithmetic type, and 1 otherwise. @@ -80,9 +135,18 @@ struct default_digits10_impl // Integer * \li A dummy_precision() function returning a weak epsilon value. It is mainly used as a default * value by the fuzzy comparison operators. * \li highest() and lowest() functions returning the highest and lowest possible values respectively. + * \li digits() function returning the number of radix digits (non-sign digits for integers, mantissa for floating-point). This is + * the analogue of std::numeric_limits::digits + * which is used as the default implementation if specialized. * \li digits10() function returning the number of decimal digits that can be represented without change. This is * the analogue of std::numeric_limits::digits10 * which is used as the default implementation if specialized. + * \li min_exponent() and max_exponent() functions returning the highest and lowest possible values, respectively, + * such that the radix raised to the power exponent-1 is a normalized floating-point number. These are equivalent to + * std::numeric_limits::min_exponent/ + * std::numeric_limits::max_exponent. + * \li infinity() function returning a representation of positive infinity, if available. + * \li quiet_NaN function returning a non-signaling "not-a-number", if available. */ template struct GenericNumTraits @@ -106,42 +170,60 @@ template struct GenericNumTraits typedef T Nested; typedef T Literal; - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline Real epsilon() { return numext::numeric_limits::epsilon(); } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline int digits10() { return internal::default_digits10_impl::run(); } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + static inline int digits() + { + return internal::default_digits_impl::run(); + } + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + static inline int min_exponent() + { + return numext::numeric_limits::min_exponent; + } + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + static inline int max_exponent() + { + return numext::numeric_limits::max_exponent; + } + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline Real dummy_precision() { // make sure to override this for floating-point types return Real(0); } - - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline T highest() { return (numext::numeric_limits::max)(); } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline T lowest() { - return IsInteger ? (numext::numeric_limits::min)() : (-(numext::numeric_limits::max)()); + return IsInteger ? (numext::numeric_limits::min)() + : static_cast(-(numext::numeric_limits::max)()); } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline T infinity() { return numext::numeric_limits::infinity(); } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline T quiet_NaN() { return numext::numeric_limits::quiet_NaN(); } @@ -153,19 +235,20 @@ template struct NumTraits : GenericNumTraits template<> struct NumTraits : GenericNumTraits { - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline float dummy_precision() { return 1e-5f; } }; template<> struct NumTraits : GenericNumTraits { - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline double dummy_precision() { return 1e-12; } }; template<> struct NumTraits : GenericNumTraits { + EIGEN_CONSTEXPR static inline long double dummy_precision() { return 1e-15l; } }; @@ -182,11 +265,11 @@ template struct NumTraits > MulCost = 4 * NumTraits::MulCost + 2 * NumTraits::AddCost }; - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline Real epsilon() { return NumTraits::epsilon(); } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline Real dummy_precision() { return NumTraits::dummy_precision(); } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline int digits10() { return NumTraits::digits10(); } }; @@ -206,16 +289,17 @@ struct NumTraits > IsInteger = NumTraits::IsInteger, IsSigned = NumTraits::IsSigned, RequireInitialization = 1, - ReadCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits::ReadCost, - AddCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits::AddCost, - MulCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits::MulCost + ReadCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * int(NumTraits::ReadCost), + AddCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * int(NumTraits::AddCost), + MulCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * int(NumTraits::MulCost) }; - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline RealScalar epsilon() { return NumTraits::epsilon(); } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline RealScalar dummy_precision() { return NumTraits::dummy_precision(); } + EIGEN_CONSTEXPR static inline int digits10() { return NumTraits::digits10(); } }; @@ -229,6 +313,7 @@ template<> struct NumTraits MulCost = HugeCost }; + EIGEN_CONSTEXPR static inline int digits10() { return 0; } private: @@ -243,6 +328,8 @@ private: // Empty specialization for void to allow template specialization based on NumTraits::Real with T==void and SFINAE. template<> struct NumTraits {}; +template<> struct NumTraits : GenericNumTraits {}; + } // end namespace Eigen #endif // EIGEN_NUMTRAITS_H diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/PartialReduxEvaluator.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/PartialReduxEvaluator.h new file mode 100644 index 0000000000..29abf35b99 --- /dev/null +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/PartialReduxEvaluator.h @@ -0,0 +1,232 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2011-2018 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PARTIALREDUX_H +#define EIGEN_PARTIALREDUX_H + +namespace Eigen { + +namespace internal { + + +/*************************************************************************** +* +* This file provides evaluators for partial reductions. +* There are two modes: +* +* - scalar path: simply calls the respective function on the column or row. +* -> nothing special here, all the tricky part is handled by the return +* types of VectorwiseOp's members. They embed the functor calling the +* respective DenseBase's member function. +* +* - vectorized path: implements a packet-wise reductions followed by +* some (optional) processing of the outcome, e.g., division by n for mean. +* +* For the vectorized path let's observe that the packet-size and outer-unrolling +* are both decided by the assignement logic. So all we have to do is to decide +* on the inner unrolling. +* +* For the unrolling, we can reuse "internal::redux_vec_unroller" from Redux.h, +* but be need to be careful to specify correct increment. +* +***************************************************************************/ + + +/* logic deciding a strategy for unrolling of vectorized paths */ +template +struct packetwise_redux_traits +{ + enum { + OuterSize = int(Evaluator::IsRowMajor) ? Evaluator::RowsAtCompileTime : Evaluator::ColsAtCompileTime, + Cost = OuterSize == Dynamic ? HugeCost + : OuterSize * Evaluator::CoeffReadCost + (OuterSize-1) * functor_traits::Cost, + Unrolling = Cost <= EIGEN_UNROLLING_LIMIT ? CompleteUnrolling : NoUnrolling + }; + +}; + +/* Value to be returned when size==0 , by default let's return 0 */ +template +EIGEN_DEVICE_FUNC +PacketType packetwise_redux_empty_value(const Func& ) { return pset1(0); } + +/* For products the default is 1 */ +template +EIGEN_DEVICE_FUNC +PacketType packetwise_redux_empty_value(const scalar_product_op& ) { return pset1(1); } + +/* Perform the actual reduction */ +template::Unrolling +> +struct packetwise_redux_impl; + +/* Perform the actual reduction with unrolling */ +template +struct packetwise_redux_impl +{ + typedef redux_novec_unroller Base; + typedef typename Evaluator::Scalar Scalar; + + template + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE + PacketType run(const Evaluator &eval, const Func& func, Index /*size*/) + { + return redux_vec_unroller::OuterSize>::template run(eval,func); + } +}; + +/* Add a specialization of redux_vec_unroller for size==0 at compiletime. + * This specialization is not required for general reductions, which is + * why it is defined here. + */ +template +struct redux_vec_unroller +{ + template + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE PacketType run(const Evaluator &, const Func& f) + { + return packetwise_redux_empty_value(f); + } +}; + +/* Perform the actual reduction for dynamic sizes */ +template +struct packetwise_redux_impl +{ + typedef typename Evaluator::Scalar Scalar; + typedef typename redux_traits::PacketType PacketScalar; + + template + EIGEN_DEVICE_FUNC + static PacketType run(const Evaluator &eval, const Func& func, Index size) + { + if(size==0) + return packetwise_redux_empty_value(func); + + const Index size4 = (size-1)&(~3); + PacketType p = eval.template packetByOuterInner(0,0); + Index i = 1; + // This loop is optimized for instruction pipelining: + // - each iteration generates two independent instructions + // - thanks to branch prediction and out-of-order execution we have independent instructions across loops + for(; i(i+0,0),eval.template packetByOuterInner(i+1,0)), + func.packetOp(eval.template packetByOuterInner(i+2,0),eval.template packetByOuterInner(i+3,0)))); + for(; i(i,0)); + return p; + } +}; + +template< typename ArgType, typename MemberOp, int Direction> +struct evaluator > + : evaluator_base > +{ + typedef PartialReduxExpr XprType; + typedef typename internal::nested_eval::type ArgTypeNested; + typedef typename internal::add_const_on_value_type::type ConstArgTypeNested; + typedef typename internal::remove_all::type ArgTypeNestedCleaned; + typedef typename ArgType::Scalar InputScalar; + typedef typename XprType::Scalar Scalar; + enum { + TraversalSize = Direction==int(Vertical) ? int(ArgType::RowsAtCompileTime) : int(ArgType::ColsAtCompileTime) + }; + typedef typename MemberOp::template Cost CostOpType; + enum { + CoeffReadCost = TraversalSize==Dynamic ? HugeCost + : TraversalSize==0 ? 1 + : int(TraversalSize) * int(evaluator::CoeffReadCost) + int(CostOpType::value), + + _ArgFlags = evaluator::Flags, + + _Vectorizable = bool(int(_ArgFlags)&PacketAccessBit) + && bool(MemberOp::Vectorizable) + && (Direction==int(Vertical) ? bool(_ArgFlags&RowMajorBit) : (_ArgFlags&RowMajorBit)==0) + && (TraversalSize!=0), + + Flags = (traits::Flags&RowMajorBit) + | (evaluator::Flags&(HereditaryBits&(~RowMajorBit))) + | (_Vectorizable ? PacketAccessBit : 0) + | LinearAccessBit, + + Alignment = 0 // FIXME this will need to be improved once PartialReduxExpr is vectorized + }; + + EIGEN_DEVICE_FUNC explicit evaluator(const XprType xpr) + : m_arg(xpr.nestedExpression()), m_functor(xpr.functor()) + { + EIGEN_INTERNAL_CHECK_COST_VALUE(TraversalSize==Dynamic ? HugeCost : (TraversalSize==0 ? 1 : int(CostOpType::value))); + EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); + } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Scalar coeff(Index i, Index j) const + { + return coeff(Direction==Vertical ? j : i); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Scalar coeff(Index index) const + { + return m_functor(m_arg.template subVector(index)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PacketType packet(Index i, Index j) const + { + return packet(Direction==Vertical ? j : i); + } + + template + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC + PacketType packet(Index idx) const + { + enum { PacketSize = internal::unpacket_traits::size }; + typedef Block PanelType; + + PanelType panel(m_arg, + Direction==Vertical ? 0 : idx, + Direction==Vertical ? idx : 0, + Direction==Vertical ? m_arg.rows() : Index(PacketSize), + Direction==Vertical ? Index(PacketSize) : m_arg.cols()); + + // FIXME + // See bug 1612, currently if PacketSize==1 (i.e. complex with 128bits registers) then the storage-order of panel get reversed + // and methods like packetByOuterInner do not make sense anymore in this context. + // So let's just by pass "vectorization" in this case: + if(PacketSize==1) + return internal::pset1(coeff(idx)); + + typedef typename internal::redux_evaluator PanelEvaluator; + PanelEvaluator panel_eval(panel); + typedef typename MemberOp::BinaryOp BinaryOp; + PacketType p = internal::packetwise_redux_impl::template run(panel_eval,m_functor.binaryFunc(),m_arg.outerSize()); + return p; + } + +protected: + ConstArgTypeNested m_arg; + const MemberOp m_functor; +}; + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_PARTIALREDUX_H diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/PermutationMatrix.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/PermutationMatrix.h index 47c06ba770..69401bf41e 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/PermutationMatrix.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/PermutationMatrix.h @@ -88,13 +88,13 @@ class PermutationBase : public EigenBase } /** \returns the number of rows */ - inline Index rows() const { return Index(indices().size()); } + inline EIGEN_DEVICE_FUNC Index rows() const { return Index(indices().size()); } /** \returns the number of columns */ - inline Index cols() const { return Index(indices().size()); } + inline EIGEN_DEVICE_FUNC Index cols() const { return Index(indices().size()); } /** \returns the size of a side of the respective square matrix, i.e., the number of indices */ - inline Index size() const { return Index(indices().size()); } + inline EIGEN_DEVICE_FUNC Index size() const { return Index(indices().size()); } #ifndef EIGEN_PARSED_BY_DOXYGEN template diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/PlainObjectBase.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/PlainObjectBase.h index 0f3632cfd1..e2ddbd1d52 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/PlainObjectBase.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/PlainObjectBase.h @@ -13,10 +13,10 @@ #if defined(EIGEN_INITIALIZE_MATRICES_BY_ZERO) # define EIGEN_INITIALIZE_COEFFS -# define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED for(int i=0;i::quiet_NaN(); +# define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED for(Index i=0;i::quiet_NaN(); #else # undef EIGEN_INITIALIZE_COEFFS # define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED @@ -104,7 +104,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type typedef typename internal::traits::StorageKind StorageKind; typedef typename internal::traits::Scalar Scalar; - + typedef typename internal::packet_traits::type PacketScalar; typedef typename NumTraits::Real RealScalar; typedef Derived DenseType; @@ -118,16 +118,8 @@ class PlainObjectBase : public internal::dense_xpr_base::type using Base::IsVectorAtCompileTime; using Base::Flags; - template friend class Eigen::Map; - friend class Eigen::Map; typedef Eigen::Map MapType; - friend class Eigen::Map; typedef const Eigen::Map ConstMapType; -#if EIGEN_MAX_ALIGN_BYTES>0 - // for EIGEN_MAX_ALIGN_BYTES==0, AlignedMax==Unaligned, and many compilers generate warnings for friend-ing a class twice. - friend class Eigen::Map; - friend class Eigen::Map; -#endif typedef Eigen::Map AlignedMapType; typedef const Eigen::Map ConstAlignedMapType; template struct StridedMapType { typedef Eigen::Map type; }; @@ -147,10 +139,10 @@ class PlainObjectBase : public internal::dense_xpr_base::type EIGEN_DEVICE_FUNC const Base& base() const { return *static_cast(this); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index rows() const { return m_storage.rows(); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index cols() const { return m_storage.cols(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index rows() const EIGEN_NOEXCEPT { return m_storage.rows(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index cols() const EIGEN_NOEXCEPT { return m_storage.cols(); } /** This is an overloaded version of DenseCoeffsBase::coeff(Index,Index) const * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts. @@ -358,7 +350,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type * remain row-vectors and vectors remain vectors. */ template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resizeLike(const EigenBase& _other) { const OtherDerived& other = _other.derived(); @@ -383,7 +375,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type * of rows and/or of columns, you can use conservativeResize(NoChange_t, Index) or * conservativeResize(Index, NoChange_t). * - * Matrices are resized relative to the top-left element. In case values need to be + * Matrices are resized relative to the top-left element. In case values need to be * appended to the matrix they will be uninitialized. */ EIGEN_DEVICE_FUNC @@ -440,7 +432,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type * of rows and/or of columns, you can use conservativeResize(NoChange_t, Index) or * conservativeResize(Index, NoChange_t). * - * Matrices are resized relative to the top-left element. In case values need to be + * Matrices are resized relative to the top-left element. In case values need to be * appended to the matrix they will copied from \c other. */ template @@ -508,8 +500,8 @@ class PlainObjectBase : public internal::dense_xpr_base::type EIGEN_DEVICE_FUNC PlainObjectBase& operator=(PlainObjectBase&& other) EIGEN_NOEXCEPT { - using std::swap; - swap(m_storage, other.m_storage); + _check_template_params(); + m_storage = std::move(other.m_storage); return *this; } #endif @@ -526,6 +518,71 @@ class PlainObjectBase : public internal::dense_xpr_base::type // EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED } + #if EIGEN_HAS_CXX11 + /** \brief Construct a row of column vector with fixed size from an arbitrary number of coefficients. \cpp11 + * + * \only_for_vectors + * + * This constructor is for 1D array or vectors with more than 4 coefficients. + * There exists C++98 analogue constructors for fixed-size array/vector having 1, 2, 3, or 4 coefficients. + * + * \warning To construct a column (resp. row) vector of fixed length, the number of values passed to this + * constructor must match the the fixed number of rows (resp. columns) of \c *this. + */ + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) + : m_storage() + { + _check_template_params(); + EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, sizeof...(args) + 4); + m_storage.data()[0] = a0; + m_storage.data()[1] = a1; + m_storage.data()[2] = a2; + m_storage.data()[3] = a3; + Index i = 4; + auto x = {(m_storage.data()[i++] = args, 0)...}; + static_cast(x); + } + + /** \brief Constructs a Matrix or Array and initializes it by elements given by an initializer list of initializer + * lists \cpp11 + */ + EIGEN_DEVICE_FUNC + explicit EIGEN_STRONG_INLINE PlainObjectBase(const std::initializer_list>& list) + : m_storage() + { + _check_template_params(); + + size_t list_size = 0; + if (list.begin() != list.end()) { + list_size = list.begin()->size(); + } + + // This is to allow syntax like VectorXi {{1, 2, 3, 4}} + if (ColsAtCompileTime == 1 && list.size() == 1) { + eigen_assert(list_size == static_cast(RowsAtCompileTime) || RowsAtCompileTime == Dynamic); + resize(list_size, ColsAtCompileTime); + std::copy(list.begin()->begin(), list.begin()->end(), m_storage.data()); + } else { + eigen_assert(list.size() == static_cast(RowsAtCompileTime) || RowsAtCompileTime == Dynamic); + eigen_assert(list_size == static_cast(ColsAtCompileTime) || ColsAtCompileTime == Dynamic); + resize(list.size(), list_size); + + Index row_index = 0; + for (const std::initializer_list& row : list) { + eigen_assert(list_size == row.size()); + Index col_index = 0; + for (const Scalar& e : row) { + coeffRef(row_index, col_index) = e; + ++col_index; + } + ++row_index; + } + } + } + #endif // end EIGEN_HAS_CXX11 + /** \sa PlainObjectBase::operator=(const EigenBase&) */ template EIGEN_DEVICE_FUNC @@ -564,7 +621,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type * \copydetails DenseBase::operator=(const EigenBase &other) */ template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const EigenBase &other) { _resize_to_match(other); @@ -652,18 +709,26 @@ class PlainObjectBase : public internal::dense_xpr_base::type using Base::setConstant; EIGEN_DEVICE_FUNC Derived& setConstant(Index size, const Scalar& val); EIGEN_DEVICE_FUNC Derived& setConstant(Index rows, Index cols, const Scalar& val); + EIGEN_DEVICE_FUNC Derived& setConstant(NoChange_t, Index cols, const Scalar& val); + EIGEN_DEVICE_FUNC Derived& setConstant(Index rows, NoChange_t, const Scalar& val); using Base::setZero; EIGEN_DEVICE_FUNC Derived& setZero(Index size); EIGEN_DEVICE_FUNC Derived& setZero(Index rows, Index cols); + EIGEN_DEVICE_FUNC Derived& setZero(NoChange_t, Index cols); + EIGEN_DEVICE_FUNC Derived& setZero(Index rows, NoChange_t); using Base::setOnes; EIGEN_DEVICE_FUNC Derived& setOnes(Index size); EIGEN_DEVICE_FUNC Derived& setOnes(Index rows, Index cols); + EIGEN_DEVICE_FUNC Derived& setOnes(NoChange_t, Index cols); + EIGEN_DEVICE_FUNC Derived& setOnes(Index rows, NoChange_t); using Base::setRandom; Derived& setRandom(Index size); Derived& setRandom(Index rows, Index cols); + Derived& setRandom(NoChange_t, Index cols); + Derived& setRandom(Index rows, NoChange_t); #ifdef EIGEN_PLAINOBJECTBASE_PLUGIN #include EIGEN_PLAINOBJECTBASE_PLUGIN @@ -678,7 +743,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type * remain row-vectors and vectors remain vectors. */ template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _resize_to_match(const EigenBase& other) { #ifdef EIGEN_NO_AUTOMATIC_RESIZING @@ -705,10 +770,10 @@ class PlainObjectBase : public internal::dense_xpr_base::type * * \internal */ - // aliasing is dealt once in internall::call_assignment + // aliasing is dealt once in internal::call_assignment // so at this stage we have to assume aliasing... and resising has to be done later. template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& _set(const DenseBase& other) { internal::call_assignment(this->derived(), other.derived()); @@ -721,7 +786,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type * \sa operator=(const MatrixBase&), _set() */ template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& _set_noalias(const DenseBase& other) { // I don't think we need this resize call since the lazyAssign will anyways resize @@ -744,18 +809,18 @@ class PlainObjectBase : public internal::dense_xpr_base::type FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED) resize(rows,cols); } - + template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init2(const T0& val0, const T1& val1, typename internal::enable_if::type* = 0) { EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 2) m_storage.data()[0] = Scalar(val0); m_storage.data()[1] = Scalar(val1); } - + template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init2(const Index& val0, const Index& val1, typename internal::enable_if< (!internal::is_same::value) && (internal::is_same::value) @@ -781,8 +846,8 @@ class PlainObjectBase : public internal::dense_xpr_base::type FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED) resize(size); } - - // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitely converted) + + // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitly converted) template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(const Scalar& val0, typename internal::enable_if::value,T>::type* = 0) @@ -790,7 +855,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 1) m_storage.data()[0] = val0; } - + // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type match the index type) template EIGEN_DEVICE_FUNC @@ -846,7 +911,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type { this->derived() = r; } - + // For fixed-size Array template EIGEN_DEVICE_FUNC @@ -858,7 +923,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type { Base::setConstant(val0); } - + // For fixed-size Array template EIGEN_DEVICE_FUNC @@ -872,38 +937,38 @@ class PlainObjectBase : public internal::dense_xpr_base::type { Base::setConstant(val0); } - + template friend struct internal::matrix_swap_impl; public: - + #ifndef EIGEN_PARSED_BY_DOXYGEN /** \internal * \brief Override DenseBase::swap() since for dynamic-sized matrices * of same type it is enough to swap the data pointers. */ template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void swap(DenseBase & other) { enum { SwapPointers = internal::is_same::value && Base::SizeAtCompileTime==Dynamic }; internal::matrix_swap_impl::run(this->derived(), other.derived()); } - + /** \internal * \brief const version forwarded to DenseBase::swap */ template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void swap(DenseBase const & other) { Base::swap(other.derived()); } - - EIGEN_DEVICE_FUNC + + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void _check_template_params() { - EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, (Options&RowMajor)==RowMajor) - && EIGEN_IMPLIES(MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1, (Options&RowMajor)==0) + EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, (int(Options)&RowMajor)==RowMajor) + && EIGEN_IMPLIES(MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1, (int(Options)&RowMajor)==0) && ((RowsAtCompileTime == Dynamic) || (RowsAtCompileTime >= 0)) && ((ColsAtCompileTime == Dynamic) || (ColsAtCompileTime >= 0)) && ((MaxRowsAtCompileTime == Dynamic) || (MaxRowsAtCompileTime >= 0)) @@ -915,6 +980,17 @@ class PlainObjectBase : public internal::dense_xpr_base::type } enum { IsPlainObjectBase = 1 }; +#endif + public: + // These apparently need to be down here for nvcc+icc to prevent duplicate + // Map symbol. + template friend class Eigen::Map; + friend class Eigen::Map; + friend class Eigen::Map; +#if EIGEN_MAX_ALIGN_BYTES>0 + // for EIGEN_MAX_ALIGN_BYTES==0, AlignedMax==Unaligned, and many compilers generate warnings for friend-ing a class twice. + friend class Eigen::Map; + friend class Eigen::Map; #endif }; @@ -923,13 +999,19 @@ namespace internal { template struct conservative_resize_like_impl { + #if EIGEN_HAS_TYPE_TRAITS + static const bool IsRelocatable = std::is_trivially_copyable::value; + #else + static const bool IsRelocatable = !NumTraits::RequireInitialization; + #endif static void run(DenseBase& _this, Index rows, Index cols) { if (_this.rows() == rows && _this.cols() == cols) return; EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived) - if ( ( Derived::IsRowMajor && _this.cols() == cols) || // row-major and we change only the number of rows - (!Derived::IsRowMajor && _this.rows() == rows) ) // column-major and we change only the number of columns + if ( IsRelocatable + && (( Derived::IsRowMajor && _this.cols() == cols) || // row-major and we change only the number of rows + (!Derived::IsRowMajor && _this.rows() == rows) )) // column-major and we change only the number of columns { internal::check_rows_cols_for_overflow::run(rows, cols); _this.derived().m_storage.conservativeResize(rows*cols,rows,cols); @@ -937,7 +1019,7 @@ struct conservative_resize_like_impl else { // The storage order does not allow us to use reallocation. - typename Derived::PlainObject tmp(rows,cols); + Derived tmp(rows,cols); const Index common_rows = numext::mini(rows, _this.rows()); const Index common_cols = numext::mini(cols, _this.cols()); tmp.block(0,0,common_rows,common_cols) = _this.block(0,0,common_rows,common_cols); @@ -957,8 +1039,9 @@ struct conservative_resize_like_impl EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived) EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(OtherDerived) - if ( ( Derived::IsRowMajor && _this.cols() == other.cols()) || // row-major and we change only the number of rows - (!Derived::IsRowMajor && _this.rows() == other.rows()) ) // column-major and we change only the number of columns + if ( IsRelocatable && + (( Derived::IsRowMajor && _this.cols() == other.cols()) || // row-major and we change only the number of rows + (!Derived::IsRowMajor && _this.rows() == other.rows()) )) // column-major and we change only the number of columns { const Index new_rows = other.rows() - _this.rows(); const Index new_cols = other.cols() - _this.cols(); @@ -971,7 +1054,7 @@ struct conservative_resize_like_impl else { // The storage order does not allow us to use reallocation. - typename Derived::PlainObject tmp(other); + Derived tmp(other); const Index common_rows = numext::mini(tmp.rows(), _this.rows()); const Index common_cols = numext::mini(tmp.cols(), _this.cols()); tmp.block(0,0,common_rows,common_cols) = _this.block(0,0,common_rows,common_cols); @@ -986,13 +1069,18 @@ template struct conservative_resize_like_impl : conservative_resize_like_impl { - using conservative_resize_like_impl::run; - + typedef conservative_resize_like_impl Base; + using Base::run; + using Base::IsRelocatable; + static void run(DenseBase& _this, Index size) { const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : size; const Index new_cols = Derived::RowsAtCompileTime==1 ? size : 1; - _this.derived().m_storage.conservativeResize(size,new_rows,new_cols); + if(IsRelocatable) + _this.derived().m_storage.conservativeResize(size,new_rows,new_cols); + else + Base::run(_this.derived(), new_rows, new_cols); } static void run(DenseBase& _this, const DenseBase& other) @@ -1003,7 +1091,10 @@ struct conservative_resize_like_impl const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : other.rows(); const Index new_cols = Derived::RowsAtCompileTime==1 ? other.cols() : 1; - _this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols); + if(IsRelocatable) + _this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols); + else + Base::run(_this.derived(), new_rows, new_cols); if (num_new_elements > 0) _this.tail(num_new_elements) = other.tail(num_new_elements); @@ -1014,7 +1105,7 @@ template struct matrix_swap_impl { EIGEN_DEVICE_FUNC - static inline void run(MatrixTypeA& a, MatrixTypeB& b) + static EIGEN_STRONG_INLINE void run(MatrixTypeA& a, MatrixTypeB& b) { a.base().swap(b); } diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Product.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Product.h index 676c480277..70a6c10639 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Product.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Product.h @@ -23,25 +23,25 @@ struct traits > typedef typename remove_all::type RhsCleaned; typedef traits LhsTraits; typedef traits RhsTraits; - + typedef MatrixXpr XprKind; - + typedef typename ScalarBinaryOpTraits::Scalar, typename traits::Scalar>::ReturnType Scalar; typedef typename product_promote_storage_type::ret>::ret StorageKind; typedef typename promote_index_type::type StorageIndex; - + enum { RowsAtCompileTime = LhsTraits::RowsAtCompileTime, ColsAtCompileTime = RhsTraits::ColsAtCompileTime, MaxRowsAtCompileTime = LhsTraits::MaxRowsAtCompileTime, MaxColsAtCompileTime = RhsTraits::MaxColsAtCompileTime, - + // FIXME: only needed by GeneralMatrixMatrixTriangular InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(LhsTraits::ColsAtCompileTime, RhsTraits::RowsAtCompileTime), - + // The storage order is somewhat arbitrary here. The correct one will be determined through the evaluator. Flags = (MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1) ? RowMajorBit : (MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1) ? 0 @@ -74,10 +74,10 @@ class Product : public ProductImpl<_Lhs,_Rhs,Option, internal::product_type<_Lhs,_Rhs>::ret>::ret> { public: - + typedef _Lhs Lhs; typedef _Rhs Rhs; - + typedef typename ProductImpl< Lhs, Rhs, Option, typename internal::product_promote_storage_type::StorageKind, @@ -90,18 +90,23 @@ class Product : public ProductImpl<_Lhs,_Rhs,Option, typedef typename internal::remove_all::type LhsNestedCleaned; typedef typename internal::remove_all::type RhsNestedCleaned; - EIGEN_DEVICE_FUNC Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs) { eigen_assert(lhs.cols() == rhs.rows() && "invalid matrix product" && "if you wanted a coeff-wise or a dot product use the respective explicit functions"); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index rows() const EIGEN_NOEXCEPT { return m_lhs.rows(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); } - EIGEN_DEVICE_FUNC const LhsNestedCleaned& lhs() const { return m_lhs; } - EIGEN_DEVICE_FUNC const RhsNestedCleaned& rhs() const { return m_rhs; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const LhsNestedCleaned& lhs() const { return m_lhs; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const RhsNestedCleaned& rhs() const { return m_rhs; } protected: @@ -110,13 +115,13 @@ class Product : public ProductImpl<_Lhs,_Rhs,Option, }; namespace internal { - + template::ret> class dense_product_base : public internal::dense_xpr_base >::type {}; -/** Convertion to scalar for inner-products */ +/** Conversion to scalar for inner-products */ template class dense_product_base : public internal::dense_xpr_base >::type @@ -126,8 +131,8 @@ class dense_product_base public: using Base::derived; typedef typename Base::Scalar Scalar; - - EIGEN_STRONG_INLINE operator const Scalar() const + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator const Scalar() const { return internal::evaluator(derived()).coeff(0,0); } @@ -148,25 +153,25 @@ class ProductImpl : public internal::dense_product_base { typedef Product Derived; - + public: - + typedef typename internal::dense_product_base Base; EIGEN_DENSE_PUBLIC_INTERFACE(Derived) protected: enum { - IsOneByOne = (RowsAtCompileTime == 1 || RowsAtCompileTime == Dynamic) && + IsOneByOne = (RowsAtCompileTime == 1 || RowsAtCompileTime == Dynamic) && (ColsAtCompileTime == 1 || ColsAtCompileTime == Dynamic), EnableCoeff = IsOneByOne || Option==LazyProduct }; - + public: - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index row, Index col) const { EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS); eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) ); - + return internal::evaluator(derived()).coeff(row,col); } @@ -174,11 +179,11 @@ class ProductImpl { EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS); eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) ); - + return internal::evaluator(derived()).coeff(i); } - - + + }; } // end namespace Eigen diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/ProductEvaluators.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/ProductEvaluators.h index bce1310c96..8cf294b287 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/ProductEvaluators.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/ProductEvaluators.h @@ -14,27 +14,27 @@ #define EIGEN_PRODUCTEVALUATORS_H namespace Eigen { - + namespace internal { /** \internal * Evaluator of a product expression. * Since products require special treatments to handle all possible cases, - * we simply deffer the evaluation logic to a product_evaluator class + * we simply defer the evaluation logic to a product_evaluator class * which offers more partial specialization possibilities. - * + * * \sa class product_evaluator */ template -struct evaluator > +struct evaluator > : public product_evaluator > { typedef Product XprType; typedef product_evaluator Base; - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(xpr) {} }; - + // Catch "scalar * ( A * B )" and transform it to "(A*scalar) * B" // TODO we should apply that rule only if that's really helpful template @@ -62,12 +62,12 @@ struct evaluator, template -struct evaluator, DiagIndex> > +struct evaluator, DiagIndex> > : public evaluator, DiagIndex> > { typedef Diagonal, DiagIndex> XprType; typedef evaluator, DiagIndex> > Base; - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(Diagonal, DiagIndex>( Product(xpr.nestedExpression().lhs(), xpr.nestedExpression().rhs()), @@ -108,27 +108,27 @@ struct product_evaluator, ProductTag, LhsShape, RhsSh : m_result(xpr.rows(), xpr.cols()) { ::new (static_cast(this)) Base(m_result); - + // FIXME shall we handle nested_eval here?, // if so, then we must take care at removing the call to nested_eval in the specializations (e.g., in permutation_matrix_product, transposition_matrix_product, etc.) // typedef typename internal::nested_eval::type LhsNested; // typedef typename internal::nested_eval::type RhsNested; // typedef typename internal::remove_all::type LhsNestedCleaned; // typedef typename internal::remove_all::type RhsNestedCleaned; -// +// // const LhsNested lhs(xpr.lhs()); // const RhsNested rhs(xpr.rhs()); -// +// // generic_product_impl::evalTo(m_result, lhs, rhs); generic_product_impl::evalTo(m_result, xpr.lhs(), xpr.rhs()); } - -protected: + +protected: PlainObject m_result; }; -// The following three shortcuts are enabled only if the scalar types match excatly. +// The following three shortcuts are enabled only if the scalar types match exactly. // TODO: we could enable them for different scalar types when the product is not vectorized. // Dense = Product @@ -137,7 +137,7 @@ struct Assignment, internal::assign_op::type> { typedef Product SrcXprType; - static EIGEN_STRONG_INLINE + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op &) { Index dstRows = src.rows(); @@ -155,7 +155,7 @@ struct Assignment, internal::add_assign_op< typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type> { typedef Product SrcXprType; - static EIGEN_STRONG_INLINE + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op &) { eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); @@ -170,7 +170,7 @@ struct Assignment, internal::sub_assign_op< typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type> { typedef Product SrcXprType; - static EIGEN_STRONG_INLINE + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op &) { eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); @@ -190,7 +190,7 @@ struct Assignment, const CwiseNullaryOp,Plain>, const Product > SrcXprType; - static EIGEN_STRONG_INLINE + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const AssignFunc& func) { call_assignment_no_alias(dst, (src.lhs().functor().m_other * src.rhs().lhs())*src.rhs().rhs(), func); @@ -217,7 +217,7 @@ template - static EIGEN_STRONG_INLINE + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const InitialFunc& /*func*/) { call_assignment_no_alias(dst, src.lhs(), Func1()); @@ -246,19 +246,19 @@ template struct generic_product_impl { template - static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { dst.coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum(); } - + template - static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { dst.coeffRef(0,0) += (lhs.transpose().cwiseProduct(rhs)).sum(); } - + template - static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { dst.coeffRef(0,0) -= (lhs.transpose().cwiseProduct(rhs)).sum(); } }; @@ -269,10 +269,10 @@ struct generic_product_impl // Column major result template -void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&) +void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&) { evaluator rhsEval(rhs); - typename nested_eval::type actual_lhs(lhs); + ei_declare_local_nested_eval(Lhs,lhs,Rhs::SizeAtCompileTime,actual_lhs); // FIXME if cols is large enough, then it might be useful to make sure that lhs is sequentially stored // FIXME not very good if rhs is real and lhs complex while alpha is real too const Index cols = dst.cols(); @@ -282,10 +282,10 @@ void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const // Row major result template -void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&) +void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&) { evaluator lhsEval(lhs); - typename nested_eval::type actual_rhs(rhs); + ei_declare_local_nested_eval(Rhs,rhs,Lhs::SizeAtCompileTime,actual_rhs); // FIXME if rows is large enough, then it might be useful to make sure that rhs is sequentially stored // FIXME not very good if lhs is real and rhs complex while alpha is real too const Index rows = dst.rows(); @@ -298,43 +298,43 @@ struct generic_product_impl { template struct is_row_major : internal::conditional<(int(T::Flags)&RowMajorBit), internal::true_type, internal::false_type>::type {}; typedef typename Product::Scalar Scalar; - + // TODO it would be nice to be able to exploit our *_assign_op functors for that purpose - struct set { template void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() = src; } }; - struct add { template void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } }; - struct sub { template void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } }; + struct set { template EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() = src; } }; + struct add { template EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } }; + struct sub { template EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } }; struct adds { Scalar m_scale; explicit adds(const Scalar& s) : m_scale(s) {} - template void operator()(const Dst& dst, const Src& src) const { + template void EIGEN_DEVICE_FUNC operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += m_scale * src; } }; - + template - static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major()); } - + template - static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major()); } - + template - static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major()); } - + template - static EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major()); } - + }; @@ -343,21 +343,21 @@ template struct generic_product_impl_base { typedef typename Product::Scalar Scalar; - + template - static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { dst.setZero(); scaleAndAddTo(dst, lhs, rhs, Scalar(1)); } template - static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { scaleAndAddTo(dst,lhs, rhs, Scalar(1)); } template - static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); } - + template - static EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { Derived::scaleAndAddTo(dst,lhs,rhs,alpha); } }; @@ -373,8 +373,13 @@ struct generic_product_impl typedef typename internal::remove_all::type>::type MatrixType; template - static EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { + // Fallback to inner product if both the lhs and rhs is a runtime vector. + if (lhs.rows() == 1 && rhs.cols() == 1) { + dst.coeffRef(0,0) += alpha * lhs.row(0).conjugate().dot(rhs.col(0)); + return; + } LhsNested actual_lhs(lhs); RhsNested actual_rhs(rhs); internal::gemv_dense_selector }; template -struct generic_product_impl +struct generic_product_impl { typedef typename Product::Scalar Scalar; - + template - static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { // Same as: dst.noalias() = lhs.lazyProduct(rhs); // but easier on the compiler side @@ -398,48 +403,71 @@ struct generic_product_impl } template - static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { // dst.noalias() += lhs.lazyProduct(rhs); call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::add_assign_op()); } - + template - static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { // dst.noalias() -= lhs.lazyProduct(rhs); call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op()); } - // Catch "dst {,+,-}= (s*A)*B" and evaluate it lazily by moving out the scalar factor: - // dst {,+,-}= s * (A.lazyProduct(B)) - // This is a huge benefit for heap-allocated matrix types as it save one costly allocation. - // For them, this strategy is also faster than simply by-passing the heap allocation through - // stack allocation. - // For fixed sizes matrices, this is less obvious, it is sometimes x2 faster, but sometimes x3 slower, - // and the behavior depends also a lot on the compiler... so let's be conservative and enable them for dynamic-size only, - // that is when coming from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h - template + // This is a special evaluation path called from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h + // This variant tries to extract scalar multiples from both the LHS and RHS and factor them out. For instance: + // dst {,+,-}= (s1*A)*(B*s2) + // will be rewritten as: + // dst {,+,-}= (s1*s2) * (A.lazyProduct(B)) + // There are at least four benefits of doing so: + // 1 - huge performance gain for heap-allocated matrix types as it save costly allocations. + // 2 - it is faster than simply by-passing the heap allocation through stack allocation. + // 3 - it makes this fallback consistent with the heavy GEMM routine. + // 4 - it fully by-passes huge stack allocation attempts when multiplying huge fixed-size matrices. + // (see https://stackoverflow.com/questions/54738495) + // For small fixed sizes matrices, howver, the gains are less obvious, it is sometimes x2 faster, but sometimes x3 slower, + // and the behavior depends also a lot on the compiler... This is why this re-writting strategy is currently + // enabled only when falling back from the main GEMM. + template static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void eval_dynamic(Dst& dst, const CwiseBinaryOp, - const CwiseNullaryOp, Plain1>, Xpr2>& lhs, const Rhs& rhs, const Func &func) + void eval_dynamic(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Func &func) { - call_assignment_no_alias(dst, lhs.lhs().functor().m_other * lhs.rhs().lazyProduct(rhs), func); + enum { + HasScalarFactor = blas_traits::HasScalarFactor || blas_traits::HasScalarFactor, + ConjLhs = blas_traits::NeedToConjugate, + ConjRhs = blas_traits::NeedToConjugate + }; + // FIXME: in c++11 this should be auto, and extractScalarFactor should also return auto + // this is important for real*complex_mat + Scalar actualAlpha = combine_scalar_factors(lhs, rhs); + + eval_dynamic_impl(dst, + blas_traits::extract(lhs).template conjugateIf(), + blas_traits::extract(rhs).template conjugateIf(), + func, + actualAlpha, + typename conditional::type()); } - // Here, we we always have LhsT==Lhs, but we need to make it a template type to make the above - // overload more specialized. - template +protected: + + template static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void eval_dynamic(Dst& dst, const LhsT& lhs, const Rhs& rhs, const Func &func) + void eval_dynamic_impl(Dst& dst, const LhsT& lhs, const RhsT& rhs, const Func &func, const Scalar& s /* == 1 */, false_type) { - call_assignment_no_alias(dst, lhs.lazyProduct(rhs), func); + EIGEN_UNUSED_VARIABLE(s); + eigen_internal_assert(s==Scalar(1)); + call_restricted_packet_assignment_no_alias(dst, lhs.lazyProduct(rhs), func); + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void eval_dynamic_impl(Dst& dst, const LhsT& lhs, const RhsT& rhs, const Func &func, const Scalar& s, true_type) + { + call_restricted_packet_assignment_no_alias(dst, s * lhs.lazyProduct(rhs), func); } - - -// template -// static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) -// { dst.noalias() += alpha * lhs.lazyProduct(rhs); } }; // This specialization enforces the use of a coefficient-based evaluation strategy @@ -497,7 +525,7 @@ struct product_evaluator, ProductTag, DenseShape, typedef typename internal::nested_eval::type LhsNested; typedef typename internal::nested_eval::type RhsNested; - + typedef typename internal::remove_all::type LhsNestedCleaned; typedef typename internal::remove_all::type RhsNestedCleaned; @@ -516,19 +544,19 @@ struct product_evaluator, ProductTag, DenseShape, typedef typename find_best_packet::type RhsVecPacketType; enum { - + LhsCoeffReadCost = LhsEtorType::CoeffReadCost, RhsCoeffReadCost = RhsEtorType::CoeffReadCost, CoeffReadCost = InnerSize==0 ? NumTraits::ReadCost : InnerSize == Dynamic ? HugeCost - : InnerSize * (NumTraits::MulCost + LhsCoeffReadCost + RhsCoeffReadCost) + : InnerSize * (NumTraits::MulCost + int(LhsCoeffReadCost) + int(RhsCoeffReadCost)) + (InnerSize - 1) * NumTraits::AddCost, Unroll = CoeffReadCost <= EIGEN_UNROLLING_LIMIT, - + LhsFlags = LhsEtorType::Flags, RhsFlags = RhsEtorType::Flags, - + LhsRowMajor = LhsFlags & RowMajorBit, RhsRowMajor = RhsFlags & RowMajorBit, @@ -538,7 +566,7 @@ struct product_evaluator, ProductTag, DenseShape, // Here, we don't care about alignment larger than the usable packet size. LhsAlignment = EIGEN_PLAIN_ENUM_MIN(LhsEtorType::Alignment,LhsVecPacketSize*int(sizeof(typename LhsNestedCleaned::Scalar))), RhsAlignment = EIGEN_PLAIN_ENUM_MIN(RhsEtorType::Alignment,RhsVecPacketSize*int(sizeof(typename RhsNestedCleaned::Scalar))), - + SameType = is_same::value, CanVectorizeRhs = bool(RhsRowMajor) && (RhsFlags & PacketAccessBit) && (ColsAtCompileTime!=1), @@ -548,12 +576,12 @@ struct product_evaluator, ProductTag, DenseShape, : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0 : (bool(RhsRowMajor) && !CanVectorizeLhs), - Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & ~RowMajorBit) + Flags = ((int(LhsFlags) | int(RhsFlags)) & HereditaryBits & ~RowMajorBit) | (EvalToRowMajor ? RowMajorBit : 0) // TODO enable vectorization for mixed types | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0) | (XprType::IsVectorAtCompileTime ? LinearAccessBit : 0), - + LhsOuterStrideBytes = int(LhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename LhsNestedCleaned::Scalar)), RhsOuterStrideBytes = int(RhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename RhsNestedCleaned::Scalar)), @@ -569,10 +597,10 @@ struct product_evaluator, ProductTag, DenseShape, CanVectorizeInner = SameType && LhsRowMajor && (!RhsRowMajor) - && (LhsFlags & RhsFlags & ActualPacketAccessBit) - && (InnerSize % packet_traits::size == 0) + && (int(LhsFlags) & int(RhsFlags) & ActualPacketAccessBit) + && (int(InnerSize) % packet_traits::size == 0) }; - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index row, Index col) const { return (m_lhs.row(row).transpose().cwiseProduct( m_rhs.col(col) )).sum(); @@ -582,7 +610,8 @@ struct product_evaluator, ProductTag, DenseShape, * which is why we don't set the LinearAccessBit. * TODO: this seems possible when the result is a vector */ - EIGEN_DEVICE_FUNC const CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const CoeffReturnType coeff(Index index) const { const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? 0 : index; const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? index : 0; @@ -590,6 +619,7 @@ struct product_evaluator, ProductTag, DenseShape, } template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packet(Index row, Index col) const { PacketType res; @@ -601,6 +631,7 @@ struct product_evaluator, ProductTag, DenseShape, } template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packet(Index index) const { const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? 0 : index; @@ -611,7 +642,7 @@ struct product_evaluator, ProductTag, DenseShape, protected: typename internal::add_const_on_value_type::type m_lhs; typename internal::add_const_on_value_type::type m_rhs; - + LhsEtorType m_lhsImpl; RhsEtorType m_rhsImpl; @@ -629,7 +660,8 @@ struct product_evaluator, LazyCoeffBasedProduc enum { Flags = Base::Flags | EvalBeforeNestingBit }; - EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit product_evaluator(const XprType& xpr) : Base(BaseProduct(xpr.lhs(),xpr.rhs())) {} }; @@ -641,7 +673,7 @@ struct product_evaluator, LazyCoeffBasedProduc template struct etor_product_packet_impl { - static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res) { etor_product_packet_impl::run(row, col, lhs, rhs, innerDim, res); res = pmadd(pset1(lhs.coeff(row, Index(UnrollingIndex-1))), rhs.template packet(Index(UnrollingIndex-1), col), res); @@ -651,7 +683,7 @@ struct etor_product_packet_impl struct etor_product_packet_impl { - static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res) { etor_product_packet_impl::run(row, col, lhs, rhs, innerDim, res); res = pmadd(lhs.template packet(row, Index(UnrollingIndex-1)), pset1(rhs.coeff(Index(UnrollingIndex-1), col)), res); @@ -661,7 +693,7 @@ struct etor_product_packet_impl struct etor_product_packet_impl { - static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res) { res = pmul(pset1(lhs.coeff(row, Index(0))),rhs.template packet(Index(0), col)); } @@ -670,7 +702,7 @@ struct etor_product_packet_impl template struct etor_product_packet_impl { - static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res) { res = pmul(lhs.template packet(row, Index(0)), pset1(rhs.coeff(Index(0), col))); } @@ -679,7 +711,7 @@ struct etor_product_packet_impl template struct etor_product_packet_impl { - static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res) { res = pset1(typename unpacket_traits::type(0)); } @@ -688,7 +720,7 @@ struct etor_product_packet_impl template struct etor_product_packet_impl { - static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res) { res = pset1(typename unpacket_traits::type(0)); } @@ -697,7 +729,7 @@ struct etor_product_packet_impl template struct etor_product_packet_impl { - static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res) { res = pset1(typename unpacket_traits::type(0)); for(Index i = 0; i < innerDim; ++i) @@ -708,7 +740,7 @@ struct etor_product_packet_impl template struct etor_product_packet_impl { - static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res) { res = pset1(typename unpacket_traits::type(0)); for(Index i = 0; i < innerDim; ++i) @@ -730,7 +762,7 @@ struct generic_product_impl : generic_product_impl_base > { typedef typename Product::Scalar Scalar; - + template static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { @@ -744,7 +776,7 @@ struct generic_product_impl : generic_product_impl_base > { typedef typename Product::Scalar Scalar; - + template static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { @@ -765,9 +797,10 @@ struct generic_product_impl : generic_product_impl_base > { typedef typename Product::Scalar Scalar; - + template - static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) + static EIGEN_DEVICE_FUNC + void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { selfadjoint_product_impl::run(dst, lhs.nestedExpression(), rhs, alpha); } @@ -778,7 +811,7 @@ struct generic_product_impl : generic_product_impl_base > { typedef typename Product::Scalar Scalar; - + template static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { @@ -790,7 +823,7 @@ struct generic_product_impl /*************************************************************************** * Diagonal products ***************************************************************************/ - + template struct diagonal_product_evaluator_base : evaluator_base @@ -798,17 +831,25 @@ struct diagonal_product_evaluator_base typedef typename ScalarBinaryOpTraits::ReturnType Scalar; public: enum { - CoeffReadCost = NumTraits::MulCost + evaluator::CoeffReadCost + evaluator::CoeffReadCost, - + CoeffReadCost = int(NumTraits::MulCost) + int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost), + MatrixFlags = evaluator::Flags, DiagFlags = evaluator::Flags, - _StorageOrder = MatrixFlags & RowMajorBit ? RowMajor : ColMajor, + + _StorageOrder = (Derived::MaxRowsAtCompileTime==1 && Derived::MaxColsAtCompileTime!=1) ? RowMajor + : (Derived::MaxColsAtCompileTime==1 && Derived::MaxRowsAtCompileTime!=1) ? ColMajor + : MatrixFlags & RowMajorBit ? RowMajor : ColMajor, + _SameStorageOrder = _StorageOrder == (MatrixFlags & RowMajorBit ? RowMajor : ColMajor), + _ScalarAccessOnDiag = !((int(_StorageOrder) == ColMajor && int(ProductOrder) == OnTheLeft) ||(int(_StorageOrder) == RowMajor && int(ProductOrder) == OnTheRight)), _SameTypes = is_same::value, // FIXME currently we need same types, but in the future the next rule should be the one //_Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && ((!_PacketOnDiag) || (_SameTypes && bool(int(DiagFlags)&PacketAccessBit))), - _Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))), + _Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) + && _SameTypes + && (_SameStorageOrder || (MatrixFlags&LinearAccessBit)==LinearAccessBit) + && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))), _LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0, Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0), Alignment = evaluator::Alignment, @@ -817,14 +858,14 @@ public: || (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::RowsAtCompileTime==1 && ProductOrder==OnTheLeft) || (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::ColsAtCompileTime==1 && ProductOrder==OnTheRight) }; - - diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag) + + EIGEN_DEVICE_FUNC diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag) : m_diagImpl(diag), m_matImpl(mat) { EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits::MulCost); EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); } - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const { if(AsScalarProduct) @@ -832,7 +873,7 @@ public: else return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx); } - + protected: template EIGEN_STRONG_INLINE PacketType packet_impl(Index row, Index col, Index id, internal::true_type) const @@ -840,7 +881,7 @@ protected: return internal::pmul(m_matImpl.template packet(row, col), internal::pset1(m_diagImpl.coeff(id))); } - + template EIGEN_STRONG_INLINE PacketType packet_impl(Index row, Index col, Index id, internal::false_type) const { @@ -851,7 +892,7 @@ protected: return internal::pmul(m_matImpl.template packet(row, col), m_diagImpl.template packet(id)); } - + evaluator m_diagImpl; evaluator m_matImpl; }; @@ -866,25 +907,25 @@ struct product_evaluator, ProductTag, DiagonalSha using Base::m_matImpl; using Base::coeff; typedef typename Base::Scalar Scalar; - + typedef Product XprType; typedef typename XprType::PlainObject PlainObject; - - enum { - StorageOrder = int(Rhs::Flags) & RowMajorBit ? RowMajor : ColMajor - }; + typedef typename Lhs::DiagonalVectorType DiagonalType; + + + enum { StorageOrder = Base::_StorageOrder }; EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) : Base(xpr.rhs(), xpr.lhs().diagonal()) { } - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const { return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col); } - -#ifndef __CUDACC__ + +#ifndef EIGEN_GPUCC template EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { @@ -893,7 +934,7 @@ struct product_evaluator, ProductTag, DiagonalSha return this->template packet_impl(row,col, row, typename internal::conditional::type()); } - + template EIGEN_STRONG_INLINE PacketType packet(Index idx) const { @@ -912,30 +953,30 @@ struct product_evaluator, ProductTag, DenseShape, using Base::m_matImpl; using Base::coeff; typedef typename Base::Scalar Scalar; - + typedef Product XprType; typedef typename XprType::PlainObject PlainObject; - - enum { StorageOrder = int(Lhs::Flags) & RowMajorBit ? RowMajor : ColMajor }; + + enum { StorageOrder = Base::_StorageOrder }; EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) : Base(xpr.lhs(), xpr.rhs().diagonal()) { } - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const { return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col); } - -#ifndef __CUDACC__ + +#ifndef EIGEN_GPUCC template EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { return this->template packet_impl(row,col, col, typename internal::conditional::type()); } - + template EIGEN_STRONG_INLINE PacketType packet(Index idx) const { @@ -963,7 +1004,7 @@ struct permutation_matrix_product typedef typename remove_all::type MatrixTypeCleaned; template - static inline void run(Dest& dst, const PermutationType& perm, const ExpressionType& xpr) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Dest& dst, const PermutationType& perm, const ExpressionType& xpr) { MatrixType mat(xpr); const Index n = Side==OnTheLeft ? mat.rows() : mat.cols(); @@ -1017,7 +1058,7 @@ template struct generic_product_impl { template - static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) { permutation_matrix_product::run(dst, lhs, rhs); } @@ -1027,7 +1068,7 @@ template struct generic_product_impl { template - static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) { permutation_matrix_product::run(dst, rhs, lhs); } @@ -1037,7 +1078,7 @@ template struct generic_product_impl, Rhs, PermutationShape, MatrixShape, ProductTag> { template - static void evalTo(Dest& dst, const Inverse& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Inverse& lhs, const Rhs& rhs) { permutation_matrix_product::run(dst, lhs.nestedExpression(), rhs); } @@ -1047,7 +1088,7 @@ template struct generic_product_impl, MatrixShape, PermutationShape, ProductTag> { template - static void evalTo(Dest& dst, const Lhs& lhs, const Inverse& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Inverse& rhs) { permutation_matrix_product::run(dst, rhs.nestedExpression(), lhs); } @@ -1069,9 +1110,9 @@ struct transposition_matrix_product { typedef typename nested_eval::type MatrixType; typedef typename remove_all::type MatrixTypeCleaned; - + template - static inline void run(Dest& dst, const TranspositionType& tr, const ExpressionType& xpr) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Dest& dst, const TranspositionType& tr, const ExpressionType& xpr) { MatrixType mat(xpr); typedef typename TranspositionType::StorageIndex StorageIndex; @@ -1094,7 +1135,7 @@ template struct generic_product_impl { template - static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) { transposition_matrix_product::run(dst, lhs, rhs); } @@ -1104,7 +1145,7 @@ template struct generic_product_impl { template - static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) { transposition_matrix_product::run(dst, rhs, lhs); } @@ -1115,7 +1156,7 @@ template struct generic_product_impl, Rhs, TranspositionsShape, MatrixShape, ProductTag> { template - static void evalTo(Dest& dst, const Transpose& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Transpose& lhs, const Rhs& rhs) { transposition_matrix_product::run(dst, lhs.nestedExpression(), rhs); } @@ -1125,7 +1166,7 @@ template struct generic_product_impl, MatrixShape, TranspositionsShape, ProductTag> { template - static void evalTo(Dest& dst, const Lhs& lhs, const Transpose& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Transpose& rhs) { transposition_matrix_product::run(dst, rhs.nestedExpression(), lhs); } diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Random.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Random.h index 6faf789c76..dab2ac8e9e 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Random.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Random.h @@ -128,7 +128,7 @@ DenseBase::Random() * \sa class CwiseNullaryOp, setRandom(Index), setRandom(Index,Index) */ template -inline Derived& DenseBase::setRandom() +EIGEN_DEVICE_FUNC inline Derived& DenseBase::setRandom() { return *this = Random(rows(), cols()); } @@ -177,6 +177,42 @@ PlainObjectBase::setRandom(Index rows, Index cols) return setRandom(); } +/** Resizes to the given size, changing only the number of columns, and sets all + * coefficients in this expression to random values. For the parameter of type + * NoChange_t, just pass the special value \c NoChange. + * + * Numbers are uniformly spread through their whole definition range for integer types, + * and in the [-1:1] range for floating point scalar types. + * + * \not_reentrant + * + * \sa DenseBase::setRandom(), setRandom(Index), setRandom(Index, NoChange_t), class CwiseNullaryOp, DenseBase::Random() + */ +template +EIGEN_STRONG_INLINE Derived& +PlainObjectBase::setRandom(NoChange_t, Index cols) +{ + return setRandom(rows(), cols); +} + +/** Resizes to the given size, changing only the number of rows, and sets all + * coefficients in this expression to random values. For the parameter of type + * NoChange_t, just pass the special value \c NoChange. + * + * Numbers are uniformly spread through their whole definition range for integer types, + * and in the [-1:1] range for floating point scalar types. + * + * \not_reentrant + * + * \sa DenseBase::setRandom(), setRandom(Index), setRandom(NoChange_t, Index), class CwiseNullaryOp, DenseBase::Random() + */ +template +EIGEN_STRONG_INLINE Derived& +PlainObjectBase::setRandom(Index rows, NoChange_t) +{ + return setRandom(rows, cols()); +} + } // end namespace Eigen #endif // EIGEN_RANDOM_H diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Redux.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Redux.h index 760e9f8615..b6790d1105 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Redux.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Redux.h @@ -23,23 +23,29 @@ namespace internal { * Part 1 : the logic deciding a strategy for vectorization and unrolling ***************************************************************************/ -template +template struct redux_traits { public: - typedef typename find_best_packet::type PacketType; + typedef typename find_best_packet::type PacketType; enum { PacketSize = unpacket_traits::size, - InnerMaxSize = int(Derived::IsRowMajor) - ? Derived::MaxColsAtCompileTime - : Derived::MaxRowsAtCompileTime + InnerMaxSize = int(Evaluator::IsRowMajor) + ? Evaluator::MaxColsAtCompileTime + : Evaluator::MaxRowsAtCompileTime, + OuterMaxSize = int(Evaluator::IsRowMajor) + ? Evaluator::MaxRowsAtCompileTime + : Evaluator::MaxColsAtCompileTime, + SliceVectorizedWork = int(InnerMaxSize)==Dynamic ? Dynamic + : int(OuterMaxSize)==Dynamic ? (int(InnerMaxSize)>=int(PacketSize) ? Dynamic : 0) + : (int(InnerMaxSize)/int(PacketSize)) * int(OuterMaxSize) }; enum { - MightVectorize = (int(Derived::Flags)&ActualPacketAccessBit) + MightVectorize = (int(Evaluator::Flags)&ActualPacketAccessBit) && (functor_traits::PacketAccess), - MayLinearVectorize = bool(MightVectorize) && (int(Derived::Flags)&LinearAccessBit), - MaySliceVectorize = bool(MightVectorize) && int(InnerMaxSize)>=3*PacketSize + MayLinearVectorize = bool(MightVectorize) && (int(Evaluator::Flags)&LinearAccessBit), + MaySliceVectorize = bool(MightVectorize) && (int(SliceVectorizedWork)==Dynamic || int(SliceVectorizedWork)>=3) }; public: @@ -51,8 +57,8 @@ public: public: enum { - Cost = Derived::SizeAtCompileTime == Dynamic ? HugeCost - : Derived::SizeAtCompileTime * Derived::CoeffReadCost + (Derived::SizeAtCompileTime-1) * functor_traits::Cost, + Cost = Evaluator::SizeAtCompileTime == Dynamic ? HugeCost + : int(Evaluator::SizeAtCompileTime) * int(Evaluator::CoeffReadCost) + (Evaluator::SizeAtCompileTime-1) * functor_traits::Cost, UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Traversal) == int(DefaultTraversal) ? 1 : int(PacketSize)) }; @@ -64,18 +70,20 @@ public: #ifdef EIGEN_DEBUG_ASSIGN static void debug() { - std::cerr << "Xpr: " << typeid(typename Derived::XprType).name() << std::endl; + std::cerr << "Xpr: " << typeid(typename Evaluator::XprType).name() << std::endl; std::cerr.setf(std::ios::hex, std::ios::basefield); - EIGEN_DEBUG_VAR(Derived::Flags) + EIGEN_DEBUG_VAR(Evaluator::Flags) std::cerr.unsetf(std::ios::hex); EIGEN_DEBUG_VAR(InnerMaxSize) + EIGEN_DEBUG_VAR(OuterMaxSize) + EIGEN_DEBUG_VAR(SliceVectorizedWork) EIGEN_DEBUG_VAR(PacketSize) EIGEN_DEBUG_VAR(MightVectorize) EIGEN_DEBUG_VAR(MayLinearVectorize) EIGEN_DEBUG_VAR(MaySliceVectorize) - EIGEN_DEBUG_VAR(Traversal) + std::cerr << "Traversal" << " = " << Traversal << " (" << demangle_traversal(Traversal) << ")" << std::endl; EIGEN_DEBUG_VAR(UnrollingLimit) - EIGEN_DEBUG_VAR(Unrolling) + std::cerr << "Unrolling" << " = " << Unrolling << " (" << demangle_unrolling(Unrolling) << ")" << std::endl; std::cerr << std::endl; } #endif @@ -87,88 +95,86 @@ public: /*** no vectorization ***/ -template +template struct redux_novec_unroller { enum { HalfLength = Length/2 }; - typedef typename Derived::Scalar Scalar; + typedef typename Evaluator::Scalar Scalar; EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func) + static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func& func) { - return func(redux_novec_unroller::run(mat,func), - redux_novec_unroller::run(mat,func)); + return func(redux_novec_unroller::run(eval,func), + redux_novec_unroller::run(eval,func)); } }; -template -struct redux_novec_unroller +template +struct redux_novec_unroller { enum { - outer = Start / Derived::InnerSizeAtCompileTime, - inner = Start % Derived::InnerSizeAtCompileTime + outer = Start / Evaluator::InnerSizeAtCompileTime, + inner = Start % Evaluator::InnerSizeAtCompileTime }; - typedef typename Derived::Scalar Scalar; + typedef typename Evaluator::Scalar Scalar; EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func&) + static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func&) { - return mat.coeffByOuterInner(outer, inner); + return eval.coeffByOuterInner(outer, inner); } }; // This is actually dead code and will never be called. It is required // to prevent false warnings regarding failed inlining though // for 0 length run() will never be called at all. -template -struct redux_novec_unroller +template +struct redux_novec_unroller { - typedef typename Derived::Scalar Scalar; + typedef typename Evaluator::Scalar Scalar; EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Scalar run(const Derived&, const Func&) { return Scalar(); } + static EIGEN_STRONG_INLINE Scalar run(const Evaluator&, const Func&) { return Scalar(); } }; /*** vectorization ***/ -template +template struct redux_vec_unroller { - enum { - PacketSize = redux_traits::PacketSize, - HalfLength = Length/2 - }; - - typedef typename Derived::Scalar Scalar; - typedef typename redux_traits::PacketType PacketScalar; - - static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func& func) + template + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE PacketType run(const Evaluator &eval, const Func& func) { + enum { + PacketSize = unpacket_traits::size, + HalfLength = Length/2 + }; + return func.packetOp( - redux_vec_unroller::run(mat,func), - redux_vec_unroller::run(mat,func) ); + redux_vec_unroller::template run(eval,func), + redux_vec_unroller::template run(eval,func) ); } }; -template -struct redux_vec_unroller +template +struct redux_vec_unroller { - enum { - index = Start * redux_traits::PacketSize, - outer = index / int(Derived::InnerSizeAtCompileTime), - inner = index % int(Derived::InnerSizeAtCompileTime), - alignment = Derived::Alignment - }; - - typedef typename Derived::Scalar Scalar; - typedef typename redux_traits::PacketType PacketScalar; - - static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func&) + template + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE PacketType run(const Evaluator &eval, const Func&) { - return mat.template packetByOuterInner(outer, inner); + enum { + PacketSize = unpacket_traits::size, + index = Start * PacketSize, + outer = index / int(Evaluator::InnerSizeAtCompileTime), + inner = index % int(Evaluator::InnerSizeAtCompileTime), + alignment = Evaluator::Alignment + }; + return eval.template packetByOuterInner(outer, inner); } }; @@ -176,53 +182,65 @@ struct redux_vec_unroller * Part 3 : implementation of all cases ***************************************************************************/ -template::Traversal, - int Unrolling = redux_traits::Unrolling +template::Traversal, + int Unrolling = redux_traits::Unrolling > struct redux_impl; -template -struct redux_impl +template +struct redux_impl { - typedef typename Derived::Scalar Scalar; - EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func) + typedef typename Evaluator::Scalar Scalar; + + template + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE + Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr) { - eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix"); + eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix"); Scalar res; - res = mat.coeffByOuterInner(0, 0); - for(Index i = 1; i < mat.innerSize(); ++i) - res = func(res, mat.coeffByOuterInner(0, i)); - for(Index i = 1; i < mat.outerSize(); ++i) - for(Index j = 0; j < mat.innerSize(); ++j) - res = func(res, mat.coeffByOuterInner(i, j)); + res = eval.coeffByOuterInner(0, 0); + for(Index i = 1; i < xpr.innerSize(); ++i) + res = func(res, eval.coeffByOuterInner(0, i)); + for(Index i = 1; i < xpr.outerSize(); ++i) + for(Index j = 0; j < xpr.innerSize(); ++j) + res = func(res, eval.coeffByOuterInner(i, j)); return res; } }; -template -struct redux_impl - : public redux_novec_unroller -{}; - -template -struct redux_impl +template +struct redux_impl + : redux_novec_unroller { - typedef typename Derived::Scalar Scalar; - typedef typename redux_traits::PacketType PacketScalar; - - static Scalar run(const Derived &mat, const Func& func) + typedef redux_novec_unroller Base; + typedef typename Evaluator::Scalar Scalar; + template + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE + Scalar run(const Evaluator &eval, const Func& func, const XprType& /*xpr*/) { - const Index size = mat.size(); + return Base::run(eval,func); + } +}; + +template +struct redux_impl +{ + typedef typename Evaluator::Scalar Scalar; + typedef typename redux_traits::PacketType PacketScalar; + + template + static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr) + { + const Index size = xpr.size(); - const Index packetSize = redux_traits::PacketSize; + const Index packetSize = redux_traits::PacketSize; const int packetAlignment = unpacket_traits::alignment; enum { - alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned), - alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Derived::Alignment) + alignment0 = (bool(Evaluator::Flags & DirectAccessBit) && bool(packet_traits::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned), + alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Evaluator::Alignment) }; - const Index alignedStart = internal::first_default_aligned(mat.nestedExpression()); + const Index alignedStart = internal::first_default_aligned(xpr); const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize); const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize); const Index alignedEnd2 = alignedStart + alignedSize2; @@ -230,34 +248,34 @@ struct redux_impl Scalar res; if(alignedSize) { - PacketScalar packet_res0 = mat.template packet(alignedStart); + PacketScalar packet_res0 = eval.template packet(alignedStart); if(alignedSize>packetSize) // we have at least two packets to partly unroll the loop { - PacketScalar packet_res1 = mat.template packet(alignedStart+packetSize); + PacketScalar packet_res1 = eval.template packet(alignedStart+packetSize); for(Index index = alignedStart + 2*packetSize; index < alignedEnd2; index += 2*packetSize) { - packet_res0 = func.packetOp(packet_res0, mat.template packet(index)); - packet_res1 = func.packetOp(packet_res1, mat.template packet(index+packetSize)); + packet_res0 = func.packetOp(packet_res0, eval.template packet(index)); + packet_res1 = func.packetOp(packet_res1, eval.template packet(index+packetSize)); } packet_res0 = func.packetOp(packet_res0,packet_res1); if(alignedEnd>alignedEnd2) - packet_res0 = func.packetOp(packet_res0, mat.template packet(alignedEnd2)); + packet_res0 = func.packetOp(packet_res0, eval.template packet(alignedEnd2)); } res = func.predux(packet_res0); for(Index index = 0; index < alignedStart; ++index) - res = func(res,mat.coeff(index)); + res = func(res,eval.coeff(index)); for(Index index = alignedEnd; index < size; ++index) - res = func(res,mat.coeff(index)); + res = func(res,eval.coeff(index)); } else // too small to vectorize anything. // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize. { - res = mat.coeff(0); + res = eval.coeff(0); for(Index index = 1; index < size; ++index) - res = func(res,mat.coeff(index)); + res = func(res,eval.coeff(index)); } return res; @@ -265,130 +283,108 @@ struct redux_impl }; // NOTE: for SliceVectorizedTraversal we simply bypass unrolling -template -struct redux_impl +template +struct redux_impl { - typedef typename Derived::Scalar Scalar; - typedef typename redux_traits::PacketType PacketType; + typedef typename Evaluator::Scalar Scalar; + typedef typename redux_traits::PacketType PacketType; - EIGEN_DEVICE_FUNC static Scalar run(const Derived &mat, const Func& func) + template + EIGEN_DEVICE_FUNC static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr) { - eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix"); - const Index innerSize = mat.innerSize(); - const Index outerSize = mat.outerSize(); + eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix"); + const Index innerSize = xpr.innerSize(); + const Index outerSize = xpr.outerSize(); enum { - packetSize = redux_traits::PacketSize + packetSize = redux_traits::PacketSize }; const Index packetedInnerSize = ((innerSize)/packetSize)*packetSize; Scalar res; if(packetedInnerSize) { - PacketType packet_res = mat.template packet(0,0); + PacketType packet_res = eval.template packet(0,0); for(Index j=0; j(j,i)); + packet_res = func.packetOp(packet_res, eval.template packetByOuterInner(j,i)); res = func.predux(packet_res); for(Index j=0; j::run(mat, func); + res = redux_impl::run(eval, func, xpr); } return res; } }; -template -struct redux_impl +template +struct redux_impl { - typedef typename Derived::Scalar Scalar; + typedef typename Evaluator::Scalar Scalar; - typedef typename redux_traits::PacketType PacketScalar; + typedef typename redux_traits::PacketType PacketType; enum { - PacketSize = redux_traits::PacketSize, - Size = Derived::SizeAtCompileTime, - VectorizedSize = (Size / PacketSize) * PacketSize + PacketSize = redux_traits::PacketSize, + Size = Evaluator::SizeAtCompileTime, + VectorizedSize = (int(Size) / int(PacketSize)) * int(PacketSize) }; - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func) + + template + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE + Scalar run(const Evaluator &eval, const Func& func, const XprType &xpr) { - eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix"); + EIGEN_ONLY_USED_FOR_DEBUG(xpr) + eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix"); if (VectorizedSize > 0) { - Scalar res = func.predux(redux_vec_unroller::run(mat,func)); + Scalar res = func.predux(redux_vec_unroller::template run(eval,func)); if (VectorizedSize != Size) - res = func(res,redux_novec_unroller::run(mat,func)); + res = func(res,redux_novec_unroller::run(eval,func)); return res; } else { - return redux_novec_unroller::run(mat,func); + return redux_novec_unroller::run(eval,func); } } }; // evaluator adaptor template -class redux_evaluator +class redux_evaluator : public internal::evaluator<_XprType> { + typedef internal::evaluator<_XprType> Base; public: typedef _XprType XprType; - EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit redux_evaluator(const XprType &xpr) : Base(xpr) {} typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketScalar PacketScalar; - typedef typename XprType::PacketReturnType PacketReturnType; enum { MaxRowsAtCompileTime = XprType::MaxRowsAtCompileTime, MaxColsAtCompileTime = XprType::MaxColsAtCompileTime, // TODO we should not remove DirectAccessBit and rather find an elegant way to query the alignment offset at runtime from the evaluator - Flags = evaluator::Flags & ~DirectAccessBit, + Flags = Base::Flags & ~DirectAccessBit, IsRowMajor = XprType::IsRowMajor, SizeAtCompileTime = XprType::SizeAtCompileTime, - InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime, - CoeffReadCost = evaluator::CoeffReadCost, - Alignment = evaluator::Alignment + InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime }; - EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); } - EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); } - EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); } - EIGEN_DEVICE_FUNC Index innerSize() const { return m_xpr.innerSize(); } - EIGEN_DEVICE_FUNC Index outerSize() const { return m_xpr.outerSize(); } - - EIGEN_DEVICE_FUNC - CoeffReturnType coeff(Index row, Index col) const - { return m_evaluator.coeff(row, col); } - - EIGEN_DEVICE_FUNC - CoeffReturnType coeff(Index index) const - { return m_evaluator.coeff(index); } - - template - PacketType packet(Index row, Index col) const - { return m_evaluator.template packet(row, col); } - - template - PacketType packet(Index index) const - { return m_evaluator.template packet(index); } - - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffByOuterInner(Index outer, Index inner) const - { return m_evaluator.coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } + { return Base::coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetByOuterInner(Index outer, Index inner) const - { return m_evaluator.template packet(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } + { return Base::template packet(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } - const XprType & nestedExpression() const { return m_xpr; } - -protected: - internal::evaluator m_evaluator; - const XprType &m_xpr; }; } // end namespace internal @@ -403,39 +399,53 @@ protected: * The template parameter \a BinaryOp is the type of the functor \a func which must be * an associative operator. Both current C++98 and C++11 functor styles are handled. * + * \warning the matrix must be not empty, otherwise an assertion is triggered. + * * \sa DenseBase::sum(), DenseBase::minCoeff(), DenseBase::maxCoeff(), MatrixBase::colwise(), MatrixBase::rowwise() */ template template -EIGEN_STRONG_INLINE typename internal::traits::Scalar +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::redux(const Func& func) const { eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); typedef typename internal::redux_evaluator ThisEvaluator; ThisEvaluator thisEval(derived()); - - return internal::redux_impl::run(thisEval, func); + + // The initial expression is passed to the reducer as an additional argument instead of + // passing it as a member of redux_evaluator to help + return internal::redux_impl::run(thisEval, func, derived()); } /** \returns the minimum of all coefficients of \c *this. - * \warning the result is undefined if \c *this contains NaN. + * In case \c *this contains NaN, NaNPropagation determines the behavior: + * NaNPropagation == PropagateFast : undefined + * NaNPropagation == PropagateNaN : result is NaN + * NaNPropagation == PropagateNumbers : result is minimum of elements that are not NaN + * \warning the matrix must be not empty, otherwise an assertion is triggered. */ template -EIGEN_STRONG_INLINE typename internal::traits::Scalar +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::minCoeff() const { - return derived().redux(Eigen::internal::scalar_min_op()); + return derived().redux(Eigen::internal::scalar_min_op()); } -/** \returns the maximum of all coefficients of \c *this. - * \warning the result is undefined if \c *this contains NaN. +/** \returns the maximum of all coefficients of \c *this. + * In case \c *this contains NaN, NaNPropagation determines the behavior: + * NaNPropagation == PropagateFast : undefined + * NaNPropagation == PropagateNaN : result is NaN + * NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN + * \warning the matrix must be not empty, otherwise an assertion is triggered. */ template -EIGEN_STRONG_INLINE typename internal::traits::Scalar +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::maxCoeff() const { - return derived().redux(Eigen::internal::scalar_max_op()); + return derived().redux(Eigen::internal::scalar_max_op()); } /** \returns the sum of all coefficients of \c *this @@ -445,7 +455,7 @@ DenseBase::maxCoeff() const * \sa trace(), prod(), mean() */ template -EIGEN_STRONG_INLINE typename internal::traits::Scalar +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::sum() const { if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0)) @@ -458,7 +468,7 @@ DenseBase::sum() const * \sa trace(), prod(), sum() */ template -EIGEN_STRONG_INLINE typename internal::traits::Scalar +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::mean() const { #ifdef __INTEL_COMPILER @@ -479,7 +489,7 @@ DenseBase::mean() const * \sa sum(), mean(), trace() */ template -EIGEN_STRONG_INLINE typename internal::traits::Scalar +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::prod() const { if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0)) @@ -494,7 +504,7 @@ DenseBase::prod() const * \sa diagonal(), sum() */ template -EIGEN_STRONG_INLINE typename internal::traits::Scalar +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar MatrixBase::trace() const { return derived().diagonal().sum(); diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Ref.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Ref.h index 17a1496b84..c2a37eadbb 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Ref.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Ref.h @@ -10,7 +10,7 @@ #ifndef EIGEN_REF_H #define EIGEN_REF_H -namespace Eigen { +namespace Eigen { namespace internal { @@ -48,7 +48,7 @@ struct traits > }; typedef typename internal::conditional::type type; }; - + }; template @@ -67,12 +67,12 @@ public: typedef MapBase Base; EIGEN_DENSE_PUBLIC_INTERFACE(RefBase) - EIGEN_DEVICE_FUNC inline Index innerStride() const + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const { return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1; } - EIGEN_DEVICE_FUNC inline Index outerStride() const + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const { return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer() : IsVectorAtCompileTime ? this->size() @@ -86,36 +86,122 @@ public: m_stride(StrideType::OuterStrideAtCompileTime==Dynamic?0:StrideType::OuterStrideAtCompileTime, StrideType::InnerStrideAtCompileTime==Dynamic?0:StrideType::InnerStrideAtCompileTime) {} - + EIGEN_INHERIT_ASSIGNMENT_OPERATORS(RefBase) protected: typedef Stride StrideBase; - template - EIGEN_DEVICE_FUNC void construct(Expression& expr) - { - EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(PlainObjectType,Expression); + // Resolves inner stride if default 0. + static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index resolveInnerStride(Index inner) { + return inner == 0 ? 1 : inner; + } + // Resolves outer stride if default 0. + static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index resolveOuterStride(Index inner, Index outer, Index rows, Index cols, bool isVectorAtCompileTime, bool isRowMajor) { + return outer == 0 ? isVectorAtCompileTime ? inner * rows * cols : isRowMajor ? inner * cols : inner * rows : outer; + } + + // Returns true if construction is valid, false if there is a stride mismatch, + // and fails if there is a size mismatch. + template + EIGEN_DEVICE_FUNC bool construct(Expression& expr) + { + // Check matrix sizes. If this is a compile-time vector, we do allow + // implicitly transposing. + EIGEN_STATIC_ASSERT( + EIGEN_PREDICATE_SAME_MATRIX_SIZE(PlainObjectType, Expression) + // If it is a vector, the transpose sizes might match. + || ( PlainObjectType::IsVectorAtCompileTime + && ((int(PlainObjectType::RowsAtCompileTime)==Eigen::Dynamic + || int(Expression::ColsAtCompileTime)==Eigen::Dynamic + || int(PlainObjectType::RowsAtCompileTime)==int(Expression::ColsAtCompileTime)) + && (int(PlainObjectType::ColsAtCompileTime)==Eigen::Dynamic + || int(Expression::RowsAtCompileTime)==Eigen::Dynamic + || int(PlainObjectType::ColsAtCompileTime)==int(Expression::RowsAtCompileTime)))), + YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES + ) + + // Determine runtime rows and columns. + Index rows = expr.rows(); + Index cols = expr.cols(); if(PlainObjectType::RowsAtCompileTime==1) { eigen_assert(expr.rows()==1 || expr.cols()==1); - ::new (static_cast(this)) Base(expr.data(), 1, expr.size()); + rows = 1; + cols = expr.size(); } else if(PlainObjectType::ColsAtCompileTime==1) { eigen_assert(expr.rows()==1 || expr.cols()==1); - ::new (static_cast(this)) Base(expr.data(), expr.size(), 1); + rows = expr.size(); + cols = 1; } - else - ::new (static_cast(this)) Base(expr.data(), expr.rows(), expr.cols()); - - if(Expression::IsVectorAtCompileTime && (!PlainObjectType::IsVectorAtCompileTime) && ((Expression::Flags&RowMajorBit)!=(PlainObjectType::Flags&RowMajorBit))) - ::new (&m_stride) StrideBase(expr.innerStride(), StrideType::InnerStrideAtCompileTime==0?0:1); - else - ::new (&m_stride) StrideBase(StrideType::OuterStrideAtCompileTime==0?0:expr.outerStride(), - StrideType::InnerStrideAtCompileTime==0?0:expr.innerStride()); + // Verify that the sizes are valid. + eigen_assert( + (PlainObjectType::RowsAtCompileTime == Dynamic) || (PlainObjectType::RowsAtCompileTime == rows)); + eigen_assert( + (PlainObjectType::ColsAtCompileTime == Dynamic) || (PlainObjectType::ColsAtCompileTime == cols)); + + + // If this is a vector, we might be transposing, which means that stride should swap. + const bool transpose = PlainObjectType::IsVectorAtCompileTime && (rows != expr.rows()); + // If the storage format differs, we also need to swap the stride. + const bool row_major = ((PlainObjectType::Flags)&RowMajorBit) != 0; + const bool expr_row_major = (Expression::Flags&RowMajorBit) != 0; + const bool storage_differs = (row_major != expr_row_major); + + const bool swap_stride = (transpose != storage_differs); + + // Determine expr's actual strides, resolving any defaults if zero. + const Index expr_inner_actual = resolveInnerStride(expr.innerStride()); + const Index expr_outer_actual = resolveOuterStride(expr_inner_actual, + expr.outerStride(), + expr.rows(), + expr.cols(), + Expression::IsVectorAtCompileTime != 0, + expr_row_major); + + // If this is a column-major row vector or row-major column vector, the inner-stride + // is arbitrary, so set it to either the compile-time inner stride or 1. + const bool row_vector = (rows == 1); + const bool col_vector = (cols == 1); + const Index inner_stride = + ( (!row_major && row_vector) || (row_major && col_vector) ) ? + ( StrideType::InnerStrideAtCompileTime > 0 ? Index(StrideType::InnerStrideAtCompileTime) : 1) + : swap_stride ? expr_outer_actual : expr_inner_actual; + + // If this is a column-major column vector or row-major row vector, the outer-stride + // is arbitrary, so set it to either the compile-time outer stride or vector size. + const Index outer_stride = + ( (!row_major && col_vector) || (row_major && row_vector) ) ? + ( StrideType::OuterStrideAtCompileTime > 0 ? Index(StrideType::OuterStrideAtCompileTime) : rows * cols * inner_stride) + : swap_stride ? expr_inner_actual : expr_outer_actual; + + // Check if given inner/outer strides are compatible with compile-time strides. + const bool inner_valid = (StrideType::InnerStrideAtCompileTime == Dynamic) + || (resolveInnerStride(Index(StrideType::InnerStrideAtCompileTime)) == inner_stride); + if (!inner_valid) { + return false; + } + + const bool outer_valid = (StrideType::OuterStrideAtCompileTime == Dynamic) + || (resolveOuterStride( + inner_stride, + Index(StrideType::OuterStrideAtCompileTime), + rows, cols, PlainObjectType::IsVectorAtCompileTime != 0, + row_major) + == outer_stride); + if (!outer_valid) { + return false; + } + + ::new (static_cast(this)) Base(expr.data(), rows, cols); + ::new (&m_stride) StrideBase( + (StrideType::OuterStrideAtCompileTime == 0) ? 0 : outer_stride, + (StrideType::InnerStrideAtCompileTime == 0) ? 0 : inner_stride ); + return true; } StrideBase m_stride; @@ -187,6 +273,8 @@ protected: * void foo(const Ref >& A) { foo_impl(A); } * \endcode * + * See also the following stackoverflow questions for further references: + * - Correct usage of the Eigen::Ref<> class * * \sa PlainObjectBase::Map(), \ref TopicStorageOrders */ @@ -210,7 +298,10 @@ template class Ref typename internal::enable_if::MatchAtCompileTime),Derived>::type* = 0) { EIGEN_STATIC_ASSERT(bool(Traits::template match::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH); - Base::construct(expr.derived()); + // Construction must pass since we will not create temprary storage in the non-const case. + const bool success = Base::construct(expr.derived()); + EIGEN_UNUSED_VARIABLE(success) + eigen_assert(success); } template EIGEN_DEVICE_FUNC inline Ref(const DenseBase& expr, @@ -224,7 +315,10 @@ template class Ref EIGEN_STATIC_ASSERT(bool(internal::is_lvalue::value), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY); EIGEN_STATIC_ASSERT(bool(Traits::template match::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH); EIGEN_STATIC_ASSERT(!Derived::IsPlainObjectBase,THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY); - Base::construct(expr.const_cast_derived()); + // Construction must pass since we will not create temporary storage in the non-const case. + const bool success = Base::construct(expr.const_cast_derived()); + EIGEN_UNUSED_VARIABLE(success) + eigen_assert(success); } EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Ref) @@ -265,7 +359,10 @@ template class Ref< template EIGEN_DEVICE_FUNC void construct(const Expression& expr,internal::true_type) { - Base::construct(expr); + // Check if we can use the underlying expr's storage directly, otherwise call the copy version. + if (!Base::construct(expr)) { + construct(expr, internal::false_type()); + } } template diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Replicate.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Replicate.h index 9960ef884e..ab5be7e64b 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Replicate.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Replicate.h @@ -10,7 +10,7 @@ #ifndef EIGEN_REPLICATE_H #define EIGEN_REPLICATE_H -namespace Eigen { +namespace Eigen { namespace internal { template @@ -35,7 +35,7 @@ struct traits > IsRowMajor = MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1 ? 1 : MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1 ? 0 : (MatrixType::Flags & RowMajorBit) ? 1 : 0, - + // FIXME enable DirectAccess with negative strides? Flags = IsRowMajor ? RowMajorBit : 0 }; @@ -88,15 +88,15 @@ template class Replicate THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE) } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const { return m_matrix.rows() * m_rowFactor.value(); } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const { return m_matrix.cols() * m_colFactor.value(); } EIGEN_DEVICE_FUNC const _MatrixTypeNested& nestedExpression() const - { - return m_matrix; + { + return m_matrix; } protected: @@ -115,7 +115,7 @@ template class Replicate */ template template -const Replicate +EIGEN_DEVICE_FUNC const Replicate DenseBase::replicate() const { return Replicate(derived()); @@ -130,7 +130,7 @@ DenseBase::replicate() const * \sa VectorwiseOp::replicate(), DenseBase::replicate(), class Replicate */ template -const typename VectorwiseOp::ReplicateReturnType +EIGEN_DEVICE_FUNC const typename VectorwiseOp::ReplicateReturnType VectorwiseOp::replicate(Index factor) const { return typename VectorwiseOp::ReplicateReturnType diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Reshaped.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Reshaped.h new file mode 100644 index 0000000000..52de73b6fc --- /dev/null +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Reshaped.h @@ -0,0 +1,454 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2017 Gael Guennebaud +// Copyright (C) 2014 yoco +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_RESHAPED_H +#define EIGEN_RESHAPED_H + +namespace Eigen { + +/** \class Reshaped + * \ingroup Core_Module + * + * \brief Expression of a fixed-size or dynamic-size reshape + * + * \tparam XprType the type of the expression in which we are taking a reshape + * \tparam Rows the number of rows of the reshape we are taking at compile time (optional) + * \tparam Cols the number of columns of the reshape we are taking at compile time (optional) + * \tparam Order can be ColMajor or RowMajor, default is ColMajor. + * + * This class represents an expression of either a fixed-size or dynamic-size reshape. + * It is the return type of DenseBase::reshaped(NRowsType,NColsType) and + * most of the time this is the only way it is used. + * + * However, in C++98, if you want to directly maniputate reshaped expressions, + * for instance if you want to write a function returning such an expression, you + * will need to use this class. In C++11, it is advised to use the \em auto + * keyword for such use cases. + * + * Here is an example illustrating the dynamic case: + * \include class_Reshaped.cpp + * Output: \verbinclude class_Reshaped.out + * + * Here is an example illustrating the fixed-size case: + * \include class_FixedReshaped.cpp + * Output: \verbinclude class_FixedReshaped.out + * + * \sa DenseBase::reshaped(NRowsType,NColsType) + */ + +namespace internal { + +template +struct traits > : traits +{ + typedef typename traits::Scalar Scalar; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::XprKind XprKind; + enum{ + MatrixRows = traits::RowsAtCompileTime, + MatrixCols = traits::ColsAtCompileTime, + RowsAtCompileTime = Rows, + ColsAtCompileTime = Cols, + MaxRowsAtCompileTime = Rows, + MaxColsAtCompileTime = Cols, + XpxStorageOrder = ((int(traits::Flags) & RowMajorBit) == RowMajorBit) ? RowMajor : ColMajor, + ReshapedStorageOrder = (RowsAtCompileTime == 1 && ColsAtCompileTime != 1) ? RowMajor + : (ColsAtCompileTime == 1 && RowsAtCompileTime != 1) ? ColMajor + : XpxStorageOrder, + HasSameStorageOrderAsXprType = (ReshapedStorageOrder == XpxStorageOrder), + InnerSize = (ReshapedStorageOrder==int(RowMajor)) ? int(ColsAtCompileTime) : int(RowsAtCompileTime), + InnerStrideAtCompileTime = HasSameStorageOrderAsXprType + ? int(inner_stride_at_compile_time::ret) + : Dynamic, + OuterStrideAtCompileTime = Dynamic, + + HasDirectAccess = internal::has_direct_access::ret + && (Order==int(XpxStorageOrder)) + && ((evaluator::Flags&LinearAccessBit)==LinearAccessBit), + + MaskPacketAccessBit = (InnerSize == Dynamic || (InnerSize % packet_traits::size) == 0) + && (InnerStrideAtCompileTime == 1) + ? PacketAccessBit : 0, + //MaskAlignedBit = ((OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % 16) == 0)) ? AlignedBit : 0, + FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1) ? LinearAccessBit : 0, + FlagsLvalueBit = is_lvalue::value ? LvalueBit : 0, + FlagsRowMajorBit = (ReshapedStorageOrder==int(RowMajor)) ? RowMajorBit : 0, + FlagsDirectAccessBit = HasDirectAccess ? DirectAccessBit : 0, + Flags0 = traits::Flags & ( (HereditaryBits & ~RowMajorBit) | MaskPacketAccessBit), + + Flags = (Flags0 | FlagsLinearAccessBit | FlagsLvalueBit | FlagsRowMajorBit | FlagsDirectAccessBit) + }; +}; + +template class ReshapedImpl_dense; + +} // end namespace internal + +template class ReshapedImpl; + +template class Reshaped + : public ReshapedImpl::StorageKind> +{ + typedef ReshapedImpl::StorageKind> Impl; + public: + //typedef typename Impl::Base Base; + typedef Impl Base; + EIGEN_GENERIC_PUBLIC_INTERFACE(Reshaped) + EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Reshaped) + + /** Fixed-size constructor + */ + EIGEN_DEVICE_FUNC + inline Reshaped(XprType& xpr) + : Impl(xpr) + { + EIGEN_STATIC_ASSERT(RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic,THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE) + eigen_assert(Rows * Cols == xpr.rows() * xpr.cols()); + } + + /** Dynamic-size constructor + */ + EIGEN_DEVICE_FUNC + inline Reshaped(XprType& xpr, + Index reshapeRows, Index reshapeCols) + : Impl(xpr, reshapeRows, reshapeCols) + { + eigen_assert((RowsAtCompileTime==Dynamic || RowsAtCompileTime==reshapeRows) + && (ColsAtCompileTime==Dynamic || ColsAtCompileTime==reshapeCols)); + eigen_assert(reshapeRows * reshapeCols == xpr.rows() * xpr.cols()); + } +}; + +// The generic default implementation for dense reshape simply forward to the internal::ReshapedImpl_dense +// that must be specialized for direct and non-direct access... +template +class ReshapedImpl + : public internal::ReshapedImpl_dense >::HasDirectAccess> +{ + typedef internal::ReshapedImpl_dense >::HasDirectAccess> Impl; + public: + typedef Impl Base; + EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ReshapedImpl) + EIGEN_DEVICE_FUNC inline ReshapedImpl(XprType& xpr) : Impl(xpr) {} + EIGEN_DEVICE_FUNC inline ReshapedImpl(XprType& xpr, Index reshapeRows, Index reshapeCols) + : Impl(xpr, reshapeRows, reshapeCols) {} +}; + +namespace internal { + +/** \internal Internal implementation of dense Reshaped in the general case. */ +template +class ReshapedImpl_dense + : public internal::dense_xpr_base >::type +{ + typedef Reshaped ReshapedType; + public: + + typedef typename internal::dense_xpr_base::type Base; + EIGEN_DENSE_PUBLIC_INTERFACE(ReshapedType) + EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ReshapedImpl_dense) + + typedef typename internal::ref_selector::non_const_type MatrixTypeNested; + typedef typename internal::remove_all::type NestedExpression; + + class InnerIterator; + + /** Fixed-size constructor + */ + EIGEN_DEVICE_FUNC + inline ReshapedImpl_dense(XprType& xpr) + : m_xpr(xpr), m_rows(Rows), m_cols(Cols) + {} + + /** Dynamic-size constructor + */ + EIGEN_DEVICE_FUNC + inline ReshapedImpl_dense(XprType& xpr, Index nRows, Index nCols) + : m_xpr(xpr), m_rows(nRows), m_cols(nCols) + {} + + EIGEN_DEVICE_FUNC Index rows() const { return m_rows; } + EIGEN_DEVICE_FUNC Index cols() const { return m_cols; } + + #ifdef EIGEN_PARSED_BY_DOXYGEN + /** \sa MapBase::data() */ + EIGEN_DEVICE_FUNC inline const Scalar* data() const; + EIGEN_DEVICE_FUNC inline Index innerStride() const; + EIGEN_DEVICE_FUNC inline Index outerStride() const; + #endif + + /** \returns the nested expression */ + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + nestedExpression() const { return m_xpr; } + + /** \returns the nested expression */ + EIGEN_DEVICE_FUNC + typename internal::remove_reference::type& + nestedExpression() { return m_xpr; } + + protected: + + MatrixTypeNested m_xpr; + const internal::variable_if_dynamic m_rows; + const internal::variable_if_dynamic m_cols; +}; + + +/** \internal Internal implementation of dense Reshaped in the direct access case. */ +template +class ReshapedImpl_dense + : public MapBase > +{ + typedef Reshaped ReshapedType; + typedef typename internal::ref_selector::non_const_type XprTypeNested; + public: + + typedef MapBase Base; + EIGEN_DENSE_PUBLIC_INTERFACE(ReshapedType) + EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ReshapedImpl_dense) + + /** Fixed-size constructor + */ + EIGEN_DEVICE_FUNC + inline ReshapedImpl_dense(XprType& xpr) + : Base(xpr.data()), m_xpr(xpr) + {} + + /** Dynamic-size constructor + */ + EIGEN_DEVICE_FUNC + inline ReshapedImpl_dense(XprType& xpr, Index nRows, Index nCols) + : Base(xpr.data(), nRows, nCols), + m_xpr(xpr) + {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& nestedExpression() const + { + return m_xpr; + } + + EIGEN_DEVICE_FUNC + XprType& nestedExpression() { return m_xpr; } + + /** \sa MapBase::innerStride() */ + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index innerStride() const + { + return m_xpr.innerStride(); + } + + /** \sa MapBase::outerStride() */ + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index outerStride() const + { + return ((Flags&RowMajorBit)==RowMajorBit) ? this->cols() : this->rows(); + } + + protected: + + XprTypeNested m_xpr; +}; + +// Evaluators +template struct reshaped_evaluator; + +template +struct evaluator > + : reshaped_evaluator >::HasDirectAccess> +{ + typedef Reshaped XprType; + typedef typename XprType::Scalar Scalar; + // TODO: should check for smaller packet types + typedef typename packet_traits::type PacketScalar; + + enum { + CoeffReadCost = evaluator::CoeffReadCost, + HasDirectAccess = traits::HasDirectAccess, + +// RowsAtCompileTime = traits::RowsAtCompileTime, +// ColsAtCompileTime = traits::ColsAtCompileTime, +// MaxRowsAtCompileTime = traits::MaxRowsAtCompileTime, +// MaxColsAtCompileTime = traits::MaxColsAtCompileTime, +// +// InnerStrideAtCompileTime = traits::HasSameStorageOrderAsXprType +// ? int(inner_stride_at_compile_time::ret) +// : Dynamic, +// OuterStrideAtCompileTime = Dynamic, + + FlagsLinearAccessBit = (traits::RowsAtCompileTime == 1 || traits::ColsAtCompileTime == 1 || HasDirectAccess) ? LinearAccessBit : 0, + FlagsRowMajorBit = (traits::ReshapedStorageOrder==int(RowMajor)) ? RowMajorBit : 0, + FlagsDirectAccessBit = HasDirectAccess ? DirectAccessBit : 0, + Flags0 = evaluator::Flags & (HereditaryBits & ~RowMajorBit), + Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit | FlagsDirectAccessBit, + + PacketAlignment = unpacket_traits::alignment, + Alignment = evaluator::Alignment + }; + typedef reshaped_evaluator reshaped_evaluator_type; + EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : reshaped_evaluator_type(xpr) + { + EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); + } +}; + +template +struct reshaped_evaluator + : evaluator_base > +{ + typedef Reshaped XprType; + + enum { + CoeffReadCost = evaluator::CoeffReadCost /* TODO + cost of index computations */, + + Flags = (evaluator::Flags & (HereditaryBits /*| LinearAccessBit | DirectAccessBit*/)), + + Alignment = 0 + }; + + EIGEN_DEVICE_FUNC explicit reshaped_evaluator(const XprType& xpr) : m_argImpl(xpr.nestedExpression()), m_xpr(xpr) + { + EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); + } + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + + typedef std::pair RowCol; + + inline RowCol index_remap(Index rowId, Index colId) const + { + if(Order==ColMajor) + { + const Index nth_elem_idx = colId * m_xpr.rows() + rowId; + return RowCol(nth_elem_idx % m_xpr.nestedExpression().rows(), + nth_elem_idx / m_xpr.nestedExpression().rows()); + } + else + { + const Index nth_elem_idx = colId + rowId * m_xpr.cols(); + return RowCol(nth_elem_idx / m_xpr.nestedExpression().cols(), + nth_elem_idx % m_xpr.nestedExpression().cols()); + } + } + + EIGEN_DEVICE_FUNC + inline Scalar& coeffRef(Index rowId, Index colId) + { + EIGEN_STATIC_ASSERT_LVALUE(XprType) + const RowCol row_col = index_remap(rowId, colId); + return m_argImpl.coeffRef(row_col.first, row_col.second); + } + + EIGEN_DEVICE_FUNC + inline const Scalar& coeffRef(Index rowId, Index colId) const + { + const RowCol row_col = index_remap(rowId, colId); + return m_argImpl.coeffRef(row_col.first, row_col.second); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index rowId, Index colId) const + { + const RowCol row_col = index_remap(rowId, colId); + return m_argImpl.coeff(row_col.first, row_col.second); + } + + EIGEN_DEVICE_FUNC + inline Scalar& coeffRef(Index index) + { + EIGEN_STATIC_ASSERT_LVALUE(XprType) + const RowCol row_col = index_remap(Rows == 1 ? 0 : index, + Rows == 1 ? index : 0); + return m_argImpl.coeffRef(row_col.first, row_col.second); + + } + + EIGEN_DEVICE_FUNC + inline const Scalar& coeffRef(Index index) const + { + const RowCol row_col = index_remap(Rows == 1 ? 0 : index, + Rows == 1 ? index : 0); + return m_argImpl.coeffRef(row_col.first, row_col.second); + } + + EIGEN_DEVICE_FUNC + inline const CoeffReturnType coeff(Index index) const + { + const RowCol row_col = index_remap(Rows == 1 ? 0 : index, + Rows == 1 ? index : 0); + return m_argImpl.coeff(row_col.first, row_col.second); + } +#if 0 + EIGEN_DEVICE_FUNC + template + inline PacketScalar packet(Index rowId, Index colId) const + { + const RowCol row_col = index_remap(rowId, colId); + return m_argImpl.template packet(row_col.first, row_col.second); + + } + + template + EIGEN_DEVICE_FUNC + inline void writePacket(Index rowId, Index colId, const PacketScalar& val) + { + const RowCol row_col = index_remap(rowId, colId); + m_argImpl.const_cast_derived().template writePacket + (row_col.first, row_col.second, val); + } + + template + EIGEN_DEVICE_FUNC + inline PacketScalar packet(Index index) const + { + const RowCol row_col = index_remap(RowsAtCompileTime == 1 ? 0 : index, + RowsAtCompileTime == 1 ? index : 0); + return m_argImpl.template packet(row_col.first, row_col.second); + } + + template + EIGEN_DEVICE_FUNC + inline void writePacket(Index index, const PacketScalar& val) + { + const RowCol row_col = index_remap(RowsAtCompileTime == 1 ? 0 : index, + RowsAtCompileTime == 1 ? index : 0); + return m_argImpl.template packet(row_col.first, row_col.second, val); + } +#endif +protected: + + evaluator m_argImpl; + const XprType& m_xpr; + +}; + +template +struct reshaped_evaluator +: mapbase_evaluator, + typename Reshaped::PlainObject> +{ + typedef Reshaped XprType; + typedef typename XprType::Scalar Scalar; + + EIGEN_DEVICE_FUNC explicit reshaped_evaluator(const XprType& xpr) + : mapbase_evaluator(xpr) + { + // TODO: for the 3.4 release, this should be turned to an internal assertion, but let's keep it as is for the beta lifetime + eigen_assert(((internal::UIntPtr(xpr.data()) % EIGEN_PLAIN_ENUM_MAX(1,evaluator::Alignment)) == 0) && "data is not aligned"); + } +}; + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_RESHAPED_H diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/ReturnByValue.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/ReturnByValue.h index c44b7673bb..4dad13ea11 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/ReturnByValue.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/ReturnByValue.h @@ -60,8 +60,10 @@ template class ReturnByValue EIGEN_DEVICE_FUNC inline void evalTo(Dest& dst) const { static_cast(this)->evalTo(dst); } - EIGEN_DEVICE_FUNC inline Index rows() const { return static_cast(this)->rows(); } - EIGEN_DEVICE_FUNC inline Index cols() const { return static_cast(this)->cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rows() const EIGEN_NOEXCEPT { return static_cast(this)->rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return static_cast(this)->cols(); } #ifndef EIGEN_PARSED_BY_DOXYGEN #define Unusable YOU_ARE_TRYING_TO_ACCESS_A_SINGLE_COEFFICIENT_IN_A_SPECIAL_EXPRESSION_WHERE_THAT_IS_NOT_ALLOWED_BECAUSE_THAT_WOULD_BE_INEFFICIENT @@ -79,7 +81,7 @@ template class ReturnByValue template template -Derived& DenseBase::operator=(const ReturnByValue& other) +EIGEN_DEVICE_FUNC Derived& DenseBase::operator=(const ReturnByValue& other) { other.evalTo(derived()); return derived(); @@ -90,7 +92,7 @@ namespace internal { // Expression is evaluated in a temporary; default implementation of Assignment is bypassed so that // when a ReturnByValue expression is assigned, the evaluator is not constructed. // TODO: Finalize port to new regime; ReturnByValue should not exist in the expression world - + template struct evaluator > : public evaluator::ReturnType> @@ -98,7 +100,7 @@ struct evaluator > typedef ReturnByValue XprType; typedef typename internal::traits::ReturnType PlainObject; typedef evaluator Base; - + EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : m_result(xpr.rows(), xpr.cols()) { diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Reverse.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Reverse.h index 0640cda2a1..28cdd76aca 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Reverse.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Reverse.h @@ -12,7 +12,7 @@ #ifndef EIGEN_REVERSE_H #define EIGEN_REVERSE_H -namespace Eigen { +namespace Eigen { namespace internal { @@ -44,7 +44,7 @@ template struct reverse_packet_cond static inline PacketType run(const PacketType& x) { return x; } }; -} // end namespace internal +} // end namespace internal /** \class Reverse * \ingroup Core_Module @@ -89,8 +89,10 @@ template class Reverse EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Reverse) - EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.rows(); } - EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); } EIGEN_DEVICE_FUNC inline Index innerStride() const { @@ -98,7 +100,7 @@ template class Reverse } EIGEN_DEVICE_FUNC const typename internal::remove_all::type& - nestedExpression() const + nestedExpression() const { return m_matrix; } @@ -114,7 +116,7 @@ template class Reverse * */ template -inline typename DenseBase::ReverseReturnType +EIGEN_DEVICE_FUNC inline typename DenseBase::ReverseReturnType DenseBase::reverse() { return ReverseReturnType(derived()); @@ -136,7 +138,7 @@ DenseBase::reverse() * * \sa VectorwiseOp::reverseInPlace(), reverse() */ template -inline void DenseBase::reverseInPlace() +EIGEN_DEVICE_FUNC inline void DenseBase::reverseInPlace() { if(cols()>rows()) { @@ -161,7 +163,7 @@ inline void DenseBase::reverseInPlace() } namespace internal { - + template struct vectorwise_reverse_inplace_impl; @@ -171,8 +173,10 @@ struct vectorwise_reverse_inplace_impl template static void run(ExpressionType &xpr) { + const int HalfAtCompileTime = ExpressionType::RowsAtCompileTime==Dynamic?Dynamic:ExpressionType::RowsAtCompileTime/2; Index half = xpr.rows()/2; - xpr.topRows(half).swap(xpr.bottomRows(half).colwise().reverse()); + xpr.topRows(fix(half)) + .swap(xpr.bottomRows(fix(half)).colwise().reverse()); } }; @@ -182,8 +186,10 @@ struct vectorwise_reverse_inplace_impl template static void run(ExpressionType &xpr) { + const int HalfAtCompileTime = ExpressionType::ColsAtCompileTime==Dynamic?Dynamic:ExpressionType::ColsAtCompileTime/2; Index half = xpr.cols()/2; - xpr.leftCols(half).swap(xpr.rightCols(half).rowwise().reverse()); + xpr.leftCols(fix(half)) + .swap(xpr.rightCols(fix(half)).rowwise().reverse()); } }; @@ -201,9 +207,9 @@ struct vectorwise_reverse_inplace_impl * * \sa DenseBase::reverseInPlace(), reverse() */ template -void VectorwiseOp::reverseInPlace() +EIGEN_DEVICE_FUNC void VectorwiseOp::reverseInPlace() { - internal::vectorwise_reverse_inplace_impl::run(_expression().const_cast_derived()); + internal::vectorwise_reverse_inplace_impl::run(m_matrix); } } // end namespace Eigen diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Select.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Select.h index 79eec1b5b0..7c86bf87c1 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Select.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Select.h @@ -10,7 +10,7 @@ #ifndef EIGEN_SELECT_H #define EIGEN_SELECT_H -namespace Eigen { +namespace Eigen { /** \class Select * \ingroup Core_Module @@ -67,8 +67,10 @@ class Select : public internal::dense_xpr_base< Select template -inline const Select +inline EIGEN_DEVICE_FUNC const Select DenseBase::select(const DenseBase& thenMatrix, const DenseBase& elseMatrix) const { @@ -134,7 +136,7 @@ DenseBase::select(const DenseBase& thenMatrix, */ template template -inline const Select +inline EIGEN_DEVICE_FUNC const Select DenseBase::select(const DenseBase& thenMatrix, const typename ThenDerived::Scalar& elseScalar) const { @@ -149,7 +151,7 @@ DenseBase::select(const DenseBase& thenMatrix, */ template template -inline const Select +inline EIGEN_DEVICE_FUNC const Select DenseBase::select(const typename ElseDerived::Scalar& thenScalar, const DenseBase& elseMatrix) const { diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/SelfAdjointView.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/SelfAdjointView.h index b2e51f37ac..8ce3b372a0 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/SelfAdjointView.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/SelfAdjointView.h @@ -10,7 +10,7 @@ #ifndef EIGEN_SELFADJOINTMATRIX_H #define EIGEN_SELFADJOINTMATRIX_H -namespace Eigen { +namespace Eigen { /** \class SelfAdjointView * \ingroup Core_Module @@ -58,14 +58,15 @@ template class SelfAdjointView typedef MatrixTypeNestedCleaned NestedExpression; /** \brief The type of coefficients in this matrix */ - typedef typename internal::traits::Scalar Scalar; + typedef typename internal::traits::Scalar Scalar; typedef typename MatrixType::StorageIndex StorageIndex; typedef typename internal::remove_all::type MatrixConjugateReturnType; + typedef SelfAdjointView::type, UpLo> ConstSelfAdjointView; enum { Mode = internal::traits::Mode, Flags = internal::traits::Flags, - TransposeMode = ((Mode & Upper) ? Lower : 0) | ((Mode & Lower) ? Upper : 0) + TransposeMode = ((int(Mode) & int(Upper)) ? Lower : 0) | ((int(Mode) & int(Lower)) ? Upper : 0) }; typedef typename MatrixType::PlainObject PlainObject; @@ -75,14 +76,14 @@ template class SelfAdjointView EIGEN_STATIC_ASSERT(UpLo==Lower || UpLo==Upper,SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY); } - EIGEN_DEVICE_FUNC - inline Index rows() const { return m_matrix.rows(); } - EIGEN_DEVICE_FUNC - inline Index cols() const { return m_matrix.cols(); } - EIGEN_DEVICE_FUNC - inline Index outerStride() const { return m_matrix.outerStride(); } - EIGEN_DEVICE_FUNC - inline Index innerStride() const { return m_matrix.innerStride(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index outerStride() const EIGEN_NOEXCEPT { return m_matrix.outerStride(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index innerStride() const EIGEN_NOEXCEPT { return m_matrix.innerStride(); } /** \sa MatrixBase::coeff() * \warning the coordinates must fit into the referenced triangular part @@ -131,7 +132,7 @@ template class SelfAdjointView { return Product(lhs.derived(),rhs); } - + friend EIGEN_DEVICE_FUNC const SelfAdjointView operator*(const Scalar& s, const SelfAdjointView& mat) @@ -197,6 +198,18 @@ template class SelfAdjointView inline const ConjugateReturnType conjugate() const { return ConjugateReturnType(m_matrix.conjugate()); } + /** \returns an expression of the complex conjugate of \c *this if Cond==true, + * returns \c *this otherwise. + */ + template + EIGEN_DEVICE_FUNC + inline typename internal::conditional::type + conjugateIf() const + { + typedef typename internal::conditional::type ReturnType; + return ReturnType(m_matrix.template conjugateIf()); + } + typedef SelfAdjointView AdjointReturnType; /** \sa MatrixBase::adjoint() const */ EIGEN_DEVICE_FUNC @@ -287,17 +300,17 @@ protected: using Base::m_src; using Base::m_functor; public: - + typedef typename Base::DstEvaluatorType DstEvaluatorType; typedef typename Base::SrcEvaluatorType SrcEvaluatorType; typedef typename Base::Scalar Scalar; typedef typename Base::AssignmentTraits AssignmentTraits; - - + + EIGEN_DEVICE_FUNC triangular_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr) : Base(dst, src, func, dstExpr) {} - + EIGEN_DEVICE_FUNC void assignCoeff(Index row, Index col) { eigen_internal_assert(row!=col); @@ -305,12 +318,12 @@ public: m_functor.assignCoeff(m_dst.coeffRef(row,col), tmp); m_functor.assignCoeff(m_dst.coeffRef(col,row), numext::conj(tmp)); } - + EIGEN_DEVICE_FUNC void assignDiagonalCoeff(Index id) { Base::assignCoeff(id,id); } - + EIGEN_DEVICE_FUNC void assignOppositeCoeff(Index, Index) { eigen_internal_assert(false && "should never be called"); } }; @@ -324,7 +337,7 @@ public: /** This is the const version of MatrixBase::selfadjointView() */ template template -typename MatrixBase::template ConstSelfAdjointViewReturnType::Type +EIGEN_DEVICE_FUNC typename MatrixBase::template ConstSelfAdjointViewReturnType::Type MatrixBase::selfadjointView() const { return typename ConstSelfAdjointViewReturnType::Type(derived()); @@ -341,7 +354,7 @@ MatrixBase::selfadjointView() const */ template template -typename MatrixBase::template SelfAdjointViewReturnType::Type +EIGEN_DEVICE_FUNC typename MatrixBase::template SelfAdjointViewReturnType::Type MatrixBase::selfadjointView() { return typename SelfAdjointViewReturnType::Type(derived()); diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Solve.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Solve.h index a8daea5113..23d5cb7072 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Solve.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Solve.h @@ -13,13 +13,13 @@ namespace Eigen { template class SolveImpl; - + /** \class Solve * \ingroup Core_Module * * \brief Pseudo expression representing a solving operation * - * \tparam Decomposition the type of the matrix or decomposion object + * \tparam Decomposition the type of the matrix or decomposition object * \tparam Rhstype the type of the right-hand side * * This class represents an expression of A.solve(B) @@ -64,13 +64,13 @@ class Solve : public SolveImpl::PlainObject PlainObject; typedef typename internal::traits::StorageIndex StorageIndex; - + Solve(const Decomposition &dec, const RhsType &rhs) : m_dec(dec), m_rhs(rhs) {} - - EIGEN_DEVICE_FUNC Index rows() const { return m_dec.cols(); } - EIGEN_DEVICE_FUNC Index cols() const { return m_rhs.cols(); } + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_dec.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); } EIGEN_DEVICE_FUNC const Decomposition& dec() const { return m_dec; } EIGEN_DEVICE_FUNC const RhsType& rhs() const { return m_rhs; } @@ -87,14 +87,14 @@ class SolveImpl : public MatrixBase > { typedef Solve Derived; - + public: - + typedef MatrixBase > Base; EIGEN_DENSE_PUBLIC_INTERFACE(Derived) private: - + Scalar coeff(Index row, Index col) const; Scalar coeff(Index i) const; }; @@ -119,15 +119,15 @@ struct evaluator > typedef evaluator Base; enum { Flags = Base::Flags | EvalBeforeNestingBit }; - + EIGEN_DEVICE_FUNC explicit evaluator(const SolveType& solve) : m_result(solve.rows(), solve.cols()) { ::new (static_cast(this)) Base(m_result); solve.dec()._solve_impl(solve.rhs(), m_result); } - -protected: + +protected: PlainObject m_result; }; @@ -176,12 +176,12 @@ struct Assignment(src.rhs(), dst); } }; -} // end namepsace internal +} // end namespace internal } // end namespace Eigen diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/SolveTriangular.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/SolveTriangular.h index fd0acb1a58..dfbf99523a 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/SolveTriangular.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/SolveTriangular.h @@ -10,7 +10,7 @@ #ifndef EIGEN_SOLVETRIANGULAR_H #define EIGEN_SOLVETRIANGULAR_H -namespace Eigen { +namespace Eigen { namespace internal { @@ -54,7 +54,7 @@ struct triangular_solver_selector typedef blas_traits LhsProductTraits; typedef typename LhsProductTraits::ExtractType ActualLhsType; typedef Map, Aligned> MappedRhs; - static void run(const Lhs& lhs, Rhs& rhs) + static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs) { ActualLhsType actualLhs = LhsProductTraits::extract(lhs); @@ -64,7 +64,7 @@ struct triangular_solver_selector ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhs,rhs.size(), (useRhsDirectly ? rhs.data() : 0)); - + if(!useRhsDirectly) MappedRhs(actualRhs,rhs.size()) = rhs; @@ -85,7 +85,7 @@ struct triangular_solver_selector typedef blas_traits LhsProductTraits; typedef typename LhsProductTraits::DirectLinearAccessType ActualLhsType; - static void run(const Lhs& lhs, Rhs& rhs) + static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs) { typename internal::add_const_on_value_type::type actualLhs = LhsProductTraits::extract(lhs); @@ -118,7 +118,7 @@ struct triangular_solver_unroller { DiagIndex = IsLower ? LoopIndex : Size - LoopIndex - 1, StartIndex = IsLower ? 0 : DiagIndex+1 }; - static void run(const Lhs& lhs, Rhs& rhs) + static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs) { if (LoopIndex>0) rhs.coeffRef(DiagIndex) -= lhs.row(DiagIndex).template segment(StartIndex).transpose() @@ -133,22 +133,22 @@ struct triangular_solver_unroller { template struct triangular_solver_unroller { - static void run(const Lhs&, Rhs&) {} + static EIGEN_DEVICE_FUNC void run(const Lhs&, Rhs&) {} }; template struct triangular_solver_selector { - static void run(const Lhs& lhs, Rhs& rhs) + static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs) { triangular_solver_unroller::run(lhs,rhs); } }; template struct triangular_solver_selector { - static void run(const Lhs& lhs, Rhs& rhs) + static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs) { Transpose trLhs(lhs); Transpose trRhs(rhs); - + triangular_solver_unroller,Transpose, ((Mode&Upper)==Upper ? Lower : Upper) | (Mode&UnitDiag), 0,Rhs::SizeAtCompileTime>::run(trLhs,trRhs); @@ -164,11 +164,11 @@ struct triangular_solver_selector { #ifndef EIGEN_PARSED_BY_DOXYGEN template template -void TriangularViewImpl::solveInPlace(const MatrixBase& _other) const +EIGEN_DEVICE_FUNC void TriangularViewImpl::solveInPlace(const MatrixBase& _other) const { OtherDerived& other = _other.const_cast_derived(); eigen_assert( derived().cols() == derived().rows() && ((Side==OnTheLeft && derived().cols() == other.rows()) || (Side==OnTheRight && derived().cols() == other.cols())) ); - eigen_assert((!(Mode & ZeroDiag)) && bool(Mode & (Upper|Lower))); + eigen_assert((!(int(Mode) & int(ZeroDiag))) && bool(int(Mode) & (int(Upper) | int(Lower)))); // If solving for a 0x0 matrix, nothing to do, simply return. if (derived().cols() == 0) return; @@ -213,8 +213,8 @@ template struct triangular_solv : m_triangularMatrix(tri), m_rhs(rhs) {} - inline Index rows() const { return m_rhs.rows(); } - inline Index cols() const { return m_rhs.cols(); } + inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_rhs.rows(); } + inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); } template inline void evalTo(Dest& dst) const { diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/SolverBase.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/SolverBase.h index 8a4adc2297..5014610420 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/SolverBase.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/SolverBase.h @@ -14,8 +14,35 @@ namespace Eigen { namespace internal { +template +struct solve_assertion { + template + static void run(const Derived& solver, const Rhs& b) { solver.template _check_solve_assertion(b); } +}; +template +struct solve_assertion > +{ + typedef Transpose type; + template + static void run(const type& transpose, const Rhs& b) + { + internal::solve_assertion::type>::template run(transpose.nestedExpression(), b); + } +}; + +template +struct solve_assertion, const Transpose > > +{ + typedef CwiseUnaryOp, const Transpose > type; + + template + static void run(const type& adjoint, const Rhs& b) + { + internal::solve_assertion >::type>::template run(adjoint.nestedExpression(), b); + } +}; } // end namespace internal /** \class SolverBase @@ -35,7 +62,7 @@ namespace internal { * * \warning Currently, any other usage of transpose() and adjoint() are not supported and will produce compilation errors. * - * \sa class PartialPivLU, class FullPivLU + * \sa class PartialPivLU, class FullPivLU, class HouseholderQR, class ColPivHouseholderQR, class FullPivHouseholderQR, class CompleteOrthogonalDecomposition, class LLT, class LDLT, class SVDBase */ template class SolverBase : public EigenBase @@ -46,6 +73,9 @@ class SolverBase : public EigenBase typedef typename internal::traits::Scalar Scalar; typedef Scalar CoeffReturnType; + template + friend struct internal::solve_assertion; + enum { RowsAtCompileTime = internal::traits::RowsAtCompileTime, ColsAtCompileTime = internal::traits::ColsAtCompileTime, @@ -56,7 +86,8 @@ class SolverBase : public EigenBase MaxSizeAtCompileTime = (internal::size_at_compile_time::MaxRowsAtCompileTime, internal::traits::MaxColsAtCompileTime>::ret), IsVectorAtCompileTime = internal::traits::MaxRowsAtCompileTime == 1 - || internal::traits::MaxColsAtCompileTime == 1 + || internal::traits::MaxColsAtCompileTime == 1, + NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2 }; /** Default constructor */ @@ -74,7 +105,7 @@ class SolverBase : public EigenBase inline const Solve solve(const MatrixBase& b) const { - eigen_assert(derived().rows()==b.rows() && "solve(): invalid number of rows of the right hand side matrix b"); + internal::solve_assertion::type>::template run(derived(), b); return Solve(derived(), b.derived()); } @@ -112,6 +143,13 @@ class SolverBase : public EigenBase } protected: + + template + void _check_solve_assertion(const Rhs& b) const { + EIGEN_ONLY_USED_FOR_DEBUG(b); + eigen_assert(derived().m_isInitialized && "Solver is not initialized."); + eigen_assert((Transpose_?derived().cols():derived().rows())==b.rows() && "SolverBase::solve(): invalid number of rows of the right hand side matrix b"); + } }; namespace internal { diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/StableNorm.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/StableNorm.h index 88c8d98902..4a3f0cca8c 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/StableNorm.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/StableNorm.h @@ -50,6 +50,71 @@ inline void stable_norm_kernel(const ExpressionType& bl, Scalar& ssq, Scalar& sc ssq += (bl*invScale).squaredNorm(); } +template +void stable_norm_impl_inner_step(const VectorType &vec, RealScalar& ssq, RealScalar& scale, RealScalar& invScale) +{ + typedef typename VectorType::Scalar Scalar; + const Index blockSize = 4096; + + typedef typename internal::nested_eval::type VectorTypeCopy; + typedef typename internal::remove_all::type VectorTypeCopyClean; + const VectorTypeCopy copy(vec); + + enum { + CanAlign = ( (int(VectorTypeCopyClean::Flags)&DirectAccessBit) + || (int(internal::evaluator::Alignment)>0) // FIXME Alignment)>0 might not be enough + ) && (blockSize*sizeof(Scalar)*20) // if we cannot allocate on the stack, then let's not bother about this optimization + }; + typedef typename internal::conditional, internal::evaluator::Alignment>, + typename VectorTypeCopyClean::ConstSegmentReturnType>::type SegmentWrapper; + Index n = vec.size(); + + Index bi = internal::first_default_aligned(copy); + if (bi>0) + internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale); + for (; bi +typename VectorType::RealScalar +stable_norm_impl(const VectorType &vec, typename enable_if::type* = 0 ) +{ + using std::sqrt; + using std::abs; + + Index n = vec.size(); + + if(n==1) + return abs(vec.coeff(0)); + + typedef typename VectorType::RealScalar RealScalar; + RealScalar scale(0); + RealScalar invScale(1); + RealScalar ssq(0); // sum of squares + + stable_norm_impl_inner_step(vec, ssq, scale, invScale); + + return scale * sqrt(ssq); +} + +template +typename MatrixType::RealScalar +stable_norm_impl(const MatrixType &mat, typename enable_if::type* = 0 ) +{ + using std::sqrt; + + typedef typename MatrixType::RealScalar RealScalar; + RealScalar scale(0); + RealScalar invScale(1); + RealScalar ssq(0); // sum of squares + + for(Index j=0; j inline typename NumTraits::Scalar>::Real blueNorm_impl(const EigenBase& _vec) @@ -58,52 +123,43 @@ blueNorm_impl(const EigenBase& _vec) using std::pow; using std::sqrt; using std::abs; + + // This program calculates the machine-dependent constants + // bl, b2, slm, s2m, relerr overfl + // from the "basic" machine-dependent numbers + // nbig, ibeta, it, iemin, iemax, rbig. + // The following define the basic machine-dependent constants. + // For portability, the PORT subprograms "ilmaeh" and "rlmach" + // are used. For any specific computer, each of the assignment + // statements can be replaced + static const int ibeta = std::numeric_limits::radix; // base for floating-point numbers + static const int it = NumTraits::digits(); // number of base-beta digits in mantissa + static const int iemin = NumTraits::min_exponent(); // minimum exponent + static const int iemax = NumTraits::max_exponent(); // maximum exponent + static const RealScalar rbig = NumTraits::highest(); // largest floating-point number + static const RealScalar b1 = RealScalar(pow(RealScalar(ibeta),RealScalar(-((1-iemin)/2)))); // lower boundary of midrange + static const RealScalar b2 = RealScalar(pow(RealScalar(ibeta),RealScalar((iemax + 1 - it)/2))); // upper boundary of midrange + static const RealScalar s1m = RealScalar(pow(RealScalar(ibeta),RealScalar((2-iemin)/2))); // scaling factor for lower range + static const RealScalar s2m = RealScalar(pow(RealScalar(ibeta),RealScalar(- ((iemax+it)/2)))); // scaling factor for upper range + static const RealScalar eps = RealScalar(pow(double(ibeta), 1-it)); + static const RealScalar relerr = sqrt(eps); // tolerance for neglecting asml + const Derived& vec(_vec.derived()); - static bool initialized = false; - static RealScalar b1, b2, s1m, s2m, rbig, relerr; - if(!initialized) - { - int ibeta, it, iemin, iemax, iexp; - RealScalar eps; - // This program calculates the machine-dependent constants - // bl, b2, slm, s2m, relerr overfl - // from the "basic" machine-dependent numbers - // nbig, ibeta, it, iemin, iemax, rbig. - // The following define the basic machine-dependent constants. - // For portability, the PORT subprograms "ilmaeh" and "rlmach" - // are used. For any specific computer, each of the assignment - // statements can be replaced - ibeta = std::numeric_limits::radix; // base for floating-point numbers - it = std::numeric_limits::digits; // number of base-beta digits in mantissa - iemin = std::numeric_limits::min_exponent; // minimum exponent - iemax = std::numeric_limits::max_exponent; // maximum exponent - rbig = (std::numeric_limits::max)(); // largest floating-point number - - iexp = -((1-iemin)/2); - b1 = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp))); // lower boundary of midrange - iexp = (iemax + 1 - it)/2; - b2 = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp))); // upper boundary of midrange - - iexp = (2-iemin)/2; - s1m = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp))); // scaling factor for lower range - iexp = - ((iemax+it)/2); - s2m = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp))); // scaling factor for upper range - - eps = RealScalar(pow(double(ibeta), 1-it)); - relerr = sqrt(eps); // tolerance for neglecting asml - initialized = true; - } Index n = vec.size(); RealScalar ab2 = b2 / RealScalar(n); RealScalar asml = RealScalar(0); RealScalar amed = RealScalar(0); RealScalar abig = RealScalar(0); - for(typename Derived::InnerIterator it(vec, 0); it; ++it) + + for(Index j=0; j ab2) abig += numext::abs2(ax*s2m); - else if(ax < b1) asml += numext::abs2(ax*s1m); - else amed += numext::abs2(ax); + for(typename Derived::InnerIterator iter(vec, j); iter; ++iter) + { + RealScalar ax = abs(iter.value()); + if(ax > ab2) abig += numext::abs2(ax*s2m); + else if(ax < b1) asml += numext::abs2(ax*s1m); + else amed += numext::abs2(ax); + } } if(amed!=amed) return amed; // we got a NaN @@ -156,36 +212,7 @@ template inline typename NumTraits::Scalar>::Real MatrixBase::stableNorm() const { - using std::sqrt; - using std::abs; - const Index blockSize = 4096; - RealScalar scale(0); - RealScalar invScale(1); - RealScalar ssq(0); // sum of square - - typedef typename internal::nested_eval::type DerivedCopy; - typedef typename internal::remove_all::type DerivedCopyClean; - const DerivedCopy copy(derived()); - - enum { - CanAlign = ( (int(DerivedCopyClean::Flags)&DirectAccessBit) - || (int(internal::evaluator::Alignment)>0) // FIXME Alignment)>0 might not be enough - ) && (blockSize*sizeof(Scalar)*20) // if we cannot allocate on the stack, then let's not bother about this optimization - }; - typedef typename internal::conditional, internal::evaluator::Alignment>, - typename DerivedCopyClean::ConstSegmentReturnType>::type SegmentWrapper; - Index n = size(); - - if(n==1) - return abs(this->coeff(0)); - - Index bi = internal::first_default_aligned(copy); - if (bi>0) - internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale); - for (; bi inline typename NumTraits::Scalar>::Real MatrixBase::hypotNorm() const { - return this->cwiseAbs().redux(internal::scalar_hypot_op()); + if(size()==1) + return numext::abs(coeff(0,0)); + else + return this->cwiseAbs().redux(internal::scalar_hypot_op()); } } // end namespace Eigen diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/StlIterators.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/StlIterators.h new file mode 100644 index 0000000000..09041db1d5 --- /dev/null +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/StlIterators.h @@ -0,0 +1,463 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2018 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_STLITERATORS_H +#define EIGEN_STLITERATORS_H + +namespace Eigen { + +namespace internal { + +template +struct indexed_based_stl_iterator_traits; + +template +class indexed_based_stl_iterator_base +{ +protected: + typedef indexed_based_stl_iterator_traits traits; + typedef typename traits::XprType XprType; + typedef indexed_based_stl_iterator_base non_const_iterator; + typedef indexed_based_stl_iterator_base const_iterator; + typedef typename internal::conditional::value,non_const_iterator,const_iterator>::type other_iterator; + // NOTE: in C++03 we cannot declare friend classes through typedefs because we need to write friend class: + friend class indexed_based_stl_iterator_base; + friend class indexed_based_stl_iterator_base; +public: + typedef Index difference_type; + typedef std::random_access_iterator_tag iterator_category; + + indexed_based_stl_iterator_base() EIGEN_NO_THROW : mp_xpr(0), m_index(0) {} + indexed_based_stl_iterator_base(XprType& xpr, Index index) EIGEN_NO_THROW : mp_xpr(&xpr), m_index(index) {} + + indexed_based_stl_iterator_base(const non_const_iterator& other) EIGEN_NO_THROW + : mp_xpr(other.mp_xpr), m_index(other.m_index) + {} + + indexed_based_stl_iterator_base& operator=(const non_const_iterator& other) + { + mp_xpr = other.mp_xpr; + m_index = other.m_index; + return *this; + } + + Derived& operator++() { ++m_index; return derived(); } + Derived& operator--() { --m_index; return derived(); } + + Derived operator++(int) { Derived prev(derived()); operator++(); return prev;} + Derived operator--(int) { Derived prev(derived()); operator--(); return prev;} + + friend Derived operator+(const indexed_based_stl_iterator_base& a, Index b) { Derived ret(a.derived()); ret += b; return ret; } + friend Derived operator-(const indexed_based_stl_iterator_base& a, Index b) { Derived ret(a.derived()); ret -= b; return ret; } + friend Derived operator+(Index a, const indexed_based_stl_iterator_base& b) { Derived ret(b.derived()); ret += a; return ret; } + friend Derived operator-(Index a, const indexed_based_stl_iterator_base& b) { Derived ret(b.derived()); ret -= a; return ret; } + + Derived& operator+=(Index b) { m_index += b; return derived(); } + Derived& operator-=(Index b) { m_index -= b; return derived(); } + + difference_type operator-(const indexed_based_stl_iterator_base& other) const + { + eigen_assert(mp_xpr == other.mp_xpr); + return m_index - other.m_index; + } + + difference_type operator-(const other_iterator& other) const + { + eigen_assert(mp_xpr == other.mp_xpr); + return m_index - other.m_index; + } + + bool operator==(const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index == other.m_index; } + bool operator!=(const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index != other.m_index; } + bool operator< (const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index < other.m_index; } + bool operator<=(const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <= other.m_index; } + bool operator> (const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index > other.m_index; } + bool operator>=(const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >= other.m_index; } + + bool operator==(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index == other.m_index; } + bool operator!=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index != other.m_index; } + bool operator< (const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index < other.m_index; } + bool operator<=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <= other.m_index; } + bool operator> (const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index > other.m_index; } + bool operator>=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >= other.m_index; } + +protected: + + Derived& derived() { return static_cast(*this); } + const Derived& derived() const { return static_cast(*this); } + + XprType *mp_xpr; + Index m_index; +}; + +template +class indexed_based_stl_reverse_iterator_base +{ +protected: + typedef indexed_based_stl_iterator_traits traits; + typedef typename traits::XprType XprType; + typedef indexed_based_stl_reverse_iterator_base non_const_iterator; + typedef indexed_based_stl_reverse_iterator_base const_iterator; + typedef typename internal::conditional::value,non_const_iterator,const_iterator>::type other_iterator; + // NOTE: in C++03 we cannot declare friend classes through typedefs because we need to write friend class: + friend class indexed_based_stl_reverse_iterator_base; + friend class indexed_based_stl_reverse_iterator_base; +public: + typedef Index difference_type; + typedef std::random_access_iterator_tag iterator_category; + + indexed_based_stl_reverse_iterator_base() : mp_xpr(0), m_index(0) {} + indexed_based_stl_reverse_iterator_base(XprType& xpr, Index index) : mp_xpr(&xpr), m_index(index) {} + + indexed_based_stl_reverse_iterator_base(const non_const_iterator& other) + : mp_xpr(other.mp_xpr), m_index(other.m_index) + {} + + indexed_based_stl_reverse_iterator_base& operator=(const non_const_iterator& other) + { + mp_xpr = other.mp_xpr; + m_index = other.m_index; + return *this; + } + + Derived& operator++() { --m_index; return derived(); } + Derived& operator--() { ++m_index; return derived(); } + + Derived operator++(int) { Derived prev(derived()); operator++(); return prev;} + Derived operator--(int) { Derived prev(derived()); operator--(); return prev;} + + friend Derived operator+(const indexed_based_stl_reverse_iterator_base& a, Index b) { Derived ret(a.derived()); ret += b; return ret; } + friend Derived operator-(const indexed_based_stl_reverse_iterator_base& a, Index b) { Derived ret(a.derived()); ret -= b; return ret; } + friend Derived operator+(Index a, const indexed_based_stl_reverse_iterator_base& b) { Derived ret(b.derived()); ret += a; return ret; } + friend Derived operator-(Index a, const indexed_based_stl_reverse_iterator_base& b) { Derived ret(b.derived()); ret -= a; return ret; } + + Derived& operator+=(Index b) { m_index -= b; return derived(); } + Derived& operator-=(Index b) { m_index += b; return derived(); } + + difference_type operator-(const indexed_based_stl_reverse_iterator_base& other) const + { + eigen_assert(mp_xpr == other.mp_xpr); + return other.m_index - m_index; + } + + difference_type operator-(const other_iterator& other) const + { + eigen_assert(mp_xpr == other.mp_xpr); + return other.m_index - m_index; + } + + bool operator==(const indexed_based_stl_reverse_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index == other.m_index; } + bool operator!=(const indexed_based_stl_reverse_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index != other.m_index; } + bool operator< (const indexed_based_stl_reverse_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index > other.m_index; } + bool operator<=(const indexed_based_stl_reverse_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >= other.m_index; } + bool operator> (const indexed_based_stl_reverse_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index < other.m_index; } + bool operator>=(const indexed_based_stl_reverse_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <= other.m_index; } + + bool operator==(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index == other.m_index; } + bool operator!=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index != other.m_index; } + bool operator< (const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index > other.m_index; } + bool operator<=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >= other.m_index; } + bool operator> (const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index < other.m_index; } + bool operator>=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <= other.m_index; } + +protected: + + Derived& derived() { return static_cast(*this); } + const Derived& derived() const { return static_cast(*this); } + + XprType *mp_xpr; + Index m_index; +}; + +template +class pointer_based_stl_iterator +{ + enum { is_lvalue = internal::is_lvalue::value }; + typedef pointer_based_stl_iterator::type> non_const_iterator; + typedef pointer_based_stl_iterator::type> const_iterator; + typedef typename internal::conditional::value,non_const_iterator,const_iterator>::type other_iterator; + // NOTE: in C++03 we cannot declare friend classes through typedefs because we need to write friend class: + friend class pointer_based_stl_iterator::type>; + friend class pointer_based_stl_iterator::type>; +public: + typedef Index difference_type; + typedef typename XprType::Scalar value_type; + typedef std::random_access_iterator_tag iterator_category; + typedef typename internal::conditional::type pointer; + typedef typename internal::conditional::type reference; + + + pointer_based_stl_iterator() EIGEN_NO_THROW : m_ptr(0) {} + pointer_based_stl_iterator(XprType& xpr, Index index) EIGEN_NO_THROW : m_incr(xpr.innerStride()) + { + m_ptr = xpr.data() + index * m_incr.value(); + } + + pointer_based_stl_iterator(const non_const_iterator& other) EIGEN_NO_THROW + : m_ptr(other.m_ptr), m_incr(other.m_incr) + {} + + pointer_based_stl_iterator& operator=(const non_const_iterator& other) EIGEN_NO_THROW + { + m_ptr = other.m_ptr; + m_incr.setValue(other.m_incr); + return *this; + } + + reference operator*() const { return *m_ptr; } + reference operator[](Index i) const { return *(m_ptr+i*m_incr.value()); } + pointer operator->() const { return m_ptr; } + + pointer_based_stl_iterator& operator++() { m_ptr += m_incr.value(); return *this; } + pointer_based_stl_iterator& operator--() { m_ptr -= m_incr.value(); return *this; } + + pointer_based_stl_iterator operator++(int) { pointer_based_stl_iterator prev(*this); operator++(); return prev;} + pointer_based_stl_iterator operator--(int) { pointer_based_stl_iterator prev(*this); operator--(); return prev;} + + friend pointer_based_stl_iterator operator+(const pointer_based_stl_iterator& a, Index b) { pointer_based_stl_iterator ret(a); ret += b; return ret; } + friend pointer_based_stl_iterator operator-(const pointer_based_stl_iterator& a, Index b) { pointer_based_stl_iterator ret(a); ret -= b; return ret; } + friend pointer_based_stl_iterator operator+(Index a, const pointer_based_stl_iterator& b) { pointer_based_stl_iterator ret(b); ret += a; return ret; } + friend pointer_based_stl_iterator operator-(Index a, const pointer_based_stl_iterator& b) { pointer_based_stl_iterator ret(b); ret -= a; return ret; } + + pointer_based_stl_iterator& operator+=(Index b) { m_ptr += b*m_incr.value(); return *this; } + pointer_based_stl_iterator& operator-=(Index b) { m_ptr -= b*m_incr.value(); return *this; } + + difference_type operator-(const pointer_based_stl_iterator& other) const { + return (m_ptr - other.m_ptr)/m_incr.value(); + } + + difference_type operator-(const other_iterator& other) const { + return (m_ptr - other.m_ptr)/m_incr.value(); + } + + bool operator==(const pointer_based_stl_iterator& other) const { return m_ptr == other.m_ptr; } + bool operator!=(const pointer_based_stl_iterator& other) const { return m_ptr != other.m_ptr; } + bool operator< (const pointer_based_stl_iterator& other) const { return m_ptr < other.m_ptr; } + bool operator<=(const pointer_based_stl_iterator& other) const { return m_ptr <= other.m_ptr; } + bool operator> (const pointer_based_stl_iterator& other) const { return m_ptr > other.m_ptr; } + bool operator>=(const pointer_based_stl_iterator& other) const { return m_ptr >= other.m_ptr; } + + bool operator==(const other_iterator& other) const { return m_ptr == other.m_ptr; } + bool operator!=(const other_iterator& other) const { return m_ptr != other.m_ptr; } + bool operator< (const other_iterator& other) const { return m_ptr < other.m_ptr; } + bool operator<=(const other_iterator& other) const { return m_ptr <= other.m_ptr; } + bool operator> (const other_iterator& other) const { return m_ptr > other.m_ptr; } + bool operator>=(const other_iterator& other) const { return m_ptr >= other.m_ptr; } + +protected: + + pointer m_ptr; + internal::variable_if_dynamic m_incr; +}; + +template +struct indexed_based_stl_iterator_traits > +{ + typedef _XprType XprType; + typedef generic_randaccess_stl_iterator::type> non_const_iterator; + typedef generic_randaccess_stl_iterator::type> const_iterator; +}; + +template +class generic_randaccess_stl_iterator : public indexed_based_stl_iterator_base > +{ +public: + typedef typename XprType::Scalar value_type; + +protected: + + enum { + has_direct_access = (internal::traits::Flags & DirectAccessBit) ? 1 : 0, + is_lvalue = internal::is_lvalue::value + }; + + typedef indexed_based_stl_iterator_base Base; + using Base::m_index; + using Base::mp_xpr; + + // TODO currently const Transpose/Reshape expressions never returns const references, + // so lets return by value too. + //typedef typename internal::conditional::type read_only_ref_t; + typedef const value_type read_only_ref_t; + +public: + + typedef typename internal::conditional::type pointer; + typedef typename internal::conditional::type reference; + + generic_randaccess_stl_iterator() : Base() {} + generic_randaccess_stl_iterator(XprType& xpr, Index index) : Base(xpr,index) {} + generic_randaccess_stl_iterator(const typename Base::non_const_iterator& other) : Base(other) {} + using Base::operator=; + + reference operator*() const { return (*mp_xpr)(m_index); } + reference operator[](Index i) const { return (*mp_xpr)(m_index+i); } + pointer operator->() const { return &((*mp_xpr)(m_index)); } +}; + +template +struct indexed_based_stl_iterator_traits > +{ + typedef _XprType XprType; + typedef subvector_stl_iterator::type, Direction> non_const_iterator; + typedef subvector_stl_iterator::type, Direction> const_iterator; +}; + +template +class subvector_stl_iterator : public indexed_based_stl_iterator_base > +{ +protected: + + enum { is_lvalue = internal::is_lvalue::value }; + + typedef indexed_based_stl_iterator_base Base; + using Base::m_index; + using Base::mp_xpr; + + typedef typename internal::conditional::type SubVectorType; + typedef typename internal::conditional::type ConstSubVectorType; + + +public: + typedef typename internal::conditional::type reference; + typedef typename reference::PlainObject value_type; + +private: + class subvector_stl_iterator_ptr + { + public: + subvector_stl_iterator_ptr(const reference &subvector) : m_subvector(subvector) {} + reference* operator->() { return &m_subvector; } + private: + reference m_subvector; + }; +public: + + typedef subvector_stl_iterator_ptr pointer; + + subvector_stl_iterator() : Base() {} + subvector_stl_iterator(XprType& xpr, Index index) : Base(xpr,index) {} + + reference operator*() const { return (*mp_xpr).template subVector(m_index); } + reference operator[](Index i) const { return (*mp_xpr).template subVector(m_index+i); } + pointer operator->() const { return (*mp_xpr).template subVector(m_index); } +}; + +template +struct indexed_based_stl_iterator_traits > +{ + typedef _XprType XprType; + typedef subvector_stl_reverse_iterator::type, Direction> non_const_iterator; + typedef subvector_stl_reverse_iterator::type, Direction> const_iterator; +}; + +template +class subvector_stl_reverse_iterator : public indexed_based_stl_reverse_iterator_base > +{ +protected: + + enum { is_lvalue = internal::is_lvalue::value }; + + typedef indexed_based_stl_reverse_iterator_base Base; + using Base::m_index; + using Base::mp_xpr; + + typedef typename internal::conditional::type SubVectorType; + typedef typename internal::conditional::type ConstSubVectorType; + + +public: + typedef typename internal::conditional::type reference; + typedef typename reference::PlainObject value_type; + +private: + class subvector_stl_reverse_iterator_ptr + { + public: + subvector_stl_reverse_iterator_ptr(const reference &subvector) : m_subvector(subvector) {} + reference* operator->() { return &m_subvector; } + private: + reference m_subvector; + }; +public: + + typedef subvector_stl_reverse_iterator_ptr pointer; + + subvector_stl_reverse_iterator() : Base() {} + subvector_stl_reverse_iterator(XprType& xpr, Index index) : Base(xpr,index) {} + + reference operator*() const { return (*mp_xpr).template subVector(m_index); } + reference operator[](Index i) const { return (*mp_xpr).template subVector(m_index+i); } + pointer operator->() const { return (*mp_xpr).template subVector(m_index); } +}; + +} // namespace internal + + +/** returns an iterator to the first element of the 1D vector or array + * \only_for_vectors + * \sa end(), cbegin() + */ +template +inline typename DenseBase::iterator DenseBase::begin() +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived); + return iterator(derived(), 0); +} + +/** const version of begin() */ +template +inline typename DenseBase::const_iterator DenseBase::begin() const +{ + return cbegin(); +} + +/** returns a read-only const_iterator to the first element of the 1D vector or array + * \only_for_vectors + * \sa cend(), begin() + */ +template +inline typename DenseBase::const_iterator DenseBase::cbegin() const +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived); + return const_iterator(derived(), 0); +} + +/** returns an iterator to the element following the last element of the 1D vector or array + * \only_for_vectors + * \sa begin(), cend() + */ +template +inline typename DenseBase::iterator DenseBase::end() +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived); + return iterator(derived(), size()); +} + +/** const version of end() */ +template +inline typename DenseBase::const_iterator DenseBase::end() const +{ + return cend(); +} + +/** returns a read-only const_iterator to the element following the last element of the 1D vector or array + * \only_for_vectors + * \sa begin(), cend() + */ +template +inline typename DenseBase::const_iterator DenseBase::cend() const +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived); + return const_iterator(derived(), size()); +} + +} // namespace Eigen + +#endif // EIGEN_STLITERATORS_H diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Stride.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Stride.h index 513742f34b..6494d51420 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Stride.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Stride.h @@ -10,7 +10,7 @@ #ifndef EIGEN_STRIDE_H #define EIGEN_STRIDE_H -namespace Eigen { +namespace Eigen { /** \class Stride * \ingroup Core_Module @@ -38,6 +38,10 @@ namespace Eigen { * \include Map_general_stride.cpp * Output: \verbinclude Map_general_stride.out * + * Both strides can be negative, however, a negative stride of -1 cannot be specified at compiletime + * because of the ambiguity with Dynamic which is defined to -1 (historically, negative strides were + * not allowed). + * * \sa class InnerStride, class OuterStride, \ref TopicStorageOrders */ template @@ -55,6 +59,8 @@ class Stride Stride() : m_outer(OuterStrideAtCompileTime), m_inner(InnerStrideAtCompileTime) { + // FIXME: for Eigen 4 we should use DynamicIndex instead of Dynamic. + // FIXME: for Eigen 4 we should also unify this API with fix<> eigen_assert(InnerStrideAtCompileTime != Dynamic && OuterStrideAtCompileTime != Dynamic); } @@ -63,7 +69,6 @@ class Stride Stride(Index outerStride, Index innerStride) : m_outer(outerStride), m_inner(innerStride) { - eigen_assert(innerStride>=0 && outerStride>=0); } /** Copy constructor */ @@ -73,10 +78,10 @@ class Stride {} /** \returns the outer stride */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outer() const { return m_outer.value(); } /** \returns the inner stride */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index inner() const { return m_inner.value(); } protected: diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Swap.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Swap.h index d702009185..180a4e5adf 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Swap.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Swap.h @@ -30,12 +30,13 @@ public: typedef typename Base::DstXprType DstXprType; typedef swap_assign_op Functor; - EIGEN_DEVICE_FUNC generic_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + generic_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr) : Base(dst, src, func, dstExpr) {} template - void assignPacket(Index row, Index col) + EIGEN_STRONG_INLINE void assignPacket(Index row, Index col) { PacketType tmp = m_src.template packet(row,col); const_cast(m_src).template writePacket(row,col, m_dst.template packet(row,col)); @@ -43,7 +44,7 @@ public: } template - void assignPacket(Index index) + EIGEN_STRONG_INLINE void assignPacket(Index index) { PacketType tmp = m_src.template packet(index); const_cast(m_src).template writePacket(index, m_dst.template packet(index)); @@ -52,7 +53,7 @@ public: // TODO find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I mean no CRTP (Gael) template - void assignPacketByOuterInner(Index outer, Index inner) + EIGEN_STRONG_INLINE void assignPacketByOuterInner(Index outer, Index inner) { Index row = Base::rowIndexByOuterInner(outer, inner); Index col = Base::colIndexByOuterInner(outer, inner); diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Transpose.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Transpose.h index 960dc4510c..2bc658f40b 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Transpose.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Transpose.h @@ -11,7 +11,7 @@ #ifndef EIGEN_TRANSPOSE_H #define EIGEN_TRANSPOSE_H -namespace Eigen { +namespace Eigen { namespace internal { template @@ -61,24 +61,27 @@ template class Transpose typedef typename internal::remove_all::type NestedExpression; EIGEN_DEVICE_FUNC - explicit inline Transpose(MatrixType& matrix) : m_matrix(matrix) {} + explicit EIGEN_STRONG_INLINE Transpose(MatrixType& matrix) : m_matrix(matrix) {} EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Transpose) - EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.cols(); } - EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.rows(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index rows() const EIGEN_NOEXCEPT { return m_matrix.cols(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index cols() const EIGEN_NOEXCEPT { return m_matrix.rows(); } /** \returns the nested expression */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename internal::remove_all::type& nestedExpression() const { return m_matrix; } /** \returns the nested expression */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::remove_reference::type& nestedExpression() { return m_matrix; } /** \internal */ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize(Index nrows, Index ncols) { m_matrix.resize(ncols,nrows); } @@ -122,8 +125,10 @@ template class TransposeImpl EIGEN_DENSE_PUBLIC_INTERFACE(Transpose) EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TransposeImpl) - EIGEN_DEVICE_FUNC inline Index innerStride() const { return derived().nestedExpression().innerStride(); } - EIGEN_DEVICE_FUNC inline Index outerStride() const { return derived().nestedExpression().outerStride(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index innerStride() const { return derived().nestedExpression().innerStride(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index outerStride() const { return derived().nestedExpression().outerStride(); } typedef typename internal::conditional< internal::is_lvalue::value, @@ -131,18 +136,20 @@ template class TransposeImpl const Scalar >::type ScalarWithConstIfNotLvalue; - EIGEN_DEVICE_FUNC inline ScalarWithConstIfNotLvalue* data() { return derived().nestedExpression().data(); } - EIGEN_DEVICE_FUNC inline const Scalar* data() const { return derived().nestedExpression().data(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + ScalarWithConstIfNotLvalue* data() { return derived().nestedExpression().data(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Scalar* data() const { return derived().nestedExpression().data(); } // FIXME: shall we keep the const version of coeffRef? - EIGEN_DEVICE_FUNC - inline const Scalar& coeffRef(Index rowId, Index colId) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Scalar& coeffRef(Index rowId, Index colId) const { return derived().nestedExpression().coeffRef(colId, rowId); } - EIGEN_DEVICE_FUNC - inline const Scalar& coeffRef(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Scalar& coeffRef(Index index) const { return derived().nestedExpression().coeffRef(index); } @@ -170,7 +177,8 @@ template class TransposeImpl * * \sa transposeInPlace(), adjoint() */ template -inline Transpose +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +Transpose DenseBase::transpose() { return TransposeReturnType(derived()); @@ -182,7 +190,8 @@ DenseBase::transpose() * * \sa transposeInPlace(), adjoint() */ template -inline typename DenseBase::ConstTransposeReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename DenseBase::ConstTransposeReturnType DenseBase::transpose() const { return ConstTransposeReturnType(derived()); @@ -208,7 +217,7 @@ DenseBase::transpose() const * * \sa adjointInPlace(), transpose(), conjugate(), class Transpose, class internal::scalar_conjugate_op */ template -inline const typename MatrixBase::AdjointReturnType +EIGEN_DEVICE_FUNC inline const typename MatrixBase::AdjointReturnType MatrixBase::adjoint() const { return AdjointReturnType(this->transpose()); @@ -230,11 +239,10 @@ struct inplace_transpose_selector; template struct inplace_transpose_selector { // square matrix static void run(MatrixType& m) { - m.matrix().template triangularView().swap(m.matrix().transpose()); + m.matrix().template triangularView().swap(m.matrix().transpose().template triangularView()); } }; -// TODO: vectorized path is currently limited to LargestPacketSize x LargestPacketSize cases only. template struct inplace_transpose_selector { // PacketSize x PacketSize static void run(MatrixType& m) { @@ -251,16 +259,66 @@ struct inplace_transpose_selector { // PacketSize x Packet } }; + +template +void BlockedInPlaceTranspose(MatrixType& m) { + typedef typename MatrixType::Scalar Scalar; + typedef typename internal::packet_traits::type Packet; + const Index PacketSize = internal::packet_traits::size; + eigen_assert(m.rows() == m.cols()); + int row_start = 0; + for (; row_start + PacketSize <= m.rows(); row_start += PacketSize) { + for (int col_start = row_start; col_start + PacketSize <= m.cols(); col_start += PacketSize) { + PacketBlock A; + if (row_start == col_start) { + for (Index i=0; i(row_start + i,col_start); + internal::ptranspose(A); + for (Index i=0; i(m.rowIndexByOuterInner(row_start + i, col_start), m.colIndexByOuterInner(row_start + i,col_start), A.packet[i]); + } else { + PacketBlock B; + for (Index i=0; i(row_start + i,col_start); + B.packet[i] = m.template packetByOuterInner(col_start + i, row_start); + } + internal::ptranspose(A); + internal::ptranspose(B); + for (Index i=0; i(m.rowIndexByOuterInner(row_start + i, col_start), m.colIndexByOuterInner(row_start + i,col_start), B.packet[i]); + m.template writePacket(m.rowIndexByOuterInner(col_start + i, row_start), m.colIndexByOuterInner(col_start + i,row_start), A.packet[i]); + } + } + } + } + for (Index row = row_start; row < m.rows(); ++row) { + m.matrix().row(row).head(row).swap( + m.matrix().col(row).head(row).transpose()); + } +} + template -struct inplace_transpose_selector { // non square matrix +struct inplace_transpose_selector { // non square or dynamic matrix static void run(MatrixType& m) { - if (m.rows()==m.cols()) - m.matrix().template triangularView().swap(m.matrix().transpose()); - else + typedef typename MatrixType::Scalar Scalar; + if (m.rows() == m.cols()) { + const Index PacketSize = internal::packet_traits::size; + if (!NumTraits::IsComplex && m.rows() >= PacketSize) { + if ((m.rows() % PacketSize) == 0) + BlockedInPlaceTranspose::Alignment>(m); + else + BlockedInPlaceTranspose(m); + } + else { + m.matrix().template triangularView().swap(m.matrix().transpose().template triangularView()); + } + } else { m = m.transpose().eval(); + } } }; + } // end namespace internal /** This is the "in place" version of transpose(): it replaces \c *this by its own transpose. @@ -278,12 +336,12 @@ struct inplace_transpose_selector { // non squ * Notice however that this method is only useful if you want to replace a matrix by its own transpose. * If you just need the transpose of a matrix, use transpose(). * - * \note if the matrix is not square, then \c *this must be a resizable matrix. + * \note if the matrix is not square, then \c *this must be a resizable matrix. * This excludes (non-square) fixed-size matrices, block-expressions and maps. * * \sa transpose(), adjoint(), adjointInPlace() */ template -inline void DenseBase::transposeInPlace() +EIGEN_DEVICE_FUNC inline void DenseBase::transposeInPlace() { eigen_assert((rows() == cols() || (RowsAtCompileTime == Dynamic && ColsAtCompileTime == Dynamic)) && "transposeInPlace() called on a non-square non-resizable matrix"); @@ -314,7 +372,7 @@ inline void DenseBase::transposeInPlace() * * \sa transpose(), adjoint(), transposeInPlace() */ template -inline void MatrixBase::adjointInPlace() +EIGEN_DEVICE_FUNC inline void MatrixBase::adjointInPlace() { derived() = adjoint().eval(); } @@ -393,7 +451,8 @@ struct checkTransposeAliasing_impl template void check_for_aliasing(const Dst &dst, const Src &src) { - internal::checkTransposeAliasing_impl::run(dst, src); + if((!Dst::IsVectorAtCompileTime) && dst.rows()>1 && dst.cols()>1) + internal::checkTransposeAliasing_impl::run(dst, src); } } // end namespace internal diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Transpositions.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Transpositions.h index 7718625e80..38a7b01cb5 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Transpositions.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Transpositions.h @@ -10,20 +10,22 @@ #ifndef EIGEN_TRANSPOSITIONS_H #define EIGEN_TRANSPOSITIONS_H -namespace Eigen { +namespace Eigen { template class TranspositionsBase { typedef internal::traits Traits; - + public: typedef typename Traits::IndicesType IndicesType; typedef typename IndicesType::Scalar StorageIndex; typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3 + EIGEN_DEVICE_FUNC Derived& derived() { return *static_cast(this); } + EIGEN_DEVICE_FUNC const Derived& derived() const { return *static_cast(this); } /** Copies the \a other transpositions into \c *this */ @@ -35,13 +37,17 @@ class TranspositionsBase } /** \returns the number of transpositions */ + EIGEN_DEVICE_FUNC Index size() const { return indices().size(); } /** \returns the number of rows of the equivalent permutation matrix */ + EIGEN_DEVICE_FUNC Index rows() const { return indices().size(); } /** \returns the number of columns of the equivalent permutation matrix */ + EIGEN_DEVICE_FUNC Index cols() const { return indices().size(); } /** Direct access to the underlying index vector */ + EIGEN_DEVICE_FUNC inline const StorageIndex& coeff(Index i) const { return indices().coeff(i); } /** Direct access to the underlying index vector */ inline StorageIndex& coeffRef(Index i) { return indices().coeffRef(i); } @@ -55,8 +61,10 @@ class TranspositionsBase inline StorageIndex& operator[](Index i) { return indices()(i); } /** const version of indices(). */ + EIGEN_DEVICE_FUNC const IndicesType& indices() const { return derived().indices(); } /** \returns a reference to the stored array representing the transpositions. */ + EIGEN_DEVICE_FUNC IndicesType& indices() { return derived().indices(); } /** Resizes to given size. */ @@ -73,7 +81,7 @@ class TranspositionsBase } // FIXME: do we want such methods ? - // might be usefull when the target matrix expression is complex, e.g.: + // might be useful when the target matrix expression is complex, e.g.: // object.matrix().block(..,..,..,..) = trans * object.matrix().block(..,..,..,..); /* template @@ -178,8 +186,10 @@ class Transpositions : public TranspositionsBase,P #endif /** const version of indices(). */ + EIGEN_DEVICE_FUNC const IndicesType& indices() const { return m_indices; } - + /** \returns a reference to the stored array representing the transpositions. */ + EIGEN_DEVICE_FUNC IndicesType& indices() { return m_indices; } protected: @@ -279,9 +291,11 @@ class TranspositionsWrapper } /** const version of indices(). */ + EIGEN_DEVICE_FUNC const IndicesType& indices() const { return m_indices; } /** \returns a reference to the stored array representing the transpositions. */ + EIGEN_DEVICE_FUNC IndicesType& indices() { return m_indices; } protected: @@ -335,9 +349,12 @@ class Transpose > explicit Transpose(const TranspositionType& t) : m_transpositions(t) {} - Index size() const { return m_transpositions.size(); } - Index rows() const { return m_transpositions.size(); } - Index cols() const { return m_transpositions.size(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + Index size() const EIGEN_NOEXCEPT { return m_transpositions.size(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + Index rows() const EIGEN_NOEXCEPT { return m_transpositions.size(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + Index cols() const EIGEN_NOEXCEPT { return m_transpositions.size(); } /** \returns the \a matrix with the inverse transpositions applied to the columns. */ @@ -356,7 +373,8 @@ class Transpose > { return Product(*this, matrix.derived()); } - + + EIGEN_DEVICE_FUNC const TranspositionType& nestedExpression() const { return m_transpositions; } protected: diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/TriangularMatrix.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/TriangularMatrix.h index 9abb7e31a8..fdb8bc15a5 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/TriangularMatrix.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/TriangularMatrix.h @@ -11,12 +11,12 @@ #ifndef EIGEN_TRIANGULARMATRIX_H #define EIGEN_TRIANGULARMATRIX_H -namespace Eigen { +namespace Eigen { namespace internal { - + template struct triangular_solve_retval; - + } /** \class TriangularBase @@ -34,16 +34,16 @@ template class TriangularBase : public EigenBase ColsAtCompileTime = internal::traits::ColsAtCompileTime, MaxRowsAtCompileTime = internal::traits::MaxRowsAtCompileTime, MaxColsAtCompileTime = internal::traits::MaxColsAtCompileTime, - + SizeAtCompileTime = (internal::size_at_compile_time::RowsAtCompileTime, internal::traits::ColsAtCompileTime>::ret), /**< This is equal to the number of coefficients, i.e. the number of * rows times the number of columns, or to \a Dynamic if this is not * known at compile-time. \sa RowsAtCompileTime, ColsAtCompileTime */ - + MaxSizeAtCompileTime = (internal::size_at_compile_time::MaxRowsAtCompileTime, internal::traits::MaxColsAtCompileTime>::ret) - + }; typedef typename internal::traits::Scalar Scalar; typedef typename internal::traits::StorageKind StorageKind; @@ -53,18 +53,19 @@ template class TriangularBase : public EigenBase typedef Derived const& Nested; EIGEN_DEVICE_FUNC - inline TriangularBase() { eigen_assert(!((Mode&UnitDiag) && (Mode&ZeroDiag))); } + inline TriangularBase() { eigen_assert(!((int(Mode) & int(UnitDiag)) && (int(Mode) & int(ZeroDiag)))); } + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rows() const EIGEN_NOEXCEPT { return derived().rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return derived().cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index outerStride() const EIGEN_NOEXCEPT { return derived().outerStride(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index innerStride() const EIGEN_NOEXCEPT { return derived().innerStride(); } - EIGEN_DEVICE_FUNC - inline Index rows() const { return derived().rows(); } - EIGEN_DEVICE_FUNC - inline Index cols() const { return derived().cols(); } - EIGEN_DEVICE_FUNC - inline Index outerStride() const { return derived().outerStride(); } - EIGEN_DEVICE_FUNC - inline Index innerStride() const { return derived().innerStride(); } - // dummy resize function + EIGEN_DEVICE_FUNC void resize(Index rows, Index cols) { EIGEN_UNUSED_VARIABLE(rows); @@ -155,7 +156,7 @@ template class TriangularBase : public EigenBase * \param MatrixType the type of the object in which we are taking the triangular part * \param Mode the kind of triangular matrix expression to construct. Can be #Upper, * #Lower, #UnitUpper, #UnitLower, #StrictlyUpper, or #StrictlyLower. - * This is in fact a bit field; it must have either #Upper or #Lower, + * This is in fact a bit field; it must have either #Upper or #Lower, * and additionally it may have #UnitDiag or #ZeroDiag or neither. * * This class represents a triangular part of a matrix, not necessarily square. Strictly speaking, for rectangular @@ -197,7 +198,8 @@ template class TriangularView typedef typename internal::traits::MatrixTypeNestedNonRef MatrixTypeNestedNonRef; typedef typename internal::remove_all::type MatrixConjugateReturnType; - + typedef TriangularView::type, _Mode> ConstTriangularView; + public: typedef typename internal::traits::StorageKind StorageKind; @@ -216,15 +218,15 @@ template class TriangularView EIGEN_DEVICE_FUNC explicit inline TriangularView(MatrixType& matrix) : m_matrix(matrix) {} - + EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TriangularView) /** \copydoc EigenBase::rows() */ - EIGEN_DEVICE_FUNC - inline Index rows() const { return m_matrix.rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); } /** \copydoc EigenBase::cols() */ - EIGEN_DEVICE_FUNC - inline Index cols() const { return m_matrix.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); } /** \returns a const reference to the nested expression */ EIGEN_DEVICE_FUNC @@ -233,13 +235,25 @@ template class TriangularView /** \returns a reference to the nested expression */ EIGEN_DEVICE_FUNC NestedExpression& nestedExpression() { return m_matrix; } - + typedef TriangularView ConjugateReturnType; /** \sa MatrixBase::conjugate() const */ EIGEN_DEVICE_FUNC inline const ConjugateReturnType conjugate() const { return ConjugateReturnType(m_matrix.conjugate()); } + /** \returns an expression of the complex conjugate of \c *this if Cond==true, + * returns \c *this otherwise. + */ + template + EIGEN_DEVICE_FUNC + inline typename internal::conditional::type + conjugateIf() const + { + typedef typename internal::conditional::type ReturnType; + return ReturnType(m_matrix.template conjugateIf()); + } + typedef TriangularView AdjointReturnType; /** \sa MatrixBase::adjoint() const */ EIGEN_DEVICE_FUNC @@ -255,7 +269,7 @@ template class TriangularView typename MatrixType::TransposeReturnType tmp(m_matrix); return TransposeReturnType(tmp); } - + typedef TriangularView ConstTransposeReturnType; /** \sa MatrixBase::transpose() const */ EIGEN_DEVICE_FUNC @@ -266,10 +280,10 @@ template class TriangularView template EIGEN_DEVICE_FUNC - inline const Solve + inline const Solve solve(const MatrixBase& other) const { return Solve(*this, other.derived()); } - + // workaround MSVC ICE #if EIGEN_COMP_MSVC template @@ -313,7 +327,7 @@ template class TriangularView else return m_matrix.diagonal().prod(); } - + protected: MatrixTypeNested m_matrix; @@ -375,7 +389,7 @@ template class TriangularViewImpl<_Mat internal::call_assignment_no_alias(derived(), other.derived(), internal::sub_assign_op()); return derived(); } - + /** \sa MatrixBase::operator*=() */ EIGEN_DEVICE_FUNC TriangularViewType& operator*=(const typename internal::traits::Scalar& other) { return *this = derived().nestedExpression() * other; } @@ -433,14 +447,14 @@ template class TriangularViewImpl<_Mat TriangularViewType& operator=(const TriangularViewImpl& other) { return *this = other.derived().nestedExpression(); } - /** \deprecated */ template - EIGEN_DEVICE_FUNC + /** \deprecated */ + EIGEN_DEPRECATED EIGEN_DEVICE_FUNC void lazyAssign(const TriangularBase& other); - /** \deprecated */ template - EIGEN_DEVICE_FUNC + /** \deprecated */ + EIGEN_DEPRECATED EIGEN_DEVICE_FUNC void lazyAssign(const MatrixBase& other); #endif @@ -468,7 +482,7 @@ template class TriangularViewImpl<_Mat * \a Side==OnTheLeft (the default), or the right-inverse-multiply \a other * inverse(\c *this) if * \a Side==OnTheRight. * - * Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft + * Note that the template parameter \c Side can be omitted, in which case \c Side==OnTheLeft * * The matrix \c *this must be triangular and invertible (i.e., all the coefficients of the * diagonal must be non zero). It works as a forward (resp. backward) substitution if \c *this @@ -486,7 +500,6 @@ template class TriangularViewImpl<_Mat * \sa TriangularView::solveInPlace() */ template - EIGEN_DEVICE_FUNC inline const internal::triangular_solve_retval solve(const MatrixBase& other) const; @@ -495,7 +508,7 @@ template class TriangularViewImpl<_Mat * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here. * This function will const_cast it, so constness isn't honored here. * - * Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft + * Note that the template parameter \c Side can be omitted, in which case \c Side==OnTheLeft * * See TriangularView:solve() for the details. */ @@ -521,10 +534,10 @@ template class TriangularViewImpl<_Mat call_assignment(derived(), other.const_cast_derived(), internal::swap_assign_op()); } - /** \deprecated - * Shortcut for \code (*this).swap(other.triangularView<(*this)::Mode>()) \endcode */ + /** Shortcut for \code (*this).swap(other.triangularView<(*this)::Mode>()) \endcode */ template - EIGEN_DEVICE_FUNC + /** \deprecated */ + EIGEN_DEPRECATED EIGEN_DEVICE_FUNC void swap(MatrixBase const & other) { EIGEN_STATIC_ASSERT_LVALUE(OtherDerived); @@ -556,7 +569,7 @@ template class TriangularViewImpl<_Mat // FIXME should we keep that possibility template template -inline TriangularView& +EIGEN_DEVICE_FUNC inline TriangularView& TriangularViewImpl::operator=(const MatrixBase& other) { internal::call_assignment_no_alias(derived(), other.derived(), internal::assign_op()); @@ -566,7 +579,7 @@ TriangularViewImpl::operator=(const MatrixBase template -void TriangularViewImpl::lazyAssign(const MatrixBase& other) +EIGEN_DEVICE_FUNC void TriangularViewImpl::lazyAssign(const MatrixBase& other) { internal::call_assignment_no_alias(derived(), other.template triangularView()); } @@ -575,7 +588,7 @@ void TriangularViewImpl::lazyAssign(const MatrixBase template -inline TriangularView& +EIGEN_DEVICE_FUNC inline TriangularView& TriangularViewImpl::operator=(const TriangularBase& other) { eigen_assert(Mode == int(OtherDerived::Mode)); @@ -585,7 +598,7 @@ TriangularViewImpl::operator=(const TriangularBase template -void TriangularViewImpl::lazyAssign(const TriangularBase& other) +EIGEN_DEVICE_FUNC void TriangularViewImpl::lazyAssign(const TriangularBase& other) { eigen_assert(Mode == int(OtherDerived::Mode)); internal::call_assignment_no_alias(derived(), other.derived()); @@ -600,7 +613,7 @@ void TriangularViewImpl::lazyAssign(const TriangularBas * If the matrix is triangular, the opposite part is set to zero. */ template template -void TriangularBase::evalTo(MatrixBase &other) const +EIGEN_DEVICE_FUNC void TriangularBase::evalTo(MatrixBase &other) const { evalToLazy(other.derived()); } @@ -626,6 +639,7 @@ void TriangularBase::evalTo(MatrixBase &other) const */ template template +EIGEN_DEVICE_FUNC typename MatrixBase::template TriangularViewReturnType::Type MatrixBase::triangularView() { @@ -635,6 +649,7 @@ MatrixBase::triangularView() /** This is the const version of MatrixBase::triangularView() */ template template +EIGEN_DEVICE_FUNC typename MatrixBase::template ConstTriangularViewReturnType::Type MatrixBase::triangularView() const { @@ -700,7 +715,7 @@ bool MatrixBase::isLowerTriangular(const RealScalar& prec) const namespace internal { - + // TODO currently a triangular expression has the form TriangularView<.,.> // in the future triangular-ness should be defined by the expression traits // such that Transpose > is valid. (currently TriangularBase::transpose() is overloaded to make it work) @@ -717,6 +732,7 @@ struct unary_evaluator, IndexBased> { typedef TriangularView XprType; typedef evaluator::type> Base; + EIGEN_DEVICE_FUNC unary_evaluator(const XprType &xpr) : Base(xpr.nestedExpression()) {} }; @@ -728,7 +744,7 @@ struct Dense2Triangular {}; template struct triangular_assignment_loop; - + /** \internal Specialization of the dense assignment kernel for triangular matrices. * The main difference is that the triangular, diagonal, and opposite parts are processed through three different functions. * \tparam UpLo must be either Lower or Upper @@ -745,17 +761,17 @@ protected: using Base::m_src; using Base::m_functor; public: - + typedef typename Base::DstEvaluatorType DstEvaluatorType; typedef typename Base::SrcEvaluatorType SrcEvaluatorType; typedef typename Base::Scalar Scalar; typedef typename Base::AssignmentTraits AssignmentTraits; - - + + EIGEN_DEVICE_FUNC triangular_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr) : Base(dst, src, func, dstExpr) {} - + #ifdef EIGEN_INTERNAL_DEBUGGING EIGEN_DEVICE_FUNC void assignCoeff(Index row, Index col) { @@ -765,16 +781,16 @@ public: #else using Base::assignCoeff; #endif - + EIGEN_DEVICE_FUNC void assignDiagonalCoeff(Index id) { if(Mode==UnitDiag && SetOpposite) m_functor.assignCoeff(m_dst.coeffRef(id,id), Scalar(1)); else if(Mode==ZeroDiag && SetOpposite) m_functor.assignCoeff(m_dst.coeffRef(id,id), Scalar(0)); else if(Mode==0) Base::assignCoeff(id,id); } - + EIGEN_DEVICE_FUNC void assignOppositeCoeff(Index row, Index col) - { + { eigen_internal_assert(row!=col); if(SetOpposite) m_functor.assignCoeff(m_dst.coeffRef(row,col), Scalar(0)); @@ -795,17 +811,17 @@ void call_triangular_assignment_loop(DstXprType& dst, const SrcXprType& src, con if((dst.rows()!=dstRows) || (dst.cols()!=dstCols)) dst.resize(dstRows, dstCols); DstEvaluatorType dstEvaluator(dst); - + typedef triangular_dense_assignment_kernel< Mode&(Lower|Upper),Mode&(UnitDiag|ZeroDiag|SelfAdjoint),SetOpposite, DstEvaluatorType,SrcEvaluatorType,Functor> Kernel; Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived()); - + enum { unroll = DstXprType::SizeAtCompileTime != Dynamic && SrcEvaluatorType::CoeffReadCost < HugeCost - && DstXprType::SizeAtCompileTime * (DstEvaluatorType::CoeffReadCost+SrcEvaluatorType::CoeffReadCost) / 2 <= EIGEN_UNROLLING_LIMIT + && DstXprType::SizeAtCompileTime * (int(DstEvaluatorType::CoeffReadCost) + int(SrcEvaluatorType::CoeffReadCost)) / 2 <= EIGEN_UNROLLING_LIMIT }; - + triangular_assignment_loop::run(kernel); } @@ -827,8 +843,8 @@ struct Assignment EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func) { eigen_assert(int(DstXprType::Mode) == int(SrcXprType::Mode)); - - call_triangular_assignment_loop(dst, src, func); + + call_triangular_assignment_loop(dst, src, func); } }; @@ -837,7 +853,7 @@ struct Assignment { EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func) { - call_triangular_assignment_loop(dst, src, func); + call_triangular_assignment_loop(dst, src, func); } }; @@ -846,7 +862,7 @@ struct Assignment { EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func) { - call_triangular_assignment_loop(dst, src, func); + call_triangular_assignment_loop(dst, src, func); } }; @@ -857,19 +873,19 @@ struct triangular_assignment_loop // FIXME: this is not very clean, perhaps this information should be provided by the kernel? typedef typename Kernel::DstEvaluatorType DstEvaluatorType; typedef typename DstEvaluatorType::XprType DstXprType; - + enum { col = (UnrollCount-1) / DstXprType::RowsAtCompileTime, row = (UnrollCount-1) % DstXprType::RowsAtCompileTime }; - + typedef typename Kernel::Scalar Scalar; EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel) { triangular_assignment_loop::run(kernel); - + if(row==col) kernel.assignDiagonalCoeff(row); else if( ((Mode&Lower) && row>col) || ((Mode&Upper) && row } else i = maxi; - + if(i * If the matrix is triangular, the opposite part is set to zero. */ template template -void TriangularBase::evalToLazy(MatrixBase &other) const +EIGEN_DEVICE_FUNC void TriangularBase::evalToLazy(MatrixBase &other) const { other.derived().resize(this->rows(), this->cols()); - internal::call_triangular_assignment_loop(other.derived(), derived().nestedExpression()); + internal::call_triangular_assignment_loop(other.derived(), derived().nestedExpression()); } namespace internal { - + // Triangular = Product template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar> struct Assignment, internal::assign_op::Scalar>, Dense2Triangular> @@ -952,7 +968,7 @@ struct Assignment, internal::assign_ if((dst.rows()!=dstRows) || (dst.cols()!=dstCols)) dst.resize(dstRows, dstCols); - dst._assignProduct(src, 1, 0); + dst._assignProduct(src, Scalar(1), false); } }; @@ -963,7 +979,7 @@ struct Assignment, internal::add_ass typedef Product SrcXprType; static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op &) { - dst._assignProduct(src, 1, 1); + dst._assignProduct(src, Scalar(1), true); } }; @@ -974,7 +990,7 @@ struct Assignment, internal::sub_ass typedef Product SrcXprType; static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op &) { - dst._assignProduct(src, -1, 1); + dst._assignProduct(src, Scalar(-1), true); } }; diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/VectorBlock.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/VectorBlock.h index d72fbf7e99..71c5b95eec 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/VectorBlock.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/VectorBlock.h @@ -35,7 +35,7 @@ struct traits > * It is the return type of DenseBase::segment(Index,Index) and DenseBase::segment(Index) and * most of the time this is the only way it is used. * - * However, if you want to directly maniputate sub-vector expressions, + * However, if you want to directly manipulate sub-vector expressions, * for instance if you want to write a function returning such an expression, you * will need to use this class. * @@ -71,8 +71,8 @@ template class VectorBlock /** Dynamic-size constructor */ - EIGEN_DEVICE_FUNC - inline VectorBlock(VectorType& vector, Index start, Index size) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + VectorBlock(VectorType& vector, Index start, Index size) : Base(vector, IsColVector ? start : 0, IsColVector ? 0 : start, IsColVector ? size : 1, IsColVector ? 1 : size) @@ -82,8 +82,8 @@ template class VectorBlock /** Fixed-size constructor */ - EIGEN_DEVICE_FUNC - inline VectorBlock(VectorType& vector, Index start) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + VectorBlock(VectorType& vector, Index start) : Base(vector, IsColVector ? start : 0, IsColVector ? 0 : start) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorBlock); diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/VectorwiseOp.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/VectorwiseOp.h index 4fe267e9f1..870f4f1e40 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/VectorwiseOp.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/VectorwiseOp.h @@ -1,7 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2008-2010 Gael Guennebaud +// Copyright (C) 2008-2019 Gael Guennebaud // Copyright (C) 2006-2008 Benoit Jacob // // This Source Code Form is subject to the terms of the Mozilla @@ -65,10 +65,10 @@ class PartialReduxExpr : public internal::dense_xpr_base< PartialReduxExpr \ - struct member_##MEMBER { \ - EIGEN_EMPTY_STRUCT_CTOR(member_##MEMBER) \ - typedef ResultType result_type; \ - template struct Cost \ - { enum { value = COST }; }; \ - template \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ - ResultType operator()(const XprType& mat) const \ - { return mat.MEMBER(); } \ +template struct partial_redux_dummy_func; + +#define EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(MEMBER,COST,VECTORIZABLE,BINARYOP) \ + template \ + struct member_##MEMBER { \ + EIGEN_EMPTY_STRUCT_CTOR(member_##MEMBER) \ + typedef ResultType result_type; \ + typedef BINARYOP BinaryOp; \ + template struct Cost { enum { value = COST }; }; \ + enum { Vectorizable = VECTORIZABLE }; \ + template \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ + ResultType operator()(const XprType& mat) const \ + { return mat.MEMBER(); } \ + BinaryOp binaryFunc() const { return BinaryOp(); } \ } +#define EIGEN_MEMBER_FUNCTOR(MEMBER,COST) \ + EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(MEMBER,COST,0,partial_redux_dummy_func) + namespace internal { -EIGEN_MEMBER_FUNCTOR(squaredNorm, Size * NumTraits::MulCost + (Size-1)*NumTraits::AddCost); EIGEN_MEMBER_FUNCTOR(norm, (Size+5) * NumTraits::MulCost + (Size-1)*NumTraits::AddCost); EIGEN_MEMBER_FUNCTOR(stableNorm, (Size+5) * NumTraits::MulCost + (Size-1)*NumTraits::AddCost); EIGEN_MEMBER_FUNCTOR(blueNorm, (Size+5) * NumTraits::MulCost + (Size-1)*NumTraits::AddCost); EIGEN_MEMBER_FUNCTOR(hypotNorm, (Size-1) * functor_traits >::Cost ); -EIGEN_MEMBER_FUNCTOR(sum, (Size-1)*NumTraits::AddCost); -EIGEN_MEMBER_FUNCTOR(mean, (Size-1)*NumTraits::AddCost + NumTraits::MulCost); -EIGEN_MEMBER_FUNCTOR(minCoeff, (Size-1)*NumTraits::AddCost); -EIGEN_MEMBER_FUNCTOR(maxCoeff, (Size-1)*NumTraits::AddCost); EIGEN_MEMBER_FUNCTOR(all, (Size-1)*NumTraits::AddCost); EIGEN_MEMBER_FUNCTOR(any, (Size-1)*NumTraits::AddCost); EIGEN_MEMBER_FUNCTOR(count, (Size-1)*NumTraits::AddCost); -EIGEN_MEMBER_FUNCTOR(prod, (Size-1)*NumTraits::MulCost); -template +EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(sum, (Size-1)*NumTraits::AddCost, 1, internal::scalar_sum_op); +EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(minCoeff, (Size-1)*NumTraits::AddCost, 1, internal::scalar_min_op); +EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(maxCoeff, (Size-1)*NumTraits::AddCost, 1, internal::scalar_max_op); +EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(prod, (Size-1)*NumTraits::MulCost, 1, internal::scalar_product_op); + +template struct member_lpnorm { typedef ResultType result_type; - template struct Cost + enum { Vectorizable = 0 }; + template struct Cost { enum { value = (Size+5) * NumTraits::MulCost + (Size-1)*NumTraits::AddCost }; }; EIGEN_DEVICE_FUNC member_lpnorm() {} template @@ -121,17 +128,20 @@ struct member_lpnorm { { return mat.template lpNorm

(); } }; -template +template struct member_redux { + typedef BinaryOpT BinaryOp; typedef typename result_of< BinaryOp(const Scalar&,const Scalar&) >::type result_type; - template struct Cost - { enum { value = (Size-1) * functor_traits::Cost }; }; + + enum { Vectorizable = functor_traits::PacketAccess }; + template struct Cost { enum { value = (Size-1) * functor_traits::Cost }; }; EIGEN_DEVICE_FUNC explicit member_redux(const BinaryOp func) : m_functor(func) {} template EIGEN_DEVICE_FUNC inline result_type operator()(const DenseBase& mat) const { return mat.redux(m_functor); } + const BinaryOp& binaryFunc() const { return m_functor; } const BinaryOp m_functor; }; } @@ -139,18 +149,38 @@ struct member_redux { /** \class VectorwiseOp * \ingroup Core_Module * - * \brief Pseudo expression providing partial reduction operations + * \brief Pseudo expression providing broadcasting and partial reduction operations * * \tparam ExpressionType the type of the object on which to do partial reductions - * \tparam Direction indicates the direction of the redux (#Vertical or #Horizontal) + * \tparam Direction indicates whether to operate on columns (#Vertical) or rows (#Horizontal) * - * This class represents a pseudo expression with partial reduction features. + * This class represents a pseudo expression with broadcasting and partial reduction features. * It is the return type of DenseBase::colwise() and DenseBase::rowwise() - * and most of the time this is the only way it is used. + * and most of the time this is the only way it is explicitly used. + * + * To understand the logic of rowwise/colwise expression, let's consider a generic case `A.colwise().foo()` + * where `foo` is any method of `VectorwiseOp`. This expression is equivalent to applying `foo()` to each + * column of `A` and then re-assemble the outputs in a matrix expression: + * \code [A.col(0).foo(), A.col(1).foo(), ..., A.col(A.cols()-1).foo()] \endcode * * Example: \include MatrixBase_colwise.cpp * Output: \verbinclude MatrixBase_colwise.out * + * The begin() and end() methods are obviously exceptions to the previous rule as they + * return STL-compatible begin/end iterators to the rows or columns of the nested expression. + * Typical use cases include for-range-loop and calls to STL algorithms: + * + * Example: \include MatrixBase_colwise_iterator_cxx11.cpp + * Output: \verbinclude MatrixBase_colwise_iterator_cxx11.out + * + * For a partial reduction on an empty input, some rules apply. + * For the sake of clarity, let's consider a vertical reduction: + * - If the number of columns is zero, then a 1x0 row-major vector expression is returned. + * - Otherwise, if the number of rows is zero, then + * - a row vector of zeros is returned for sum-like reductions (sum, squaredNorm, norm, etc.) + * - a row vector of ones is returned for a product reduction (e.g., MatrixXd(n,0).colwise().prod()) + * - an assert is triggered for all other reductions (minCoeff,maxCoeff,redux(bin_op)) + * * \sa DenseBase::colwise(), DenseBase::rowwise(), class PartialReduxExpr */ template class VectorwiseOp @@ -163,11 +193,11 @@ template class VectorwiseOp typedef typename internal::ref_selector::non_const_type ExpressionTypeNested; typedef typename internal::remove_all::type ExpressionTypeNestedCleaned; - template class Functor, - typename Scalar_=Scalar> struct ReturnType + template class Functor, + typename ReturnScalar=Scalar> struct ReturnType { typedef PartialReduxExpr, + Functor, Direction > Type; }; @@ -187,23 +217,6 @@ template class VectorwiseOp protected: - typedef typename internal::conditional::type SubVector; - /** \internal - * \returns the i-th subvector according to the \c Direction */ - EIGEN_DEVICE_FUNC - SubVector subVector(Index i) - { - return SubVector(m_matrix.derived(),i); - } - - /** \internal - * \returns the number of subvectors in the direction \c Direction */ - EIGEN_DEVICE_FUNC - Index subVectors() const - { return isVertical?m_matrix.cols():m_matrix.rows(); } - template struct ExtendedType { typedef Replicate class VectorwiseOp EIGEN_DEVICE_FUNC inline const ExpressionType& _expression() const { return m_matrix; } + #ifdef EIGEN_PARSED_BY_DOXYGEN + /** STL-like RandomAccessIterator + * iterator type over the columns or rows as returned by the begin() and end() methods. + */ + random_access_iterator_type iterator; + /** This is the const version of iterator (aka read-only) */ + random_access_iterator_type const_iterator; + #else + typedef internal::subvector_stl_iterator iterator; + typedef internal::subvector_stl_iterator const_iterator; + typedef internal::subvector_stl_reverse_iterator reverse_iterator; + typedef internal::subvector_stl_reverse_iterator const_reverse_iterator; + #endif + + /** returns an iterator to the first row (rowwise) or column (colwise) of the nested expression. + * \sa end(), cbegin() + */ + iterator begin() { return iterator (m_matrix, 0); } + /** const version of begin() */ + const_iterator begin() const { return const_iterator(m_matrix, 0); } + /** const version of begin() */ + const_iterator cbegin() const { return const_iterator(m_matrix, 0); } + + /** returns a reverse iterator to the last row (rowwise) or column (colwise) of the nested expression. + * \sa rend(), crbegin() + */ + reverse_iterator rbegin() { return reverse_iterator (m_matrix, m_matrix.template subVectors()-1); } + /** const version of rbegin() */ + const_reverse_iterator rbegin() const { return const_reverse_iterator (m_matrix, m_matrix.template subVectors()-1); } + /** const version of rbegin() */ + const_reverse_iterator crbegin() const { return const_reverse_iterator (m_matrix, m_matrix.template subVectors()-1); } + + /** returns an iterator to the row (resp. column) following the last row (resp. column) of the nested expression + * \sa begin(), cend() + */ + iterator end() { return iterator (m_matrix, m_matrix.template subVectors()); } + /** const version of end() */ + const_iterator end() const { return const_iterator(m_matrix, m_matrix.template subVectors()); } + /** const version of end() */ + const_iterator cend() const { return const_iterator(m_matrix, m_matrix.template subVectors()); } + + /** returns a reverse iterator to the row (resp. column) before the first row (resp. column) of the nested expression + * \sa begin(), cend() + */ + reverse_iterator rend() { return reverse_iterator (m_matrix, -1); } + /** const version of rend() */ + const_reverse_iterator rend() const { return const_reverse_iterator (m_matrix, -1); } + /** const version of rend() */ + const_reverse_iterator crend() const { return const_reverse_iterator (m_matrix, -1); } + /** \returns a row or column vector expression of \c *this reduxed by \a func * * The template parameter \a BinaryOp is the type of the functor * of the custom redux operator. Note that func must be an associative operator. * + * \warning the size along the reduction direction must be strictly positive, + * otherwise an assertion is triggered. + * * \sa class VectorwiseOp, DenseBase::colwise(), DenseBase::rowwise() */ template EIGEN_DEVICE_FUNC const typename ReduxReturnType::Type redux(const BinaryOp& func = BinaryOp()) const - { return typename ReduxReturnType::Type(_expression(), internal::member_redux(func)); } + { + eigen_assert(redux_length()>0 && "you are using an empty matrix"); + return typename ReduxReturnType::Type(_expression(), internal::member_redux(func)); + } typedef typename ReturnType::Type MinCoeffReturnType; typedef typename ReturnType::Type MaxCoeffReturnType; - typedef typename ReturnType::Type SquaredNormReturnType; - typedef typename ReturnType::Type NormReturnType; + typedef PartialReduxExpr, const ExpressionTypeNestedCleaned>,internal::member_sum,Direction> SquaredNormReturnType; + typedef CwiseUnaryOp, const SquaredNormReturnType> NormReturnType; typedef typename ReturnType::Type BlueNormReturnType; typedef typename ReturnType::Type StableNormReturnType; typedef typename ReturnType::Type HypotNormReturnType; typedef typename ReturnType::Type SumReturnType; - typedef typename ReturnType::Type MeanReturnType; + typedef EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(SumReturnType,Scalar,quotient) MeanReturnType; typedef typename ReturnType::Type AllReturnType; typedef typename ReturnType::Type AnyReturnType; - typedef PartialReduxExpr, Direction> CountReturnType; + typedef PartialReduxExpr, Direction> CountReturnType; typedef typename ReturnType::Type ProdReturnType; typedef Reverse ConstReverseReturnType; typedef Reverse ReverseReturnType; template struct LpNormReturnType { - typedef PartialReduxExpr,Direction> Type; + typedef PartialReduxExpr,Direction> Type; }; /** \returns a row (or column) vector expression of the smallest coefficient * of each column (or row) of the referenced expression. * + * \warning the size along the reduction direction must be strictly positive, + * otherwise an assertion is triggered. + * * \warning the result is undefined if \c *this contains NaN. * * Example: \include PartialRedux_minCoeff.cpp @@ -302,11 +374,17 @@ template class VectorwiseOp * \sa DenseBase::minCoeff() */ EIGEN_DEVICE_FUNC const MinCoeffReturnType minCoeff() const - { return MinCoeffReturnType(_expression()); } + { + eigen_assert(redux_length()>0 && "you are using an empty matrix"); + return MinCoeffReturnType(_expression()); + } /** \returns a row (or column) vector expression of the largest coefficient * of each column (or row) of the referenced expression. * + * \warning the size along the reduction direction must be strictly positive, + * otherwise an assertion is triggered. + * * \warning the result is undefined if \c *this contains NaN. * * Example: \include PartialRedux_maxCoeff.cpp @@ -315,7 +393,10 @@ template class VectorwiseOp * \sa DenseBase::maxCoeff() */ EIGEN_DEVICE_FUNC const MaxCoeffReturnType maxCoeff() const - { return MaxCoeffReturnType(_expression()); } + { + eigen_assert(redux_length()>0 && "you are using an empty matrix"); + return MaxCoeffReturnType(_expression()); + } /** \returns a row (or column) vector expression of the squared norm * of each column (or row) of the referenced expression. @@ -327,7 +408,7 @@ template class VectorwiseOp * \sa DenseBase::squaredNorm() */ EIGEN_DEVICE_FUNC const SquaredNormReturnType squaredNorm() const - { return SquaredNormReturnType(_expression()); } + { return SquaredNormReturnType(m_matrix.cwiseAbs2()); } /** \returns a row (or column) vector expression of the norm * of each column (or row) of the referenced expression. @@ -339,7 +420,7 @@ template class VectorwiseOp * \sa DenseBase::norm() */ EIGEN_DEVICE_FUNC const NormReturnType norm() const - { return NormReturnType(_expression()); } + { return NormReturnType(squaredNorm()); } /** \returns a row (or column) vector expression of the norm * of each column (or row) of the referenced expression. @@ -404,7 +485,7 @@ template class VectorwiseOp * \sa DenseBase::mean() */ EIGEN_DEVICE_FUNC const MeanReturnType mean() const - { return MeanReturnType(_expression()); } + { return sum() / Scalar(Direction==Vertical?m_matrix.rows():m_matrix.cols()); } /** \returns a row (or column) vector expression representing * whether \b all coefficients of each respective column (or row) are \c true. @@ -500,7 +581,7 @@ template class VectorwiseOp EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived) EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived) //eigen_assert((m_matrix.isNull()) == (other.isNull())); FIXME - return const_cast(m_matrix = extendedTo(other.derived())); + return m_matrix = extendedTo(other.derived()); } /** Adds the vector \a other to each subvector of \c *this */ @@ -510,7 +591,7 @@ template class VectorwiseOp { EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived) EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived) - return const_cast(m_matrix += extendedTo(other.derived())); + return m_matrix += extendedTo(other.derived()); } /** Substracts the vector \a other to each subvector of \c *this */ @@ -520,7 +601,7 @@ template class VectorwiseOp { EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived) EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived) - return const_cast(m_matrix -= extendedTo(other.derived())); + return m_matrix -= extendedTo(other.derived()); } /** Multiples each subvector of \c *this by the vector \a other */ @@ -532,7 +613,7 @@ template class VectorwiseOp EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType) EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived) m_matrix *= extendedTo(other.derived()); - return const_cast(m_matrix); + return m_matrix; } /** Divides each subvector of \c *this by the vector \a other */ @@ -544,7 +625,7 @@ template class VectorwiseOp EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType) EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived) m_matrix /= extendedTo(other.derived()); - return const_cast(m_matrix); + return m_matrix; } /** Returns the expression of the sum of the vector \a other to each subvector of \c *this */ @@ -609,7 +690,7 @@ template class VectorwiseOp EIGEN_DEVICE_FUNC CwiseBinaryOp, const ExpressionTypeNestedCleaned, - const typename OppositeExtendedType::Type>::Type> + const typename OppositeExtendedType::Type> normalized() const { return m_matrix.cwiseQuotient(extendedToOpposite(this->norm())); } @@ -658,7 +739,15 @@ template class VectorwiseOp EIGEN_DEVICE_FUNC const HNormalizedReturnType hnormalized() const; +# ifdef EIGEN_VECTORWISEOP_PLUGIN +# include EIGEN_VECTORWISEOP_PLUGIN +# endif + protected: + Index redux_length() const + { + return Direction==Vertical ? m_matrix.rows() : m_matrix.cols(); + } ExpressionTypeNested m_matrix; }; @@ -670,7 +759,7 @@ template class VectorwiseOp * \sa rowwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting */ template -inline typename DenseBase::ColwiseReturnType +EIGEN_DEVICE_FUNC inline typename DenseBase::ColwiseReturnType DenseBase::colwise() { return ColwiseReturnType(derived()); @@ -684,7 +773,7 @@ DenseBase::colwise() * \sa colwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting */ template -inline typename DenseBase::RowwiseReturnType +EIGEN_DEVICE_FUNC inline typename DenseBase::RowwiseReturnType DenseBase::rowwise() { return RowwiseReturnType(derived()); diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Visitor.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Visitor.h index 54c1883d98..00bcca8776 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Visitor.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/Visitor.h @@ -10,7 +10,7 @@ #ifndef EIGEN_VISITOR_H #define EIGEN_VISITOR_H -namespace Eigen { +namespace Eigen { namespace internal { @@ -40,6 +40,14 @@ struct visitor_impl } }; +// This specialization enables visitors on empty matrices at compile-time +template +struct visitor_impl { + EIGEN_DEVICE_FUNC + static inline void run(const Derived &/*mat*/, Visitor& /*visitor*/) + {} +}; + template struct visitor_impl { @@ -62,22 +70,22 @@ class visitor_evaluator public: EIGEN_DEVICE_FUNC explicit visitor_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {} - + typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - + enum { RowsAtCompileTime = XprType::RowsAtCompileTime, CoeffReadCost = internal::evaluator::CoeffReadCost }; - - EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); } - EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); } - EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); } + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_xpr.rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_xpr.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT { return m_xpr.size(); } EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const { return m_evaluator.coeff(row, col); } - + protected: internal::evaluator m_evaluator; const XprType &m_xpr; @@ -99,6 +107,8 @@ protected: * \note compared to one or two \em for \em loops, visitors offer automatic * unrolling for small fixed size matrix. * + * \note if the matrix is empty, then the visitor is left unchanged. + * * \sa minCoeff(Index*,Index*), maxCoeff(Index*,Index*), DenseBase::redux() */ template @@ -106,12 +116,15 @@ template EIGEN_DEVICE_FUNC void DenseBase::visit(Visitor& visitor) const { + if(size()==0) + return; + typedef typename internal::visitor_evaluator ThisEvaluator; ThisEvaluator thisEval(derived()); - + enum { unroll = SizeAtCompileTime != Dynamic - && SizeAtCompileTime * ThisEvaluator::CoeffReadCost + (SizeAtCompileTime-1) * internal::functor_traits::Cost <= EIGEN_UNROLLING_LIMIT + && SizeAtCompileTime * int(ThisEvaluator::CoeffReadCost) + (SizeAtCompileTime-1) * int(internal::functor_traits::Cost) <= EIGEN_UNROLLING_LIMIT }; return internal::visitor_impl::run(thisEval, visitor); } @@ -124,6 +137,9 @@ namespace internal { template struct coeff_visitor { + // default initialization to avoid countless invalid maybe-uninitialized warnings by gcc + EIGEN_DEVICE_FUNC + coeff_visitor() : row(-1), col(-1), res(0) {} typedef typename Derived::Scalar Scalar; Index row, col; Scalar res; @@ -141,7 +157,7 @@ struct coeff_visitor * * \sa DenseBase::minCoeff(Index*, Index*) */ -template +template struct min_coeff_visitor : coeff_visitor { typedef typename Derived::Scalar Scalar; @@ -157,8 +173,40 @@ struct min_coeff_visitor : coeff_visitor } }; -template -struct functor_traits > { +template +struct min_coeff_visitor : coeff_visitor +{ + typedef typename Derived::Scalar Scalar; + EIGEN_DEVICE_FUNC + void operator() (const Scalar& value, Index i, Index j) + { + if((numext::isnan)(this->res) || (!(numext::isnan)(value) && value < this->res)) + { + this->res = value; + this->row = i; + this->col = j; + } + } +}; + +template +struct min_coeff_visitor : coeff_visitor +{ + typedef typename Derived::Scalar Scalar; + EIGEN_DEVICE_FUNC + void operator() (const Scalar& value, Index i, Index j) + { + if((numext::isnan)(value) || value < this->res) + { + this->res = value; + this->row = i; + this->col = j; + } + } +}; + +template + struct functor_traits > { enum { Cost = NumTraits::AddCost }; @@ -169,10 +217,10 @@ struct functor_traits > { * * \sa DenseBase::maxCoeff(Index*, Index*) */ -template +template struct max_coeff_visitor : coeff_visitor { - typedef typename Derived::Scalar Scalar; + typedef typename Derived::Scalar Scalar; EIGEN_DEVICE_FUNC void operator() (const Scalar& value, Index i, Index j) { @@ -185,8 +233,40 @@ struct max_coeff_visitor : coeff_visitor } }; -template -struct functor_traits > { +template +struct max_coeff_visitor : coeff_visitor +{ + typedef typename Derived::Scalar Scalar; + EIGEN_DEVICE_FUNC + void operator() (const Scalar& value, Index i, Index j) + { + if((numext::isnan)(this->res) || (!(numext::isnan)(value) && value > this->res)) + { + this->res = value; + this->row = i; + this->col = j; + } + } +}; + +template +struct max_coeff_visitor : coeff_visitor +{ + typedef typename Derived::Scalar Scalar; + EIGEN_DEVICE_FUNC + void operator() (const Scalar& value, Index i, Index j) + { + if((numext::isnan)(value) || value > this->res) + { + this->res = value; + this->row = i; + this->col = j; + } + } +}; + +template +struct functor_traits > { enum { Cost = NumTraits::AddCost }; @@ -196,17 +276,24 @@ struct functor_traits > { /** \fn DenseBase::minCoeff(IndexType* rowId, IndexType* colId) const * \returns the minimum of all coefficients of *this and puts in *row and *col its location. - * \warning the result is undefined if \c *this contains NaN. + * + * In case \c *this contains NaN, NaNPropagation determines the behavior: + * NaNPropagation == PropagateFast : undefined + * NaNPropagation == PropagateNaN : result is NaN + * NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN + * \warning the matrix must be not empty, otherwise an assertion is triggered. * * \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visit(), DenseBase::minCoeff() */ template -template +template EIGEN_DEVICE_FUNC typename internal::traits::Scalar DenseBase::minCoeff(IndexType* rowId, IndexType* colId) const { - internal::min_coeff_visitor minVisitor; + eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); + + internal::min_coeff_visitor minVisitor; this->visit(minVisitor); *rowId = minVisitor.row; if (colId) *colId = minVisitor.col; @@ -214,18 +301,25 @@ DenseBase::minCoeff(IndexType* rowId, IndexType* colId) const } /** \returns the minimum of all coefficients of *this and puts in *index its location. - * \warning the result is undefined if \c *this contains NaN. + * + * In case \c *this contains NaN, NaNPropagation determines the behavior: + * NaNPropagation == PropagateFast : undefined + * NaNPropagation == PropagateNaN : result is NaN + * NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN + * \warning the matrix must be not empty, otherwise an assertion is triggered. * * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::minCoeff() */ template -template +template EIGEN_DEVICE_FUNC typename internal::traits::Scalar DenseBase::minCoeff(IndexType* index) const { + eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - internal::min_coeff_visitor minVisitor; + internal::min_coeff_visitor minVisitor; this->visit(minVisitor); *index = IndexType((RowsAtCompileTime==1) ? minVisitor.col : minVisitor.row); return minVisitor.res; @@ -233,17 +327,24 @@ DenseBase::minCoeff(IndexType* index) const /** \fn DenseBase::maxCoeff(IndexType* rowId, IndexType* colId) const * \returns the maximum of all coefficients of *this and puts in *row and *col its location. - * \warning the result is undefined if \c *this contains NaN. + * + * In case \c *this contains NaN, NaNPropagation determines the behavior: + * NaNPropagation == PropagateFast : undefined + * NaNPropagation == PropagateNaN : result is NaN + * NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN + * \warning the matrix must be not empty, otherwise an assertion is triggered. * * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::maxCoeff() */ template -template +template EIGEN_DEVICE_FUNC typename internal::traits::Scalar DenseBase::maxCoeff(IndexType* rowPtr, IndexType* colPtr) const { - internal::max_coeff_visitor maxVisitor; + eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); + + internal::max_coeff_visitor maxVisitor; this->visit(maxVisitor); *rowPtr = maxVisitor.row; if (colPtr) *colPtr = maxVisitor.col; @@ -251,18 +352,25 @@ DenseBase::maxCoeff(IndexType* rowPtr, IndexType* colPtr) const } /** \returns the maximum of all coefficients of *this and puts in *index its location. - * \warning the result is undefined if \c *this contains NaN. + * + * In case \c *this contains NaN, NaNPropagation determines the behavior: + * NaNPropagation == PropagateFast : undefined + * NaNPropagation == PropagateNaN : result is NaN + * NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN + * \warning the matrix must be not empty, otherwise an assertion is triggered. * * \sa DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visitor(), DenseBase::maxCoeff() */ template -template +template EIGEN_DEVICE_FUNC typename internal::traits::Scalar DenseBase::maxCoeff(IndexType* index) const { + eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - internal::max_coeff_visitor maxVisitor; + internal::max_coeff_visitor maxVisitor; this->visit(maxVisitor); *index = (RowsAtCompileTime==1) ? maxVisitor.col : maxVisitor.row; return maxVisitor.res; diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/AVX/Complex.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/AVX/Complex.h index 7fa61969dc..ab7bd6c651 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/AVX/Complex.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/AVX/Complex.h @@ -22,6 +22,7 @@ struct Packet4cf __m256 v; }; +#ifndef EIGEN_VECTORIZE_AVX512 template<> struct packet_traits > : default_packet_traits { typedef Packet4cf type; @@ -37,6 +38,7 @@ template<> struct packet_traits > : default_packet_traits HasMul = 1, HasDiv = 1, HasNegate = 1, + HasSqrt = 1, HasAbs = 0, HasAbs2 = 0, HasMin = 0, @@ -44,8 +46,20 @@ template<> struct packet_traits > : default_packet_traits HasSetLinear = 0 }; }; +#endif -template<> struct unpacket_traits { typedef std::complex type; enum {size=4, alignment=Aligned32}; typedef Packet2cf half; }; +template<> struct unpacket_traits { + typedef std::complex type; + typedef Packet2cf half; + typedef Packet8f as_real; + enum { + size=4, + alignment=Aligned32, + vectorizable=true, + masked_load_available=false, + masked_store_available=false + }; +}; template<> EIGEN_STRONG_INLINE Packet4cf padd(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf psub(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); } @@ -67,10 +81,17 @@ template<> EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, con return Packet4cf(result); } +template <> +EIGEN_STRONG_INLINE Packet4cf pcmp_eq(const Packet4cf& a, const Packet4cf& b) { + __m256 eq = _mm256_cmp_ps(a.v, b.v, _CMP_EQ_OQ); + return Packet4cf(_mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1))); +} + +template<> EIGEN_STRONG_INLINE Packet4cf ptrue(const Packet4cf& a) { return Packet4cf(ptrue(Packet8f(a.v))); } template<> EIGEN_STRONG_INLINE Packet4cf pand (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf por (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf pxor (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet4cf pandnot(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf pandnot(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(b.v,a.v)); } template<> EIGEN_STRONG_INLINE Packet4cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet4cf(pload(&numext::real_ref(*from))); } template<> EIGEN_STRONG_INLINE Packet4cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cf(ploadu(&numext::real_ref(*from))); } @@ -140,70 +161,12 @@ template<> EIGEN_STRONG_INLINE std::complex predux(const Packe Packet2cf(_mm256_extractf128_ps(a.v,1)))); } -template<> EIGEN_STRONG_INLINE Packet4cf preduxp(const Packet4cf* vecs) -{ - Packet8f t0 = _mm256_shuffle_ps(vecs[0].v, vecs[0].v, _MM_SHUFFLE(3, 1, 2 ,0)); - Packet8f t1 = _mm256_shuffle_ps(vecs[1].v, vecs[1].v, _MM_SHUFFLE(3, 1, 2 ,0)); - t0 = _mm256_hadd_ps(t0,t1); - Packet8f t2 = _mm256_shuffle_ps(vecs[2].v, vecs[2].v, _MM_SHUFFLE(3, 1, 2 ,0)); - Packet8f t3 = _mm256_shuffle_ps(vecs[3].v, vecs[3].v, _MM_SHUFFLE(3, 1, 2 ,0)); - t2 = _mm256_hadd_ps(t2,t3); - - t1 = _mm256_permute2f128_ps(t0,t2, 0 + (2<<4)); - t3 = _mm256_permute2f128_ps(t0,t2, 1 + (3<<4)); - - return Packet4cf(_mm256_add_ps(t1,t3)); -} - template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet4cf& a) { return predux_mul(pmul(Packet2cf(_mm256_extractf128_ps(a.v, 0)), Packet2cf(_mm256_extractf128_ps(a.v, 1)))); } -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4cf& first, const Packet4cf& second) - { - if (Offset==0) return; - palign_impl::run(first.v, second.v); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const - { - return internal::pmul(a, pconj(b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const - { - return internal::pmul(pconj(a), b); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const - { - return pconj(internal::pmul(a, b)); - } -}; - EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cf,Packet8f) template<> EIGEN_STRONG_INLINE Packet4cf pdiv(const Packet4cf& a, const Packet4cf& b) @@ -228,6 +191,7 @@ struct Packet2cd __m256d v; }; +#ifndef EIGEN_VECTORIZE_AVX512 template<> struct packet_traits > : default_packet_traits { typedef Packet2cd type; @@ -243,6 +207,7 @@ template<> struct packet_traits > : default_packet_traits HasMul = 1, HasDiv = 1, HasNegate = 1, + HasSqrt = 1, HasAbs = 0, HasAbs2 = 0, HasMin = 0, @@ -250,8 +215,20 @@ template<> struct packet_traits > : default_packet_traits HasSetLinear = 0 }; }; +#endif -template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned32}; typedef Packet1cd half; }; +template<> struct unpacket_traits { + typedef std::complex type; + typedef Packet1cd half; + typedef Packet4d as_real; + enum { + size=2, + alignment=Aligned32, + vectorizable=true, + masked_load_available=false, + masked_store_available=false + }; +}; template<> EIGEN_STRONG_INLINE Packet2cd padd(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd psub(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); } @@ -272,10 +249,17 @@ template<> EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, con return Packet2cd(_mm256_addsub_pd(even, odd)); } +template <> +EIGEN_STRONG_INLINE Packet2cd pcmp_eq(const Packet2cd& a, const Packet2cd& b) { + __m256d eq = _mm256_cmp_pd(a.v, b.v, _CMP_EQ_OQ); + return Packet2cd(pand(eq, _mm256_permute_pd(eq, 0x5))); +} + +template<> EIGEN_STRONG_INLINE Packet2cd ptrue(const Packet2cd& a) { return Packet2cd(ptrue(Packet4d(a.v))); } template<> EIGEN_STRONG_INLINE Packet2cd pand (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd por (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd pxor (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cd pandnot(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd pandnot(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(b.v,a.v)); } template<> EIGEN_STRONG_INLINE Packet2cd pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cd(pload((const double*)from)); } @@ -327,63 +311,12 @@ template<> EIGEN_STRONG_INLINE std::complex predux(const Pack Packet1cd(_mm256_extractf128_pd(a.v,1)))); } -template<> EIGEN_STRONG_INLINE Packet2cd preduxp(const Packet2cd* vecs) -{ - Packet4d t0 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 0 + (2<<4)); - Packet4d t1 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 1 + (3<<4)); - - return Packet2cd(_mm256_add_pd(t0,t1)); -} - template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cd& a) { return predux(pmul(Packet1cd(_mm256_extractf128_pd(a.v,0)), Packet1cd(_mm256_extractf128_pd(a.v,1)))); } -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet2cd& first, const Packet2cd& second) - { - if (Offset==0) return; - palign_impl::run(first.v, second.v); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const - { - return internal::pmul(a, pconj(b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const - { - return internal::pmul(pconj(a), b); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const - { - return pconj(internal::pmul(a, b)); - } -}; - EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cd,Packet4d) template<> EIGEN_STRONG_INLINE Packet2cd pdiv(const Packet2cd& a, const Packet2cd& b) @@ -424,24 +357,12 @@ ptranspose(PacketBlock& kernel) { kernel.packet[0].v = tmp; } -template<> EIGEN_STRONG_INLINE Packet4cf pinsertfirst(const Packet4cf& a, std::complex b) -{ - return Packet4cf(_mm256_blend_ps(a.v,pset1(b).v,1|2)); +template<> EIGEN_STRONG_INLINE Packet2cd psqrt(const Packet2cd& a) { + return psqrt_complex(a); } -template<> EIGEN_STRONG_INLINE Packet2cd pinsertfirst(const Packet2cd& a, std::complex b) -{ - return Packet2cd(_mm256_blend_pd(a.v,pset1(b).v,1|2)); -} - -template<> EIGEN_STRONG_INLINE Packet4cf pinsertlast(const Packet4cf& a, std::complex b) -{ - return Packet4cf(_mm256_blend_ps(a.v,pset1(b).v,(1<<7)|(1<<6))); -} - -template<> EIGEN_STRONG_INLINE Packet2cd pinsertlast(const Packet2cd& a, std::complex b) -{ - return Packet2cd(_mm256_blend_pd(a.v,pset1(b).v,(1<<3)|(1<<2))); +template<> EIGEN_STRONG_INLINE Packet4cf psqrt(const Packet4cf& a) { + return psqrt_complex(a); } } // end namespace internal diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/AVX/MathFunctions.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/AVX/MathFunctions.h index 6af67ce2d6..67041c812c 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/AVX/MathFunctions.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/AVX/MathFunctions.h @@ -10,7 +10,7 @@ #ifndef EIGEN_MATH_FUNCTIONS_AVX_H #define EIGEN_MATH_FUNCTIONS_AVX_H -/* The sin, cos, exp, and log functions of this file are loosely derived from +/* The sin and cos functions of this file are loosely derived from * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ */ @@ -18,187 +18,50 @@ namespace Eigen { namespace internal { -inline Packet8i pshiftleft(Packet8i v, int n) -{ -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_slli_epi32(v, n); -#else - __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(v, 0), n); - __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(v, 1), n); - return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); -#endif -} - -inline Packet8f pshiftright(Packet8f v, int n) -{ -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_cvtepi32_ps(_mm256_srli_epi32(_mm256_castps_si256(v), n)); -#else - __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 0), n); - __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 1), n); - return _mm256_cvtepi32_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1)); -#endif -} - -// Sine function -// Computes sin(x) by wrapping x to the interval [-Pi/4,3*Pi/4] and -// evaluating interpolants in [-Pi/4,Pi/4] or [Pi/4,3*Pi/4]. The interpolants -// are (anti-)symmetric and thus have only odd/even coefficients template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f psin(const Packet8f& _x) { - Packet8f x = _x; - - // Some useful values. - _EIGEN_DECLARE_CONST_Packet8i(one, 1); - _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f); - _EIGEN_DECLARE_CONST_Packet8f(two, 2.0f); - _EIGEN_DECLARE_CONST_Packet8f(one_over_four, 0.25f); - _EIGEN_DECLARE_CONST_Packet8f(one_over_pi, 3.183098861837907e-01f); - _EIGEN_DECLARE_CONST_Packet8f(neg_pi_first, -3.140625000000000e+00f); - _EIGEN_DECLARE_CONST_Packet8f(neg_pi_second, -9.670257568359375e-04f); - _EIGEN_DECLARE_CONST_Packet8f(neg_pi_third, -6.278329571784980e-07f); - _EIGEN_DECLARE_CONST_Packet8f(four_over_pi, 1.273239544735163e+00f); - - // Map x from [-Pi/4,3*Pi/4] to z in [-1,3] and subtract the shifted period. - Packet8f z = pmul(x, p8f_one_over_pi); - Packet8f shift = _mm256_floor_ps(padd(z, p8f_one_over_four)); - x = pmadd(shift, p8f_neg_pi_first, x); - x = pmadd(shift, p8f_neg_pi_second, x); - x = pmadd(shift, p8f_neg_pi_third, x); - z = pmul(x, p8f_four_over_pi); - - // Make a mask for the entries that need flipping, i.e. wherever the shift - // is odd. - Packet8i shift_ints = _mm256_cvtps_epi32(shift); - Packet8i shift_isodd = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(shift_ints), _mm256_castsi256_ps(p8i_one))); - Packet8i sign_flip_mask = pshiftleft(shift_isodd, 31); - - // Create a mask for which interpolant to use, i.e. if z > 1, then the mask - // is set to ones for that entry. - Packet8f ival_mask = _mm256_cmp_ps(z, p8f_one, _CMP_GT_OQ); - - // Evaluate the polynomial for the interval [1,3] in z. - _EIGEN_DECLARE_CONST_Packet8f(coeff_right_0, 9.999999724233232e-01f); - _EIGEN_DECLARE_CONST_Packet8f(coeff_right_2, -3.084242535619928e-01f); - _EIGEN_DECLARE_CONST_Packet8f(coeff_right_4, 1.584991525700324e-02f); - _EIGEN_DECLARE_CONST_Packet8f(coeff_right_6, -3.188805084631342e-04f); - Packet8f z_minus_two = psub(z, p8f_two); - Packet8f z_minus_two2 = pmul(z_minus_two, z_minus_two); - Packet8f right = pmadd(p8f_coeff_right_6, z_minus_two2, p8f_coeff_right_4); - right = pmadd(right, z_minus_two2, p8f_coeff_right_2); - right = pmadd(right, z_minus_two2, p8f_coeff_right_0); - - // Evaluate the polynomial for the interval [-1,1] in z. - _EIGEN_DECLARE_CONST_Packet8f(coeff_left_1, 7.853981525427295e-01f); - _EIGEN_DECLARE_CONST_Packet8f(coeff_left_3, -8.074536727092352e-02f); - _EIGEN_DECLARE_CONST_Packet8f(coeff_left_5, 2.489871967827018e-03f); - _EIGEN_DECLARE_CONST_Packet8f(coeff_left_7, -3.587725841214251e-05f); - Packet8f z2 = pmul(z, z); - Packet8f left = pmadd(p8f_coeff_left_7, z2, p8f_coeff_left_5); - left = pmadd(left, z2, p8f_coeff_left_3); - left = pmadd(left, z2, p8f_coeff_left_1); - left = pmul(left, z); - - // Assemble the results, i.e. select the left and right polynomials. - left = _mm256_andnot_ps(ival_mask, left); - right = _mm256_and_ps(ival_mask, right); - Packet8f res = _mm256_or_ps(left, right); - - // Flip the sign on the odd intervals and return the result. - res = _mm256_xor_ps(res, _mm256_castsi256_ps(sign_flip_mask)); - return res; + return psin_float(_x); +} + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f +pcos(const Packet8f& _x) { + return pcos_float(_x); } -// Natural logarithm -// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2) -// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can -// be easily approximated by a polynomial centered on m=1 for stability. -// TODO(gonnet): Further reduce the interval allowing for lower-degree -// polynomial interpolants -> ... -> profit! template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f plog(const Packet8f& _x) { - Packet8f x = _x; - _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f); - _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f); - _EIGEN_DECLARE_CONST_Packet8f(126f, 126.0f); + return plog_float(_x); +} - _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inv_mant_mask, ~0x7f800000); +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d +plog(const Packet4d& _x) { + return plog_double(_x); +} - // The smallest non denormalized float number. - _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(min_norm_pos, 0x00800000); - _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(minus_inf, 0xff800000); +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f +plog2(const Packet8f& _x) { + return plog2_float(_x); +} - // Polynomial coefficients. - _EIGEN_DECLARE_CONST_Packet8f(cephes_SQRTHF, 0.707106781186547524f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p0, 7.0376836292E-2f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p1, -1.1514610310E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p2, 1.1676998740E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p3, -1.2420140846E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p4, +1.4249322787E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p5, -1.6668057665E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p6, +2.0000714765E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p7, -2.4999993993E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p8, +3.3333331174E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_q1, -2.12194440e-4f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_q2, 0.693359375f); +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d +plog2(const Packet4d& _x) { + return plog2_double(_x); +} - Packet8f invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_NGE_UQ); // not greater equal is true if x is NaN - Packet8f iszero_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_EQ_OQ); +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet8f plog1p(const Packet8f& _x) { + return generic_plog1p(_x); +} - // Truncate input values to the minimum positive normal. - x = pmax(x, p8f_min_norm_pos); - - Packet8f emm0 = pshiftright(x,23); - Packet8f e = _mm256_sub_ps(emm0, p8f_126f); - - // Set the exponents to -1, i.e. x are in the range [0.5,1). - x = _mm256_and_ps(x, p8f_inv_mant_mask); - x = _mm256_or_ps(x, p8f_half); - - // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2)) - // and shift by -1. The values are then centered around 0, which improves - // the stability of the polynomial evaluation. - // if( x < SQRTHF ) { - // e -= 1; - // x = x + x - 1.0; - // } else { x = x - 1.0; } - Packet8f mask = _mm256_cmp_ps(x, p8f_cephes_SQRTHF, _CMP_LT_OQ); - Packet8f tmp = _mm256_and_ps(x, mask); - x = psub(x, p8f_1); - e = psub(e, _mm256_and_ps(p8f_1, mask)); - x = padd(x, tmp); - - Packet8f x2 = pmul(x, x); - Packet8f x3 = pmul(x2, x); - - // Evaluate the polynomial approximant of degree 8 in three parts, probably - // to improve instruction-level parallelism. - Packet8f y, y1, y2; - y = pmadd(p8f_cephes_log_p0, x, p8f_cephes_log_p1); - y1 = pmadd(p8f_cephes_log_p3, x, p8f_cephes_log_p4); - y2 = pmadd(p8f_cephes_log_p6, x, p8f_cephes_log_p7); - y = pmadd(y, x, p8f_cephes_log_p2); - y1 = pmadd(y1, x, p8f_cephes_log_p5); - y2 = pmadd(y2, x, p8f_cephes_log_p8); - y = pmadd(y, x3, y1); - y = pmadd(y, x3, y2); - y = pmul(y, x3); - - // Add the logarithm of the exponent back to the result of the interpolation. - y1 = pmul(e, p8f_cephes_log_q1); - tmp = pmul(x2, p8f_half); - y = padd(y, y1); - x = psub(x, tmp); - y2 = pmul(e, p8f_cephes_log_q2); - x = padd(x, y); - x = padd(x, y2); - - // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF. - return _mm256_or_ps( - _mm256_andnot_ps(iszero_mask, _mm256_or_ps(x, invalid_mask)), - _mm256_and_ps(iszero_mask, p8f_minus_inf)); +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet8f pexpm1(const Packet8f& _x) { + return generic_expm1(_x); } // Exponential function. Works by writing "x = m*log(2) + r" where @@ -207,149 +70,21 @@ plog(const Packet8f& _x) { template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f pexp(const Packet8f& _x) { - _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f); - _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f); - _EIGEN_DECLARE_CONST_Packet8f(127, 127.0f); - - _EIGEN_DECLARE_CONST_Packet8f(exp_hi, 88.3762626647950f); - _EIGEN_DECLARE_CONST_Packet8f(exp_lo, -88.3762626647949f); - - _EIGEN_DECLARE_CONST_Packet8f(cephes_LOG2EF, 1.44269504088896341f); - - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p0, 1.9875691500E-4f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p1, 1.3981999507E-3f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p2, 8.3334519073E-3f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p3, 4.1665795894E-2f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p4, 1.6666665459E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p5, 5.0000001201E-1f); - - // Clamp x. - Packet8f x = pmax(pmin(_x, p8f_exp_hi), p8f_exp_lo); - - // Express exp(x) as exp(m*ln(2) + r), start by extracting - // m = floor(x/ln(2) + 0.5). - Packet8f m = _mm256_floor_ps(pmadd(x, p8f_cephes_LOG2EF, p8f_half)); - -// Get r = x - m*ln(2). If no FMA instructions are available, m*ln(2) is -// subtracted out in two parts, m*C1+m*C2 = m*ln(2), to avoid accumulating -// truncation errors. Note that we don't use the "pmadd" function here to -// ensure that a precision-preserving FMA instruction is used. -#ifdef EIGEN_VECTORIZE_FMA - _EIGEN_DECLARE_CONST_Packet8f(nln2, -0.6931471805599453f); - Packet8f r = _mm256_fmadd_ps(m, p8f_nln2, x); -#else - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C1, 0.693359375f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C2, -2.12194440e-4f); - Packet8f r = psub(x, pmul(m, p8f_cephes_exp_C1)); - r = psub(r, pmul(m, p8f_cephes_exp_C2)); -#endif - - Packet8f r2 = pmul(r, r); - - // TODO(gonnet): Split into odd/even polynomials and try to exploit - // instruction-level parallelism. - Packet8f y = p8f_cephes_exp_p0; - y = pmadd(y, r, p8f_cephes_exp_p1); - y = pmadd(y, r, p8f_cephes_exp_p2); - y = pmadd(y, r, p8f_cephes_exp_p3); - y = pmadd(y, r, p8f_cephes_exp_p4); - y = pmadd(y, r, p8f_cephes_exp_p5); - y = pmadd(y, r2, r); - y = padd(y, p8f_1); - - // Build emm0 = 2^m. - Packet8i emm0 = _mm256_cvttps_epi32(padd(m, p8f_127)); - emm0 = pshiftleft(emm0, 23); - - // Return 2^m * exp(r). - return pmax(pmul(y, _mm256_castsi256_ps(emm0)), _x); + return pexp_float(_x); } // Hyperbolic Tangent function. template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f -ptanh(const Packet8f& x) { - return internal::generic_fast_tanh_float(x); +ptanh(const Packet8f& _x) { + return internal::generic_fast_tanh_float(_x); } +// Exponential function for doubles. template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d pexp(const Packet4d& _x) { - Packet4d x = _x; - - _EIGEN_DECLARE_CONST_Packet4d(1, 1.0); - _EIGEN_DECLARE_CONST_Packet4d(2, 2.0); - _EIGEN_DECLARE_CONST_Packet4d(half, 0.5); - - _EIGEN_DECLARE_CONST_Packet4d(exp_hi, 709.437); - _EIGEN_DECLARE_CONST_Packet4d(exp_lo, -709.436139303); - - _EIGEN_DECLARE_CONST_Packet4d(cephes_LOG2EF, 1.4426950408889634073599); - - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p0, 1.26177193074810590878e-4); - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p1, 3.02994407707441961300e-2); - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p2, 9.99999999999999999910e-1); - - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q0, 3.00198505138664455042e-6); - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q1, 2.52448340349684104192e-3); - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q2, 2.27265548208155028766e-1); - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q3, 2.00000000000000000009e0); - - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C1, 0.693145751953125); - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C2, 1.42860682030941723212e-6); - _EIGEN_DECLARE_CONST_Packet4i(1023, 1023); - - Packet4d tmp, fx; - - // clamp x - x = pmax(pmin(x, p4d_exp_hi), p4d_exp_lo); - // Express exp(x) as exp(g + n*log(2)). - fx = pmadd(p4d_cephes_LOG2EF, x, p4d_half); - - // Get the integer modulus of log(2), i.e. the "n" described above. - fx = _mm256_floor_pd(fx); - - // Get the remainder modulo log(2), i.e. the "g" described above. Subtract - // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last - // digits right. - tmp = pmul(fx, p4d_cephes_exp_C1); - Packet4d z = pmul(fx, p4d_cephes_exp_C2); - x = psub(x, tmp); - x = psub(x, z); - - Packet4d x2 = pmul(x, x); - - // Evaluate the numerator polynomial of the rational interpolant. - Packet4d px = p4d_cephes_exp_p0; - px = pmadd(px, x2, p4d_cephes_exp_p1); - px = pmadd(px, x2, p4d_cephes_exp_p2); - px = pmul(px, x); - - // Evaluate the denominator polynomial of the rational interpolant. - Packet4d qx = p4d_cephes_exp_q0; - qx = pmadd(qx, x2, p4d_cephes_exp_q1); - qx = pmadd(qx, x2, p4d_cephes_exp_q2); - qx = pmadd(qx, x2, p4d_cephes_exp_q3); - - // I don't really get this bit, copied from the SSE2 routines, so... - // TODO(gonnet): Figure out what is going on here, perhaps find a better - // rational interpolant? - x = _mm256_div_pd(px, psub(qx, px)); - x = pmadd(p4d_2, x, p4d_1); - - // Build e=2^n by constructing the exponents in a 128-bit vector and - // shifting them to where they belong in double-precision values. - __m128i emm0 = _mm256_cvtpd_epi32(fx); - emm0 = _mm_add_epi32(emm0, p4i_1023); - emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0)); - __m128i lo = _mm_slli_epi64(emm0, 52); - __m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52); - __m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0); - e = _mm256_insertf128_si256(e, hi, 1); - - // Construct the result 2^n * exp(g) = e * x. The max is used to catch - // non-finite values in the input. - return pmax(pmul(x, _mm256_castsi256_pd(e)), _x); + return pexp_double(_x); } // Functions for sqrt. @@ -362,37 +97,39 @@ pexp(const Packet4d& _x) { // For detail see here: http://www.beyond3d.com/content/articles/8/ #if EIGEN_FAST_MATH template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f -psqrt(const Packet8f& _x) { - Packet8f half = pmul(_x, pset1(.5f)); - Packet8f denormal_mask = _mm256_and_ps( - _mm256_cmp_ps(_x, pset1((std::numeric_limits::min)()), - _CMP_LT_OQ), - _mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_GE_OQ)); +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet8f psqrt(const Packet8f& _x) { + Packet8f minus_half_x = pmul(_x, pset1(-0.5f)); + Packet8f denormal_mask = pandnot( + pcmp_lt(_x, pset1((std::numeric_limits::min)())), + pcmp_lt(_x, pzero(_x))); // Compute approximate reciprocal sqrt. Packet8f x = _mm256_rsqrt_ps(_x); // Do a single step of Newton's iteration. - x = pmul(x, psub(pset1(1.5f), pmul(half, pmul(x,x)))); + x = pmul(x, pmadd(minus_half_x, pmul(x,x), pset1(1.5f))); // Flush results for denormals to zero. - return _mm256_andnot_ps(denormal_mask, pmul(_x,x)); + return pandnot(pmul(_x,x), denormal_mask); } -#else -template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet8f psqrt(const Packet8f& x) { - return _mm256_sqrt_ps(x); -} -#endif -template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4d psqrt(const Packet4d& x) { - return _mm256_sqrt_pd(x); -} -#if EIGEN_FAST_MATH +#else + +template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet8f psqrt(const Packet8f& _x) { + return _mm256_sqrt_ps(_x); +} + +#endif + +template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4d psqrt(const Packet4d& _x) { + return _mm256_sqrt_pd(_x); +} + +#if EIGEN_FAST_MATH template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f prsqrt(const Packet8f& _x) { _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inf, 0x7f800000); - _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(nan, 0x7fc00000); _EIGEN_DECLARE_CONST_Packet8f(one_point_five, 1.5f); _EIGEN_DECLARE_CONST_Packet8f(minus_half, -0.5f); _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(flt_min, 0x00800000); @@ -401,36 +138,88 @@ Packet8f prsqrt(const Packet8f& _x) { // select only the inverse sqrt of positive normal inputs (denormals are // flushed to zero and cause infs as well). - Packet8f le_zero_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_LT_OQ); - Packet8f x = _mm256_andnot_ps(le_zero_mask, _mm256_rsqrt_ps(_x)); + Packet8f lt_min_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_LT_OQ); + Packet8f inf_mask = _mm256_cmp_ps(_x, p8f_inf, _CMP_EQ_OQ); + Packet8f not_normal_finite_mask = _mm256_or_ps(lt_min_mask, inf_mask); - // Fill in NaNs and Infs for the negative/zero entries. - Packet8f neg_mask = _mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_LT_OQ); - Packet8f zero_mask = _mm256_andnot_ps(neg_mask, le_zero_mask); - Packet8f infs_and_nans = _mm256_or_ps(_mm256_and_ps(neg_mask, p8f_nan), - _mm256_and_ps(zero_mask, p8f_inf)); + // Compute an approximate result using the rsqrt intrinsic. + Packet8f y_approx = _mm256_rsqrt_ps(_x); - // Do a single step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p8f_one_point_five)); + // Do a single step of Newton-Raphson iteration to improve the approximation. + // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n). + // It is essential to evaluate the inner term like this because forming + // y_n^2 may over- or underflow. + Packet8f y_newton = pmul(y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p8f_one_point_five)); - // Insert NaNs and Infs in all the right places. - return _mm256_or_ps(x, infs_and_nans); + // Select the result of the Newton-Raphson step for positive normal arguments. + // For other arguments, choose the output of the intrinsic. This will + // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(x) = +inf if + // x is zero or a positive denormalized float (equivalent to flushing positive + // denormalized inputs to zero). + return pselect(not_normal_finite_mask, y_approx, y_newton); } #else template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet8f prsqrt(const Packet8f& x) { +Packet8f prsqrt(const Packet8f& _x) { _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f); - return _mm256_div_ps(p8f_one, _mm256_sqrt_ps(x)); + return _mm256_div_ps(p8f_one, _mm256_sqrt_ps(_x)); } #endif template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4d prsqrt(const Packet4d& x) { +Packet4d prsqrt(const Packet4d& _x) { _EIGEN_DECLARE_CONST_Packet4d(one, 1.0); - return _mm256_div_pd(p4d_one, _mm256_sqrt_pd(x)); + return _mm256_div_pd(p4d_one, _mm256_sqrt_pd(_x)); } +F16_PACKET_FUNCTION(Packet8f, Packet8h, psin) +F16_PACKET_FUNCTION(Packet8f, Packet8h, pcos) +F16_PACKET_FUNCTION(Packet8f, Packet8h, plog) +F16_PACKET_FUNCTION(Packet8f, Packet8h, plog2) +F16_PACKET_FUNCTION(Packet8f, Packet8h, plog1p) +F16_PACKET_FUNCTION(Packet8f, Packet8h, pexpm1) +F16_PACKET_FUNCTION(Packet8f, Packet8h, pexp) +F16_PACKET_FUNCTION(Packet8f, Packet8h, ptanh) +F16_PACKET_FUNCTION(Packet8f, Packet8h, psqrt) +F16_PACKET_FUNCTION(Packet8f, Packet8h, prsqrt) + +template <> +EIGEN_STRONG_INLINE Packet8h pfrexp(const Packet8h& a, Packet8h& exponent) { + Packet8f fexponent; + const Packet8h out = float2half(pfrexp(half2float(a), fexponent)); + exponent = float2half(fexponent); + return out; +} + +template <> +EIGEN_STRONG_INLINE Packet8h pldexp(const Packet8h& a, const Packet8h& exponent) { + return float2half(pldexp(half2float(a), half2float(exponent))); +} + +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, psin) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pcos) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, plog) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, plog2) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, plog1p) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pexpm1) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pexp) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, ptanh) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, psqrt) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, prsqrt) + +template <> +EIGEN_STRONG_INLINE Packet8bf pfrexp(const Packet8bf& a, Packet8bf& exponent) { + Packet8f fexponent; + const Packet8bf out = F32ToBf16(pfrexp(Bf16ToF32(a), fexponent)); + exponent = F32ToBf16(fexponent); + return out; +} + +template <> +EIGEN_STRONG_INLINE Packet8bf pldexp(const Packet8bf& a, const Packet8bf& exponent) { + return F32ToBf16(pldexp(Bf16ToF32(a), Bf16ToF32(exponent))); +} } // end namespace internal diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/AVX/PacketMath.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/AVX/PacketMath.h index 923a124b20..7fc32fd719 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/AVX/PacketMath.h @@ -18,11 +18,11 @@ namespace internal { #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 #endif -#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS -#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) +#if !defined(EIGEN_VECTORIZE_AVX512) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS) +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 #endif -#ifdef __FMA__ +#ifdef EIGEN_VECTORIZE_FMA #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif @@ -31,10 +31,14 @@ namespace internal { typedef __m256 Packet8f; typedef __m256i Packet8i; typedef __m256d Packet4d; +typedef eigen_packet_wrapper<__m128i, 2> Packet8h; +typedef eigen_packet_wrapper<__m128i, 3> Packet8bf; template<> struct is_arithmetic<__m256> { enum { value = true }; }; template<> struct is_arithmetic<__m256i> { enum { value = true }; }; template<> struct is_arithmetic<__m256d> { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; #define _EIGEN_DECLARE_CONST_Packet8f(NAME,X) \ const Packet8f p8f_##NAME = pset1(X) @@ -58,21 +62,28 @@ template<> struct packet_traits : default_packet_traits enum { Vectorizable = 1, AlignedOnScalar = 1, - size=8, + size = 8, HasHalfPacket = 1, - HasDiv = 1, - HasSin = EIGEN_FAST_MATH, - HasCos = 0, - HasLog = 1, - HasExp = 1, + HasCmp = 1, + HasDiv = 1, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 1, + HasLog1p = 1, + HasExpm1 = 1, + HasExp = 1, + HasNdtri = 1, + HasBessel = 1, HasSqrt = 1, HasRsqrt = 1, - HasTanh = EIGEN_FAST_MATH, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH, HasBlend = 1, HasRound = 1, HasFloor = 1, - HasCeil = 1 + HasCeil = 1, + HasRint = 1 }; }; template<> struct packet_traits : default_packet_traits @@ -85,14 +96,104 @@ template<> struct packet_traits : default_packet_traits size=4, HasHalfPacket = 1, + HasCmp = 1, HasDiv = 1, + HasLog = 1, HasExp = 1, HasSqrt = 1, HasRsqrt = 1, HasBlend = 1, HasRound = 1, HasFloor = 1, - HasCeil = 1 + HasCeil = 1, + HasRint = 1 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet8h type; + // There is no half-size packet for Packet8h. + typedef Packet8h half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 8, + HasHalfPacket = 0, + + HasCmp = 1, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasNegate = 1, + HasAbs = 1, + HasAbs2 = 0, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasLog = 1, + HasLog1p = 1, + HasExpm1 = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH, + HasBlend = 0, + HasRound = 1, + HasFloor = 1, + HasCeil = 1, + HasRint = 1, + HasBessel = 1, + HasNdtri = 1 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet8bf type; + // There is no half-size packet for current Packet8bf. + // TODO: support as SSE path. + typedef Packet8bf half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 8, + HasHalfPacket = 0, + + HasCmp = 1, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasNegate = 1, + HasAbs = 1, + HasAbs2 = 0, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasLog = 1, + HasLog1p = 1, + HasExpm1 = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH, + HasBlend = 0, + HasRound = 1, + HasFloor = 1, + HasCeil = 1, + HasRint = 1, + HasBessel = 1, + HasNdtri = 1 }; }; #endif @@ -113,14 +214,45 @@ template<> struct packet_traits : default_packet_traits }; */ -template<> struct unpacket_traits { typedef float type; typedef Packet4f half; enum {size=8, alignment=Aligned32}; }; -template<> struct unpacket_traits { typedef double type; typedef Packet2d half; enum {size=4, alignment=Aligned32}; }; -template<> struct unpacket_traits { typedef int type; typedef Packet4i half; enum {size=8, alignment=Aligned32}; }; +template<> struct unpacket_traits { + typedef float type; + typedef Packet4f half; + typedef Packet8i integer_packet; + typedef uint8_t mask_t; + enum {size=8, alignment=Aligned32, vectorizable=true, masked_load_available=true, masked_store_available=true}; +}; +template<> struct unpacket_traits { + typedef double type; + typedef Packet2d half; + enum {size=4, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false}; +}; +template<> struct unpacket_traits { typedef int type; typedef Packet4i half; enum {size=8, alignment=Aligned32, vectorizable=false, masked_load_available=false, masked_store_available=false}; }; +template<> struct unpacket_traits { typedef bfloat16 type; typedef Packet8bf half; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; }; + +// Helper function for bit packing snippet of low precision comparison. +// It packs the flags from 16x16 to 8x16. +EIGEN_STRONG_INLINE __m128i Pack16To8(Packet8f rf) { + return _mm_packs_epi32(_mm256_extractf128_si256(_mm256_castps_si256(rf), 0), + _mm256_extractf128_si256(_mm256_castps_si256(rf), 1)); +} + template<> EIGEN_STRONG_INLINE Packet8f pset1(const float& from) { return _mm256_set1_ps(from); } template<> EIGEN_STRONG_INLINE Packet4d pset1(const double& from) { return _mm256_set1_pd(from); } template<> EIGEN_STRONG_INLINE Packet8i pset1(const int& from) { return _mm256_set1_epi32(from); } +template<> EIGEN_STRONG_INLINE Packet8f pset1frombits(unsigned int from) { return _mm256_castsi256_ps(pset1(from)); } +template<> EIGEN_STRONG_INLINE Packet4d pset1frombits(uint64_t from) { return _mm256_castsi256_pd(_mm256_set1_epi64x(from)); } + +template<> EIGEN_STRONG_INLINE Packet8f pzero(const Packet8f& /*a*/) { return _mm256_setzero_ps(); } +template<> EIGEN_STRONG_INLINE Packet4d pzero(const Packet4d& /*a*/) { return _mm256_setzero_pd(); } +template<> EIGEN_STRONG_INLINE Packet8i pzero(const Packet8i& /*a*/) { return _mm256_setzero_si256(); } + + +template<> EIGEN_STRONG_INLINE Packet8f peven_mask(const Packet8f& /*a*/) { return _mm256_castsi256_ps(_mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1)); } +template<> EIGEN_STRONG_INLINE Packet8i peven_mask(const Packet8i& /*a*/) { return _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); } +template<> EIGEN_STRONG_INLINE Packet4d peven_mask(const Packet4d& /*a*/) { return _mm256_castsi256_pd(_mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1)); } + template<> EIGEN_STRONG_INLINE Packet8f pload1(const float* from) { return _mm256_broadcast_ss(from); } template<> EIGEN_STRONG_INLINE Packet4d pload1(const double* from) { return _mm256_broadcast_sd(from); } @@ -129,9 +261,27 @@ template<> EIGEN_STRONG_INLINE Packet4d plset(const double& a) { retur template<> EIGEN_STRONG_INLINE Packet8f padd(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d padd(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8i padd(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_add_epi32(a,b); +#else + __m128i lo = _mm_add_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0)); + __m128i hi = _mm_add_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1)); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +#endif +} template<> EIGEN_STRONG_INLINE Packet8f psub(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d psub(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8i psub(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_sub_epi32(a,b); +#else + __m128i lo = _mm_sub_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0)); + __m128i hi = _mm_sub_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1)); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +#endif +} template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) { @@ -148,7 +298,15 @@ template<> EIGEN_STRONG_INLINE Packet8i pconj(const Packet8i& a) { return a; } template<> EIGEN_STRONG_INLINE Packet8f pmul(const Packet8f& a, const Packet8f& b) { return _mm256_mul_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d pmul(const Packet4d& a, const Packet4d& b) { return _mm256_mul_pd(a,b); } - +template<> EIGEN_STRONG_INLINE Packet8i pmul(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_mullo_epi32(a,b); +#else + const __m128i lo = _mm_mullo_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0)); + const __m128i hi = _mm_mullo_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1)); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +#endif +} template<> EIGEN_STRONG_INLINE Packet8f pdiv(const Packet8f& a, const Packet8f& b) { return _mm256_div_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d pdiv(const Packet4d& a, const Packet4d& b) { return _mm256_div_pd(a,b); } @@ -157,7 +315,7 @@ template<> EIGEN_STRONG_INLINE Packet8i pdiv(const Packet8i& /*a*/, co return pset1(0); } -#ifdef __FMA__ +#ifdef EIGEN_VECTORIZE_FMA template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) { #if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) ) // Clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers, @@ -184,14 +342,112 @@ template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& } #endif -template<> EIGEN_STRONG_INLINE Packet8f pmin(const Packet8f& a, const Packet8f& b) { return _mm256_min_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet4d pmin(const Packet4d& a, const Packet4d& b) { return _mm256_min_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LE_OQ); } +template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LT_OQ); } +template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt_or_nan(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a, b, _CMP_NGE_UQ); } +template<> EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_EQ_OQ); } -template<> EIGEN_STRONG_INLINE Packet8f pmax(const Packet8f& a, const Packet8f& b) { return _mm256_max_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet4d pmax(const Packet4d& a, const Packet4d& b) { return _mm256_max_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pcmp_le(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_LE_OQ); } +template<> EIGEN_STRONG_INLINE Packet4d pcmp_lt(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_LT_OQ); } +template<> EIGEN_STRONG_INLINE Packet4d pcmp_lt_or_nan(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a, b, _CMP_NGE_UQ); } +template<> EIGEN_STRONG_INLINE Packet4d pcmp_eq(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_EQ_OQ); } -template<> EIGEN_STRONG_INLINE Packet8f pround(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); } -template<> EIGEN_STRONG_INLINE Packet4d pround(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); } + +template<> EIGEN_STRONG_INLINE Packet8i pcmp_eq(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_cmpeq_epi32(a,b); +#else + __m128i lo = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0)); + __m128i hi = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1)); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet8f pmin(const Packet8f& a, const Packet8f& b) { +#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 + // There appears to be a bug in GCC, by which the optimizer may flip + // the argument order in calls to _mm_min_ps/_mm_max_ps, so we have to + // resort to inline ASM here. This is supposed to be fixed in gcc6.3, + // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 + Packet8f res; + asm("vminps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); + return res; +#else + // Arguments are swapped to match NaN propagation behavior of std::min. + return _mm256_min_ps(b,a); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4d pmin(const Packet4d& a, const Packet4d& b) { +#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 + // See pmin above + Packet4d res; + asm("vminpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); + return res; +#else + // Arguments are swapped to match NaN propagation behavior of std::min. + return _mm256_min_pd(b,a); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet8f pmax(const Packet8f& a, const Packet8f& b) { +#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 + // See pmin above + Packet8f res; + asm("vmaxps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); + return res; +#else + // Arguments are swapped to match NaN propagation behavior of std::max. + return _mm256_max_ps(b,a); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4d pmax(const Packet4d& a, const Packet4d& b) { +#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 + // See pmin above + Packet4d res; + asm("vmaxpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); + return res; +#else + // Arguments are swapped to match NaN propagation behavior of std::max. + return _mm256_max_pd(b,a); +#endif +} + +// Add specializations for min/max with prescribed NaN progation. +template<> +EIGEN_STRONG_INLINE Packet8f pmin(const Packet8f& a, const Packet8f& b) { + return pminmax_propagate_numbers(a, b, pmin); +} +template<> +EIGEN_STRONG_INLINE Packet4d pmin(const Packet4d& a, const Packet4d& b) { + return pminmax_propagate_numbers(a, b, pmin); +} +template<> +EIGEN_STRONG_INLINE Packet8f pmax(const Packet8f& a, const Packet8f& b) { + return pminmax_propagate_numbers(a, b, pmax); +} +template<> +EIGEN_STRONG_INLINE Packet4d pmax(const Packet4d& a, const Packet4d& b) { + return pminmax_propagate_numbers(a, b, pmax); +} +template<> +EIGEN_STRONG_INLINE Packet8f pmin(const Packet8f& a, const Packet8f& b) { + return pminmax_propagate_nan(a, b, pmin); +} +template<> +EIGEN_STRONG_INLINE Packet4d pmin(const Packet4d& a, const Packet4d& b) { + return pminmax_propagate_nan(a, b, pmin); +} +template<> +EIGEN_STRONG_INLINE Packet8f pmax(const Packet8f& a, const Packet8f& b) { + return pminmax_propagate_nan(a, b, pmax); +} +template<> +EIGEN_STRONG_INLINE Packet4d pmax(const Packet4d& a, const Packet4d& b) { + return pminmax_propagate_nan(a, b, pmax); +} + +template<> EIGEN_STRONG_INLINE Packet8f print(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); } +template<> EIGEN_STRONG_INLINE Packet4d print(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); } template<> EIGEN_STRONG_INLINE Packet8f pceil(const Packet8f& a) { return _mm256_ceil_ps(a); } template<> EIGEN_STRONG_INLINE Packet4d pceil(const Packet4d& a) { return _mm256_ceil_pd(a); } @@ -199,17 +455,124 @@ template<> EIGEN_STRONG_INLINE Packet4d pceil(const Packet4d& a) { ret template<> EIGEN_STRONG_INLINE Packet8f pfloor(const Packet8f& a) { return _mm256_floor_ps(a); } template<> EIGEN_STRONG_INLINE Packet4d pfloor(const Packet4d& a) { return _mm256_floor_pd(a); } + +template<> EIGEN_STRONG_INLINE Packet8i ptrue(const Packet8i& a) { +#ifdef EIGEN_VECTORIZE_AVX2 + // vpcmpeqd has lower latency than the more general vcmpps + return _mm256_cmpeq_epi32(a,a); +#else + const __m256 b = _mm256_castsi256_ps(a); + return _mm256_castps_si256(_mm256_cmp_ps(b,b,_CMP_TRUE_UQ)); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet8f ptrue(const Packet8f& a) { +#ifdef EIGEN_VECTORIZE_AVX2 + // vpcmpeqd has lower latency than the more general vcmpps + const __m256i b = _mm256_castps_si256(a); + return _mm256_castsi256_ps(_mm256_cmpeq_epi32(b,b)); +#else + return _mm256_cmp_ps(a,a,_CMP_TRUE_UQ); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet4d ptrue(const Packet4d& a) { +#ifdef EIGEN_VECTORIZE_AVX2 + // vpcmpeqq has lower latency than the more general vcmppd + const __m256i b = _mm256_castpd_si256(a); + return _mm256_castsi256_pd(_mm256_cmpeq_epi64(b,b)); +#else + return _mm256_cmp_pd(a,a,_CMP_TRUE_UQ); +#endif +} + template<> EIGEN_STRONG_INLINE Packet8f pand(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d pand(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8i pand(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_and_si256(a,b); +#else + return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b))); +#endif +} template<> EIGEN_STRONG_INLINE Packet8f por(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d por(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8i por(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_or_si256(a,b); +#else + return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b))); +#endif +} template<> EIGEN_STRONG_INLINE Packet8f pxor(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d pxor(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8i pxor(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_xor_si256(a,b); +#else + return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b))); +#endif +} -template<> EIGEN_STRONG_INLINE Packet8f pandnot(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet4d pandnot(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8f pandnot(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(b,a); } +template<> EIGEN_STRONG_INLINE Packet4d pandnot(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(b,a); } +template<> EIGEN_STRONG_INLINE Packet8i pandnot(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_andnot_si256(b,a); +#else + return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(b),_mm256_castsi256_ps(a))); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet8f pround(const Packet8f& a) +{ + const Packet8f mask = pset1frombits(static_cast(0x80000000u)); + const Packet8f prev0dot5 = pset1frombits(static_cast(0x3EFFFFFFu)); + return _mm256_round_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO); +} +template<> EIGEN_STRONG_INLINE Packet4d pround(const Packet4d& a) +{ + const Packet4d mask = pset1frombits(static_cast(0x8000000000000000ull)); + const Packet4d prev0dot5 = pset1frombits(static_cast(0x3FDFFFFFFFFFFFFFull)); + return _mm256_round_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO); +} + +template<> EIGEN_STRONG_INLINE Packet8f pselect(const Packet8f& mask, const Packet8f& a, const Packet8f& b) +{ return _mm256_blendv_ps(b,a,mask); } +template<> EIGEN_STRONG_INLINE Packet4d pselect(const Packet4d& mask, const Packet4d& a, const Packet4d& b) +{ return _mm256_blendv_pd(b,a,mask); } + +template EIGEN_STRONG_INLINE Packet8i parithmetic_shift_right(Packet8i a) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_srai_epi32(a, N); +#else + __m128i lo = _mm_srai_epi32(_mm256_extractf128_si256(a, 0), N); + __m128i hi = _mm_srai_epi32(_mm256_extractf128_si256(a, 1), N); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +#endif +} + +template EIGEN_STRONG_INLINE Packet8i plogical_shift_right(Packet8i a) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_srli_epi32(a, N); +#else + __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(a, 0), N); + __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(a, 1), N); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +#endif +} + +template EIGEN_STRONG_INLINE Packet8i plogical_shift_left(Packet8i a) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_slli_epi32(a, N); +#else + __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(a, 0), N); + __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(a, 1), N); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +#endif +} template<> EIGEN_STRONG_INLINE Packet8f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); } template<> EIGEN_STRONG_INLINE Packet4d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); } @@ -219,6 +582,14 @@ template<> EIGEN_STRONG_INLINE Packet8f ploadu(const float* from) { EI template<> EIGEN_STRONG_INLINE Packet4d ploadu(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from); } template<> EIGEN_STRONG_INLINE Packet8i ploadu(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast(from)); } +template<> EIGEN_STRONG_INLINE Packet8f ploadu(const float* from, uint8_t umask) { + Packet8i mask = _mm256_set1_epi8(static_cast(umask)); + const Packet8i bit_mask = _mm256_set_epi32(0xffffff7f, 0xffffffbf, 0xffffffdf, 0xffffffef, 0xfffffff7, 0xfffffffb, 0xfffffffd, 0xfffffffe); + mask = por(mask, bit_mask); + mask = pcmp_eq(mask, _mm256_set1_epi32(0xffffffff)); + EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_maskload_ps(from, mask); +} + // Loads 4 floats from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, a3} template<> EIGEN_STRONG_INLINE Packet8f ploaddup(const float* from) { @@ -226,7 +597,7 @@ template<> EIGEN_STRONG_INLINE Packet8f ploaddup(const float* from) // Packet8f tmp = _mm256_castps128_ps256(_mm_loadu_ps(from)); // tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1); // return _mm256_unpacklo_ps(tmp,tmp); - + // _mm256_insertf128_ps is very slow on Haswell, thus: Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from); // mimic an "inplace" permutation of the lower 128bits using a blend @@ -256,6 +627,14 @@ template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet8f& template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); } template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); } +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet8f& from, uint8_t umask) { + Packet8i mask = _mm256_set1_epi8(static_cast(umask)); + const Packet8i bit_mask = _mm256_set_epi32(0xffffff7f, 0xffffffbf, 0xffffffdf, 0xffffffef, 0xfffffff7, 0xfffffffb, 0xfffffffd, 0xfffffffe); + mask = por(mask, bit_mask); + mask = pcmp_eq(mask, _mm256_set1_epi32(0xffffffff)); + EIGEN_DEBUG_UNALIGNED_STORE return _mm256_maskstore_ps(to, mask, from); +} + // NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available // NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride), 4); template<> EIGEN_DEVICE_FUNC inline Packet8f pgather(const float* from, Index stride) @@ -354,47 +733,66 @@ template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a) return _mm256_and_pd(a,mask); } -// preduxp should be ok -// FIXME: why is this ok? why isn't the simply implementation working as expected? -template<> EIGEN_STRONG_INLINE Packet8f preduxp(const Packet8f* vecs) -{ - __m256 hsum1 = _mm256_hadd_ps(vecs[0], vecs[1]); - __m256 hsum2 = _mm256_hadd_ps(vecs[2], vecs[3]); - __m256 hsum3 = _mm256_hadd_ps(vecs[4], vecs[5]); - __m256 hsum4 = _mm256_hadd_ps(vecs[6], vecs[7]); - - __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1); - __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2); - __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3); - __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4); - - __m256 perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); - __m256 perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); - __m256 perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); - __m256 perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); - - __m256 sum1 = _mm256_add_ps(perm1, hsum5); - __m256 sum2 = _mm256_add_ps(perm2, hsum6); - __m256 sum3 = _mm256_add_ps(perm3, hsum7); - __m256 sum4 = _mm256_add_ps(perm4, hsum8); - - __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); - __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); - - __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0); - return final; +template<> EIGEN_STRONG_INLINE Packet8f pfrexp(const Packet8f& a, Packet8f& exponent) { + return pfrexp_generic(a,exponent); } -template<> EIGEN_STRONG_INLINE Packet4d preduxp(const Packet4d* vecs) -{ - Packet4d tmp0, tmp1; - tmp0 = _mm256_hadd_pd(vecs[0], vecs[1]); - tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); +// Extract exponent without existence of Packet4l. +template<> +EIGEN_STRONG_INLINE +Packet4d pfrexp_generic_get_biased_exponent(const Packet4d& a) { + const Packet4d cst_exp_mask = pset1frombits(static_cast(0x7ff0000000000000ull)); + __m256i a_expo = _mm256_castpd_si256(pand(a, cst_exp_mask)); +#ifdef EIGEN_VECTORIZE_AVX2 + a_expo = _mm256_srli_epi64(a_expo, 52); + __m128i lo = _mm256_extractf128_si256(a_expo, 0); + __m128i hi = _mm256_extractf128_si256(a_expo, 1); +#else + __m128i lo = _mm256_extractf128_si256(a_expo, 0); + __m128i hi = _mm256_extractf128_si256(a_expo, 1); + lo = _mm_srli_epi64(lo, 52); + hi = _mm_srli_epi64(hi, 52); +#endif + Packet2d exponent_lo = _mm_cvtepi32_pd(vec4i_swizzle1(lo, 0, 2, 1, 3)); + Packet2d exponent_hi = _mm_cvtepi32_pd(vec4i_swizzle1(hi, 0, 2, 1, 3)); + Packet4d exponent = _mm256_insertf128_pd(_mm256_setzero_pd(), exponent_lo, 0); + exponent = _mm256_insertf128_pd(exponent, exponent_hi, 1); + return exponent; +} - tmp1 = _mm256_hadd_pd(vecs[2], vecs[3]); - tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); - return _mm256_blend_pd(tmp0, tmp1, 0xC); +template<> EIGEN_STRONG_INLINE Packet4d pfrexp(const Packet4d& a, Packet4d& exponent) { + return pfrexp_generic(a, exponent); +} + +template<> EIGEN_STRONG_INLINE Packet8f pldexp(const Packet8f& a, const Packet8f& exponent) { + return pldexp_generic(a, exponent); +} + +template<> EIGEN_STRONG_INLINE Packet4d pldexp(const Packet4d& a, const Packet4d& exponent) { + // Clamp exponent to [-2099, 2099] + const Packet4d max_exponent = pset1(2099.0); + const Packet4i e = _mm256_cvtpd_epi32(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent)); + + // Split 2^e into four factors and multiply. + const Packet4i bias = pset1(1023); + Packet4i b = parithmetic_shift_right<2>(e); // floor(e/4) + + // 2^b + Packet4i hi = vec4i_swizzle1(padd(b, bias), 0, 2, 1, 3); + Packet4i lo = _mm_slli_epi64(hi, 52); + hi = _mm_slli_epi64(_mm_srli_epi64(hi, 32), 52); + Packet4d c = _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1)); + Packet4d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b) + + // 2^(e - 3b) + b = psub(psub(psub(e, b), b), b); // e - 3b + hi = vec4i_swizzle1(padd(b, bias), 0, 2, 1, 3); + lo = _mm_slli_epi64(hi, 52); + hi = _mm_slli_epi64(_mm_srli_epi64(hi, 32), 52); + c = _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1)); + out = pmul(out, c); // a * 2^e + return out; } template<> EIGEN_STRONG_INLINE float predux(const Packet8f& a) @@ -406,7 +804,7 @@ template<> EIGEN_STRONG_INLINE double predux(const Packet4d& a) return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a),_mm256_extractf128_pd(a,1)))); } -template<> EIGEN_STRONG_INLINE Packet4f predux_downto4(const Packet8f& a) +template<> EIGEN_STRONG_INLINE Packet4f predux_half_dowto4(const Packet8f& a) { return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1)); } @@ -450,93 +848,16 @@ template<> EIGEN_STRONG_INLINE double predux_max(const Packet4d& a) return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1))); } +// not needed yet +// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet8f& x) +// { +// return _mm256_movemask_ps(x)==0xFF; +// } -template -struct palign_impl +template<> EIGEN_STRONG_INLINE bool predux_any(const Packet8f& x) { - static EIGEN_STRONG_INLINE void run(Packet8f& first, const Packet8f& second) - { - if (Offset==1) - { - first = _mm256_blend_ps(first, second, 1); - Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1)); - Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); - first = _mm256_blend_ps(tmp1, tmp2, 0x88); - } - else if (Offset==2) - { - first = _mm256_blend_ps(first, second, 3); - Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2)); - Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); - first = _mm256_blend_ps(tmp1, tmp2, 0xcc); - } - else if (Offset==3) - { - first = _mm256_blend_ps(first, second, 7); - Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3)); - Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); - first = _mm256_blend_ps(tmp1, tmp2, 0xee); - } - else if (Offset==4) - { - first = _mm256_blend_ps(first, second, 15); - Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(3,2,1,0)); - Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); - first = _mm256_permute_ps(tmp2, _MM_SHUFFLE(3,2,1,0)); - } - else if (Offset==5) - { - first = _mm256_blend_ps(first, second, 31); - first = _mm256_permute2f128_ps(first, first, 1); - Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1)); - first = _mm256_permute2f128_ps(tmp, tmp, 1); - first = _mm256_blend_ps(tmp, first, 0x88); - } - else if (Offset==6) - { - first = _mm256_blend_ps(first, second, 63); - first = _mm256_permute2f128_ps(first, first, 1); - Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2)); - first = _mm256_permute2f128_ps(tmp, tmp, 1); - first = _mm256_blend_ps(tmp, first, 0xcc); - } - else if (Offset==7) - { - first = _mm256_blend_ps(first, second, 127); - first = _mm256_permute2f128_ps(first, first, 1); - Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3)); - first = _mm256_permute2f128_ps(tmp, tmp, 1); - first = _mm256_blend_ps(tmp, first, 0xee); - } - } -}; - -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4d& first, const Packet4d& second) - { - if (Offset==1) - { - first = _mm256_blend_pd(first, second, 1); - __m256d tmp = _mm256_permute_pd(first, 5); - first = _mm256_permute2f128_pd(tmp, tmp, 1); - first = _mm256_blend_pd(tmp, first, 0xA); - } - else if (Offset==2) - { - first = _mm256_blend_pd(first, second, 3); - first = _mm256_permute2f128_pd(first, first, 1); - } - else if (Offset==3) - { - first = _mm256_blend_pd(first, second, 7); - __m256d tmp = _mm256_permute_pd(first, 5); - first = _mm256_permute2f128_pd(tmp, tmp, 1); - first = _mm256_blend_pd(tmp, first, 5); - } - } -}; + return _mm256_movemask_ps(x)!=0; +} EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { @@ -610,24 +931,640 @@ template<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, cons return _mm256_blendv_pd(thenPacket, elsePacket, false_mask); } -template<> EIGEN_STRONG_INLINE Packet8f pinsertfirst(const Packet8f& a, float b) -{ - return _mm256_blend_ps(a,pset1(b),1); +// Packet math for Eigen::half + +template<> struct unpacket_traits { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet8h half; }; + +template<> EIGEN_STRONG_INLINE Packet8h pset1(const Eigen::half& from) { + return _mm_set1_epi16(numext::bit_cast(from)); } -template<> EIGEN_STRONG_INLINE Packet4d pinsertfirst(const Packet4d& a, double b) -{ - return _mm256_blend_pd(a,pset1(b),1); +template<> EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet8h& from) { + return numext::bit_cast(static_cast(_mm_extract_epi16(from, 0))); } -template<> EIGEN_STRONG_INLINE Packet8f pinsertlast(const Packet8f& a, float b) -{ - return _mm256_blend_ps(a,pset1(b),(1<<7)); +template<> EIGEN_STRONG_INLINE Packet8h pload(const Eigen::half* from) { + return _mm_load_si128(reinterpret_cast(from)); } -template<> EIGEN_STRONG_INLINE Packet4d pinsertlast(const Packet4d& a, double b) +template<> EIGEN_STRONG_INLINE Packet8h ploadu(const Eigen::half* from) { + return _mm_loadu_si128(reinterpret_cast(from)); +} + +template<> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet8h& from) { + _mm_store_si128(reinterpret_cast<__m128i*>(to), from); +} + +template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet8h& from) { + _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); +} + +template<> EIGEN_STRONG_INLINE Packet8h +ploaddup(const Eigen::half* from) { + const numext::uint16_t a = numext::bit_cast(from[0]); + const numext::uint16_t b = numext::bit_cast(from[1]); + const numext::uint16_t c = numext::bit_cast(from[2]); + const numext::uint16_t d = numext::bit_cast(from[3]); + return _mm_set_epi16(d, d, c, c, b, b, a, a); +} + +template<> EIGEN_STRONG_INLINE Packet8h +ploadquad(const Eigen::half* from) { + const numext::uint16_t a = numext::bit_cast(from[0]); + const numext::uint16_t b = numext::bit_cast(from[1]); + return _mm_set_epi16(b, b, b, b, a, a, a, a); +} + +template<> EIGEN_STRONG_INLINE Packet8h ptrue(const Packet8h& a) { + return _mm_cmpeq_epi32(a, a); +} + +template <> +EIGEN_STRONG_INLINE Packet8h pabs(const Packet8h& a) { + const __m128i sign_mask = _mm_set1_epi16(static_cast(0x8000)); + return _mm_andnot_si128(sign_mask, a); +} + +EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) { +#ifdef EIGEN_HAS_FP16_C + return _mm256_cvtph_ps(a); +#else + EIGEN_ALIGN32 Eigen::half aux[8]; + pstore(aux, a); + float f0(aux[0]); + float f1(aux[1]); + float f2(aux[2]); + float f3(aux[3]); + float f4(aux[4]); + float f5(aux[5]); + float f6(aux[6]); + float f7(aux[7]); + + return _mm256_set_ps(f7, f6, f5, f4, f3, f2, f1, f0); +#endif +} + +EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { +#ifdef EIGEN_HAS_FP16_C + return _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); +#else + EIGEN_ALIGN32 float aux[8]; + pstore(aux, a); + const numext::uint16_t s0 = numext::bit_cast(Eigen::half(aux[0])); + const numext::uint16_t s1 = numext::bit_cast(Eigen::half(aux[1])); + const numext::uint16_t s2 = numext::bit_cast(Eigen::half(aux[2])); + const numext::uint16_t s3 = numext::bit_cast(Eigen::half(aux[3])); + const numext::uint16_t s4 = numext::bit_cast(Eigen::half(aux[4])); + const numext::uint16_t s5 = numext::bit_cast(Eigen::half(aux[5])); + const numext::uint16_t s6 = numext::bit_cast(Eigen::half(aux[6])); + const numext::uint16_t s7 = numext::bit_cast(Eigen::half(aux[7])); + return _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0); +#endif +} + +template <> +EIGEN_STRONG_INLINE Packet8h pmin(const Packet8h& a, + const Packet8h& b) { + return float2half(pmin(half2float(a), half2float(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet8h pmax(const Packet8h& a, + const Packet8h& b) { + return float2half(pmax(half2float(a), half2float(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet8h plset(const half& a) { + return float2half(plset(static_cast(a))); +} + +template<> EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a,const Packet8h& b) { + // in some cases Packet4i is a wrapper around __m128i, so we either need to + // cast to Packet4i to directly call the intrinsics as below: + return _mm_or_si128(a,b); +} +template<> EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h& a,const Packet8h& b) { + return _mm_xor_si128(a,b); +} +template<> EIGEN_STRONG_INLINE Packet8h pand(const Packet8h& a,const Packet8h& b) { + return _mm_and_si128(a,b); +} +template<> EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a,const Packet8h& b) { + return _mm_andnot_si128(b,a); +} + +template<> EIGEN_STRONG_INLINE Packet8h pselect(const Packet8h& mask, const Packet8h& a, const Packet8h& b) { + return _mm_blendv_epi8(b, a, mask); +} + +template<> EIGEN_STRONG_INLINE Packet8h pround(const Packet8h& a) { + return float2half(pround(half2float(a))); +} + +template<> EIGEN_STRONG_INLINE Packet8h print(const Packet8h& a) { + return float2half(print(half2float(a))); +} + +template<> EIGEN_STRONG_INLINE Packet8h pceil(const Packet8h& a) { + return float2half(pceil(half2float(a))); +} + +template<> EIGEN_STRONG_INLINE Packet8h pfloor(const Packet8h& a) { + return float2half(pfloor(half2float(a))); +} + +template<> EIGEN_STRONG_INLINE Packet8h pcmp_eq(const Packet8h& a,const Packet8h& b) { + return Pack16To8(pcmp_eq(half2float(a), half2float(b))); +} + +template<> EIGEN_STRONG_INLINE Packet8h pcmp_le(const Packet8h& a,const Packet8h& b) { + return Pack16To8(pcmp_le(half2float(a), half2float(b))); +} + +template<> EIGEN_STRONG_INLINE Packet8h pcmp_lt(const Packet8h& a,const Packet8h& b) { + return Pack16To8(pcmp_lt(half2float(a), half2float(b))); +} + +template<> EIGEN_STRONG_INLINE Packet8h pcmp_lt_or_nan(const Packet8h& a,const Packet8h& b) { + return Pack16To8(pcmp_lt_or_nan(half2float(a), half2float(b))); +} + +template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) { + Packet8h sign_mask = _mm_set1_epi16(static_cast(0x8000)); + return _mm_xor_si128(a, sign_mask); +} + +template<> EIGEN_STRONG_INLINE Packet8h padd(const Packet8h& a, const Packet8h& b) { + Packet8f af = half2float(a); + Packet8f bf = half2float(b); + Packet8f rf = padd(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE Packet8h psub(const Packet8h& a, const Packet8h& b) { + Packet8f af = half2float(a); + Packet8f bf = half2float(b); + Packet8f rf = psub(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE Packet8h pmul(const Packet8h& a, const Packet8h& b) { + Packet8f af = half2float(a); + Packet8f bf = half2float(b); + Packet8f rf = pmul(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE Packet8h pdiv(const Packet8h& a, const Packet8h& b) { + Packet8f af = half2float(a); + Packet8f bf = half2float(b); + Packet8f rf = pdiv(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE Packet8h pgather(const Eigen::half* from, Index stride) { - return _mm256_blend_pd(a,pset1(b),(1<<3)); + const numext::uint16_t s0 = numext::bit_cast(from[0*stride]); + const numext::uint16_t s1 = numext::bit_cast(from[1*stride]); + const numext::uint16_t s2 = numext::bit_cast(from[2*stride]); + const numext::uint16_t s3 = numext::bit_cast(from[3*stride]); + const numext::uint16_t s4 = numext::bit_cast(from[4*stride]); + const numext::uint16_t s5 = numext::bit_cast(from[5*stride]); + const numext::uint16_t s6 = numext::bit_cast(from[6*stride]); + const numext::uint16_t s7 = numext::bit_cast(from[7*stride]); + return _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0); +} + +template<> EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const Packet8h& from, Index stride) +{ + EIGEN_ALIGN32 Eigen::half aux[8]; + pstore(aux, from); + to[stride*0] = aux[0]; + to[stride*1] = aux[1]; + to[stride*2] = aux[2]; + to[stride*3] = aux[3]; + to[stride*4] = aux[4]; + to[stride*5] = aux[5]; + to[stride*6] = aux[6]; + to[stride*7] = aux[7]; +} + +template<> EIGEN_STRONG_INLINE Eigen::half predux(const Packet8h& a) { + Packet8f af = half2float(a); + float reduced = predux(af); + return Eigen::half(reduced); +} + +template<> EIGEN_STRONG_INLINE Eigen::half predux_max(const Packet8h& a) { + Packet8f af = half2float(a); + float reduced = predux_max(af); + return Eigen::half(reduced); +} + +template<> EIGEN_STRONG_INLINE Eigen::half predux_min(const Packet8h& a) { + Packet8f af = half2float(a); + float reduced = predux_min(af); + return Eigen::half(reduced); +} + +template<> EIGEN_STRONG_INLINE Eigen::half predux_mul(const Packet8h& a) { + Packet8f af = half2float(a); + float reduced = predux_mul(af); + return Eigen::half(reduced); +} + +template<> EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a) +{ + __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1); + return _mm_shuffle_epi8(a,m); +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + __m128i a = kernel.packet[0]; + __m128i b = kernel.packet[1]; + __m128i c = kernel.packet[2]; + __m128i d = kernel.packet[3]; + __m128i e = kernel.packet[4]; + __m128i f = kernel.packet[5]; + __m128i g = kernel.packet[6]; + __m128i h = kernel.packet[7]; + + __m128i a03b03 = _mm_unpacklo_epi16(a, b); + __m128i c03d03 = _mm_unpacklo_epi16(c, d); + __m128i e03f03 = _mm_unpacklo_epi16(e, f); + __m128i g03h03 = _mm_unpacklo_epi16(g, h); + __m128i a47b47 = _mm_unpackhi_epi16(a, b); + __m128i c47d47 = _mm_unpackhi_epi16(c, d); + __m128i e47f47 = _mm_unpackhi_epi16(e, f); + __m128i g47h47 = _mm_unpackhi_epi16(g, h); + + __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03); + __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03); + __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03); + __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03); + __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47); + __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47); + __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47); + __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47); + + __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01); + __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01); + __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23); + __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23); + __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45); + __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45); + __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67); + __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67); + + kernel.packet[0] = a0b0c0d0e0f0g0h0; + kernel.packet[1] = a1b1c1d1e1f1g1h1; + kernel.packet[2] = a2b2c2d2e2f2g2h2; + kernel.packet[3] = a3b3c3d3e3f3g3h3; + kernel.packet[4] = a4b4c4d4e4f4g4h4; + kernel.packet[5] = a5b5c5d5e5f5g5h5; + kernel.packet[6] = a6b6c6d6e6f6g6h6; + kernel.packet[7] = a7b7c7d7e7f7g7h7; +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + EIGEN_ALIGN32 Eigen::half in[4][8]; + pstore(in[0], kernel.packet[0]); + pstore(in[1], kernel.packet[1]); + pstore(in[2], kernel.packet[2]); + pstore(in[3], kernel.packet[3]); + + EIGEN_ALIGN32 Eigen::half out[4][8]; + + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 4; ++j) { + out[i][j] = in[j][2*i]; + } + for (int j = 0; j < 4; ++j) { + out[i][j+4] = in[j][2*i+1]; + } + } + + kernel.packet[0] = pload(out[0]); + kernel.packet[1] = pload(out[1]); + kernel.packet[2] = pload(out[2]); + kernel.packet[3] = pload(out[3]); +} + +// BFloat16 implementation. + +EIGEN_STRONG_INLINE Packet8f Bf16ToF32(const Packet8bf& a) { +#ifdef EIGEN_VECTORIZE_AVX2 + __m256i extend = _mm256_cvtepu16_epi32(a); + return _mm256_castsi256_ps(_mm256_slli_epi32(extend, 16)); +#else + __m128i lo = _mm_cvtepu16_epi32(a); + __m128i hi = _mm_cvtepu16_epi32(_mm_srli_si128(a, 8)); + __m128i lo_shift = _mm_slli_epi32(lo, 16); + __m128i hi_shift = _mm_slli_epi32(hi, 16); + return _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(lo_shift), hi_shift, 1)); +#endif +} + +// Convert float to bfloat16 according to round-to-nearest-even/denormals algorithm. +EIGEN_STRONG_INLINE Packet8bf F32ToBf16(const Packet8f& a) { + Packet8bf r; + + __m256i input = _mm256_castps_si256(a); + +#ifdef EIGEN_VECTORIZE_AVX2 + // uint32_t lsb = (input >> 16); + __m256i t = _mm256_srli_epi32(input, 16); + // uint32_t lsb = lsb & 1; + t = _mm256_and_si256(t, _mm256_set1_epi32(1)); + // uint32_t rounding_bias = 0x7fff + lsb; + t = _mm256_add_epi32(t, _mm256_set1_epi32(0x7fff)); + // input += rounding_bias; + t = _mm256_add_epi32(t, input); + // input = input >> 16; + t = _mm256_srli_epi32(t, 16); + // Check NaN before converting back to bf16 + __m256 mask = _mm256_cmp_ps(a, a, _CMP_ORD_Q); + __m256i nan = _mm256_set1_epi32(0x7fc0); + t = _mm256_blendv_epi8(nan, t, _mm256_castps_si256(mask)); + // output = numext::bit_cast(input); + return _mm_packus_epi32(_mm256_extractf128_si256(t, 0), + _mm256_extractf128_si256(t, 1)); +#else + // uint32_t lsb = (input >> 16); + __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(input, 0), 16); + __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(input, 1), 16); + // uint32_t lsb = lsb & 1; + lo = _mm_and_si128(lo, _mm_set1_epi32(1)); + hi = _mm_and_si128(hi, _mm_set1_epi32(1)); + // uint32_t rounding_bias = 0x7fff + lsb; + lo = _mm_add_epi32(lo, _mm_set1_epi32(0x7fff)); + hi = _mm_add_epi32(hi, _mm_set1_epi32(0x7fff)); + // input += rounding_bias; + lo = _mm_add_epi32(lo, _mm256_extractf128_si256(input, 0)); + hi = _mm_add_epi32(hi, _mm256_extractf128_si256(input, 1)); + // input = input >> 16; + lo = _mm_srli_epi32(lo, 16); + hi = _mm_srli_epi32(hi, 16); + // Check NaN before converting back to bf16 + __m256 mask = _mm256_cmp_ps(a, a, _CMP_ORD_Q); + __m128i nan = _mm_set1_epi32(0x7fc0); + lo = _mm_blendv_epi8(nan, lo, _mm_castps_si128(_mm256_castps256_ps128(mask))); + hi = _mm_blendv_epi8(nan, hi, _mm_castps_si128(_mm256_extractf128_ps(mask, 1))); + // output = numext::bit_cast(input); + return _mm_packus_epi32(lo, hi); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet8bf pset1(const bfloat16& from) { + return _mm_set1_epi16(numext::bit_cast(from)); +} + +template<> EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet8bf& from) { + return numext::bit_cast(static_cast(_mm_extract_epi16(from, 0))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pload(const bfloat16* from) { + return _mm_load_si128(reinterpret_cast(from)); +} + +template<> EIGEN_STRONG_INLINE Packet8bf ploadu(const bfloat16* from) { + return _mm_loadu_si128(reinterpret_cast(from)); +} + +template<> EIGEN_STRONG_INLINE void pstore(bfloat16* to, const Packet8bf& from) { + _mm_store_si128(reinterpret_cast<__m128i*>(to), from); +} + +template<> EIGEN_STRONG_INLINE void pstoreu(bfloat16* to, const Packet8bf& from) { + _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); +} + +template<> EIGEN_STRONG_INLINE Packet8bf +ploaddup(const bfloat16* from) { + const numext::uint16_t a = numext::bit_cast(from[0]); + const numext::uint16_t b = numext::bit_cast(from[1]); + const numext::uint16_t c = numext::bit_cast(from[2]); + const numext::uint16_t d = numext::bit_cast(from[3]); + return _mm_set_epi16(d, d, c, c, b, b, a, a); +} + +template<> EIGEN_STRONG_INLINE Packet8bf +ploadquad(const bfloat16* from) { + const numext::uint16_t a = numext::bit_cast(from[0]); + const numext::uint16_t b = numext::bit_cast(from[1]); + return _mm_set_epi16(b, b, b, b, a, a, a, a); +} + +template<> EIGEN_STRONG_INLINE Packet8bf ptrue(const Packet8bf& a) { + return _mm_cmpeq_epi32(a, a); +} + +template <> +EIGEN_STRONG_INLINE Packet8bf pabs(const Packet8bf& a) { + const __m128i sign_mask = _mm_set1_epi16(static_cast(0x8000)); + return _mm_andnot_si128(sign_mask, a); +} + +template <> +EIGEN_STRONG_INLINE Packet8bf pmin(const Packet8bf& a, + const Packet8bf& b) { + return F32ToBf16(pmin(Bf16ToF32(a), Bf16ToF32(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet8bf pmax(const Packet8bf& a, + const Packet8bf& b) { + return F32ToBf16(pmax(Bf16ToF32(a), Bf16ToF32(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet8bf plset(const bfloat16& a) { + return F32ToBf16(plset(static_cast(a))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf por(const Packet8bf& a,const Packet8bf& b) { + return _mm_or_si128(a,b); +} +template<> EIGEN_STRONG_INLINE Packet8bf pxor(const Packet8bf& a,const Packet8bf& b) { + return _mm_xor_si128(a,b); +} +template<> EIGEN_STRONG_INLINE Packet8bf pand(const Packet8bf& a,const Packet8bf& b) { + return _mm_and_si128(a,b); +} +template<> EIGEN_STRONG_INLINE Packet8bf pandnot(const Packet8bf& a,const Packet8bf& b) { + return _mm_andnot_si128(b,a); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pselect(const Packet8bf& mask, const Packet8bf& a, const Packet8bf& b) { + return _mm_blendv_epi8(b, a, mask); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pround(const Packet8bf& a) +{ + return F32ToBf16(pround(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf print(const Packet8bf& a) { + return F32ToBf16(print(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pceil(const Packet8bf& a) { + return F32ToBf16(pceil(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pfloor(const Packet8bf& a) { + return F32ToBf16(pfloor(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pcmp_eq(const Packet8bf& a,const Packet8bf& b) { + return Pack16To8(pcmp_eq(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pcmp_le(const Packet8bf& a,const Packet8bf& b) { + return Pack16To8(pcmp_le(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt(const Packet8bf& a,const Packet8bf& b) { + return Pack16To8(pcmp_lt(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(const Packet8bf& a,const Packet8bf& b) { + return Pack16To8(pcmp_lt_or_nan(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pconj(const Packet8bf& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet8bf pnegate(const Packet8bf& a) { + Packet8bf sign_mask = _mm_set1_epi16(static_cast(0x8000)); + return _mm_xor_si128(a, sign_mask); +} + +template<> EIGEN_STRONG_INLINE Packet8bf padd(const Packet8bf& a, const Packet8bf& b) { + return F32ToBf16(padd(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf psub(const Packet8bf& a, const Packet8bf& b) { + return F32ToBf16(psub(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pmul(const Packet8bf& a, const Packet8bf& b) { + return F32ToBf16(pmul(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pdiv(const Packet8bf& a, const Packet8bf& b) { + return F32ToBf16(pdiv(Bf16ToF32(a), Bf16ToF32(b))); +} + + +template<> EIGEN_STRONG_INLINE Packet8bf pgather(const bfloat16* from, Index stride) +{ + const numext::uint16_t s0 = numext::bit_cast(from[0*stride]); + const numext::uint16_t s1 = numext::bit_cast(from[1*stride]); + const numext::uint16_t s2 = numext::bit_cast(from[2*stride]); + const numext::uint16_t s3 = numext::bit_cast(from[3*stride]); + const numext::uint16_t s4 = numext::bit_cast(from[4*stride]); + const numext::uint16_t s5 = numext::bit_cast(from[5*stride]); + const numext::uint16_t s6 = numext::bit_cast(from[6*stride]); + const numext::uint16_t s7 = numext::bit_cast(from[7*stride]); + return _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0); +} + +template<> EIGEN_STRONG_INLINE void pscatter(bfloat16* to, const Packet8bf& from, Index stride) +{ + EIGEN_ALIGN32 bfloat16 aux[8]; + pstore(aux, from); + to[stride*0] = aux[0]; + to[stride*1] = aux[1]; + to[stride*2] = aux[2]; + to[stride*3] = aux[3]; + to[stride*4] = aux[4]; + to[stride*5] = aux[5]; + to[stride*6] = aux[6]; + to[stride*7] = aux[7]; +} + +template<> EIGEN_STRONG_INLINE bfloat16 predux(const Packet8bf& a) { + return static_cast(predux(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE bfloat16 predux_max(const Packet8bf& a) { + return static_cast(predux_max(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE bfloat16 predux_min(const Packet8bf& a) { + return static_cast(predux_min(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE bfloat16 predux_mul(const Packet8bf& a) { + return static_cast(predux_mul(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a) +{ + __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1); + return _mm_shuffle_epi8(a,m); +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + __m128i a = kernel.packet[0]; + __m128i b = kernel.packet[1]; + __m128i c = kernel.packet[2]; + __m128i d = kernel.packet[3]; + __m128i e = kernel.packet[4]; + __m128i f = kernel.packet[5]; + __m128i g = kernel.packet[6]; + __m128i h = kernel.packet[7]; + + __m128i a03b03 = _mm_unpacklo_epi16(a, b); + __m128i c03d03 = _mm_unpacklo_epi16(c, d); + __m128i e03f03 = _mm_unpacklo_epi16(e, f); + __m128i g03h03 = _mm_unpacklo_epi16(g, h); + __m128i a47b47 = _mm_unpackhi_epi16(a, b); + __m128i c47d47 = _mm_unpackhi_epi16(c, d); + __m128i e47f47 = _mm_unpackhi_epi16(e, f); + __m128i g47h47 = _mm_unpackhi_epi16(g, h); + + __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03); + __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03); + __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03); + __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03); + __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47); + __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47); + __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47); + __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47); + + kernel.packet[0] = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01); + kernel.packet[1] = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01); + kernel.packet[2] = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23); + kernel.packet[3] = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23); + kernel.packet[4] = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45); + kernel.packet[5] = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45); + kernel.packet[6] = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67); + kernel.packet[7] = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67); +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + __m128i a = kernel.packet[0]; + __m128i b = kernel.packet[1]; + __m128i c = kernel.packet[2]; + __m128i d = kernel.packet[3]; + + __m128i ab_03 = _mm_unpacklo_epi16(a, b); + __m128i cd_03 = _mm_unpacklo_epi16(c, d); + __m128i ab_47 = _mm_unpackhi_epi16(a, b); + __m128i cd_47 = _mm_unpackhi_epi16(c, d); + + kernel.packet[0] = _mm_unpacklo_epi32(ab_03, cd_03); + kernel.packet[1] = _mm_unpackhi_epi32(ab_03, cd_03); + kernel.packet[2] = _mm_unpacklo_epi32(ab_47, cd_47); + kernel.packet[3] = _mm_unpackhi_epi32(ab_47, cd_47); } } // end namespace internal diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/AVX/TypeCasting.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/AVX/TypeCasting.h index 83bfdc604b..d507fb67b2 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/AVX/TypeCasting.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/AVX/TypeCasting.h @@ -35,15 +35,79 @@ struct type_casting_traits { }; +#ifndef EIGEN_VECTORIZE_AVX512 + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +#endif // EIGEN_VECTORIZE_AVX512 template<> EIGEN_STRONG_INLINE Packet8i pcast(const Packet8f& a) { - return _mm256_cvtps_epi32(a); + return _mm256_cvttps_epi32(a); } template<> EIGEN_STRONG_INLINE Packet8f pcast(const Packet8i& a) { return _mm256_cvtepi32_ps(a); } +template<> EIGEN_STRONG_INLINE Packet8i preinterpret(const Packet8f& a) { + return _mm256_castps_si256(a); +} + +template<> EIGEN_STRONG_INLINE Packet8f preinterpret(const Packet8i& a) { + return _mm256_castsi256_ps(a); +} + +template<> EIGEN_STRONG_INLINE Packet8f pcast(const Packet8h& a) { + return half2float(a); +} + +template<> EIGEN_STRONG_INLINE Packet8f pcast(const Packet8bf& a) { + return Bf16ToF32(a); +} + +template<> EIGEN_STRONG_INLINE Packet8h pcast(const Packet8f& a) { + return float2half(a); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pcast(const Packet8f& a) { + return F32ToBf16(a); +} + } // end namespace internal } // end namespace Eigen diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/Default/BFloat16.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/Default/BFloat16.h new file mode 100644 index 0000000000..1c28f4f95e --- /dev/null +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/Default/BFloat16.h @@ -0,0 +1,700 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef EIGEN_BFLOAT16_H +#define EIGEN_BFLOAT16_H + +#define BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, METHOD) \ + template <> \ + EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED \ + PACKET_BF16 METHOD(const PACKET_BF16& _x) { \ + return F32ToBf16(METHOD(Bf16ToF32(_x))); \ + } + +namespace Eigen { + +struct bfloat16; + +namespace bfloat16_impl { + +// Make our own __bfloat16_raw definition. +struct __bfloat16_raw { + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw() : value(0) {} + explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw(unsigned short raw) : value(raw) {} + unsigned short value; +}; + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw raw_uint16_to_bfloat16(unsigned short value); +template +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne(float ff); +// Forward declarations of template specializations, to avoid Visual C++ 2019 errors, saying: +// > error C2908: explicit specialization; 'float_to_bfloat16_rtne' has already been instantiated +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne(float ff); +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne(float ff); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float bfloat16_to_float(__bfloat16_raw h); + +struct bfloat16_base : public __bfloat16_raw { + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16_base() {} + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16_base(const __bfloat16_raw& h) : __bfloat16_raw(h) {} +}; + +} // namespace bfloat16_impl + +// Class definition. +struct bfloat16 : public bfloat16_impl::bfloat16_base { + + typedef bfloat16_impl::__bfloat16_raw __bfloat16_raw; + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16() {} + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(const __bfloat16_raw& h) : bfloat16_impl::bfloat16_base(h) {} + + explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(bool b) + : bfloat16_impl::bfloat16_base(bfloat16_impl::raw_uint16_to_bfloat16(b ? 0x3f80 : 0)) {} + + template + explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(T val) + : bfloat16_impl::bfloat16_base(bfloat16_impl::float_to_bfloat16_rtne::value>(static_cast(val))) {} + + explicit EIGEN_DEVICE_FUNC bfloat16(float f) + : bfloat16_impl::bfloat16_base(bfloat16_impl::float_to_bfloat16_rtne(f)) {} + + // Following the convention of numpy, converting between complex and + // float will lead to loss of imag value. + template + explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(const std::complex& val) + : bfloat16_impl::bfloat16_base(bfloat16_impl::float_to_bfloat16_rtne(static_cast(val.real()))) {} + + EIGEN_DEVICE_FUNC operator float() const { // NOLINT: Allow implicit conversion to float, because it is lossless. + return bfloat16_impl::bfloat16_to_float(*this); + } +}; +} // namespace Eigen + +namespace std { +template<> +struct numeric_limits { + static const bool is_specialized = true; + static const bool is_signed = true; + static const bool is_integer = false; + static const bool is_exact = false; + static const bool has_infinity = true; + static const bool has_quiet_NaN = true; + static const bool has_signaling_NaN = true; + static const float_denorm_style has_denorm = std::denorm_absent; + static const bool has_denorm_loss = false; + static const std::float_round_style round_style = numeric_limits::round_style; + static const bool is_iec559 = false; + static const bool is_bounded = true; + static const bool is_modulo = false; + static const int digits = 8; + static const int digits10 = 2; + static const int max_digits10 = 4; + static const int radix = 2; + static const int min_exponent = numeric_limits::min_exponent; + static const int min_exponent10 = numeric_limits::min_exponent10; + static const int max_exponent = numeric_limits::max_exponent; + static const int max_exponent10 = numeric_limits::max_exponent10; + static const bool traps = numeric_limits::traps; + static const bool tinyness_before = numeric_limits::tinyness_before; + + static Eigen::bfloat16 (min)() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x0080); } + static Eigen::bfloat16 lowest() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0xff7f); } + static Eigen::bfloat16 (max)() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f7f); } + static Eigen::bfloat16 epsilon() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x3c00); } + static Eigen::bfloat16 round_error() { return Eigen::bfloat16(0x3f00); } + static Eigen::bfloat16 infinity() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f80); } + static Eigen::bfloat16 quiet_NaN() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7fc0); } + static Eigen::bfloat16 signaling_NaN() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f81); } + static Eigen::bfloat16 denorm_min() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x0001); } +}; + +// If std::numeric_limits is specialized, should also specialize +// std::numeric_limits, std::numeric_limits, and +// std::numeric_limits +// https://stackoverflow.com/a/16519653/ +template<> +struct numeric_limits : numeric_limits {}; +template<> +struct numeric_limits : numeric_limits {}; +template<> +struct numeric_limits : numeric_limits {}; +} // namespace std + +namespace Eigen { + +namespace bfloat16_impl { + +// We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler, +// invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation +// of the functions, while the latter can only deal with one of them. +#if !defined(EIGEN_HAS_NATIVE_BF16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) // Emulate support for bfloat16 floats + +#if EIGEN_COMP_CLANG && defined(EIGEN_CUDACC) +// We need to provide emulated *host-side* BF16 operators for clang. +#pragma push_macro("EIGEN_DEVICE_FUNC") +#undef EIGEN_DEVICE_FUNC +#if defined(EIGEN_HAS_CUDA_BF16) && defined(EIGEN_HAS_NATIVE_BF16) +#define EIGEN_DEVICE_FUNC __host__ +#else // both host and device need emulated ops. +#define EIGEN_DEVICE_FUNC __host__ __device__ +#endif +#endif + +// Definitions for CPUs, mostly working through conversion +// to/from fp32. + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator + (const bfloat16& a, const bfloat16& b) { + return bfloat16(float(a) + float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator + (const bfloat16& a, const int& b) { + return bfloat16(float(a) + static_cast(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator + (const int& a, const bfloat16& b) { + return bfloat16(static_cast(a) + float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator * (const bfloat16& a, const bfloat16& b) { + return bfloat16(float(a) * float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator - (const bfloat16& a, const bfloat16& b) { + return bfloat16(float(a) - float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator / (const bfloat16& a, const bfloat16& b) { + return bfloat16(float(a) / float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator - (const bfloat16& a) { + bfloat16 result; + result.value = a.value ^ 0x8000; + return result; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator += (bfloat16& a, const bfloat16& b) { + a = bfloat16(float(a) + float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator *= (bfloat16& a, const bfloat16& b) { + a = bfloat16(float(a) * float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator -= (bfloat16& a, const bfloat16& b) { + a = bfloat16(float(a) - float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator /= (bfloat16& a, const bfloat16& b) { + a = bfloat16(float(a) / float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator++(bfloat16& a) { + a += bfloat16(1); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator--(bfloat16& a) { + a -= bfloat16(1); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator++(bfloat16& a, int) { + bfloat16 original_value = a; + ++a; + return original_value; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator--(bfloat16& a, int) { + bfloat16 original_value = a; + --a; + return original_value; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const bfloat16& a, const bfloat16& b) { + return numext::equal_strict(float(a),float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const bfloat16& a, const bfloat16& b) { + return numext::not_equal_strict(float(a), float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const bfloat16& a, const bfloat16& b) { + return float(a) < float(b); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const bfloat16& a, const bfloat16& b) { + return float(a) <= float(b); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const bfloat16& a, const bfloat16& b) { + return float(a) > float(b); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const bfloat16& a, const bfloat16& b) { + return float(a) >= float(b); +} + +#if EIGEN_COMP_CLANG && defined(EIGEN_CUDACC) +#pragma pop_macro("EIGEN_DEVICE_FUNC") +#endif +#endif // Emulate support for bfloat16 floats + +// Division by an index. Do it in full float precision to avoid accuracy +// issues in converting the denominator to bfloat16. +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator / (const bfloat16& a, Index b) { + return bfloat16(static_cast(a) / static_cast(b)); +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw truncate_to_bfloat16(const float v) { + __bfloat16_raw output; + if (Eigen::numext::isnan EIGEN_NOT_A_MACRO(v)) { + output.value = std::signbit(v) ? 0xFFC0: 0x7FC0; + return output; + } + const uint16_t* p = reinterpret_cast(&v); +#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + output.value = p[0]; +#else + output.value = p[1]; +#endif + return output; +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw raw_uint16_to_bfloat16(numext::uint16_t value) { + return __bfloat16_raw(value); +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR numext::uint16_t raw_bfloat16_as_uint16(const __bfloat16_raw& bf) { + return bf.value; +} + +// float_to_bfloat16_rtne template specialization that does not make any +// assumption about the value of its function argument (ff). +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne(float ff) { +#if (defined(EIGEN_HAS_CUDA_BF16) && defined(EIGEN_HAS_HIP_BF16)) + // Nothing to do here +#else + __bfloat16_raw output; + + if (Eigen::numext::isnan EIGEN_NOT_A_MACRO(ff)) { + // If the value is a NaN, squash it to a qNaN with msb of fraction set, + // this makes sure after truncation we don't end up with an inf. + // + // qNaN magic: All exponent bits set + most significant bit of fraction + // set. + output.value = std::signbit(ff) ? 0xFFC0: 0x7FC0; + } else { + // Fast rounding algorithm that rounds a half value to nearest even. This + // reduces expected error when we convert a large number of floats. Here + // is how it works: + // + // Definitions: + // To convert a float 32 to bfloat16, a float 32 can be viewed as 32 bits + // with the following tags: + // + // Sign | Exp (8 bits) | Frac (23 bits) + // S EEEEEEEE FFFFFFLRTTTTTTTTTTTTTTT + // + // S: Sign bit. + // E: Exponent bits. + // F: First 6 bits of fraction. + // L: Least significant bit of resulting bfloat16 if we truncate away the + // rest of the float32. This is also the 7th bit of fraction + // R: Rounding bit, 8th bit of fraction. + // T: Sticky bits, rest of fraction, 15 bits. + // + // To round half to nearest even, there are 3 cases where we want to round + // down (simply truncate the result of the bits away, which consists of + // rounding bit and sticky bits) and two cases where we want to round up + // (truncate then add one to the result). + // + // The fast converting algorithm simply adds lsb (L) to 0x7fff (15 bits of + // 1s) as the rounding bias, adds the rounding bias to the input, then + // truncates the last 16 bits away. + // + // To understand how it works, we can analyze this algorithm case by case: + // + // 1. L = 0, R = 0: + // Expect: round down, this is less than half value. + // + // Algorithm: + // - Rounding bias: 0x7fff + 0 = 0x7fff + // - Adding rounding bias to input may create any carry, depending on + // whether there is any value set to 1 in T bits. + // - R may be set to 1 if there is a carry. + // - L remains 0. + // - Note that this case also handles Inf and -Inf, where all fraction + // bits, including L, R and Ts are all 0. The output remains Inf after + // this algorithm. + // + // 2. L = 1, R = 0: + // Expect: round down, this is less than half value. + // + // Algorithm: + // - Rounding bias: 0x7fff + 1 = 0x8000 + // - Adding rounding bias to input doesn't change sticky bits but + // adds 1 to rounding bit. + // - L remains 1. + // + // 3. L = 0, R = 1, all of T are 0: + // Expect: round down, this is exactly at half, the result is already + // even (L=0). + // + // Algorithm: + // - Rounding bias: 0x7fff + 0 = 0x7fff + // - Adding rounding bias to input sets all sticky bits to 1, but + // doesn't create a carry. + // - R remains 1. + // - L remains 0. + // + // 4. L = 1, R = 1: + // Expect: round up, this is exactly at half, the result needs to be + // round to the next even number. + // + // Algorithm: + // - Rounding bias: 0x7fff + 1 = 0x8000 + // - Adding rounding bias to input doesn't change sticky bits, but + // creates a carry from rounding bit. + // - The carry sets L to 0, creates another carry bit and propagate + // forward to F bits. + // - If all the F bits are 1, a carry then propagates to the exponent + // bits, which then creates the minimum value with the next exponent + // value. Note that we won't have the case where exponents are all 1, + // since that's either a NaN (handled in the other if condition) or inf + // (handled in case 1). + // + // 5. L = 0, R = 1, any of T is 1: + // Expect: round up, this is greater than half. + // + // Algorithm: + // - Rounding bias: 0x7fff + 0 = 0x7fff + // - Adding rounding bias to input creates a carry from sticky bits, + // sets rounding bit to 0, then create another carry. + // - The second carry sets L to 1. + // + // Examples: + // + // Exact half value that is already even: + // Input: + // Sign | Exp (8 bit) | Frac (first 7 bit) | Frac (last 16 bit) + // S E E E E E E E E F F F F F F L RTTTTTTTTTTTTTTT + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1000000000000000 + // + // This falls into case 3. We truncate the rest of 16 bits and no + // carry is created into F and L: + // + // Output: + // Sign | Exp (8 bit) | Frac (first 7 bit) + // S E E E E E E E E F F F F F F L + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 + // + // Exact half value, round to next even number: + // Input: + // Sign | Exp (8 bit) | Frac (first 7 bit) | Frac (last 16 bit) + // S E E E E E E E E F F F F F F L RTTTTTTTTTTTTTTT + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1000000000000000 + // + // This falls into case 4. We create a carry from R and T, + // which then propagates into L and F: + // + // Output: + // Sign | Exp (8 bit) | Frac (first 7 bit) + // S E E E E E E E E F F F F F F L + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 + // + // + // Max denormal value round to min normal value: + // Input: + // Sign | Exp (8 bit) | Frac (first 7 bit) | Frac (last 16 bit) + // S E E E E E E E E F F F F F F L RTTTTTTTTTTTTTTT + // 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1111111111111111 + // + // This falls into case 4. We create a carry from R and T, + // propagate into L and F, which then propagates into exponent + // bits: + // + // Output: + // Sign | Exp (8 bit) | Frac (first 7 bit) + // S E E E E E E E E F F F F F F L + // 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 + // + // Max normal value round to Inf: + // Input: + // Sign | Exp (8 bit) | Frac (first 7 bit) | Frac (last 16 bit) + // S E E E E E E E E F F F F F F L RTTTTTTTTTTTTTTT + // 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1111111111111111 + // + // This falls into case 4. We create a carry from R and T, + // propagate into L and F, which then propagates into exponent + // bits: + // + // Sign | Exp (8 bit) | Frac (first 7 bit) + // S E E E E E E E E F F F F F F L + // 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 + + // At this point, ff must be either a normal float, or +/-infinity. + output = float_to_bfloat16_rtne(ff); + } + return output; +#endif +} + +// float_to_bfloat16_rtne template specialization that assumes that its function +// argument (ff) is either a normal floating point number, or +/-infinity, or +// zero. Used to improve the runtime performance of conversion from an integer +// type to bfloat16. +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne(float ff) { +#if (defined(EIGEN_HAS_CUDA_BF16) && defined(EIGEN_HAS_HIP_BF16)) + // Nothing to do here +#else + numext::uint32_t input = numext::bit_cast(ff); + __bfloat16_raw output; + + // Least significant bit of resulting bfloat. + numext::uint32_t lsb = (input >> 16) & 1; + numext::uint32_t rounding_bias = 0x7fff + lsb; + input += rounding_bias; + output.value = static_cast(input >> 16); + return output; +#endif +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float bfloat16_to_float(__bfloat16_raw h) { + float result = 0; + unsigned short* q = reinterpret_cast(&result); +#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + q[0] = h.value; +#else + q[1] = h.value; +#endif + return result; +} +// --- standard functions --- + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const bfloat16& a) { + EIGEN_USING_STD(isinf); + return (isinf)(float(a)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const bfloat16& a) { + EIGEN_USING_STD(isnan); + return (isnan)(float(a)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const bfloat16& a) { + return !(isinf EIGEN_NOT_A_MACRO (a)) && !(isnan EIGEN_NOT_A_MACRO (a)); +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 abs(const bfloat16& a) { + bfloat16 result; + result.value = a.value & 0x7FFF; + return result; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 exp(const bfloat16& a) { + return bfloat16(::expf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 expm1(const bfloat16& a) { + return bfloat16(numext::expm1(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log(const bfloat16& a) { + return bfloat16(::logf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log1p(const bfloat16& a) { + return bfloat16(numext::log1p(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log10(const bfloat16& a) { + return bfloat16(::log10f(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log2(const bfloat16& a) { + return bfloat16(static_cast(EIGEN_LOG2E) * ::logf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sqrt(const bfloat16& a) { + return bfloat16(::sqrtf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 pow(const bfloat16& a, const bfloat16& b) { + return bfloat16(::powf(float(a), float(b))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sin(const bfloat16& a) { + return bfloat16(::sinf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 cos(const bfloat16& a) { + return bfloat16(::cosf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 tan(const bfloat16& a) { + return bfloat16(::tanf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 asin(const bfloat16& a) { + return bfloat16(::asinf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 acos(const bfloat16& a) { + return bfloat16(::acosf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 atan(const bfloat16& a) { + return bfloat16(::atanf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sinh(const bfloat16& a) { + return bfloat16(::sinhf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 cosh(const bfloat16& a) { + return bfloat16(::coshf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 tanh(const bfloat16& a) { + return bfloat16(::tanhf(float(a))); +} +#if EIGEN_HAS_CXX11_MATH +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 asinh(const bfloat16& a) { + return bfloat16(::asinhf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 acosh(const bfloat16& a) { + return bfloat16(::acoshf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 atanh(const bfloat16& a) { + return bfloat16(::atanhf(float(a))); +} +#endif +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 floor(const bfloat16& a) { + return bfloat16(::floorf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 ceil(const bfloat16& a) { + return bfloat16(::ceilf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 rint(const bfloat16& a) { + return bfloat16(::rintf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 round(const bfloat16& a) { + return bfloat16(::roundf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 fmod(const bfloat16& a, const bfloat16& b) { + return bfloat16(::fmodf(float(a), float(b))); +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 (min)(const bfloat16& a, const bfloat16& b) { + const float f1 = static_cast(a); + const float f2 = static_cast(b); + return f2 < f1 ? b : a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 (max)(const bfloat16& a, const bfloat16& b) { + const float f1 = static_cast(a); + const float f2 = static_cast(b); + return f1 < f2 ? b : a; +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 fmin(const bfloat16& a, const bfloat16& b) { + const float f1 = static_cast(a); + const float f2 = static_cast(b); + return bfloat16(::fminf(f1, f2)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 fmax(const bfloat16& a, const bfloat16& b) { + const float f1 = static_cast(a); + const float f2 = static_cast(b); + return bfloat16(::fmaxf(f1, f2)); +} + +#ifndef EIGEN_NO_IO +EIGEN_ALWAYS_INLINE std::ostream& operator << (std::ostream& os, const bfloat16& v) { + os << static_cast(v); + return os; +} +#endif + +} // namespace bfloat16_impl + +namespace internal { + +template<> +struct random_default_impl +{ + static inline bfloat16 run(const bfloat16& x, const bfloat16& y) + { + return x + (y-x) * bfloat16(float(std::rand()) / float(RAND_MAX)); + } + static inline bfloat16 run() + { + return run(bfloat16(-1.f), bfloat16(1.f)); + } +}; + +template<> struct is_arithmetic { enum { value = true }; }; + +} // namespace internal + +template<> struct NumTraits + : GenericNumTraits +{ + enum { + IsSigned = true, + IsInteger = false, + IsComplex = false, + RequireInitialization = false + }; + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 epsilon() { + return bfloat16_impl::raw_uint16_to_bfloat16(0x3c00); + } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 dummy_precision() { + return bfloat16_impl::raw_uint16_to_bfloat16(0x3D4D); // bfloat16(5e-2f); + + } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 highest() { + return bfloat16_impl::raw_uint16_to_bfloat16(0x7F7F); + } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 lowest() { + return bfloat16_impl::raw_uint16_to_bfloat16(0xFF7F); + } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 infinity() { + return bfloat16_impl::raw_uint16_to_bfloat16(0x7f80); + } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 quiet_NaN() { + return bfloat16_impl::raw_uint16_to_bfloat16(0x7fc0); + } +}; + +} // namespace Eigen + +namespace Eigen { +namespace numext { + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +bool (isnan)(const Eigen::bfloat16& h) { + return (bfloat16_impl::isnan)(h); +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +bool (isinf)(const Eigen::bfloat16& h) { + return (bfloat16_impl::isinf)(h); +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +bool (isfinite)(const Eigen::bfloat16& h) { + return (bfloat16_impl::isfinite)(h); +} + +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bit_cast(const uint16_t& src) { + return Eigen::bfloat16(Eigen::bfloat16_impl::raw_uint16_to_bfloat16(src)); +} + +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC uint16_t bit_cast(const Eigen::bfloat16& src) { + return Eigen::bfloat16_impl::raw_bfloat16_as_uint16(src); +} + +} // namespace numext +} // namespace Eigen + +#if EIGEN_HAS_STD_HASH +namespace std { +template <> +struct hash { + EIGEN_STRONG_INLINE std::size_t operator()(const Eigen::bfloat16& a) const { + return static_cast(Eigen::numext::bit_cast(a)); + } +}; +} // namespace std +#endif + + +#endif // EIGEN_BFLOAT16_H diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/Default/ConjHelper.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/Default/ConjHelper.h index 4cfe34e052..53830b5a27 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/Default/ConjHelper.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/Default/ConjHelper.h @@ -11,19 +11,107 @@ #ifndef EIGEN_ARCH_CONJ_HELPER_H #define EIGEN_ARCH_CONJ_HELPER_H -#define EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PACKET_CPLX, PACKET_REAL) \ - template<> struct conj_helper { \ - EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_REAL& x, const PACKET_CPLX& y, const PACKET_CPLX& c) const \ - { return padd(c, pmul(x,y)); } \ - EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_REAL& x, const PACKET_CPLX& y) const \ - { return PACKET_CPLX(Eigen::internal::pmul(x, y.v)); } \ - }; \ - \ - template<> struct conj_helper { \ - EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_CPLX& x, const PACKET_REAL& y, const PACKET_CPLX& c) const \ - { return padd(c, pmul(x,y)); } \ - EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_CPLX& x, const PACKET_REAL& y) const \ - { return PACKET_CPLX(Eigen::internal::pmul(x.v, y)); } \ +#define EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PACKET_CPLX, PACKET_REAL) \ + template <> \ + struct conj_helper { \ + EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_REAL& x, \ + const PACKET_CPLX& y, \ + const PACKET_CPLX& c) const { \ + return padd(c, this->pmul(x, y)); \ + } \ + EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_REAL& x, \ + const PACKET_CPLX& y) const { \ + return PACKET_CPLX(Eigen::internal::pmul(x, y.v)); \ + } \ + }; \ + \ + template <> \ + struct conj_helper { \ + EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_CPLX& x, \ + const PACKET_REAL& y, \ + const PACKET_CPLX& c) const { \ + return padd(c, this->pmul(x, y)); \ + } \ + EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_CPLX& x, \ + const PACKET_REAL& y) const { \ + return PACKET_CPLX(Eigen::internal::pmul(x.v, y)); \ + } \ }; -#endif // EIGEN_ARCH_CONJ_HELPER_H +namespace Eigen { +namespace internal { + +template struct conj_if; + +template<> struct conj_if { + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const { return numext::conj(x); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T pconj(const T& x) const { return internal::pconj(x); } +}; + +template<> struct conj_if { + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& operator()(const T& x) const { return x; } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& pconj(const T& x) const { return x; } +}; + +// Generic Implementation, assume scalars since the packet-version is +// specialized below. +template +struct conj_helper { + typedef typename ScalarBinaryOpTraits::ReturnType ResultType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType + pmadd(const LhsType& x, const RhsType& y, const ResultType& c) const + { return this->pmul(x, y) + c; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType + pmul(const LhsType& x, const RhsType& y) const + { return conj_if()(x) * conj_if()(y); } +}; + +template +struct conj_helper { + typedef typename ScalarBinaryOpTraits::ReturnType ResultType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType + pmadd(const LhsScalar& x, const RhsScalar& y, const ResultType& c) const + { return this->pmul(x, y) + c; } + + // We save a conjuation by using the identity conj(a)*conj(b) = conj(a*b). + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType + pmul(const LhsScalar& x, const RhsScalar& y) const + { return numext::conj(x * y); } +}; + +// Implementation with equal type, use packet operations. +template +struct conj_helper +{ + typedef Packet ResultType; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmadd(const Packet& x, const Packet& y, const Packet& c) const + { return Eigen::internal::pmadd(conj_if().pconj(x), conj_if().pconj(y), c); } + + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmul(const Packet& x, const Packet& y) const + { return Eigen::internal::pmul(conj_if().pconj(x), conj_if().pconj(y)); } +}; + +template +struct conj_helper +{ + typedef Packet ResultType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmadd(const Packet& x, const Packet& y, const Packet& c) const + { return Eigen::internal::pmadd(pconj(x), pconj(y), c); } + // We save a conjuation by using the identity conj(a)*conj(b) = conj(a*b). + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmul(const Packet& x, const Packet& y) const + { return pconj(Eigen::internal::pmul(x, y)); } +}; + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_ARCH_CONJ_HELPER_H diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h new file mode 100644 index 0000000000..c9fbaf68b4 --- /dev/null +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -0,0 +1,1649 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2007 Julien Pommier +// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com) +// Copyright (C) 2009-2019 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/* The exp and log functions of this file initially come from + * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ + */ + +#ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H +#define EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H + +namespace Eigen { +namespace internal { + +// Creates a Scalar integer type with same bit-width. +template struct make_integer; +template<> struct make_integer { typedef numext::int32_t type; }; +template<> struct make_integer { typedef numext::int64_t type; }; +template<> struct make_integer { typedef numext::int16_t type; }; +template<> struct make_integer { typedef numext::int16_t type; }; + +template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +Packet pfrexp_generic_get_biased_exponent(const Packet& a) { + typedef typename unpacket_traits::type Scalar; + typedef typename unpacket_traits::integer_packet PacketI; + enum { mantissa_bits = numext::numeric_limits::digits - 1}; + return pcast(plogical_shift_right(preinterpret(pabs(a)))); +} + +// Safely applies frexp, correctly handles denormals. +// Assumes IEEE floating point format. +template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +Packet pfrexp_generic(const Packet& a, Packet& exponent) { + typedef typename unpacket_traits::type Scalar; + typedef typename make_unsigned::type>::type ScalarUI; + enum { + TotalBits = sizeof(Scalar) * CHAR_BIT, + MantissaBits = numext::numeric_limits::digits - 1, + ExponentBits = int(TotalBits) - int(MantissaBits) - 1 + }; + + EIGEN_CONSTEXPR ScalarUI scalar_sign_mantissa_mask = + ~(((ScalarUI(1) << int(ExponentBits)) - ScalarUI(1)) << int(MantissaBits)); // ~0x7f800000 + const Packet sign_mantissa_mask = pset1frombits(static_cast(scalar_sign_mantissa_mask)); + const Packet half = pset1(Scalar(0.5)); + const Packet zero = pzero(a); + const Packet normal_min = pset1((numext::numeric_limits::min)()); // Minimum normal value, 2^-126 + + // To handle denormals, normalize by multiplying by 2^(int(MantissaBits)+1). + const Packet is_denormal = pcmp_lt(pabs(a), normal_min); + EIGEN_CONSTEXPR ScalarUI scalar_normalization_offset = ScalarUI(int(MantissaBits) + 1); // 24 + // The following cannot be constexpr because bfloat16(uint16_t) is not constexpr. + const Scalar scalar_normalization_factor = Scalar(ScalarUI(1) << int(scalar_normalization_offset)); // 2^24 + const Packet normalization_factor = pset1(scalar_normalization_factor); + const Packet normalized_a = pselect(is_denormal, pmul(a, normalization_factor), a); + + // Determine exponent offset: -126 if normal, -126-24 if denormal + const Scalar scalar_exponent_offset = -Scalar((ScalarUI(1)<<(int(ExponentBits)-1)) - ScalarUI(2)); // -126 + Packet exponent_offset = pset1(scalar_exponent_offset); + const Packet normalization_offset = pset1(-Scalar(scalar_normalization_offset)); // -24 + exponent_offset = pselect(is_denormal, padd(exponent_offset, normalization_offset), exponent_offset); + + // Determine exponent and mantissa from normalized_a. + exponent = pfrexp_generic_get_biased_exponent(normalized_a); + // Zero, Inf and NaN return 'a' unmodified, exponent is zero + // (technically the exponent is unspecified for inf/NaN, but GCC/Clang set it to zero) + const Scalar scalar_non_finite_exponent = Scalar((ScalarUI(1) << int(ExponentBits)) - ScalarUI(1)); // 255 + const Packet non_finite_exponent = pset1(scalar_non_finite_exponent); + const Packet is_zero_or_not_finite = por(pcmp_eq(a, zero), pcmp_eq(exponent, non_finite_exponent)); + const Packet m = pselect(is_zero_or_not_finite, a, por(pand(normalized_a, sign_mantissa_mask), half)); + exponent = pselect(is_zero_or_not_finite, zero, padd(exponent, exponent_offset)); + return m; +} + +// Safely applies ldexp, correctly handles overflows, underflows and denormals. +// Assumes IEEE floating point format. +template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +Packet pldexp_generic(const Packet& a, const Packet& exponent) { + // We want to return a * 2^exponent, allowing for all possible integer + // exponents without overflowing or underflowing in intermediate + // computations. + // + // Since 'a' and the output can be denormal, the maximum range of 'exponent' + // to consider for a float is: + // -255-23 -> 255+23 + // Below -278 any finite float 'a' will become zero, and above +278 any + // finite float will become inf, including when 'a' is the smallest possible + // denormal. + // + // Unfortunately, 2^(278) cannot be represented using either one or two + // finite normal floats, so we must split the scale factor into at least + // three parts. It turns out to be faster to split 'exponent' into four + // factors, since [exponent>>2] is much faster to compute that [exponent/3]. + // + // Set e = min(max(exponent, -278), 278); + // b = floor(e/4); + // out = ((((a * 2^(b)) * 2^(b)) * 2^(b)) * 2^(e-3*b)) + // + // This will avoid any intermediate overflows and correctly handle 0, inf, + // NaN cases. + typedef typename unpacket_traits::integer_packet PacketI; + typedef typename unpacket_traits::type Scalar; + typedef typename unpacket_traits::type ScalarI; + enum { + TotalBits = sizeof(Scalar) * CHAR_BIT, + MantissaBits = numext::numeric_limits::digits - 1, + ExponentBits = int(TotalBits) - int(MantissaBits) - 1 + }; + + const Packet max_exponent = pset1(Scalar((ScalarI(1)<((ScalarI(1)<<(int(ExponentBits)-1)) - ScalarI(1)); // 127 + const PacketI e = pcast(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent)); + PacketI b = parithmetic_shift_right<2>(e); // floor(e/4); + Packet c = preinterpret(plogical_shift_left(padd(b, bias))); // 2^b + Packet out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b) + b = psub(psub(psub(e, b), b), b); // e - 3b + c = preinterpret(plogical_shift_left(padd(b, bias))); // 2^(e-3*b) + out = pmul(out, c); + return out; +} + +// Explicitly multiplies +// a * (2^e) +// clamping e to the range +// [NumTraits::min_exponent()-2, NumTraits::max_exponent()] +// +// This is approx 7x faster than pldexp_impl, but will prematurely over/underflow +// if 2^e doesn't fit into a normal floating-point Scalar. +// +// Assumes IEEE floating point format +template +struct pldexp_fast_impl { + typedef typename unpacket_traits::integer_packet PacketI; + typedef typename unpacket_traits::type Scalar; + typedef typename unpacket_traits::type ScalarI; + enum { + TotalBits = sizeof(Scalar) * CHAR_BIT, + MantissaBits = numext::numeric_limits::digits - 1, + ExponentBits = int(TotalBits) - int(MantissaBits) - 1 + }; + + static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC + Packet run(const Packet& a, const Packet& exponent) { + const Packet bias = pset1(Scalar((ScalarI(1)<<(int(ExponentBits)-1)) - ScalarI(1))); // 127 + const Packet limit = pset1(Scalar((ScalarI(1)<(pmin(pmax(padd(exponent, bias), pzero(limit)), limit)); // exponent + 127 + // return a * (2^e) + return pmul(a, preinterpret(plogical_shift_left(e))); + } +}; + +// Natural or base 2 logarithm. +// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2) +// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can +// be easily approximated by a polynomial centered on m=1 for stability. +// TODO(gonnet): Further reduce the interval allowing for lower-degree +// polynomial interpolants -> ... -> profit! +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet plog_impl_float(const Packet _x) +{ + Packet x = _x; + + const Packet cst_1 = pset1(1.0f); + const Packet cst_neg_half = pset1(-0.5f); + // The smallest non denormalized float number. + const Packet cst_min_norm_pos = pset1frombits( 0x00800000u); + const Packet cst_minus_inf = pset1frombits( 0xff800000u); + const Packet cst_pos_inf = pset1frombits( 0x7f800000u); + + // Polynomial coefficients. + const Packet cst_cephes_SQRTHF = pset1(0.707106781186547524f); + const Packet cst_cephes_log_p0 = pset1(7.0376836292E-2f); + const Packet cst_cephes_log_p1 = pset1(-1.1514610310E-1f); + const Packet cst_cephes_log_p2 = pset1(1.1676998740E-1f); + const Packet cst_cephes_log_p3 = pset1(-1.2420140846E-1f); + const Packet cst_cephes_log_p4 = pset1(+1.4249322787E-1f); + const Packet cst_cephes_log_p5 = pset1(-1.6668057665E-1f); + const Packet cst_cephes_log_p6 = pset1(+2.0000714765E-1f); + const Packet cst_cephes_log_p7 = pset1(-2.4999993993E-1f); + const Packet cst_cephes_log_p8 = pset1(+3.3333331174E-1f); + + // Truncate input values to the minimum positive normal. + x = pmax(x, cst_min_norm_pos); + + Packet e; + // extract significant in the range [0.5,1) and exponent + x = pfrexp(x,e); + + // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2)) + // and shift by -1. The values are then centered around 0, which improves + // the stability of the polynomial evaluation. + // if( x < SQRTHF ) { + // e -= 1; + // x = x + x - 1.0; + // } else { x = x - 1.0; } + Packet mask = pcmp_lt(x, cst_cephes_SQRTHF); + Packet tmp = pand(x, mask); + x = psub(x, cst_1); + e = psub(e, pand(cst_1, mask)); + x = padd(x, tmp); + + Packet x2 = pmul(x, x); + Packet x3 = pmul(x2, x); + + // Evaluate the polynomial approximant of degree 8 in three parts, probably + // to improve instruction-level parallelism. + Packet y, y1, y2; + y = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1); + y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4); + y2 = pmadd(cst_cephes_log_p6, x, cst_cephes_log_p7); + y = pmadd(y, x, cst_cephes_log_p2); + y1 = pmadd(y1, x, cst_cephes_log_p5); + y2 = pmadd(y2, x, cst_cephes_log_p8); + y = pmadd(y, x3, y1); + y = pmadd(y, x3, y2); + y = pmul(y, x3); + + y = pmadd(cst_neg_half, x2, y); + x = padd(x, y); + + // Add the logarithm of the exponent back to the result of the interpolation. + if (base2) { + const Packet cst_log2e = pset1(static_cast(EIGEN_LOG2E)); + x = pmadd(x, cst_log2e, e); + } else { + const Packet cst_ln2 = pset1(static_cast(EIGEN_LN2)); + x = pmadd(e, cst_ln2, x); + } + + Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x)); + Packet iszero_mask = pcmp_eq(_x,pzero(_x)); + Packet pos_inf_mask = pcmp_eq(_x,cst_pos_inf); + // Filter out invalid inputs, i.e.: + // - negative arg will be NAN + // - 0 will be -INF + // - +INF will be +INF + return pselect(iszero_mask, cst_minus_inf, + por(pselect(pos_inf_mask,cst_pos_inf,x), invalid_mask)); +} + +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet plog_float(const Packet _x) +{ + return plog_impl_float(_x); +} + +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet plog2_float(const Packet _x) +{ + return plog_impl_float(_x); +} + +/* Returns the base e (2.718...) or base 2 logarithm of x. + * The argument is separated into its exponent and fractional parts. + * The logarithm of the fraction in the interval [sqrt(1/2), sqrt(2)], + * is approximated by + * + * log(1+x) = x - 0.5 x**2 + x**3 P(x)/Q(x). + * + * for more detail see: http://www.netlib.org/cephes/ + */ +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet plog_impl_double(const Packet _x) +{ + Packet x = _x; + + const Packet cst_1 = pset1(1.0); + const Packet cst_neg_half = pset1(-0.5); + // The smallest non denormalized double. + const Packet cst_min_norm_pos = pset1frombits( static_cast(0x0010000000000000ull)); + const Packet cst_minus_inf = pset1frombits( static_cast(0xfff0000000000000ull)); + const Packet cst_pos_inf = pset1frombits( static_cast(0x7ff0000000000000ull)); + + + // Polynomial Coefficients for log(1+x) = x - x**2/2 + x**3 P(x)/Q(x) + // 1/sqrt(2) <= x < sqrt(2) + const Packet cst_cephes_SQRTHF = pset1(0.70710678118654752440E0); + const Packet cst_cephes_log_p0 = pset1(1.01875663804580931796E-4); + const Packet cst_cephes_log_p1 = pset1(4.97494994976747001425E-1); + const Packet cst_cephes_log_p2 = pset1(4.70579119878881725854E0); + const Packet cst_cephes_log_p3 = pset1(1.44989225341610930846E1); + const Packet cst_cephes_log_p4 = pset1(1.79368678507819816313E1); + const Packet cst_cephes_log_p5 = pset1(7.70838733755885391666E0); + + const Packet cst_cephes_log_q0 = pset1(1.0); + const Packet cst_cephes_log_q1 = pset1(1.12873587189167450590E1); + const Packet cst_cephes_log_q2 = pset1(4.52279145837532221105E1); + const Packet cst_cephes_log_q3 = pset1(8.29875266912776603211E1); + const Packet cst_cephes_log_q4 = pset1(7.11544750618563894466E1); + const Packet cst_cephes_log_q5 = pset1(2.31251620126765340583E1); + + // Truncate input values to the minimum positive normal. + x = pmax(x, cst_min_norm_pos); + + Packet e; + // extract significant in the range [0.5,1) and exponent + x = pfrexp(x,e); + + // Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2)) + // and shift by -1. The values are then centered around 0, which improves + // the stability of the polynomial evaluation. + // if( x < SQRTHF ) { + // e -= 1; + // x = x + x - 1.0; + // } else { x = x - 1.0; } + Packet mask = pcmp_lt(x, cst_cephes_SQRTHF); + Packet tmp = pand(x, mask); + x = psub(x, cst_1); + e = psub(e, pand(cst_1, mask)); + x = padd(x, tmp); + + Packet x2 = pmul(x, x); + Packet x3 = pmul(x2, x); + + // Evaluate the polynomial approximant , probably to improve instruction-level parallelism. + // y = x - 0.5*x^2 + x^3 * polevl( x, P, 5 ) / p1evl( x, Q, 5 ) ); + Packet y, y1, y_; + y = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1); + y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4); + y = pmadd(y, x, cst_cephes_log_p2); + y1 = pmadd(y1, x, cst_cephes_log_p5); + y_ = pmadd(y, x3, y1); + + y = pmadd(cst_cephes_log_q0, x, cst_cephes_log_q1); + y1 = pmadd(cst_cephes_log_q3, x, cst_cephes_log_q4); + y = pmadd(y, x, cst_cephes_log_q2); + y1 = pmadd(y1, x, cst_cephes_log_q5); + y = pmadd(y, x3, y1); + + y_ = pmul(y_, x3); + y = pdiv(y_, y); + + y = pmadd(cst_neg_half, x2, y); + x = padd(x, y); + + // Add the logarithm of the exponent back to the result of the interpolation. + if (base2) { + const Packet cst_log2e = pset1(static_cast(EIGEN_LOG2E)); + x = pmadd(x, cst_log2e, e); + } else { + const Packet cst_ln2 = pset1(static_cast(EIGEN_LN2)); + x = pmadd(e, cst_ln2, x); + } + + Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x)); + Packet iszero_mask = pcmp_eq(_x,pzero(_x)); + Packet pos_inf_mask = pcmp_eq(_x,cst_pos_inf); + // Filter out invalid inputs, i.e.: + // - negative arg will be NAN + // - 0 will be -INF + // - +INF will be +INF + return pselect(iszero_mask, cst_minus_inf, + por(pselect(pos_inf_mask,cst_pos_inf,x), invalid_mask)); +} + +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet plog_double(const Packet _x) +{ + return plog_impl_double(_x); +} + +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet plog2_double(const Packet _x) +{ + return plog_impl_double(_x); +} + +/** \internal \returns log(1 + x) computed using W. Kahan's formula. + See: http://www.plunk.org/~hatch/rightway.php + */ +template +Packet generic_plog1p(const Packet& x) +{ + typedef typename unpacket_traits::type ScalarType; + const Packet one = pset1(ScalarType(1)); + Packet xp1 = padd(x, one); + Packet small_mask = pcmp_eq(xp1, one); + Packet log1 = plog(xp1); + Packet inf_mask = pcmp_eq(xp1, log1); + Packet log_large = pmul(x, pdiv(log1, psub(xp1, one))); + return pselect(por(small_mask, inf_mask), x, log_large); +} + +/** \internal \returns exp(x)-1 computed using W. Kahan's formula. + See: http://www.plunk.org/~hatch/rightway.php + */ +template +Packet generic_expm1(const Packet& x) +{ + typedef typename unpacket_traits::type ScalarType; + const Packet one = pset1(ScalarType(1)); + const Packet neg_one = pset1(ScalarType(-1)); + Packet u = pexp(x); + Packet one_mask = pcmp_eq(u, one); + Packet u_minus_one = psub(u, one); + Packet neg_one_mask = pcmp_eq(u_minus_one, neg_one); + Packet logu = plog(u); + // The following comparison is to catch the case where + // exp(x) = +inf. It is written in this way to avoid having + // to form the constant +inf, which depends on the packet + // type. + Packet pos_inf_mask = pcmp_eq(logu, u); + Packet expm1 = pmul(u_minus_one, pdiv(x, logu)); + expm1 = pselect(pos_inf_mask, u, expm1); + return pselect(one_mask, + x, + pselect(neg_one_mask, + neg_one, + expm1)); +} + + +// Exponential function. Works by writing "x = m*log(2) + r" where +// "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then +// "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1). +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet pexp_float(const Packet _x) +{ + const Packet cst_1 = pset1(1.0f); + const Packet cst_half = pset1(0.5f); + const Packet cst_exp_hi = pset1( 88.723f); + const Packet cst_exp_lo = pset1(-88.723f); + + const Packet cst_cephes_LOG2EF = pset1(1.44269504088896341f); + const Packet cst_cephes_exp_p0 = pset1(1.9875691500E-4f); + const Packet cst_cephes_exp_p1 = pset1(1.3981999507E-3f); + const Packet cst_cephes_exp_p2 = pset1(8.3334519073E-3f); + const Packet cst_cephes_exp_p3 = pset1(4.1665795894E-2f); + const Packet cst_cephes_exp_p4 = pset1(1.6666665459E-1f); + const Packet cst_cephes_exp_p5 = pset1(5.0000001201E-1f); + + // Clamp x. + Packet x = pmax(pmin(_x, cst_exp_hi), cst_exp_lo); + + // Express exp(x) as exp(m*ln(2) + r), start by extracting + // m = floor(x/ln(2) + 0.5). + Packet m = pfloor(pmadd(x, cst_cephes_LOG2EF, cst_half)); + + // Get r = x - m*ln(2). If no FMA instructions are available, m*ln(2) is + // subtracted out in two parts, m*C1+m*C2 = m*ln(2), to avoid accumulating + // truncation errors. + const Packet cst_cephes_exp_C1 = pset1(-0.693359375f); + const Packet cst_cephes_exp_C2 = pset1(2.12194440e-4f); + Packet r = pmadd(m, cst_cephes_exp_C1, x); + r = pmadd(m, cst_cephes_exp_C2, r); + + Packet r2 = pmul(r, r); + Packet r3 = pmul(r2, r); + + // Evaluate the polynomial approximant,improved by instruction-level parallelism. + Packet y, y1, y2; + y = pmadd(cst_cephes_exp_p0, r, cst_cephes_exp_p1); + y1 = pmadd(cst_cephes_exp_p3, r, cst_cephes_exp_p4); + y2 = padd(r, cst_1); + y = pmadd(y, r, cst_cephes_exp_p2); + y1 = pmadd(y1, r, cst_cephes_exp_p5); + y = pmadd(y, r3, y1); + y = pmadd(y, r2, y2); + + // Return 2^m * exp(r). + // TODO: replace pldexp with faster implementation since y in [-1, 1). + return pmax(pldexp(y,m), _x); +} + +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet pexp_double(const Packet _x) +{ + Packet x = _x; + + const Packet cst_1 = pset1(1.0); + const Packet cst_2 = pset1(2.0); + const Packet cst_half = pset1(0.5); + + const Packet cst_exp_hi = pset1(709.784); + const Packet cst_exp_lo = pset1(-709.784); + + const Packet cst_cephes_LOG2EF = pset1(1.4426950408889634073599); + const Packet cst_cephes_exp_p0 = pset1(1.26177193074810590878e-4); + const Packet cst_cephes_exp_p1 = pset1(3.02994407707441961300e-2); + const Packet cst_cephes_exp_p2 = pset1(9.99999999999999999910e-1); + const Packet cst_cephes_exp_q0 = pset1(3.00198505138664455042e-6); + const Packet cst_cephes_exp_q1 = pset1(2.52448340349684104192e-3); + const Packet cst_cephes_exp_q2 = pset1(2.27265548208155028766e-1); + const Packet cst_cephes_exp_q3 = pset1(2.00000000000000000009e0); + const Packet cst_cephes_exp_C1 = pset1(0.693145751953125); + const Packet cst_cephes_exp_C2 = pset1(1.42860682030941723212e-6); + + Packet tmp, fx; + + // clamp x + x = pmax(pmin(x, cst_exp_hi), cst_exp_lo); + // Express exp(x) as exp(g + n*log(2)). + fx = pmadd(cst_cephes_LOG2EF, x, cst_half); + + // Get the integer modulus of log(2), i.e. the "n" described above. + fx = pfloor(fx); + + // Get the remainder modulo log(2), i.e. the "g" described above. Subtract + // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last + // digits right. + tmp = pmul(fx, cst_cephes_exp_C1); + Packet z = pmul(fx, cst_cephes_exp_C2); + x = psub(x, tmp); + x = psub(x, z); + + Packet x2 = pmul(x, x); + + // Evaluate the numerator polynomial of the rational interpolant. + Packet px = cst_cephes_exp_p0; + px = pmadd(px, x2, cst_cephes_exp_p1); + px = pmadd(px, x2, cst_cephes_exp_p2); + px = pmul(px, x); + + // Evaluate the denominator polynomial of the rational interpolant. + Packet qx = cst_cephes_exp_q0; + qx = pmadd(qx, x2, cst_cephes_exp_q1); + qx = pmadd(qx, x2, cst_cephes_exp_q2); + qx = pmadd(qx, x2, cst_cephes_exp_q3); + + // I don't really get this bit, copied from the SSE2 routines, so... + // TODO(gonnet): Figure out what is going on here, perhaps find a better + // rational interpolant? + x = pdiv(px, psub(qx, px)); + x = pmadd(cst_2, x, cst_1); + + // Construct the result 2^n * exp(g) = e * x. The max is used to catch + // non-finite values in the input. + // TODO: replace pldexp with faster implementation since x in [-1, 1). + return pmax(pldexp(x,fx), _x); +} + +// The following code is inspired by the following stack-overflow answer: +// https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751 +// It has been largely optimized: +// - By-pass calls to frexp. +// - Aligned loads of required 96 bits of 2/pi. This is accomplished by +// (1) balancing the mantissa and exponent to the required bits of 2/pi are +// aligned on 8-bits, and (2) replicating the storage of the bits of 2/pi. +// - Avoid a branch in rounding and extraction of the remaining fractional part. +// Overall, I measured a speed up higher than x2 on x86-64. +inline float trig_reduce_huge (float xf, int *quadrant) +{ + using Eigen::numext::int32_t; + using Eigen::numext::uint32_t; + using Eigen::numext::int64_t; + using Eigen::numext::uint64_t; + + const double pio2_62 = 3.4061215800865545e-19; // pi/2 * 2^-62 + const uint64_t zero_dot_five = uint64_t(1) << 61; // 0.5 in 2.62-bit fixed-point foramt + + // 192 bits of 2/pi for Payne-Hanek reduction + // Bits are introduced by packet of 8 to enable aligned reads. + static const uint32_t two_over_pi [] = + { + 0x00000028, 0x000028be, 0x0028be60, 0x28be60db, + 0xbe60db93, 0x60db9391, 0xdb939105, 0x9391054a, + 0x91054a7f, 0x054a7f09, 0x4a7f09d5, 0x7f09d5f4, + 0x09d5f47d, 0xd5f47d4d, 0xf47d4d37, 0x7d4d3770, + 0x4d377036, 0x377036d8, 0x7036d8a5, 0x36d8a566, + 0xd8a5664f, 0xa5664f10, 0x664f10e4, 0x4f10e410, + 0x10e41000, 0xe4100000 + }; + + uint32_t xi = numext::bit_cast(xf); + // Below, -118 = -126 + 8. + // -126 is to get the exponent, + // +8 is to enable alignment of 2/pi's bits on 8 bits. + // This is possible because the fractional part of x as only 24 meaningful bits. + uint32_t e = (xi >> 23) - 118; + // Extract the mantissa and shift it to align it wrt the exponent + xi = ((xi & 0x007fffffu)| 0x00800000u) << (e & 0x7); + + uint32_t i = e >> 3; + uint32_t twoopi_1 = two_over_pi[i-1]; + uint32_t twoopi_2 = two_over_pi[i+3]; + uint32_t twoopi_3 = two_over_pi[i+7]; + + // Compute x * 2/pi in 2.62-bit fixed-point format. + uint64_t p; + p = uint64_t(xi) * twoopi_3; + p = uint64_t(xi) * twoopi_2 + (p >> 32); + p = (uint64_t(xi * twoopi_1) << 32) + p; + + // Round to nearest: add 0.5 and extract integral part. + uint64_t q = (p + zero_dot_five) >> 62; + *quadrant = int(q); + // Now it remains to compute "r = x - q*pi/2" with high accuracy, + // since we have p=x/(pi/2) with high accuracy, we can more efficiently compute r as: + // r = (p-q)*pi/2, + // where the product can be be carried out with sufficient accuracy using double precision. + p -= q<<62; + return float(double(int64_t(p)) * pio2_62); +} + +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +#if EIGEN_GNUC_AT_LEAST(4,4) && EIGEN_COMP_GNUC_STRICT +__attribute__((optimize("-fno-unsafe-math-optimizations"))) +#endif +Packet psincos_float(const Packet& _x) +{ + typedef typename unpacket_traits::integer_packet PacketI; + + const Packet cst_2oPI = pset1(0.636619746685028076171875f); // 2/PI + const Packet cst_rounding_magic = pset1(12582912); // 2^23 for rounding + const PacketI csti_1 = pset1(1); + const Packet cst_sign_mask = pset1frombits(0x80000000u); + + Packet x = pabs(_x); + + // Scale x by 2/Pi to find x's octant. + Packet y = pmul(x, cst_2oPI); + + // Rounding trick: + Packet y_round = padd(y, cst_rounding_magic); + EIGEN_OPTIMIZATION_BARRIER(y_round) + PacketI y_int = preinterpret(y_round); // last 23 digits represent integer (if abs(x)<2^24) + y = psub(y_round, cst_rounding_magic); // nearest integer to x*4/pi + + // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4 + // using "Extended precision modular arithmetic" + #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) + // This version requires true FMA for high accuracy + // It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08): + const float huge_th = ComputeSine ? 117435.992f : 71476.0625f; + x = pmadd(y, pset1(-1.57079601287841796875f), x); + x = pmadd(y, pset1(-3.1391647326017846353352069854736328125e-07f), x); + x = pmadd(y, pset1(-5.390302529957764765544681040410068817436695098876953125e-15f), x); + #else + // Without true FMA, the previous set of coefficients maintain 1ULP accuracy + // up to x<15.7 (for sin), but accuracy is immediately lost for x>15.7. + // We thus use one more iteration to maintain 2ULPs up to reasonably large inputs. + + // The following set of coefficients maintain 1ULP up to 9.43 and 14.16 for sin and cos respectively. + // and 2 ULP up to: + const float huge_th = ComputeSine ? 25966.f : 18838.f; + x = pmadd(y, pset1(-1.5703125), x); // = 0xbfc90000 + EIGEN_OPTIMIZATION_BARRIER(x) + x = pmadd(y, pset1(-0.000483989715576171875), x); // = 0xb9fdc000 + EIGEN_OPTIMIZATION_BARRIER(x) + x = pmadd(y, pset1(1.62865035235881805419921875e-07), x); // = 0x342ee000 + x = pmadd(y, pset1(5.5644315544167710640977020375430583953857421875e-11), x); // = 0x2e74b9ee + + // For the record, the following set of coefficients maintain 2ULP up + // to a slightly larger range: + // const float huge_th = ComputeSine ? 51981.f : 39086.125f; + // but it slightly fails to maintain 1ULP for two values of sin below pi. + // x = pmadd(y, pset1(-3.140625/2.), x); + // x = pmadd(y, pset1(-0.00048351287841796875), x); + // x = pmadd(y, pset1(-3.13855707645416259765625e-07), x); + // x = pmadd(y, pset1(-6.0771006282767103812147979624569416046142578125e-11), x); + + // For the record, with only 3 iterations it is possible to maintain + // 1 ULP up to 3PI (maybe more) and 2ULP up to 255. + // The coefficients are: 0xbfc90f80, 0xb7354480, 0x2e74b9ee + #endif + + if(predux_any(pcmp_le(pset1(huge_th),pabs(_x)))) + { + const int PacketSize = unpacket_traits::size; + EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float vals[PacketSize]; + EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float x_cpy[PacketSize]; + EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) int y_int2[PacketSize]; + pstoreu(vals, pabs(_x)); + pstoreu(x_cpy, x); + pstoreu(y_int2, y_int); + for(int k=0; k=huge_th && (numext::isfinite)(val)) + x_cpy[k] = trig_reduce_huge(val,&y_int2[k]); + } + x = ploadu(x_cpy); + y_int = ploadu(y_int2); + } + + // Compute the sign to apply to the polynomial. + // sin: sign = second_bit(y_int) xor signbit(_x) + // cos: sign = second_bit(y_int+1) + Packet sign_bit = ComputeSine ? pxor(_x, preinterpret(plogical_shift_left<30>(y_int))) + : preinterpret(plogical_shift_left<30>(padd(y_int,csti_1))); + sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit + + // Get the polynomial selection mask from the second bit of y_int + // We'll calculate both (sin and cos) polynomials and then select from the two. + Packet poly_mask = preinterpret(pcmp_eq(pand(y_int, csti_1), pzero(y_int))); + + Packet x2 = pmul(x,x); + + // Evaluate the cos(x) polynomial. (-Pi/4 <= x <= Pi/4) + Packet y1 = pset1(2.4372266125283204019069671630859375e-05f); + y1 = pmadd(y1, x2, pset1(-0.00138865201734006404876708984375f )); + y1 = pmadd(y1, x2, pset1(0.041666619479656219482421875f )); + y1 = pmadd(y1, x2, pset1(-0.5f)); + y1 = pmadd(y1, x2, pset1(1.f)); + + // Evaluate the sin(x) polynomial. (Pi/4 <= x <= Pi/4) + // octave/matlab code to compute those coefficients: + // x = (0:0.0001:pi/4)'; + // A = [x.^3 x.^5 x.^7]; + // w = ((1.-(x/(pi/4)).^2).^5)*2000+1; # weights trading relative accuracy + // c = (A'*diag(w)*A)\(A'*diag(w)*(sin(x)-x)); # weighted LS, linear coeff forced to 1 + // printf('%.64f\n %.64f\n%.64f\n', c(3), c(2), c(1)) + // + Packet y2 = pset1(-0.0001959234114083702898469196984621021329076029360294342041015625f); + y2 = pmadd(y2, x2, pset1( 0.0083326873655616851693794799871284340042620897293090820312500000f)); + y2 = pmadd(y2, x2, pset1(-0.1666666203982298255503735617821803316473960876464843750000000000f)); + y2 = pmul(y2, x2); + y2 = pmadd(y2, x, x); + + // Select the correct result from the two polynomials. + y = ComputeSine ? pselect(poly_mask,y2,y1) + : pselect(poly_mask,y1,y2); + + // Update the sign and filter huge inputs + return pxor(y, sign_bit); +} + +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet psin_float(const Packet& x) +{ + return psincos_float(x); +} + +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet pcos_float(const Packet& x) +{ + return psincos_float(x); +} + + +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet psqrt_complex(const Packet& a) { + typedef typename unpacket_traits::type Scalar; + typedef typename Scalar::value_type RealScalar; + typedef typename unpacket_traits::as_real RealPacket; + + // Computes the principal sqrt of the complex numbers in the input. + // + // For example, for packets containing 2 complex numbers stored in interleaved format + // a = [a0, a1] = [x0, y0, x1, y1], + // where x0 = real(a0), y0 = imag(a0) etc., this function returns + // b = [b0, b1] = [u0, v0, u1, v1], + // such that b0^2 = a0, b1^2 = a1. + // + // To derive the formula for the complex square roots, let's consider the equation for + // a single complex square root of the number x + i*y. We want to find real numbers + // u and v such that + // (u + i*v)^2 = x + i*y <=> + // u^2 - v^2 + i*2*u*v = x + i*v. + // By equating the real and imaginary parts we get: + // u^2 - v^2 = x + // 2*u*v = y. + // + // For x >= 0, this has the numerically stable solution + // u = sqrt(0.5 * (x + sqrt(x^2 + y^2))) + // v = 0.5 * (y / u) + // and for x < 0, + // v = sign(y) * sqrt(0.5 * (-x + sqrt(x^2 + y^2))) + // u = 0.5 * (y / v) + // + // To avoid unnecessary over- and underflow, we compute sqrt(x^2 + y^2) as + // l = max(|x|, |y|) * sqrt(1 + (min(|x|, |y|) / max(|x|, |y|))^2) , + + // In the following, without lack of generality, we have annotated the code, assuming + // that the input is a packet of 2 complex numbers. + // + // Step 1. Compute l = [l0, l0, l1, l1], where + // l0 = sqrt(x0^2 + y0^2), l1 = sqrt(x1^2 + y1^2) + // To avoid over- and underflow, we use the stable formula for each hypotenuse + // l0 = (min0 == 0 ? max0 : max0 * sqrt(1 + (min0/max0)**2)), + // where max0 = max(|x0|, |y0|), min0 = min(|x0|, |y0|), and similarly for l1. + + RealPacket a_abs = pabs(a.v); // [|x0|, |y0|, |x1|, |y1|] + RealPacket a_abs_flip = pcplxflip(Packet(a_abs)).v; // [|y0|, |x0|, |y1|, |x1|] + RealPacket a_max = pmax(a_abs, a_abs_flip); + RealPacket a_min = pmin(a_abs, a_abs_flip); + RealPacket a_min_zero_mask = pcmp_eq(a_min, pzero(a_min)); + RealPacket a_max_zero_mask = pcmp_eq(a_max, pzero(a_max)); + RealPacket r = pdiv(a_min, a_max); + const RealPacket cst_one = pset1(RealScalar(1)); + RealPacket l = pmul(a_max, psqrt(padd(cst_one, pmul(r, r)))); // [l0, l0, l1, l1] + // Set l to a_max if a_min is zero. + l = pselect(a_min_zero_mask, a_max, l); + + // Step 2. Compute [rho0, *, rho1, *], where + // rho0 = sqrt(0.5 * (l0 + |x0|)), rho1 = sqrt(0.5 * (l1 + |x1|)) + // We don't care about the imaginary parts computed here. They will be overwritten later. + const RealPacket cst_half = pset1(RealScalar(0.5)); + Packet rho; + rho.v = psqrt(pmul(cst_half, padd(a_abs, l))); + + // Step 3. Compute [rho0, eta0, rho1, eta1], where + // eta0 = (y0 / l0) / 2, and eta1 = (y1 / l1) / 2. + // set eta = 0 of input is 0 + i0. + RealPacket eta = pandnot(pmul(cst_half, pdiv(a.v, pcplxflip(rho).v)), a_max_zero_mask); + RealPacket real_mask = peven_mask(a.v); + Packet positive_real_result; + // Compute result for inputs with positive real part. + positive_real_result.v = pselect(real_mask, rho.v, eta); + + // Step 4. Compute solution for inputs with negative real part: + // [|eta0|, sign(y0)*rho0, |eta1|, sign(y1)*rho1] + const RealScalar neg_zero = RealScalar(numext::bit_cast(0x80000000u)); + const RealPacket cst_imag_sign_mask = pset1(Scalar(RealScalar(0.0), neg_zero)).v; + RealPacket imag_signs = pand(a.v, cst_imag_sign_mask); + Packet negative_real_result; + // Notice that rho is positive, so taking it's absolute value is a noop. + negative_real_result.v = por(pabs(pcplxflip(positive_real_result).v), imag_signs); + + // Step 5. Select solution branch based on the sign of the real parts. + Packet negative_real_mask; + negative_real_mask.v = pcmp_lt(pand(real_mask, a.v), pzero(a.v)); + negative_real_mask.v = por(negative_real_mask.v, pcplxflip(negative_real_mask).v); + Packet result = pselect(negative_real_mask, negative_real_result, positive_real_result); + + // Step 6. Handle special cases for infinities: + // * If z is (x,+∞), the result is (+∞,+∞) even if x is NaN + // * If z is (x,-∞), the result is (+∞,-∞) even if x is NaN + // * If z is (-∞,y), the result is (0*|y|,+∞) for finite or NaN y + // * If z is (+∞,y), the result is (+∞,0*|y|) for finite or NaN y + const RealPacket cst_pos_inf = pset1(NumTraits::infinity()); + Packet is_inf; + is_inf.v = pcmp_eq(a_abs, cst_pos_inf); + Packet is_real_inf; + is_real_inf.v = pand(is_inf.v, real_mask); + is_real_inf = por(is_real_inf, pcplxflip(is_real_inf)); + // prepare packet of (+∞,0*|y|) or (0*|y|,+∞), depending on the sign of the infinite real part. + Packet real_inf_result; + real_inf_result.v = pmul(a_abs, pset1(Scalar(RealScalar(1.0), RealScalar(0.0))).v); + real_inf_result.v = pselect(negative_real_mask.v, pcplxflip(real_inf_result).v, real_inf_result.v); + // prepare packet of (+∞,+∞) or (+∞,-∞), depending on the sign of the infinite imaginary part. + Packet is_imag_inf; + is_imag_inf.v = pandnot(is_inf.v, real_mask); + is_imag_inf = por(is_imag_inf, pcplxflip(is_imag_inf)); + Packet imag_inf_result; + imag_inf_result.v = por(pand(cst_pos_inf, real_mask), pandnot(a.v, real_mask)); + + return pselect(is_imag_inf, imag_inf_result, + pselect(is_real_inf, real_inf_result,result)); +} + +// TODO(rmlarsen): The following set of utilities for double word arithmetic +// should perhaps be refactored as a separate file, since it would be generally +// useful for special function implementation etc. Writing the algorithms in +// terms if a double word type would also make the code more readable. + +// This function splits x into the nearest integer n and fractional part r, +// such that x = n + r holds exactly. +template +EIGEN_STRONG_INLINE +void absolute_split(const Packet& x, Packet& n, Packet& r) { + n = pround(x); + r = psub(x, n); +} + +// This function computes the sum {s, r}, such that x + y = s_hi + s_lo +// holds exactly, and s_hi = fl(x+y), if |x| >= |y|. +template +EIGEN_STRONG_INLINE +void fast_twosum(const Packet& x, const Packet& y, Packet& s_hi, Packet& s_lo) { + s_hi = padd(x, y); + const Packet t = psub(s_hi, x); + s_lo = psub(y, t); +} + +#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +// This function implements the extended precision product of +// a pair of floating point numbers. Given {x, y}, it computes the pair +// {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and +// p_hi = fl(x * y). +template +EIGEN_STRONG_INLINE +void twoprod(const Packet& x, const Packet& y, + Packet& p_hi, Packet& p_lo) { + p_hi = pmul(x, y); + p_lo = pmadd(x, y, pnegate(p_hi)); +} + +#else + +// This function implements the Veltkamp splitting. Given a floating point +// number x it returns the pair {x_hi, x_lo} such that x_hi + x_lo = x holds +// exactly and that half of the significant of x fits in x_hi. +// This is Algorithm 3 from Jean-Michel Muller, "Elementary Functions", +// 3rd edition, Birkh\"auser, 2016. +template +EIGEN_STRONG_INLINE +void veltkamp_splitting(const Packet& x, Packet& x_hi, Packet& x_lo) { + typedef typename unpacket_traits::type Scalar; + EIGEN_CONSTEXPR int shift = (NumTraits::digits() + 1) / 2; + const Scalar shift_scale = Scalar(uint64_t(1) << shift); // Scalar constructor not necessarily constexpr. + const Packet gamma = pmul(pset1(shift_scale + Scalar(1)), x); + Packet rho = psub(x, gamma); + x_hi = padd(rho, gamma); + x_lo = psub(x, x_hi); +} + +// This function implements Dekker's algorithm for products x * y. +// Given floating point numbers {x, y} computes the pair +// {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and +// p_hi = fl(x * y). +template +EIGEN_STRONG_INLINE +void twoprod(const Packet& x, const Packet& y, + Packet& p_hi, Packet& p_lo) { + Packet x_hi, x_lo, y_hi, y_lo; + veltkamp_splitting(x, x_hi, x_lo); + veltkamp_splitting(y, y_hi, y_lo); + + p_hi = pmul(x, y); + p_lo = pmadd(x_hi, y_hi, pnegate(p_hi)); + p_lo = pmadd(x_hi, y_lo, p_lo); + p_lo = pmadd(x_lo, y_hi, p_lo); + p_lo = pmadd(x_lo, y_lo, p_lo); +} + +#endif // EIGEN_HAS_SINGLE_INSTRUCTION_MADD + + +// This function implements Dekker's algorithm for the addition +// of two double word numbers represented by {x_hi, x_lo} and {y_hi, y_lo}. +// It returns the result as a pair {s_hi, s_lo} such that +// x_hi + x_lo + y_hi + y_lo = s_hi + s_lo holds exactly. +// This is Algorithm 5 from Jean-Michel Muller, "Elementary Functions", +// 3rd edition, Birkh\"auser, 2016. +template +EIGEN_STRONG_INLINE + void twosum(const Packet& x_hi, const Packet& x_lo, + const Packet& y_hi, const Packet& y_lo, + Packet& s_hi, Packet& s_lo) { + const Packet x_greater_mask = pcmp_lt(pabs(y_hi), pabs(x_hi)); + Packet r_hi_1, r_lo_1; + fast_twosum(x_hi, y_hi,r_hi_1, r_lo_1); + Packet r_hi_2, r_lo_2; + fast_twosum(y_hi, x_hi,r_hi_2, r_lo_2); + const Packet r_hi = pselect(x_greater_mask, r_hi_1, r_hi_2); + + const Packet s1 = padd(padd(y_lo, r_lo_1), x_lo); + const Packet s2 = padd(padd(x_lo, r_lo_2), y_lo); + const Packet s = pselect(x_greater_mask, s1, s2); + + fast_twosum(r_hi, s, s_hi, s_lo); +} + +// This is a version of twosum for double word numbers, +// which assumes that |x_hi| >= |y_hi|. +template +EIGEN_STRONG_INLINE + void fast_twosum(const Packet& x_hi, const Packet& x_lo, + const Packet& y_hi, const Packet& y_lo, + Packet& s_hi, Packet& s_lo) { + Packet r_hi, r_lo; + fast_twosum(x_hi, y_hi, r_hi, r_lo); + const Packet s = padd(padd(y_lo, r_lo), x_lo); + fast_twosum(r_hi, s, s_hi, s_lo); +} + +// This is a version of twosum for adding a floating point number x to +// double word number {y_hi, y_lo} number, with the assumption +// that |x| >= |y_hi|. +template +EIGEN_STRONG_INLINE +void fast_twosum(const Packet& x, + const Packet& y_hi, const Packet& y_lo, + Packet& s_hi, Packet& s_lo) { + Packet r_hi, r_lo; + fast_twosum(x, y_hi, r_hi, r_lo); + const Packet s = padd(y_lo, r_lo); + fast_twosum(r_hi, s, s_hi, s_lo); +} + +// This function implements the multiplication of a double word +// number represented by {x_hi, x_lo} by a floating point number y. +// It returns the result as a pair {p_hi, p_lo} such that +// (x_hi + x_lo) * y = p_hi + p_lo hold with a relative error +// of less than 2*2^{-2p}, where p is the number of significand bit +// in the floating point type. +// This is Algorithm 7 from Jean-Michel Muller, "Elementary Functions", +// 3rd edition, Birkh\"auser, 2016. +template +EIGEN_STRONG_INLINE +void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y, + Packet& p_hi, Packet& p_lo) { + Packet c_hi, c_lo1; + twoprod(x_hi, y, c_hi, c_lo1); + const Packet c_lo2 = pmul(x_lo, y); + Packet t_hi, t_lo1; + fast_twosum(c_hi, c_lo2, t_hi, t_lo1); + const Packet t_lo2 = padd(t_lo1, c_lo1); + fast_twosum(t_hi, t_lo2, p_hi, p_lo); +} + +// This function implements the multiplication of two double word +// numbers represented by {x_hi, x_lo} and {y_hi, y_lo}. +// It returns the result as a pair {p_hi, p_lo} such that +// (x_hi + x_lo) * (y_hi + y_lo) = p_hi + p_lo holds with a relative error +// of less than 2*2^{-2p}, where p is the number of significand bit +// in the floating point type. +template +EIGEN_STRONG_INLINE +void twoprod(const Packet& x_hi, const Packet& x_lo, + const Packet& y_hi, const Packet& y_lo, + Packet& p_hi, Packet& p_lo) { + Packet p_hi_hi, p_hi_lo; + twoprod(x_hi, x_lo, y_hi, p_hi_hi, p_hi_lo); + Packet p_lo_hi, p_lo_lo; + twoprod(x_hi, x_lo, y_lo, p_lo_hi, p_lo_lo); + fast_twosum(p_hi_hi, p_hi_lo, p_lo_hi, p_lo_lo, p_hi, p_lo); +} + +// This function computes the reciprocal of a floating point number +// with extra precision and returns the result as a double word. +template +void doubleword_reciprocal(const Packet& x, Packet& recip_hi, Packet& recip_lo) { + typedef typename unpacket_traits::type Scalar; + // 1. Approximate the reciprocal as the reciprocal of the high order element. + Packet approx_recip = prsqrt(x); + approx_recip = pmul(approx_recip, approx_recip); + + // 2. Run one step of Newton-Raphson iteration in double word arithmetic + // to get the bottom half. The NR iteration for reciprocal of 'a' is + // x_{i+1} = x_i * (2 - a * x_i) + + // -a*x_i + Packet t1_hi, t1_lo; + twoprod(pnegate(x), approx_recip, t1_hi, t1_lo); + // 2 - a*x_i + Packet t2_hi, t2_lo; + fast_twosum(pset1(Scalar(2)), t1_hi, t2_hi, t2_lo); + Packet t3_hi, t3_lo; + fast_twosum(t2_hi, padd(t2_lo, t1_lo), t3_hi, t3_lo); + // x_i * (2 - a * x_i) + twoprod(t3_hi, t3_lo, approx_recip, recip_hi, recip_lo); +} + + +// This function computes log2(x) and returns the result as a double word. +template +struct accurate_log2 { + template + EIGEN_STRONG_INLINE + void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) { + log2_x_hi = plog2(x); + log2_x_lo = pzero(x); + } +}; + +// This specialization uses a more accurate algorithm to compute log2(x) for +// floats in [1/sqrt(2);sqrt(2)] with a relative accuracy of ~6.42e-10. +// This additional accuracy is needed to counter the error-magnification +// inherent in multiplying by a potentially large exponent in pow(x,y). +// The minimax polynomial used was calculated using the Sollya tool. +// See sollya.org. +template <> +struct accurate_log2 { + template + EIGEN_STRONG_INLINE + void operator()(const Packet& z, Packet& log2_x_hi, Packet& log2_x_lo) { + // The function log(1+x)/x is approximated in the interval + // [1/sqrt(2)-1;sqrt(2)-1] by a degree 10 polynomial of the form + // Q(x) = (C0 + x * (C1 + x * (C2 + x * (C3 + x * P(x))))), + // where the degree 6 polynomial P(x) is evaluated in single precision, + // while the remaining 4 terms of Q(x), as well as the final multiplication by x + // to reconstruct log(1+x) are evaluated in extra precision using + // double word arithmetic. C0 through C3 are extra precise constants + // stored as double words. + // + // The polynomial coefficients were calculated using Sollya commands: + // > n = 10; + // > f = log2(1+x)/x; + // > interval = [sqrt(0.5)-1;sqrt(2)-1]; + // > p = fpminimax(f,n,[|double,double,double,double,single...|],interval,relative,floating); + + const Packet p6 = pset1( 9.703654795885e-2f); + const Packet p5 = pset1(-0.1690667718648f); + const Packet p4 = pset1( 0.1720575392246f); + const Packet p3 = pset1(-0.1789081543684f); + const Packet p2 = pset1( 0.2050433009862f); + const Packet p1 = pset1(-0.2404672354459f); + const Packet p0 = pset1( 0.2885761857032f); + + const Packet C3_hi = pset1(-0.360674142838f); + const Packet C3_lo = pset1(-6.13283912543e-09f); + const Packet C2_hi = pset1(0.480897903442f); + const Packet C2_lo = pset1(-1.44861207474e-08f); + const Packet C1_hi = pset1(-0.721347510815f); + const Packet C1_lo = pset1(-4.84483164698e-09f); + const Packet C0_hi = pset1(1.44269502163f); + const Packet C0_lo = pset1(2.01711713999e-08f); + const Packet one = pset1(1.0f); + + const Packet x = psub(z, one); + // Evaluate P(x) in working precision. + // We evaluate it in multiple parts to improve instruction level + // parallelism. + Packet x2 = pmul(x,x); + Packet p_even = pmadd(p6, x2, p4); + p_even = pmadd(p_even, x2, p2); + p_even = pmadd(p_even, x2, p0); + Packet p_odd = pmadd(p5, x2, p3); + p_odd = pmadd(p_odd, x2, p1); + Packet p = pmadd(p_odd, x, p_even); + + // Now evaluate the low-order tems of Q(x) in double word precision. + // In the following, due to the alternating signs and the fact that + // |x| < sqrt(2)-1, we can assume that |C*_hi| >= q_i, and use + // fast_twosum instead of the slower twosum. + Packet q_hi, q_lo; + Packet t_hi, t_lo; + // C3 + x * p(x) + twoprod(p, x, t_hi, t_lo); + fast_twosum(C3_hi, C3_lo, t_hi, t_lo, q_hi, q_lo); + // C2 + x * p(x) + twoprod(q_hi, q_lo, x, t_hi, t_lo); + fast_twosum(C2_hi, C2_lo, t_hi, t_lo, q_hi, q_lo); + // C1 + x * p(x) + twoprod(q_hi, q_lo, x, t_hi, t_lo); + fast_twosum(C1_hi, C1_lo, t_hi, t_lo, q_hi, q_lo); + // C0 + x * p(x) + twoprod(q_hi, q_lo, x, t_hi, t_lo); + fast_twosum(C0_hi, C0_lo, t_hi, t_lo, q_hi, q_lo); + + // log(z) ~= x * Q(x) + twoprod(q_hi, q_lo, x, log2_x_hi, log2_x_lo); + } +}; + +// This specialization uses a more accurate algorithm to compute log2(x) for +// floats in [1/sqrt(2);sqrt(2)] with a relative accuracy of ~1.27e-18. +// This additional accuracy is needed to counter the error-magnification +// inherent in multiplying by a potentially large exponent in pow(x,y). +// The minimax polynomial used was calculated using the Sollya tool. +// See sollya.org. + +template <> +struct accurate_log2 { + template + EIGEN_STRONG_INLINE + void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) { + // We use a transformation of variables: + // r = c * (x-1) / (x+1), + // such that + // log2(x) = log2((1 + r/c) / (1 - r/c)) = f(r). + // The function f(r) can be approximated well using an odd polynomial + // of the form + // P(r) = ((Q(r^2) * r^2 + C) * r^2 + 1) * r, + // For the implementation of log2 here, Q is of degree 6 with + // coefficient represented in working precision (double), while C is a + // constant represented in extra precision as a double word to achieve + // full accuracy. + // + // The polynomial coefficients were computed by the Sollya script: + // + // c = 2 / log(2); + // trans = c * (x-1)/(x+1); + // itrans = (1+x/c)/(1-x/c); + // interval=[trans(sqrt(0.5)); trans(sqrt(2))]; + // print(interval); + // f = log2(itrans(x)); + // p=fpminimax(f,[|1,3,5,7,9,11,13,15,17|],[|1,DD,double...|],interval,relative,floating); + const Packet q12 = pset1(2.87074255468000586e-9); + const Packet q10 = pset1(2.38957980901884082e-8); + const Packet q8 = pset1(2.31032094540014656e-7); + const Packet q6 = pset1(2.27279857398537278e-6); + const Packet q4 = pset1(2.31271023278625638e-5); + const Packet q2 = pset1(2.47556738444535513e-4); + const Packet q0 = pset1(2.88543873228900172e-3); + const Packet C_hi = pset1(0.0400377511598501157); + const Packet C_lo = pset1(-4.77726582251425391e-19); + const Packet one = pset1(1.0); + + const Packet cst_2_log2e_hi = pset1(2.88539008177792677); + const Packet cst_2_log2e_lo = pset1(4.07660016854549667e-17); + // c * (x - 1) + Packet num_hi, num_lo; + twoprod(cst_2_log2e_hi, cst_2_log2e_lo, psub(x, one), num_hi, num_lo); + // TODO(rmlarsen): Investigate if using the division algorithm by + // Muller et al. is faster/more accurate. + // 1 / (x + 1) + Packet denom_hi, denom_lo; + doubleword_reciprocal(padd(x, one), denom_hi, denom_lo); + // r = c * (x-1) / (x+1), + Packet r_hi, r_lo; + twoprod(num_hi, num_lo, denom_hi, denom_lo, r_hi, r_lo); + // r2 = r * r + Packet r2_hi, r2_lo; + twoprod(r_hi, r_lo, r_hi, r_lo, r2_hi, r2_lo); + // r4 = r2 * r2 + Packet r4_hi, r4_lo; + twoprod(r2_hi, r2_lo, r2_hi, r2_lo, r4_hi, r4_lo); + + // Evaluate Q(r^2) in working precision. We evaluate it in two parts + // (even and odd in r^2) to improve instruction level parallelism. + Packet q_even = pmadd(q12, r4_hi, q8); + Packet q_odd = pmadd(q10, r4_hi, q6); + q_even = pmadd(q_even, r4_hi, q4); + q_odd = pmadd(q_odd, r4_hi, q2); + q_even = pmadd(q_even, r4_hi, q0); + Packet q = pmadd(q_odd, r2_hi, q_even); + + // Now evaluate the low order terms of P(x) in double word precision. + // In the following, due to the increasing magnitude of the coefficients + // and r being constrained to [-0.5, 0.5] we can use fast_twosum instead + // of the slower twosum. + // Q(r^2) * r^2 + Packet p_hi, p_lo; + twoprod(r2_hi, r2_lo, q, p_hi, p_lo); + // Q(r^2) * r^2 + C + Packet p1_hi, p1_lo; + fast_twosum(C_hi, C_lo, p_hi, p_lo, p1_hi, p1_lo); + // (Q(r^2) * r^2 + C) * r^2 + Packet p2_hi, p2_lo; + twoprod(r2_hi, r2_lo, p1_hi, p1_lo, p2_hi, p2_lo); + // ((Q(r^2) * r^2 + C) * r^2 + 1) + Packet p3_hi, p3_lo; + fast_twosum(one, p2_hi, p2_lo, p3_hi, p3_lo); + + // log(z) ~= ((Q(r^2) * r^2 + C) * r^2 + 1) * r + twoprod(p3_hi, p3_lo, r_hi, r_lo, log2_x_hi, log2_x_lo); + } +}; + +// This function computes exp2(x) (i.e. 2**x). +template +struct fast_accurate_exp2 { + template + EIGEN_STRONG_INLINE + Packet operator()(const Packet& x) { + // TODO(rmlarsen): Add a pexp2 packetop. + return pexp(pmul(pset1(Scalar(EIGEN_LN2)), x)); + } +}; + +// This specialization uses a faster algorithm to compute exp2(x) for floats +// in [-0.5;0.5] with a relative accuracy of 1 ulp. +// The minimax polynomial used was calculated using the Sollya tool. +// See sollya.org. +template <> +struct fast_accurate_exp2 { + template + EIGEN_STRONG_INLINE + Packet operator()(const Packet& x) { + // This function approximates exp2(x) by a degree 6 polynomial of the form + // Q(x) = 1 + x * (C + x * P(x)), where the degree 4 polynomial P(x) is evaluated in + // single precision, and the remaining steps are evaluated with extra precision using + // double word arithmetic. C is an extra precise constant stored as a double word. + // + // The polynomial coefficients were calculated using Sollya commands: + // > n = 6; + // > f = 2^x; + // > interval = [-0.5;0.5]; + // > p = fpminimax(f,n,[|1,double,single...|],interval,relative,floating); + + const Packet p4 = pset1(1.539513905e-4f); + const Packet p3 = pset1(1.340007293e-3f); + const Packet p2 = pset1(9.618283249e-3f); + const Packet p1 = pset1(5.550328270e-2f); + const Packet p0 = pset1(0.2402264923f); + + const Packet C_hi = pset1(0.6931471825f); + const Packet C_lo = pset1(2.36836577e-08f); + const Packet one = pset1(1.0f); + + // Evaluate P(x) in working precision. + // We evaluate even and odd parts of the polynomial separately + // to gain some instruction level parallelism. + Packet x2 = pmul(x,x); + Packet p_even = pmadd(p4, x2, p2); + Packet p_odd = pmadd(p3, x2, p1); + p_even = pmadd(p_even, x2, p0); + Packet p = pmadd(p_odd, x, p_even); + + // Evaluate the remaining terms of Q(x) with extra precision using + // double word arithmetic. + Packet p_hi, p_lo; + // x * p(x) + twoprod(p, x, p_hi, p_lo); + // C + x * p(x) + Packet q1_hi, q1_lo; + twosum(p_hi, p_lo, C_hi, C_lo, q1_hi, q1_lo); + // x * (C + x * p(x)) + Packet q2_hi, q2_lo; + twoprod(q1_hi, q1_lo, x, q2_hi, q2_lo); + // 1 + x * (C + x * p(x)) + Packet q3_hi, q3_lo; + // Since |q2_hi| <= sqrt(2)-1 < 1, we can use fast_twosum + // for adding it to unity here. + fast_twosum(one, q2_hi, q3_hi, q3_lo); + return padd(q3_hi, padd(q2_lo, q3_lo)); + } +}; + +// in [-0.5;0.5] with a relative accuracy of 1 ulp. +// The minimax polynomial used was calculated using the Sollya tool. +// See sollya.org. +template <> +struct fast_accurate_exp2 { + template + EIGEN_STRONG_INLINE + Packet operator()(const Packet& x) { + // This function approximates exp2(x) by a degree 10 polynomial of the form + // Q(x) = 1 + x * (C + x * P(x)), where the degree 8 polynomial P(x) is evaluated in + // single precision, and the remaining steps are evaluated with extra precision using + // double word arithmetic. C is an extra precise constant stored as a double word. + // + // The polynomial coefficients were calculated using Sollya commands: + // > n = 11; + // > f = 2^x; + // > interval = [-0.5;0.5]; + // > p = fpminimax(f,n,[|1,DD,double...|],interval,relative,floating); + + const Packet p9 = pset1(4.431642109085495276e-10); + const Packet p8 = pset1(7.073829923303358410e-9); + const Packet p7 = pset1(1.017822306737031311e-7); + const Packet p6 = pset1(1.321543498017646657e-6); + const Packet p5 = pset1(1.525273342728892877e-5); + const Packet p4 = pset1(1.540353045780084423e-4); + const Packet p3 = pset1(1.333355814685869807e-3); + const Packet p2 = pset1(9.618129107593478832e-3); + const Packet p1 = pset1(5.550410866481961247e-2); + const Packet p0 = pset1(0.240226506959101332); + const Packet C_hi = pset1(0.693147180559945286); + const Packet C_lo = pset1(4.81927865669806721e-17); + const Packet one = pset1(1.0); + + // Evaluate P(x) in working precision. + // We evaluate even and odd parts of the polynomial separately + // to gain some instruction level parallelism. + Packet x2 = pmul(x,x); + Packet p_even = pmadd(p8, x2, p6); + Packet p_odd = pmadd(p9, x2, p7); + p_even = pmadd(p_even, x2, p4); + p_odd = pmadd(p_odd, x2, p5); + p_even = pmadd(p_even, x2, p2); + p_odd = pmadd(p_odd, x2, p3); + p_even = pmadd(p_even, x2, p0); + p_odd = pmadd(p_odd, x2, p1); + Packet p = pmadd(p_odd, x, p_even); + + // Evaluate the remaining terms of Q(x) with extra precision using + // double word arithmetic. + Packet p_hi, p_lo; + // x * p(x) + twoprod(p, x, p_hi, p_lo); + // C + x * p(x) + Packet q1_hi, q1_lo; + twosum(p_hi, p_lo, C_hi, C_lo, q1_hi, q1_lo); + // x * (C + x * p(x)) + Packet q2_hi, q2_lo; + twoprod(q1_hi, q1_lo, x, q2_hi, q2_lo); + // 1 + x * (C + x * p(x)) + Packet q3_hi, q3_lo; + // Since |q2_hi| <= sqrt(2)-1 < 1, we can use fast_twosum + // for adding it to unity here. + fast_twosum(one, q2_hi, q3_hi, q3_lo); + return padd(q3_hi, padd(q2_lo, q3_lo)); + } +}; + +// This function implements the non-trivial case of pow(x,y) where x is +// positive and y is (possibly) non-integer. +// Formally, pow(x,y) = exp2(y * log2(x)), where exp2(x) is shorthand for 2^x. +// TODO(rmlarsen): We should probably add this as a packet up 'ppow', to make it +// easier to specialize or turn off for specific types and/or backends.x +template +EIGEN_STRONG_INLINE Packet generic_pow_impl(const Packet& x, const Packet& y) { + typedef typename unpacket_traits::type Scalar; + // Split x into exponent e_x and mantissa m_x. + Packet e_x; + Packet m_x = pfrexp(x, e_x); + + // Adjust m_x to lie in [1/sqrt(2):sqrt(2)] to minimize absolute error in log2(m_x). + EIGEN_CONSTEXPR Scalar sqrt_half = Scalar(0.70710678118654752440); + const Packet m_x_scale_mask = pcmp_lt(m_x, pset1(sqrt_half)); + m_x = pselect(m_x_scale_mask, pmul(pset1(Scalar(2)), m_x), m_x); + e_x = pselect(m_x_scale_mask, psub(e_x, pset1(Scalar(1))), e_x); + + // Compute log2(m_x) with 6 extra bits of accuracy. + Packet rx_hi, rx_lo; + accurate_log2()(m_x, rx_hi, rx_lo); + + // Compute the two terms {y * e_x, y * r_x} in f = y * log2(x) with doubled + // precision using double word arithmetic. + Packet f1_hi, f1_lo, f2_hi, f2_lo; + twoprod(e_x, y, f1_hi, f1_lo); + twoprod(rx_hi, rx_lo, y, f2_hi, f2_lo); + // Sum the two terms in f using double word arithmetic. We know + // that |e_x| > |log2(m_x)|, except for the case where e_x==0. + // This means that we can use fast_twosum(f1,f2). + // In the case e_x == 0, e_x * y = f1 = 0, so we don't lose any + // accuracy by violating the assumption of fast_twosum, because + // it's a no-op. + Packet f_hi, f_lo; + fast_twosum(f1_hi, f1_lo, f2_hi, f2_lo, f_hi, f_lo); + + // Split f into integer and fractional parts. + Packet n_z, r_z; + absolute_split(f_hi, n_z, r_z); + r_z = padd(r_z, f_lo); + Packet n_r; + absolute_split(r_z, n_r, r_z); + n_z = padd(n_z, n_r); + + // We now have an accurate split of f = n_z + r_z and can compute + // x^y = 2**{n_z + r_z) = exp2(r_z) * 2**{n_z}. + // Since r_z is in [-0.5;0.5], we compute the first factor to high accuracy + // using a specialized algorithm. Multiplication by the second factor can + // be done exactly using pldexp(), since it is an integer power of 2. + const Packet e_r = fast_accurate_exp2()(r_z); + return pldexp(e_r, n_z); +} + +// Generic implementation of pow(x,y). +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet generic_pow(const Packet& x, const Packet& y) { + typedef typename unpacket_traits::type Scalar; + + const Packet cst_pos_inf = pset1(NumTraits::infinity()); + const Packet cst_zero = pset1(Scalar(0)); + const Packet cst_one = pset1(Scalar(1)); + const Packet cst_nan = pset1(NumTraits::quiet_NaN()); + + const Packet abs_x = pabs(x); + // Predicates for sign and magnitude of x. + const Packet x_is_zero = pcmp_eq(x, cst_zero); + const Packet x_is_neg = pcmp_lt(x, cst_zero); + const Packet abs_x_is_inf = pcmp_eq(abs_x, cst_pos_inf); + const Packet abs_x_is_one = pcmp_eq(abs_x, cst_one); + const Packet abs_x_is_gt_one = pcmp_lt(cst_one, abs_x); + const Packet abs_x_is_lt_one = pcmp_lt(abs_x, cst_one); + const Packet x_is_one = pandnot(abs_x_is_one, x_is_neg); + const Packet x_is_neg_one = pand(abs_x_is_one, x_is_neg); + const Packet x_is_nan = pandnot(ptrue(x), pcmp_eq(x, x)); + + // Predicates for sign and magnitude of y. + const Packet y_is_one = pcmp_eq(y, cst_one); + const Packet y_is_zero = pcmp_eq(y, cst_zero); + const Packet y_is_neg = pcmp_lt(y, cst_zero); + const Packet y_is_pos = pandnot(ptrue(y), por(y_is_zero, y_is_neg)); + const Packet y_is_nan = pandnot(ptrue(y), pcmp_eq(y, y)); + const Packet abs_y_is_inf = pcmp_eq(pabs(y), cst_pos_inf); + EIGEN_CONSTEXPR Scalar huge_exponent = + (NumTraits::max_exponent() * Scalar(EIGEN_LN2)) / + NumTraits::epsilon(); + const Packet abs_y_is_huge = pcmp_le(pset1(huge_exponent), pabs(y)); + + // Predicates for whether y is integer and/or even. + const Packet y_is_int = pcmp_eq(pfloor(y), y); + const Packet y_div_2 = pmul(y, pset1(Scalar(0.5))); + const Packet y_is_even = pcmp_eq(pround(y_div_2), y_div_2); + + // Predicates encoding special cases for the value of pow(x,y) + const Packet invalid_negative_x = pandnot(pandnot(pandnot(x_is_neg, abs_x_is_inf), + y_is_int), + abs_y_is_inf); + const Packet pow_is_one = por(por(x_is_one, y_is_zero), + pand(x_is_neg_one, + por(abs_y_is_inf, pandnot(y_is_even, invalid_negative_x)))); + const Packet pow_is_nan = por(invalid_negative_x, por(x_is_nan, y_is_nan)); + const Packet pow_is_zero = por(por(por(pand(x_is_zero, y_is_pos), + pand(abs_x_is_inf, y_is_neg)), + pand(pand(abs_x_is_lt_one, abs_y_is_huge), + y_is_pos)), + pand(pand(abs_x_is_gt_one, abs_y_is_huge), + y_is_neg)); + const Packet pow_is_inf = por(por(por(pand(x_is_zero, y_is_neg), + pand(abs_x_is_inf, y_is_pos)), + pand(pand(abs_x_is_lt_one, abs_y_is_huge), + y_is_neg)), + pand(pand(abs_x_is_gt_one, abs_y_is_huge), + y_is_pos)); + + // General computation of pow(x,y) for positive x or negative x and integer y. + const Packet negate_pow_abs = pandnot(x_is_neg, y_is_even); + const Packet pow_abs = generic_pow_impl(abs_x, y); + return pselect(y_is_one, x, + pselect(pow_is_one, cst_one, + pselect(pow_is_nan, cst_nan, + pselect(pow_is_inf, cst_pos_inf, + pselect(pow_is_zero, cst_zero, + pselect(negate_pow_abs, pnegate(pow_abs), pow_abs)))))); +} + + + +/* polevl (modified for Eigen) + * + * Evaluate polynomial + * + * + * + * SYNOPSIS: + * + * int N; + * Scalar x, y, coef[N+1]; + * + * y = polevl( x, coef); + * + * + * + * DESCRIPTION: + * + * Evaluates polynomial of degree N: + * + * 2 N + * y = C + C x + C x +...+ C x + * 0 1 2 N + * + * Coefficients are stored in reverse order: + * + * coef[0] = C , ..., coef[N] = C . + * N 0 + * + * The function p1evl() assumes that coef[N] = 1.0 and is + * omitted from the array. Its calling arguments are + * otherwise the same as polevl(). + * + * + * The Eigen implementation is templatized. For best speed, store + * coef as a const array (constexpr), e.g. + * + * const double coef[] = {1.0, 2.0, 3.0, ...}; + * + */ +template +struct ppolevl { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const typename unpacket_traits::type coeff[]) { + EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + return pmadd(ppolevl::run(x, coeff), x, pset1(coeff[N])); + } +}; + +template +struct ppolevl { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const typename unpacket_traits::type coeff[]) { + EIGEN_UNUSED_VARIABLE(x); + return pset1(coeff[0]); + } +}; + +/* chbevl (modified for Eigen) + * + * Evaluate Chebyshev series + * + * + * + * SYNOPSIS: + * + * int N; + * Scalar x, y, coef[N], chebevl(); + * + * y = chbevl( x, coef, N ); + * + * + * + * DESCRIPTION: + * + * Evaluates the series + * + * N-1 + * - ' + * y = > coef[i] T (x/2) + * - i + * i=0 + * + * of Chebyshev polynomials Ti at argument x/2. + * + * Coefficients are stored in reverse order, i.e. the zero + * order term is last in the array. Note N is the number of + * coefficients, not the order. + * + * If coefficients are for the interval a to b, x must + * have been transformed to x -> 2(2x - b - a)/(b-a) before + * entering the routine. This maps x from (a, b) to (-1, 1), + * over which the Chebyshev polynomials are defined. + * + * If the coefficients are for the inverted interval, in + * which (a, b) is mapped to (1/b, 1/a), the transformation + * required is x -> 2(2ab/x - b - a)/(b-a). If b is infinity, + * this becomes x -> 4a/x - 1. + * + * + * + * SPEED: + * + * Taking advantage of the recurrence properties of the + * Chebyshev polynomials, the routine requires one more + * addition per loop than evaluating a nested polynomial of + * the same degree. + * + */ + +template +struct pchebevl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Packet run(Packet x, const typename unpacket_traits::type coef[]) { + typedef typename unpacket_traits::type Scalar; + Packet b0 = pset1(coef[0]); + Packet b1 = pset1(static_cast(0.f)); + Packet b2; + + for (int i = 1; i < N; i++) { + b2 = b1; + b1 = b0; + b0 = psub(pmadd(x, b1, pset1(coef[i])), b2); + } + + return pmul(pset1(static_cast(0.5f)), psub(b0, b2)); + } +}; + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h new file mode 100644 index 0000000000..177a04e93e --- /dev/null +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h @@ -0,0 +1,110 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2019 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_FWD_H +#define EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_FWD_H + +namespace Eigen { +namespace internal { + +// Forward declarations of the generic math functions +// implemented in GenericPacketMathFunctions.h +// This is needed to workaround a circular dependency. + +/*************************************************************************** + * Some generic implementations to be used by implementors +***************************************************************************/ + +/** Default implementation of pfrexp. + * It is expected to be called by implementers of template<> pfrexp. + */ +template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +Packet pfrexp_generic(const Packet& a, Packet& exponent); + +// Extracts the biased exponent value from Packet p, and casts the results to +// a floating-point Packet type. Used by pfrexp_generic. Override this if +// there is no unpacket_traits::integer_packet. +template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +Packet pfrexp_generic_get_biased_exponent(const Packet& p); + +/** Default implementation of pldexp. + * It is expected to be called by implementers of template<> pldexp. + */ +template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +Packet pldexp_generic(const Packet& a, const Packet& exponent); + +/** \internal \returns log(x) for single precision float */ +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet plog_float(const Packet _x); + +/** \internal \returns log2(x) for single precision float */ +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet plog2_float(const Packet _x); + +/** \internal \returns log(x) for single precision float */ +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet plog_double(const Packet _x); + +/** \internal \returns log2(x) for single precision float */ +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet plog2_double(const Packet _x); + +/** \internal \returns log(1 + x) */ +template +Packet generic_plog1p(const Packet& x); + +/** \internal \returns exp(x)-1 */ +template +Packet generic_expm1(const Packet& x); + +/** \internal \returns exp(x) for single precision float */ +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet pexp_float(const Packet _x); + +/** \internal \returns exp(x) for double precision real numbers */ +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet pexp_double(const Packet _x); + +/** \internal \returns sin(x) for single precision float */ +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet psin_float(const Packet& x); + +/** \internal \returns cos(x) for single precision float */ +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet pcos_float(const Packet& x); + +/** \internal \returns sqrt(x) for complex types */ +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet psqrt_complex(const Packet& a); + +template struct ppolevl; + + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_FWD_H diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/Default/Half.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/Default/Half.h new file mode 100644 index 0000000000..9f8e8cc1e7 --- /dev/null +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/Default/Half.h @@ -0,0 +1,942 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// +// The conversion routines are Copyright (c) Fabian Giesen, 2016. +// The original license follows: +// +// Copyright (c) Fabian Giesen, 2016 +// All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted. +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +// Standard 16-bit float type, mostly useful for GPUs. Defines a new +// type Eigen::half (inheriting either from CUDA's or HIP's __half struct) with +// operator overloads such that it behaves basically as an arithmetic +// type. It will be quite slow on CPUs (so it is recommended to stay +// in fp32 for CPUs, except for simple parameter conversions, I/O +// to disk and the likes), but fast on GPUs. + + +#ifndef EIGEN_HALF_H +#define EIGEN_HALF_H + +#include + +#if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) +// When compiling with GPU support, the "__half_raw" base class as well as +// some other routines are defined in the GPU compiler header files +// (cuda_fp16.h, hip_fp16.h), and they are not tagged constexpr +// As a consequence, we get compile failures when compiling Eigen with +// GPU support. Hence the need to disable EIGEN_CONSTEXPR when building +// Eigen with GPU support + #pragma push_macro("EIGEN_CONSTEXPR") + #undef EIGEN_CONSTEXPR + #define EIGEN_CONSTEXPR +#endif + +#define F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, METHOD) \ + template <> \ + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_UNUSED \ + PACKET_F16 METHOD(const PACKET_F16& _x) { \ + return float2half(METHOD(half2float(_x))); \ + } + +namespace Eigen { + +struct half; + +namespace half_impl { + +// We want to use the __half_raw struct from the HIP header file only during the device compile phase. +// This is required because of a quirk in the way TensorFlow GPU builds are done. +// When compiling TensorFlow source code with GPU support, files that +// * contain GPU kernels (i.e. *.cu.cc files) are compiled via hipcc +// * do not contain GPU kernels ( i.e. *.cc files) are compiled via gcc (typically) +// +// Tensorflow uses the Eigen::half type as its FP16 type, and there are functions that +// * are defined in a file that gets compiled via hipcc AND +// * have Eigen::half as a pass-by-value argument AND +// * are called in a file that gets compiled via gcc +// +// In the scenario described above the caller and callee will see different versions +// of the Eigen::half base class __half_raw, and they will be compiled by different compilers +// +// There appears to be an ABI mismatch between gcc and clang (which is called by hipcc) that results in +// the callee getting corrupted values for the Eigen::half argument. +// +// Making the host side compile phase of hipcc use the same Eigen::half impl, as the gcc compile, resolves +// this error, and hence the following convoluted #if condition +#if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE) +// Make our own __half_raw definition that is similar to CUDA's. +struct __half_raw { +#if (defined(EIGEN_HAS_GPU_FP16) && !defined(EIGEN_GPU_COMPILE_PHASE)) + // Eigen::half can be used as the datatype for shared memory declarations (in Eigen and TF) + // The element type for shared memory cannot have non-trivial constructors + // and hence the following special casing (which skips the zero-initilization). + // Note that this check gets done even in the host compilation phase, and + // hence the need for this + EIGEN_DEVICE_FUNC __half_raw() {} +#else + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw() : x(0) {} +#endif +#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) + explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw(numext::uint16_t raw) : x(numext::bit_cast<__fp16>(raw)) { + } + __fp16 x; +#else + explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw(numext::uint16_t raw) : x(raw) {} + numext::uint16_t x; +#endif +}; + +#elif defined(EIGEN_HAS_HIP_FP16) + // Nothing to do here + // HIP fp16 header file has a definition for __half_raw +#elif defined(EIGEN_HAS_CUDA_FP16) + #if EIGEN_CUDA_SDK_VER < 90000 + // In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw + typedef __half __half_raw; + #endif // defined(EIGEN_HAS_CUDA_FP16) +#elif defined(SYCL_DEVICE_ONLY) + typedef cl::sycl::half __half_raw; +#endif + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h); + +struct half_base : public __half_raw { + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base() {} + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base(const __half_raw& h) : __half_raw(h) {} + +#if defined(EIGEN_HAS_GPU_FP16) + #if defined(EIGEN_HAS_HIP_FP16) + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base(const __half& h) { x = __half_as_ushort(h); } + #elif defined(EIGEN_HAS_CUDA_FP16) + #if EIGEN_CUDA_SDK_VER >= 90000 + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {} + #endif + #endif +#endif +}; + +} // namespace half_impl + +// Class definition. +struct half : public half_impl::half_base { + + // Writing this out as separate #if-else blocks to make the code easier to follow + // The same applies to most #if-else blocks in this file +#if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE) + // Use the same base class for the following two scenarios + // * when compiling without GPU support enabled + // * during host compile phase when compiling with GPU support enabled + typedef half_impl::__half_raw __half_raw; +#elif defined(EIGEN_HAS_HIP_FP16) + // Nothing to do here + // HIP fp16 header file has a definition for __half_raw +#elif defined(EIGEN_HAS_CUDA_FP16) + // Note that EIGEN_CUDA_SDK_VER is set to 0 even when compiling with HIP, so + // (EIGEN_CUDA_SDK_VER < 90000) is true even for HIP! So keeping this within + // #if defined(EIGEN_HAS_CUDA_FP16) is needed + #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 + typedef half_impl::__half_raw __half_raw; + #endif +#endif + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half() {} + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(const __half_raw& h) : half_impl::half_base(h) {} + +#if defined(EIGEN_HAS_GPU_FP16) + #if defined(EIGEN_HAS_HIP_FP16) + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {} + #elif defined(EIGEN_HAS_CUDA_FP16) + #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000 + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {} + #endif + #endif +#endif + + + explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(bool b) + : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {} + template + explicit EIGEN_DEVICE_FUNC half(T val) + : half_impl::half_base(half_impl::float_to_half_rtne(static_cast(val))) {} + explicit EIGEN_DEVICE_FUNC half(float f) + : half_impl::half_base(half_impl::float_to_half_rtne(f)) {} + + // Following the convention of numpy, converting between complex and + // float will lead to loss of imag value. + template + explicit EIGEN_DEVICE_FUNC half(std::complex c) + : half_impl::half_base(half_impl::float_to_half_rtne(static_cast(c.real()))) {} + + EIGEN_DEVICE_FUNC operator float() const { // NOLINT: Allow implicit conversion to float, because it is lossless. + return half_impl::half_to_float(*this); + } + +#if defined(EIGEN_HAS_GPU_FP16) && !defined(EIGEN_GPU_COMPILE_PHASE) + EIGEN_DEVICE_FUNC operator __half() const { + ::__half_raw hr; + hr.x = x; + return __half(hr); + } +#endif +}; + +} // end namespace Eigen + +namespace std { +template<> +struct numeric_limits { + static const bool is_specialized = true; + static const bool is_signed = true; + static const bool is_integer = false; + static const bool is_exact = false; + static const bool has_infinity = true; + static const bool has_quiet_NaN = true; + static const bool has_signaling_NaN = true; + static const float_denorm_style has_denorm = denorm_present; + static const bool has_denorm_loss = false; + static const std::float_round_style round_style = std::round_to_nearest; + static const bool is_iec559 = false; + static const bool is_bounded = false; + static const bool is_modulo = false; + static const int digits = 11; + static const int digits10 = 3; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html + static const int max_digits10 = 5; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html + static const int radix = 2; + static const int min_exponent = -13; + static const int min_exponent10 = -4; + static const int max_exponent = 16; + static const int max_exponent10 = 4; + static const bool traps = true; + static const bool tinyness_before = false; + + static Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x400); } + static Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); } + static Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); } + static Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x0800); } + static Eigen::half round_error() { return Eigen::half(0.5); } + static Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); } + static Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); } + static Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7d00); } + static Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x1); } +}; + +// If std::numeric_limits is specialized, should also specialize +// std::numeric_limits, std::numeric_limits, and +// std::numeric_limits +// https://stackoverflow.com/a/16519653/ +template<> +struct numeric_limits : numeric_limits {}; +template<> +struct numeric_limits : numeric_limits {}; +template<> +struct numeric_limits : numeric_limits {}; +} // end namespace std + +namespace Eigen { + +namespace half_impl { + +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \ + EIGEN_CUDA_ARCH >= 530) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE)) +// Note: We deliberatly do *not* define this to 1 even if we have Arm's native +// fp16 type since GPU halfs are rather different from native CPU halfs. +// TODO: Rename to something like EIGEN_HAS_NATIVE_GPU_FP16 +#define EIGEN_HAS_NATIVE_FP16 +#endif + +// Intrinsics for native fp16 support. Note that on current hardware, +// these are no faster than fp32 arithmetic (you need to use the half2 +// versions to get the ALU speed increased), but you do save the +// conversion steps back and forth. + +#if defined(EIGEN_HAS_NATIVE_FP16) +EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) { +#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000 + return __hadd(::__half(a), ::__half(b)); +#else + return __hadd(a, b); +#endif +} +EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) { + return __hmul(a, b); +} +EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) { + return __hsub(a, b); +} +EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) { +#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000 + return __hdiv(a, b); +#else + float num = __half2float(a); + float denom = __half2float(b); + return __float2half(num / denom); +#endif +} +EIGEN_STRONG_INLINE __device__ half operator - (const half& a) { + return __hneg(a); +} +EIGEN_STRONG_INLINE __device__ half& operator += (half& a, const half& b) { + a = a + b; + return a; +} +EIGEN_STRONG_INLINE __device__ half& operator *= (half& a, const half& b) { + a = a * b; + return a; +} +EIGEN_STRONG_INLINE __device__ half& operator -= (half& a, const half& b) { + a = a - b; + return a; +} +EIGEN_STRONG_INLINE __device__ half& operator /= (half& a, const half& b) { + a = a / b; + return a; +} +EIGEN_STRONG_INLINE __device__ bool operator == (const half& a, const half& b) { + return __heq(a, b); +} +EIGEN_STRONG_INLINE __device__ bool operator != (const half& a, const half& b) { + return __hne(a, b); +} +EIGEN_STRONG_INLINE __device__ bool operator < (const half& a, const half& b) { + return __hlt(a, b); +} +EIGEN_STRONG_INLINE __device__ bool operator <= (const half& a, const half& b) { + return __hle(a, b); +} +EIGEN_STRONG_INLINE __device__ bool operator > (const half& a, const half& b) { + return __hgt(a, b); +} +EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) { + return __hge(a, b); +} +#endif + +#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) { + return half(vaddh_f16(a.x, b.x)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) { + return half(vmulh_f16(a.x, b.x)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) { + return half(vsubh_f16(a.x, b.x)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) { + return half(vdivh_f16(a.x, b.x)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) { + return half(vnegh_f16(a.x)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) { + a = half(vaddh_f16(a.x, b.x)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) { + a = half(vmulh_f16(a.x, b.x)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) { + a = half(vsubh_f16(a.x, b.x)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) { + a = half(vdivh_f16(a.x, b.x)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) { + return vceqh_f16(a.x, b.x); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) { + return !vceqh_f16(a.x, b.x); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) { + return vclth_f16(a.x, b.x); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) { + return vcleh_f16(a.x, b.x); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) { + return vcgth_f16(a.x, b.x); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) { + return vcgeh_f16(a.x, b.x); +} +// We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler, +// invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation +// of the functions, while the latter can only deal with one of them. +#elif !defined(EIGEN_HAS_NATIVE_FP16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) // Emulate support for half floats + +#if EIGEN_COMP_CLANG && defined(EIGEN_CUDACC) +// We need to provide emulated *host-side* FP16 operators for clang. +#pragma push_macro("EIGEN_DEVICE_FUNC") +#undef EIGEN_DEVICE_FUNC +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_HAS_NATIVE_FP16) +#define EIGEN_DEVICE_FUNC __host__ +#else // both host and device need emulated ops. +#define EIGEN_DEVICE_FUNC __host__ __device__ +#endif +#endif + +// Definitions for CPUs and older HIP+CUDA, mostly working through conversion +// to/from fp32. +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) { + return half(float(a) + float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) { + return half(float(a) * float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) { + return half(float(a) - float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) { + return half(float(a) / float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) { + half result; + result.x = a.x ^ 0x8000; + return result; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) { + a = half(float(a) + float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) { + a = half(float(a) * float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) { + a = half(float(a) - float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) { + a = half(float(a) / float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) { + return numext::equal_strict(float(a),float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) { + return numext::not_equal_strict(float(a), float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) { + return float(a) < float(b); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) { + return float(a) <= float(b); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) { + return float(a) > float(b); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) { + return float(a) >= float(b); +} + +#if defined(__clang__) && defined(__CUDA__) +#pragma pop_macro("EIGEN_DEVICE_FUNC") +#endif +#endif // Emulate support for half floats + +// Division by an index. Do it in full float precision to avoid accuracy +// issues in converting the denominator to half. +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) { + return half(static_cast(a) / static_cast(b)); +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator++(half& a) { + a += half(1); + return a; +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator--(half& a) { + a -= half(1); + return a; +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator++(half& a, int) { + half original_value = a; + ++a; + return original_value; +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator--(half& a, int) { + half original_value = a; + --a; + return original_value; +} + +// Conversion routines, including fallbacks for the host or older CUDA. +// Note that newer Intel CPUs (Haswell or newer) have vectorized versions of +// these in hardware. If we need more performance on older/other CPUs, they are +// also possible to vectorize directly. + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x) { + // We cannot simply do a "return __half_raw(x)" here, because __half_raw is union type + // in the hip_fp16 header file, and that will trigger a compile error + // On the other hand, having anything but a return statement also triggers a compile error + // because this is constexpr function. + // Fortunately, since we need to disable EIGEN_CONSTEXPR for GPU anyway, we can get out + // of this catch22 by having separate bodies for GPU / non GPU +#if defined(EIGEN_HAS_GPU_FP16) + __half_raw h; + h.x = x; + return h; +#else + return __half_raw(x); +#endif +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC numext::uint16_t raw_half_as_uint16(const __half_raw& h) { + // HIP/CUDA/Default have a member 'x' of type uint16_t. + // For ARM64 native half, the member 'x' is of type __fp16, so we need to bit-cast. + // For SYCL, cl::sycl::half is _Float16, so cast directly. +#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) + return numext::bit_cast(h.x); +#elif defined(SYCL_DEVICE_ONLY) + return numext::bit_cast(h); +#else + return h.x; +#endif +} + +union float32_bits { + unsigned int u; + float f; +}; + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) { +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) + __half tmp_ff = __float2half(ff); + return *(__half_raw*)&tmp_ff; + +#elif defined(EIGEN_HAS_FP16_C) + __half_raw h; + h.x = _cvtss_sh(ff, 0); + return h; + +#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) + __half_raw h; + h.x = static_cast<__fp16>(ff); + return h; + +#else + float32_bits f; f.f = ff; + + const float32_bits f32infty = { 255 << 23 }; + const float32_bits f16max = { (127 + 16) << 23 }; + const float32_bits denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 }; + unsigned int sign_mask = 0x80000000u; + __half_raw o; + o.x = static_cast(0x0u); + + unsigned int sign = f.u & sign_mask; + f.u ^= sign; + + // NOTE all the integer compares in this function can be safely + // compiled into signed compares since all operands are below + // 0x80000000. Important if you want fast straight SSE2 code + // (since there's no unsigned PCMPGTD). + + if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set) + o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf + } else { // (De)normalized number or zero + if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero + // use a magic value to align our 10 mantissa bits at the bottom of + // the float. as long as FP addition is round-to-nearest-even this + // just works. + f.f += denorm_magic.f; + + // and one integer subtract of the bias later, we have our final float! + o.x = static_cast(f.u - denorm_magic.u); + } else { + unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd + + // update exponent, rounding bias part 1 + // Equivalent to `f.u += ((unsigned int)(15 - 127) << 23) + 0xfff`, but + // without arithmetic overflow. + f.u += 0xc8000fffU; + // rounding bias part 2 + f.u += mant_odd; + // take the bits! + o.x = static_cast(f.u >> 13); + } + } + + o.x |= static_cast(sign >> 16); + return o; +#endif +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) { +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) + return __half2float(h); +#elif defined(EIGEN_HAS_FP16_C) + return _cvtsh_ss(h.x); +#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) + return static_cast(h.x); +#else + const float32_bits magic = { 113 << 23 }; + const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift + float32_bits o; + + o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits + unsigned int exp = shifted_exp & o.u; // just the exponent + o.u += (127 - 15) << 23; // exponent adjust + + // handle exponent special cases + if (exp == shifted_exp) { // Inf/NaN? + o.u += (128 - 16) << 23; // extra exp adjust + } else if (exp == 0) { // Zero/Denormal? + o.u += 1 << 23; // extra exp adjust + o.f -= magic.f; // renormalize + } + + o.u |= (h.x & 0x8000) << 16; // sign bit + return o.f; +#endif +} + +// --- standard functions --- + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) { +#ifdef EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC + return (numext::bit_cast(a.x) & 0x7fff) == 0x7c00; +#else + return (a.x & 0x7fff) == 0x7c00; +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) { +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) + return __hisnan(a); +#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) + return (numext::bit_cast(a.x) & 0x7fff) > 0x7c00; +#else + return (a.x & 0x7fff) > 0x7c00; +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const half& a) { + return !(isinf EIGEN_NOT_A_MACRO (a)) && !(isnan EIGEN_NOT_A_MACRO (a)); +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) { +#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) + return half(vabsh_f16(a.x)); +#else + half result; + result.x = a.x & 0x7FFF; + return result; +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) { +#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) + return half(hexp(a)); +#else + return half(::expf(float(a))); +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) { + return half(numext::expm1(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) { +#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) + return half(::hlog(a)); +#else + return half(::logf(float(a))); +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log1p(const half& a) { + return half(numext::log1p(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) { + return half(::log10f(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log2(const half& a) { + return half(static_cast(EIGEN_LOG2E) * ::logf(float(a))); +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) { +#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) + return half(hsqrt(a)); +#else + return half(::sqrtf(float(a))); +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half pow(const half& a, const half& b) { + return half(::powf(float(a), float(b))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sin(const half& a) { + return half(::sinf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half cos(const half& a) { + return half(::cosf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tan(const half& a) { + return half(::tanf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) { + return half(::tanhf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half asin(const half& a) { + return half(::asinf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half acos(const half& a) { + return half(::acosf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) { +#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) + return half(hfloor(a)); +#else + return half(::floorf(float(a))); +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) { +#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) + return half(hceil(a)); +#else + return half(::ceilf(float(a))); +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half rint(const half& a) { + return half(::rintf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half round(const half& a) { + return half(::roundf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half fmod(const half& a, const half& b) { + return half(::fmodf(float(a), float(b))); +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) { +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) + return __hlt(b, a) ? b : a; +#else + const float f1 = static_cast(a); + const float f2 = static_cast(b); + return f2 < f1 ? b : a; +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) { +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) + return __hlt(a, b) ? b : a; +#else + const float f1 = static_cast(a); + const float f2 = static_cast(b); + return f1 < f2 ? b : a; +#endif +} + +#ifndef EIGEN_NO_IO +EIGEN_ALWAYS_INLINE std::ostream& operator << (std::ostream& os, const half& v) { + os << static_cast(v); + return os; +} +#endif + +} // end namespace half_impl + +// import Eigen::half_impl::half into Eigen namespace +// using half_impl::half; + +namespace internal { + +template<> +struct random_default_impl +{ + static inline half run(const half& x, const half& y) + { + return x + (y-x) * half(float(std::rand()) / float(RAND_MAX)); + } + static inline half run() + { + return run(half(-1.f), half(1.f)); + } +}; + +template<> struct is_arithmetic { enum { value = true }; }; + +} // end namespace internal + +template<> struct NumTraits + : GenericNumTraits +{ + enum { + IsSigned = true, + IsInteger = false, + IsComplex = false, + RequireInitialization = false + }; + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half epsilon() { + return half_impl::raw_uint16_to_half(0x0800); + } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half dummy_precision() { + return half_impl::raw_uint16_to_half(0x211f); // Eigen::half(1e-2f); + } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half highest() { + return half_impl::raw_uint16_to_half(0x7bff); + } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half lowest() { + return half_impl::raw_uint16_to_half(0xfbff); + } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half infinity() { + return half_impl::raw_uint16_to_half(0x7c00); + } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() { + return half_impl::raw_uint16_to_half(0x7e00); + } +}; + +} // end namespace Eigen + +#if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) + #pragma pop_macro("EIGEN_CONSTEXPR") +#endif + +namespace Eigen { +namespace numext { + +#if defined(EIGEN_GPU_COMPILE_PHASE) + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isnan)(const Eigen::half& h) { + return (half_impl::isnan)(h); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isinf)(const Eigen::half& h) { + return (half_impl::isinf)(h); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isfinite)(const Eigen::half& h) { + return (half_impl::isfinite)(h); +} + +#endif + +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bit_cast(const uint16_t& src) { + return Eigen::half(Eigen::half_impl::raw_uint16_to_half(src)); +} + +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC uint16_t bit_cast(const Eigen::half& src) { + return Eigen::half_impl::raw_half_as_uint16(src); +} + +} // namespace numext +} // namespace Eigen + +// Add the missing shfl* intrinsics. +// The __shfl* functions are only valid on HIP or _CUDA_ARCH_ >= 300. +// CUDA defines them for (__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__)) +// +// HIP and CUDA prior to SDK 9.0 define +// __shfl, __shfl_up, __shfl_down, __shfl_xor for int and float +// CUDA since 9.0 deprecates those and instead defines +// __shfl_sync, __shfl_up_sync, __shfl_down_sync, __shfl_xor_sync, +// with native support for __half and __nv_bfloat16 +// +// Note that the following are __device__ - only functions. +#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 300)) \ + || defined(EIGEN_HIPCC) + +#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 90000 + +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_sync(unsigned mask, Eigen::half var, int srcLane, int width=warpSize) { + const __half h = var; + return static_cast(__shfl_sync(mask, h, srcLane, width)); +} + +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up_sync(unsigned mask, Eigen::half var, unsigned int delta, int width=warpSize) { + const __half h = var; + return static_cast(__shfl_up_sync(mask, h, delta, width)); +} + +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down_sync(unsigned mask, Eigen::half var, unsigned int delta, int width=warpSize) { + const __half h = var; + return static_cast(__shfl_down_sync(mask, h, delta, width)); +} + +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor_sync(unsigned mask, Eigen::half var, int laneMask, int width=warpSize) { + const __half h = var; + return static_cast(__shfl_xor_sync(mask, h, laneMask, width)); +} + +#else // HIP or CUDA SDK < 9.0 + +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl(Eigen::half var, int srcLane, int width=warpSize) { + const int ivar = static_cast(Eigen::numext::bit_cast(var)); + return Eigen::numext::bit_cast(static_cast(__shfl(ivar, srcLane, width))); +} + +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up(Eigen::half var, unsigned int delta, int width=warpSize) { + const int ivar = static_cast(Eigen::numext::bit_cast(var)); + return Eigen::numext::bit_cast(static_cast(__shfl_up(ivar, delta, width))); +} + +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down(Eigen::half var, unsigned int delta, int width=warpSize) { + const int ivar = static_cast(Eigen::numext::bit_cast(var)); + return Eigen::numext::bit_cast(static_cast(__shfl_down(ivar, delta, width))); +} + +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { + const int ivar = static_cast(Eigen::numext::bit_cast(var)); + return Eigen::numext::bit_cast(static_cast(__shfl_xor(ivar, laneMask, width))); +} + +#endif // HIP vs CUDA +#endif // __shfl* + +// ldg() has an overload for __half_raw, but we also need one for Eigen::half. +#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 350)) \ + || defined(EIGEN_HIPCC) +EIGEN_STRONG_INLINE __device__ Eigen::half __ldg(const Eigen::half* ptr) { + return Eigen::half_impl::raw_uint16_to_half(__ldg(reinterpret_cast(ptr))); +} +#endif // __ldg + +#if EIGEN_HAS_STD_HASH +namespace std { +template <> +struct hash { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t operator()(const Eigen::half& a) const { + return static_cast(Eigen::numext::bit_cast(a)); + } +}; +} // end namespace std +#endif + +#endif // EIGEN_HALF_H diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/Default/Settings.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/Default/Settings.h index 097373c84d..a5c3ada4c7 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/Default/Settings.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/Default/Settings.h @@ -21,7 +21,7 @@ * it does not correspond to the number of iterations or the number of instructions */ #ifndef EIGEN_UNROLLING_LIMIT -#define EIGEN_UNROLLING_LIMIT 100 +#define EIGEN_UNROLLING_LIMIT 110 #endif /** Defines the threshold between a "small" and a "large" matrix. diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/Default/TypeCasting.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/Default/TypeCasting.h new file mode 100644 index 0000000000..fb8183b78e --- /dev/null +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/Default/TypeCasting.h @@ -0,0 +1,120 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner +// Copyright (C) 2019 Rasmus Munk Larsen +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_GENERIC_TYPE_CASTING_H +#define EIGEN_GENERIC_TYPE_CASTING_H + +namespace Eigen { + +namespace internal { + +template<> +struct scalar_cast_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef Eigen::half result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const { + #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) + return __float2half(a); + #else + return Eigen::half(a); + #endif + } +}; + +template<> +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; + + +template<> +struct scalar_cast_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef Eigen::half result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const { + #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) + return __float2half(static_cast(a)); + #else + return Eigen::half(static_cast(a)); + #endif + } +}; + +template<> +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; + + +template<> +struct scalar_cast_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef float result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const { + #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) + return __half2float(a); + #else + return static_cast(a); + #endif + } +}; + +template<> +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; + + +template<> +struct scalar_cast_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef Eigen::bfloat16 result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::bfloat16 operator() (const float& a) const { + return Eigen::bfloat16(a); + } +}; + +template<> +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; + + +template<> +struct scalar_cast_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef Eigen::bfloat16 result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::bfloat16 operator() (const int& a) const { + return Eigen::bfloat16(static_cast(a)); + } +}; + +template<> +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; + + +template<> +struct scalar_cast_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef float result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::bfloat16& a) const { + return static_cast(a); + } +}; + +template<> +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; + + +} +} + +#endif // EIGEN_GENERIC_TYPE_CASTING_H diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/NEON/Complex.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/NEON/Complex.h index 306a309beb..f40af7f87f 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/NEON/Complex.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/NEON/Complex.h @@ -15,9 +15,10 @@ namespace Eigen { namespace internal { -inline uint32x4_t p4ui_CONJ_XOR() { +inline uint32x4_t p4ui_CONJ_XOR() +{ // See bug 1325, clang fails to call vld1q_u64. -#if EIGEN_COMP_CLANG +#if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML uint32x4_t ret = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; return ret; #else @@ -26,61 +27,136 @@ inline uint32x4_t p4ui_CONJ_XOR() { #endif } -inline uint32x2_t p2ui_CONJ_XOR() { +inline uint32x2_t p2ui_CONJ_XOR() +{ static const uint32_t conj_XOR_DATA[] = { 0x00000000, 0x80000000 }; return vld1_u32( conj_XOR_DATA ); } //---------- float ---------- + +struct Packet1cf +{ + EIGEN_STRONG_INLINE Packet1cf() {} + EIGEN_STRONG_INLINE explicit Packet1cf(const Packet2f& a) : v(a) {} + Packet2f v; +}; struct Packet2cf { EIGEN_STRONG_INLINE Packet2cf() {} EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {} - Packet4f v; + Packet4f v; }; -template<> struct packet_traits > : default_packet_traits +template<> struct packet_traits > : default_packet_traits { typedef Packet2cf type; - typedef Packet2cf half; - enum { + typedef Packet1cf half; + enum + { Vectorizable = 1, AlignedOnScalar = 1, size = 2, - HasHalfPacket = 0, + HasHalfPacket = 1, - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasDiv = 1, - HasNegate = 1, - HasAbs = 0, - HasAbs2 = 0, - HasMin = 0, - HasMax = 0, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, HasSetLinear = 0 }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; }; - -template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) +template<> struct unpacket_traits { - float32x2_t r64; - r64 = vld1_f32((const float *)&from); + typedef std::complex type; + typedef Packet1cf half; + typedef Packet2f as_real; + enum + { + size = 1, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef std::complex type; + typedef Packet1cf half; + typedef Packet4f as_real; + enum + { + size = 2, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> EIGEN_STRONG_INLINE Packet1cf pcast(const float& a) +{ return Packet1cf(vset_lane_f32(a, vdup_n_f32(0.f), 0)); } +template<> EIGEN_STRONG_INLINE Packet2cf pcast(const Packet2f& a) +{ return Packet2cf(vreinterpretq_f32_u64(vmovl_u32(vreinterpret_u32_f32(a)))); } + +template<> EIGEN_STRONG_INLINE Packet1cf pset1(const std::complex& from) +{ return Packet1cf(vld1_f32(reinterpret_cast(&from))); } +template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) +{ + const float32x2_t r64 = vld1_f32(reinterpret_cast(&from)); return Packet2cf(vcombine_f32(r64, r64)); } -template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cf padd(const Packet1cf& a, const Packet1cf& b) +{ return Packet1cf(padd(a.v, b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) +{ return Packet2cf(padd(a.v, b.v)); } + +template<> EIGEN_STRONG_INLINE Packet1cf psub(const Packet1cf& a, const Packet1cf& b) +{ return Packet1cf(psub(a.v, b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) +{ return Packet2cf(psub(a.v, b.v)); } + +template<> EIGEN_STRONG_INLINE Packet1cf pnegate(const Packet1cf& a) { return Packet1cf(pnegate(a.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); } + +template<> EIGEN_STRONG_INLINE Packet1cf pconj(const Packet1cf& a) +{ + const Packet2ui b = vreinterpret_u32_f32(a.v); + return Packet1cf(vreinterpret_f32_u32(veor_u32(b, p2ui_CONJ_XOR()))); +} template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { - Packet4ui b = vreinterpretq_u32_f32(a.v); + const Packet4ui b = vreinterpretq_u32_f32(a.v); return Packet2cf(vreinterpretq_f32_u32(veorq_u32(b, p4ui_CONJ_XOR()))); } +template<> EIGEN_STRONG_INLINE Packet1cf pmul(const Packet1cf& a, const Packet1cf& b) +{ + Packet2f v1, v2; + + // Get the real values of a | a1_re | a1_re | + v1 = vdup_lane_f32(a.v, 0); + // Get the imag values of a | a1_im | a1_im | + v2 = vdup_lane_f32(a.v, 1); + // Multiply the real a with b + v1 = vmul_f32(v1, b.v); + // Multiply the imag a with b + v2 = vmul_f32(v2, b.v); + // Conjugate v2 + v2 = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2), p2ui_CONJ_XOR())); + // Swap real/imag elements in v2. + v2 = vrev64_f32(v2); + // Add and return the result + return Packet1cf(vadd_f32(v1, v2)); +} template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) { Packet4f v1, v2; @@ -93,7 +169,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, con v1 = vmulq_f32(v1, b.v); // Multiply the imag a with b v2 = vmulq_f32(v2, b.v); - // Conjugate v2 + // Conjugate v2 v2 = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v2), p4ui_CONJ_XOR())); // Swap real/imag elements in v2. v2 = vrev64q_f32(v2); @@ -101,98 +177,144 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, con return Packet2cf(vaddq_f32(v1, v2)); } -template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) +template<> EIGEN_STRONG_INLINE Packet1cf pcmp_eq(const Packet1cf& a, const Packet1cf& b) { - return Packet2cf(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); + // Compare real and imaginary parts of a and b to get the mask vector: + // [re(a[0])==re(b[0]), im(a[0])==im(b[0])] + Packet2f eq = pcmp_eq(a.v, b.v); + // Swap real/imag elements in the mask in to get: + // [im(a[0])==im(b[0]), re(a[0])==re(b[0])] + Packet2f eq_swapped = vrev64_f32(eq); + // Return re(a)==re(b) && im(a)==im(b) by computing bitwise AND of eq and eq_swapped + return Packet1cf(pand(eq, eq_swapped)); } -template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) +template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) { - return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); -} -template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) -{ - return Packet2cf(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); + // Compare real and imaginary parts of a and b to get the mask vector: + // [re(a[0])==re(b[0]), im(a[0])==im(b[0]), re(a[1])==re(b[1]), im(a[1])==im(b[1])] + Packet4f eq = pcmp_eq(a.v, b.v); + // Swap real/imag elements in the mask in to get: + // [im(a[0])==im(b[0]), re(a[0])==re(b[0]), im(a[1])==im(b[1]), re(a[1])==re(b[1])] + Packet4f eq_swapped = vrev64q_f32(eq); + // Return re(a)==re(b) && im(a)==im(b) by computing bitwise AND of eq and eq_swapped + return Packet2cf(pand(eq, eq_swapped)); } + +template<> EIGEN_STRONG_INLINE Packet1cf pand(const Packet1cf& a, const Packet1cf& b) +{ return Packet1cf(vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v)))); } +template<> EIGEN_STRONG_INLINE Packet2cf pand(const Packet2cf& a, const Packet2cf& b) +{ return Packet2cf(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); } + +template<> EIGEN_STRONG_INLINE Packet1cf por(const Packet1cf& a, const Packet1cf& b) +{ return Packet1cf(vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v)))); } +template<> EIGEN_STRONG_INLINE Packet2cf por(const Packet2cf& a, const Packet2cf& b) +{ return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); } + +template<> EIGEN_STRONG_INLINE Packet1cf pxor(const Packet1cf& a, const Packet1cf& b) +{ return Packet1cf(vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v)))); } +template<> EIGEN_STRONG_INLINE Packet2cf pxor(const Packet2cf& a, const Packet2cf& b) +{ return Packet2cf(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); } + +template<> EIGEN_STRONG_INLINE Packet1cf pandnot(const Packet1cf& a, const Packet1cf& b) +{ return Packet1cf(vreinterpret_f32_u32(vbic_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v)))); } template<> EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) +{ return Packet2cf(vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); } + +template<> EIGEN_STRONG_INLINE Packet1cf pload(const std::complex* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cf(pload((const float*)from)); } +template<> EIGEN_STRONG_INLINE Packet2cf pload(const std::complex* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload(reinterpret_cast(from))); } + +template<> EIGEN_STRONG_INLINE Packet1cf ploadu(const std::complex* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cf(ploadu((const float*)from)); } +template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu(reinterpret_cast(from))); } + +template<> EIGEN_STRONG_INLINE Packet1cf ploaddup(const std::complex* from) +{ return pset1(*from); } +template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) +{ return pset1(*from); } + +template<> EIGEN_STRONG_INLINE void pstore >(std::complex *to, const Packet1cf& from) +{ EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstore >(std::complex *to, const Packet2cf& from) +{ EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast(to), from.v); } + +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex *to, const Packet1cf& from) +{ EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex *to, const Packet2cf& from) +{ EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast(to), from.v); } + +template<> EIGEN_DEVICE_FUNC inline Packet1cf pgather, Packet1cf>( + const std::complex* from, Index stride) { - return Packet2cf(vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); + const Packet2f tmp = vdup_n_f32(std::real(from[0*stride])); + return Packet1cf(vset_lane_f32(std::imag(from[0*stride]), tmp, 1)); } - -template<> EIGEN_STRONG_INLINE Packet2cf pload(const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload((const float*)from)); } -template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu((const float*)from)); } - -template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } - -template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); } -template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); } - -template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, Index stride) +template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>( + const std::complex* from, Index stride) { - Packet4f res = pset1(0.f); - res = vsetq_lane_f32(std::real(from[0*stride]), res, 0); + Packet4f res = vdupq_n_f32(std::real(from[0*stride])); res = vsetq_lane_f32(std::imag(from[0*stride]), res, 1); res = vsetq_lane_f32(std::real(from[1*stride]), res, 2); res = vsetq_lane_f32(std::imag(from[1*stride]), res, 3); return Packet2cf(res); } -template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, Index stride) +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1cf>( + std::complex* to, const Packet1cf& from, Index stride) +{ to[stride*0] = std::complex(vget_lane_f32(from.v, 0), vget_lane_f32(from.v, 1)); } +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>( + std::complex* to, const Packet2cf& from, Index stride) { to[stride*0] = std::complex(vgetq_lane_f32(from.v, 0), vgetq_lane_f32(from.v, 1)); to[stride*1] = std::complex(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3)); } -template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ARM_PREFETCH((const float *)addr); } +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex *addr) +{ EIGEN_ARM_PREFETCH(reinterpret_cast(addr)); } -template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cf& a) { - std::complex EIGEN_ALIGN16 x[2]; - vst1q_f32((float *)x, a.v); + EIGEN_ALIGN16 std::complex x; + vst1_f32(reinterpret_cast(&x), a.v); + return x; +} +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) +{ + EIGEN_ALIGN16 std::complex x[2]; + vst1q_f32(reinterpret_cast(x), a.v); return x[0]; } +template<> EIGEN_STRONG_INLINE Packet1cf preverse(const Packet1cf& a) { return a; } template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) -{ - float32x2_t a_lo, a_hi; - Packet4f a_r128; - - a_lo = vget_low_f32(a.v); - a_hi = vget_high_f32(a.v); - a_r128 = vcombine_f32(a_hi, a_lo); - - return Packet2cf(a_r128); -} +{ return Packet2cf(vcombine_f32(vget_high_f32(a.v), vget_low_f32(a.v))); } +template<> EIGEN_STRONG_INLINE Packet1cf pcplxflip(const Packet1cf& a) +{ return Packet1cf(vrev64_f32(a.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf& a) -{ - return Packet2cf(vrev64q_f32(a.v)); -} +{ return Packet2cf(vrev64q_f32(a.v)); } +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet1cf& a) +{ + std::complex s; + vst1_f32((float *)&s, a.v); + return s; +} template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) { - float32x2_t a1, a2; std::complex s; - - a1 = vget_low_f32(a.v); - a2 = vget_high_f32(a.v); - a2 = vadd_f32(a1, a2); - vst1_f32((float *)&s, a2); - + vst1_f32(reinterpret_cast(&s), vadd_f32(vget_low_f32(a.v), vget_high_f32(a.v))); return s; } -template<> EIGEN_STRONG_INLINE Packet2cf preduxp(const Packet2cf* vecs) +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cf& a) { - Packet4f sum1, sum2, sum; - - // Add the first two 64-bit float32x2_t of vecs[0] - sum1 = vcombine_f32(vget_low_f32(vecs[0].v), vget_low_f32(vecs[1].v)); - sum2 = vcombine_f32(vget_high_f32(vecs[0].v), vget_high_f32(vecs[1].v)); - sum = vaddq_f32(sum1, sum2); - - return Packet2cf(sum); + std::complex s; + vst1_f32((float *)&s, a.v); + return s; } - template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) { float32x2_t a1, a2, v1, v2, prod; @@ -208,90 +330,67 @@ template<> EIGEN_STRONG_INLINE std::complex predux_mul(const P v1 = vmul_f32(v1, a2); // Multiply the imag a with b v2 = vmul_f32(v2, a2); - // Conjugate v2 + // Conjugate v2 v2 = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2), p2ui_CONJ_XOR())); // Swap real/imag elements in v2. v2 = vrev64_f32(v2); // Add v1, v2 prod = vadd_f32(v1, v2); - vst1_f32((float *)&s, prod); + vst1_f32(reinterpret_cast(&s), prod); return s; } -template -struct palign_impl -{ - EIGEN_STRONG_INLINE static void run(Packet2cf& first, const Packet2cf& second) - { - if (Offset==1) - { - first.v = vextq_f32(first.v, second.v, 2); - } - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return internal::pmul(a, pconj(b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return internal::pmul(pconj(a), b); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return pconj(internal::pmul(a, b)); - } -}; - +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cf,Packet2f) EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) +template<> EIGEN_STRONG_INLINE Packet1cf pdiv(const Packet1cf& a, const Packet1cf& b) +{ + // TODO optimize it for NEON + Packet1cf res = pmul(a, pconj(b)); + Packet2f s, rev_s; + + // this computes the norm + s = vmul_f32(b.v, b.v); + rev_s = vrev64_f32(s); + + return Packet1cf(pdiv(res.v, vadd_f32(s, rev_s))); +} template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { // TODO optimize it for NEON - Packet2cf res = conj_helper().pmul(a,b); + Packet2cf res = pmul(a,pconj(b)); Packet4f s, rev_s; // this computes the norm s = vmulq_f32(b.v, b.v); rev_s = vrev64q_f32(s); - return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s))); + return Packet2cf(pdiv(res.v, vaddq_f32(s, rev_s))); } -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& /*kernel*/) {} +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) +{ Packet4f tmp = vcombine_f32(vget_high_f32(kernel.packet[0].v), vget_high_f32(kernel.packet[1].v)); kernel.packet[0].v = vcombine_f32(vget_low_f32(kernel.packet[0].v), vget_low_f32(kernel.packet[1].v)); kernel.packet[1].v = tmp; } +template<> EIGEN_STRONG_INLINE Packet1cf psqrt(const Packet1cf& a) { + return psqrt_complex(a); +} + +template<> EIGEN_STRONG_INLINE Packet2cf psqrt(const Packet2cf& a) { + return psqrt_complex(a); +} + //---------- double ---------- #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG // See bug 1325, clang fails to call vld1q_u64. -#if EIGEN_COMP_CLANG +#if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML static uint64x2_t p2ul_CONJ_XOR = {0x0, 0x8000000000000000}; #else const uint64_t p2ul_conj_XOR_DATA[] = { 0x0, 0x8000000000000000 }; @@ -309,7 +408,8 @@ template<> struct packet_traits > : default_packet_traits { typedef Packet1cd type; typedef Packet1cd half; - enum { + enum + { Vectorizable = 1, AlignedOnScalar = 0, size = 1, @@ -328,24 +428,50 @@ template<> struct packet_traits > : default_packet_traits }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; +template<> struct unpacket_traits +{ + typedef std::complex type; + typedef Packet1cd half; + typedef Packet2d as_real; + enum + { + size=1, + alignment=Aligned16, + vectorizable=true, + masked_load_available=false, + masked_store_available=false + }; +}; -template<> EIGEN_STRONG_INLINE Packet1cd pload(const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload((const double*)from)); } -template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet1cd pload(const std::complex* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload(reinterpret_cast(from))); } -template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) -{ /* here we really have to use unaligned loads :( */ return ploadu(&from); } +template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu(reinterpret_cast(from))); } -template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(padd(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(psub(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(a.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), p2ul_CONJ_XOR))); } +template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) +{ + /* here we really have to use unaligned loads :( */ + return ploadu(&from); +} + +template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) +{ return Packet1cd(padd(a.v, b.v)); } + +template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) +{ return Packet1cd(psub(a.v, b.v)); } + +template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) +{ return Packet1cd(pnegate(a.v)); } + +template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) +{ return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), p2ul_CONJ_XOR))); } template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) { Packet2d v1, v2; - // Get the real values of a + // Get the real values of a v1 = vdupq_lane_f64(vget_low_f64(a.v), 0); // Get the imag values of a v2 = vdupq_lane_f64(vget_high_f64(a.v), 0); @@ -353,7 +479,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, con v1 = vmulq_f64(v1, b.v); // Multiply the imag a with b v2 = vmulq_f64(v2, b.v); - // Conjugate v2 + // Conjugate v2 v2 = vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(v2), p2ul_CONJ_XOR)); // Swap real/imag elements in v2. v2 = preverse(v2); @@ -361,31 +487,44 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, con return Packet1cd(vaddq_f64(v1, v2)); } -template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) +template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) { - return Packet1cd(vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); -} -template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) -{ - return Packet1cd(vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); -} -template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) -{ - return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); + // Compare real and imaginary parts of a and b to get the mask vector: + // [re(a)==re(b), im(a)==im(b)] + Packet2d eq = pcmp_eq(a.v, b.v); + // Swap real/imag elements in the mask in to get: + // [im(a)==im(b), re(a)==re(b)] + Packet2d eq_swapped = vreinterpretq_f64_u32(vrev64q_u32(vreinterpretq_u32_f64(eq))); + // Return re(a)==re(b) & im(a)==im(b) by computing bitwise AND of eq and eq_swapped + return Packet1cd(pand(eq, eq_swapped)); } + +template<> EIGEN_STRONG_INLINE Packet1cd pand(const Packet1cd& a, const Packet1cd& b) +{ return Packet1cd(vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); } + +template<> EIGEN_STRONG_INLINE Packet1cd por(const Packet1cd& a, const Packet1cd& b) +{ return Packet1cd(vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); } + +template<> EIGEN_STRONG_INLINE Packet1cd pxor(const Packet1cd& a, const Packet1cd& b) +{ return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); } + template<> EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) -{ - return Packet1cd(vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); -} +{ return Packet1cd(vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); } -template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) { return pset1(*from); } +template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) +{ return pset1(*from); } -template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } -template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstore >(std::complex *to, const Packet1cd& from) +{ EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast(to), from.v); } -template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ARM_PREFETCH((const double *)addr); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex *to, const Packet1cd& from) +{ EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast(to), from.v); } -template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>(const std::complex* from, Index stride) +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex *addr) +{ EIGEN_ARM_PREFETCH(reinterpret_cast(addr)); } + +template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>( + const std::complex* from, Index stride) { Packet2d res = pset1(0.0); res = vsetq_lane_f64(std::real(from[0*stride]), res, 0); @@ -393,17 +532,14 @@ template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Pack return Packet1cd(res); } -template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>(std::complex* to, const Packet1cd& from, Index stride) -{ - to[stride*0] = std::complex(vgetq_lane_f64(from.v, 0), vgetq_lane_f64(from.v, 1)); -} +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>( + std::complex* to, const Packet1cd& from, Index stride) +{ to[stride*0] = std::complex(vgetq_lane_f64(from.v, 0), vgetq_lane_f64(from.v, 1)); } - -template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) { - std::complex EIGEN_ALIGN16 res; + EIGEN_ALIGN16 std::complex res; pstore >(&res, a); - return res; } @@ -411,59 +547,14 @@ template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a template<> EIGEN_STRONG_INLINE std::complex predux(const Packet1cd& a) { return pfirst(a); } -template<> EIGEN_STRONG_INLINE Packet1cd preduxp(const Packet1cd* vecs) { return vecs[0]; } - template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) { return pfirst(a); } -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) - { - // FIXME is it sure we never have to align a Packet1cd? - // Even though a std::complex has 16 bytes, it is not necessarily aligned on a 16 bytes boundary... - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - return internal::pmul(a, pconj(b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - return internal::pmul(pconj(a), b); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - return pconj(internal::pmul(a, b)); - } -}; - EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { // TODO optimize it for NEON - Packet1cd res = conj_helper().pmul(a,b); + Packet1cd res = pmul(a,pconj(b)); Packet2d s = pmul(b.v, b.v); Packet2d rev_s = preverse(s); @@ -471,9 +562,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, con } EIGEN_STRONG_INLINE Packet1cd pcplxflip/**/(const Packet1cd& x) -{ - return Packet1cd(preverse(Packet2d(x.v))); -} +{ return Packet1cd(preverse(Packet2d(x.v))); } EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { @@ -481,6 +570,11 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) kernel.packet[0].v = vcombine_f64(vget_low_f64(kernel.packet[0].v), vget_low_f64(kernel.packet[1].v)); kernel.packet[1].v = tmp; } + +template<> EIGEN_STRONG_INLINE Packet1cd psqrt(const Packet1cd& a) { + return psqrt_complex(a); +} + #endif // EIGEN_ARCH_ARM64 } // end namespace internal diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h new file mode 100644 index 0000000000..3481f337e3 --- /dev/null +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h @@ -0,0 +1,183 @@ +namespace Eigen { +namespace internal { + +#if EIGEN_ARCH_ARM && EIGEN_COMP_CLANG + +// Clang seems to excessively spill registers in the GEBP kernel on 32-bit arm. +// Here we specialize gebp_traits to eliminate these register spills. +// See #2138. +template<> +struct gebp_traits + : gebp_traits +{ + EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const + { + // This volatile inline ASM both acts as a barrier to prevent reordering, + // as well as enforces strict register use. + asm volatile( + "vmla.f32 %q[r], %q[c], %q[alpha]" + : [r] "+w" (r) + : [c] "w" (c), + [alpha] "w" (alpha) + : ); + } + + template + EIGEN_STRONG_INLINE void madd(const Packet4f& a, const Packet4f& b, + Packet4f& c, Packet4f& tmp, + const LaneIdType&) const { + acc(a, b, c); + } + + template + EIGEN_STRONG_INLINE void madd(const Packet4f& a, const QuadPacket& b, + Packet4f& c, Packet4f& tmp, + const LaneIdType& lane) const { + madd(a, b.get(lane), c, tmp, lane); + } +}; + +#endif // EIGEN_ARCH_ARM && EIGEN_COMP_CLANG + +#if EIGEN_ARCH_ARM64 + +template<> +struct gebp_traits + : gebp_traits +{ + typedef float RhsPacket; + typedef float32x4_t RhsPacketx4; + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + { + dest = *b; + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + dest = vld1q_f32(b); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const + { + dest = *b; + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const + {} + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const + { + loadRhs(b,dest); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const + { + c = vfmaq_n_f32(c, a, b); + } + + // NOTE: Template parameter inference failed when compiled with Android NDK: + // "candidate template ignored: could not match 'FixedInt' against 'Eigen::internal::FixedInt<0>". + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const + { madd_helper<0>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const + { madd_helper<1>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const + { madd_helper<2>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const + { madd_helper<3>(a, b, c); } + + private: + template + EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const + { + #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0)) + // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 + // vfmaq_laneq_f32 is implemented through a costly dup + if(LaneID==0) asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) : ); + else if(LaneID==1) asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w" (c) : "w" (a), "w" (b) : ); + else if(LaneID==2) asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w" (c) : "w" (a), "w" (b) : ); + else if(LaneID==3) asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w" (c) : "w" (a), "w" (b) : ); + #else + c = vfmaq_laneq_f32(c, a, b, LaneID); + #endif + } +}; + + +template<> +struct gebp_traits + : gebp_traits +{ + typedef double RhsPacket; + + struct RhsPacketx4 { + float64x2_t B_0, B_1; + }; + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + { + dest = *b; + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + dest.B_0 = vld1q_f64(b); + dest.B_1 = vld1q_f64(b+2); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const + { + loadRhs(b,dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const + {} + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const + { + loadRhs(b,dest); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const + { + c = vfmaq_n_f64(c, a, b); + } + + // NOTE: Template parameter inference failed when compiled with Android NDK: + // "candidate template ignored: could not match 'FixedInt' against 'Eigen::internal::FixedInt<0>". + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const + { madd_helper<0>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const + { madd_helper<1>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const + { madd_helper<2>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const + { madd_helper<3>(a, b, c); } + + private: + template + EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const + { + #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0)) + // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 + // vfmaq_laneq_f64 is implemented through a costly dup + if(LaneID==0) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : ); + else if(LaneID==1) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : ); + else if(LaneID==2) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : ); + else if(LaneID==3) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : ); + #else + if(LaneID==0) c = vfmaq_laneq_f64(c, a, b.B_0, 0); + else if(LaneID==1) c = vfmaq_laneq_f64(c, a, b.B_0, 1); + else if(LaneID==2) c = vfmaq_laneq_f64(c, a, b.B_1, 0); + else if(LaneID==3) c = vfmaq_laneq_f64(c, a, b.B_1, 1); + #endif + } +}; + +#endif // EIGEN_ARCH_ARM64 + +} // namespace internal +} // namespace Eigen diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/NEON/MathFunctions.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/NEON/MathFunctions.h index 6bb05bb922..fa6615a851 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/NEON/MathFunctions.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/NEON/MathFunctions.h @@ -5,10 +5,6 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -/* The sin, cos, exp, and log functions of this file come from - * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ - */ - #ifndef EIGEN_MATH_FUNCTIONS_NEON_H #define EIGEN_MATH_FUNCTIONS_NEON_H @@ -16,74 +12,62 @@ namespace Eigen { namespace internal { -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f pexp(const Packet4f& _x) -{ - Packet4f x = _x; - Packet4f tmp, fx; +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f pexp(const Packet2f& x) +{ return pexp_float(x); } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f pexp(const Packet4f& x) +{ return pexp_float(x); } - _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); - _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); - _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); - _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f); - _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f); +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f plog(const Packet2f& x) +{ return plog_float(x); } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f plog(const Packet4f& x) +{ return plog_float(x); } - x = vminq_f32(x, p4f_exp_hi); - x = vmaxq_f32(x, p4f_exp_lo); +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f psin(const Packet2f& x) +{ return psin_float(x); } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f psin(const Packet4f& x) +{ return psin_float(x); } - /* express exp(x) as exp(g + n*log(2)) */ - fx = vmlaq_f32(p4f_half, x, p4f_cephes_LOG2EF); +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f pcos(const Packet2f& x) +{ return pcos_float(x); } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f pcos(const Packet4f& x) +{ return pcos_float(x); } - /* perform a floorf */ - tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx)); +// Hyperbolic Tangent function. +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f ptanh(const Packet2f& x) +{ return internal::generic_fast_tanh_float(x); } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f ptanh(const Packet4f& x) +{ return internal::generic_fast_tanh_float(x); } - /* if greater, substract 1 */ - Packet4ui mask = vcgtq_f32(tmp, fx); - mask = vandq_u32(mask, vreinterpretq_u32_f32(p4f_1)); +BF16_PACKET_FUNCTION(Packet4f, Packet4bf, psin) +BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pcos) +BF16_PACKET_FUNCTION(Packet4f, Packet4bf, plog) +BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pexp) +BF16_PACKET_FUNCTION(Packet4f, Packet4bf, ptanh) - fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask)); - - tmp = vmulq_f32(fx, p4f_cephes_exp_C1); - Packet4f z = vmulq_f32(fx, p4f_cephes_exp_C2); - x = vsubq_f32(x, tmp); - x = vsubq_f32(x, z); - - Packet4f y = vmulq_f32(p4f_cephes_exp_p0, x); - z = vmulq_f32(x, x); - y = vaddq_f32(y, p4f_cephes_exp_p1); - y = vmulq_f32(y, x); - y = vaddq_f32(y, p4f_cephes_exp_p2); - y = vmulq_f32(y, x); - y = vaddq_f32(y, p4f_cephes_exp_p3); - y = vmulq_f32(y, x); - y = vaddq_f32(y, p4f_cephes_exp_p4); - y = vmulq_f32(y, x); - y = vaddq_f32(y, p4f_cephes_exp_p5); - - y = vmulq_f32(y, z); - y = vaddq_f32(y, x); - y = vaddq_f32(y, p4f_1); - - /* build 2^n */ - int32x4_t mm; - mm = vcvtq_s32_f32(fx); - mm = vaddq_s32(mm, p4i_0x7f); - mm = vshlq_n_s32(mm, 23); - Packet4f pow2n = vreinterpretq_f32_s32(mm); - - y = vmulq_f32(y, pow2n); - return y; +template <> +EIGEN_STRONG_INLINE Packet4bf pfrexp(const Packet4bf& a, Packet4bf& exponent) { + Packet4f fexponent; + const Packet4bf out = F32ToBf16(pfrexp(Bf16ToF32(a), fexponent)); + exponent = F32ToBf16(fexponent); + return out; } +template <> +EIGEN_STRONG_INLINE Packet4bf pldexp(const Packet4bf& a, const Packet4bf& exponent) { + return F32ToBf16(pldexp(Bf16ToF32(a), Bf16ToF32(exponent))); +} + +//---------- double ---------- + +#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d pexp(const Packet2d& x) +{ return pexp_double(x); } + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d plog(const Packet2d& x) +{ return plog_double(x); } + +#endif + } // end namespace internal } // end namespace Eigen diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/NEON/PacketMath.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/NEON/PacketMath.h index 3d5ed0d240..d2aeef4308 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/NEON/PacketMath.h @@ -24,54 +24,118 @@ namespace internal { #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif -#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#endif - #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS #if EIGEN_ARCH_ARM64 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 #else -#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 #endif #endif -#if EIGEN_COMP_MSVC +#if EIGEN_COMP_MSVC_STRICT // In MSVC's arm_neon.h header file, all NEON vector types // are aliases to the same underlying type __n128. // We thus have to wrap them to make them different C++ types. // (See also bug 1428) - -template -struct eigen_packet_wrapper -{ - operator T&() { return m_val; } - operator const T&() const { return m_val; } - eigen_packet_wrapper() {} - eigen_packet_wrapper(const T &v) : m_val(v) {} - eigen_packet_wrapper& operator=(const T &v) { - m_val = v; - return *this; - } - - T m_val; -}; -typedef eigen_packet_wrapper Packet2f; -typedef eigen_packet_wrapper Packet4f; -typedef eigen_packet_wrapper Packet4i; -typedef eigen_packet_wrapper Packet2i; -typedef eigen_packet_wrapper Packet4ui; +typedef eigen_packet_wrapper Packet2f; +typedef eigen_packet_wrapper Packet4f; +typedef eigen_packet_wrapper Packet4c; +typedef eigen_packet_wrapper Packet8c; +typedef eigen_packet_wrapper Packet16c; +typedef eigen_packet_wrapper Packet4uc; +typedef eigen_packet_wrapper Packet8uc; +typedef eigen_packet_wrapper Packet16uc; +typedef eigen_packet_wrapper Packet4s; +typedef eigen_packet_wrapper Packet8s; +typedef eigen_packet_wrapper Packet4us; +typedef eigen_packet_wrapper Packet8us; +typedef eigen_packet_wrapper Packet2i; +typedef eigen_packet_wrapper Packet4i; +typedef eigen_packet_wrapper Packet2ui; +typedef eigen_packet_wrapper Packet4ui; +typedef eigen_packet_wrapper Packet2l; +typedef eigen_packet_wrapper Packet2ul; #else -typedef float32x2_t Packet2f; -typedef float32x4_t Packet4f; -typedef int32x4_t Packet4i; -typedef int32x2_t Packet2i; -typedef uint32x4_t Packet4ui; +typedef float32x2_t Packet2f; +typedef float32x4_t Packet4f; +typedef eigen_packet_wrapper Packet4c; +typedef int8x8_t Packet8c; +typedef int8x16_t Packet16c; +typedef eigen_packet_wrapper Packet4uc; +typedef uint8x8_t Packet8uc; +typedef uint8x16_t Packet16uc; +typedef int16x4_t Packet4s; +typedef int16x8_t Packet8s; +typedef uint16x4_t Packet4us; +typedef uint16x8_t Packet8us; +typedef int32x2_t Packet2i; +typedef int32x4_t Packet4i; +typedef uint32x2_t Packet2ui; +typedef uint32x4_t Packet4ui; +typedef int64x2_t Packet2l; +typedef uint64x2_t Packet2ul; -#endif // EIGEN_COMP_MSVC +#endif // EIGEN_COMP_MSVC_STRICT + +EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f& m, int mask){ + const float* a = reinterpret_cast(&m); + Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3 )), *(a + ((mask >> 6) & 3))}; + return res; +} + +// fuctionally equivalent to _mm_shuffle_ps in SSE when interleave +// == false (i.e. shuffle(m, n, mask) equals _mm_shuffle_ps(m, n, mask)), +// interleave m and n when interleave == true. Currently used in LU/arch/InverseSize4.h +// to enable a shared implementation for fast inversion of matrices of size 4. +template +EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f &m, const Packet4f &n, int mask) +{ + const float* a = reinterpret_cast(&m); + const float* b = reinterpret_cast(&n); + Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))}; + return res; +} + +template<> +EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f &m, const Packet4f &n, int mask) +{ + const float* a = reinterpret_cast(&m); + const float* b = reinterpret_cast(&n); + Packet4f res = {*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))}; + return res; +} + +EIGEN_STRONG_INLINE static int eigen_neon_shuffle_mask(int p, int q, int r, int s) {return ((s)<<6|(r)<<4|(q)<<2|(p));} + +EIGEN_STRONG_INLINE Packet4f vec4f_swizzle1(const Packet4f& a, int p, int q, int r, int s) +{ + return shuffle1(a, eigen_neon_shuffle_mask(p, q, r, s)); +} +EIGEN_STRONG_INLINE Packet4f vec4f_swizzle2(const Packet4f& a, const Packet4f& b, int p, int q, int r, int s) +{ + return shuffle2(a,b,eigen_neon_shuffle_mask(p, q, r, s)); +} +EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b) +{ + return shuffle2(a,b,eigen_neon_shuffle_mask(0, 1, 0, 1)); +} +EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b) +{ + return shuffle2(b,a,eigen_neon_shuffle_mask(2, 3, 2, 3)); +} +EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b) +{ + return shuffle2(a,b,eigen_neon_shuffle_mask(0, 0, 1, 1)); +} +EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b) +{ + return shuffle2(a,b,eigen_neon_shuffle_mask(2, 2, 3, 3)); +} +#define vec4f_duplane(a, p) \ + vdupq_lane_f32(vget_low_f32(a), p) #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ const Packet4f p4f_##NAME = pset1(X) @@ -98,81 +162,816 @@ typedef uint32x4_t Packet4ui; #define EIGEN_ARM_PREFETCH(ADDR) #endif -template<> struct packet_traits : default_packet_traits +template <> +struct packet_traits : default_packet_traits { typedef Packet4f type; - typedef Packet4f half; // Packet2f intrinsics not implemented yet - enum { + typedef Packet2f half; + enum + { Vectorizable = 1, AlignedOnScalar = 1, size = 4, - HasHalfPacket=0, // Packet2f intrinsics not implemented yet - - HasDiv = 1, - // FIXME check the Has* - HasSin = 0, - HasCos = 0, - HasLog = 0, + HasHalfPacket = 1, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasAbsDiff = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + + HasDiv = 1, + HasFloor = 1, + HasCeil = 1, + HasRint = 1, + + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 1, HasExp = 1, - HasSqrt = 0 + HasSqrt = 1, + HasRsqrt = 1, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH, + HasBessel = 0, // Issues with accuracy. + HasNdtri = 0 }; }; -template<> struct packet_traits : default_packet_traits + +template <> +struct packet_traits : default_packet_traits { - typedef Packet4i type; - typedef Packet4i half; // Packet2i intrinsics not implemented yet - enum { + typedef Packet16c type; + typedef Packet8c half; + enum + { Vectorizable = 1, AlignedOnScalar = 1, - size=4, - HasHalfPacket=0 // Packet2i intrinsics not implemented yet - // FIXME check the Has* + size = 16, + HasHalfPacket = 1, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasAbsDiff = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0 }; }; -#if EIGEN_GNUC_AT_MOST(4,4) && !EIGEN_COMP_LLVM -// workaround gcc 4.2, 4.3 and 4.4 compilatin issue +template <> +struct packet_traits : default_packet_traits +{ + typedef Packet16uc type; + typedef Packet8uc half; + enum + { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 16, + HasHalfPacket = 1, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 0, + HasAbs = 1, + HasAbsDiff = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + + HasSqrt = 1 + }; +}; + +template <> +struct packet_traits : default_packet_traits +{ + typedef Packet8s type; + typedef Packet4s half; + enum + { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 8, + HasHalfPacket = 1, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasAbsDiff = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits +{ + typedef Packet8us type; + typedef Packet4us half; + enum + { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 8, + HasHalfPacket = 1, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 0, + HasAbs = 0, + HasAbsDiff = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasSqrt = 1 + }; +}; + +template <> +struct packet_traits : default_packet_traits +{ + typedef Packet4i type; + typedef Packet2i half; + enum + { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + HasHalfPacket = 1, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasAbsDiff = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits +{ + typedef Packet4ui type; + typedef Packet2ui half; + enum + { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + HasHalfPacket = 1, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 0, + HasAbs = 0, + HasArg = 0, + HasAbs2 = 1, + HasAbsDiff = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + + HasSqrt = 1 + }; +}; + +template <> +struct packet_traits : default_packet_traits +{ + typedef Packet2l type; + typedef Packet2l half; + enum + { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 2, + HasHalfPacket = 0, + + HasCmp = 1, + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasAbsDiff = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits +{ + typedef Packet2ul type; + typedef Packet2ul half; + enum + { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 2, + HasHalfPacket = 0, + + HasCmp = 1, + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 0, + HasAbs = 0, + HasArg = 0, + HasAbs2 = 1, + HasAbsDiff = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0 + }; +}; + +#if EIGEN_GNUC_AT_MOST(4, 4) && !EIGEN_COMP_LLVM +// workaround gcc 4.2, 4.3 and 4.4 compilation issue EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); } -EIGEN_STRONG_INLINE float32x2_t vld1_f32 (const float* x) { return ::vld1_f32 ((const float32_t*)x); } -EIGEN_STRONG_INLINE float32x2_t vld1_dup_f32 (const float* x) { return ::vld1_dup_f32 ((const float32_t*)x); } -EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); } -EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); } +EIGEN_STRONG_INLINE float32x2_t vld1_f32(const float* x) { return ::vld1_f32 ((const float32_t*)x); } +EIGEN_STRONG_INLINE float32x2_t vld1_dup_f32(const float* x) { return ::vld1_dup_f32 ((const float32_t*)x); } +EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); } +EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); } #endif -template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; -template<> struct unpacket_traits { typedef int32_t type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; +template<> struct unpacket_traits +{ + typedef float type; + typedef Packet2f half; + typedef Packet2i integer_packet; + enum + { + size = 2, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef float type; + typedef Packet2f half; + typedef Packet4i integer_packet; + enum + { + size = 4, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef int8_t type; + typedef Packet4c half; + enum + { + size = 4, + alignment = Unaligned, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef int8_t type; + typedef Packet4c half; + enum + { + size = 8, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef int8_t type; + typedef Packet8c half; + enum + { + size = 16, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef uint8_t type; + typedef Packet4uc half; + enum + { + size = 4, + alignment = Unaligned, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef uint8_t type; + typedef Packet4uc half; + enum + { + size = 8, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef uint8_t type; + typedef Packet8uc half; + enum + { + size = 16, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false}; +}; +template<> struct unpacket_traits +{ + typedef int16_t type; + typedef Packet4s half; + enum + { + size = 4, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef int16_t type; + typedef Packet4s half; + enum + { + size = 8, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef uint16_t type; + typedef Packet4us half; + enum + { + size = 4, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef uint16_t type; + typedef Packet4us half; + enum + { + size = 8, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef int32_t type; + typedef Packet2i half; + enum + { + size = 2, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef int32_t type; + typedef Packet2i half; + enum + { + size = 4, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef uint32_t type; + typedef Packet2ui half; + enum + { + size = 2, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef uint32_t type; + typedef Packet2ui half; + enum + { + size = 4, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef int64_t type; + typedef Packet2l half; + enum + { + size = 2, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef uint64_t type; + typedef Packet2ul half; + enum + { + size = 2, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; -template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return vdupq_n_f32(from); } -template<> EIGEN_STRONG_INLINE Packet4i pset1(const int32_t& from) { return vdupq_n_s32(from); } +template<> EIGEN_STRONG_INLINE Packet2f pset1(const float& from) { return vdup_n_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return vdupq_n_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4c pset1(const int8_t& from) +{ return vget_lane_s32(vreinterpret_s32_s8(vdup_n_s8(from)), 0); } +template<> EIGEN_STRONG_INLINE Packet8c pset1(const int8_t& from) { return vdup_n_s8(from); } +template<> EIGEN_STRONG_INLINE Packet16c pset1(const int8_t& from) { return vdupq_n_s8(from); } +template<> EIGEN_STRONG_INLINE Packet4uc pset1(const uint8_t& from) +{ return vget_lane_u32(vreinterpret_u32_u8(vdup_n_u8(from)), 0); } +template<> EIGEN_STRONG_INLINE Packet8uc pset1(const uint8_t& from) { return vdup_n_u8(from); } +template<> EIGEN_STRONG_INLINE Packet16uc pset1(const uint8_t& from) { return vdupq_n_u8(from); } +template<> EIGEN_STRONG_INLINE Packet4s pset1(const int16_t& from) { return vdup_n_s16(from); } +template<> EIGEN_STRONG_INLINE Packet8s pset1(const int16_t& from) { return vdupq_n_s16(from); } +template<> EIGEN_STRONG_INLINE Packet4us pset1(const uint16_t& from) { return vdup_n_u16(from); } +template<> EIGEN_STRONG_INLINE Packet8us pset1(const uint16_t& from) { return vdupq_n_u16(from); } +template<> EIGEN_STRONG_INLINE Packet2i pset1(const int32_t& from) { return vdup_n_s32(from); } +template<> EIGEN_STRONG_INLINE Packet4i pset1(const int32_t& from) { return vdupq_n_s32(from); } +template<> EIGEN_STRONG_INLINE Packet2ui pset1(const uint32_t& from) { return vdup_n_u32(from); } +template<> EIGEN_STRONG_INLINE Packet4ui pset1(const uint32_t& from) { return vdupq_n_u32(from); } +template<> EIGEN_STRONG_INLINE Packet2l pset1(const int64_t& from) { return vdupq_n_s64(from); } +template<> EIGEN_STRONG_INLINE Packet2ul pset1(const uint64_t& from) { return vdupq_n_u64(from); } +template<> EIGEN_STRONG_INLINE Packet2f pset1frombits(unsigned int from) +{ return vreinterpret_f32_u32(vdup_n_u32(from)); } +template<> EIGEN_STRONG_INLINE Packet4f pset1frombits(unsigned int from) +{ return vreinterpretq_f32_u32(vdupq_n_u32(from)); } + +template<> EIGEN_STRONG_INLINE Packet2f plset(const float& a) +{ + const float c[] = {0.0f,1.0f}; + return vadd_f32(pset1(a), vld1_f32(c)); +} template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { - const float f[] = {0, 1, 2, 3}; - Packet4f countdown = vld1q_f32(f); - return vaddq_f32(pset1(a), countdown); + const float c[] = {0.0f,1.0f,2.0f,3.0f}; + return vaddq_f32(pset1(a), vld1q_f32(c)); +} +template<> EIGEN_STRONG_INLINE Packet4c plset(const int8_t& a) +{ return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(vreinterpret_s8_u32(vdup_n_u32(0x03020100)), vdup_n_s8(a))), 0); } +template<> EIGEN_STRONG_INLINE Packet8c plset(const int8_t& a) +{ + const int8_t c[] = {0,1,2,3,4,5,6,7}; + return vadd_s8(pset1(a), vld1_s8(c)); +} +template<> EIGEN_STRONG_INLINE Packet16c plset(const int8_t& a) +{ + const int8_t c[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; + return vaddq_s8(pset1(a), vld1q_s8(c)); +} +template<> EIGEN_STRONG_INLINE Packet4uc plset(const uint8_t& a) +{ return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(vreinterpret_u8_u32(vdup_n_u32(0x03020100)), vdup_n_u8(a))), 0); } +template<> EIGEN_STRONG_INLINE Packet8uc plset(const uint8_t& a) +{ + const uint8_t c[] = {0,1,2,3,4,5,6,7}; + return vadd_u8(pset1(a), vld1_u8(c)); +} +template<> EIGEN_STRONG_INLINE Packet16uc plset(const uint8_t& a) +{ + const uint8_t c[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; + return vaddq_u8(pset1(a), vld1q_u8(c)); +} +template<> EIGEN_STRONG_INLINE Packet4s plset(const int16_t& a) +{ + const int16_t c[] = {0,1,2,3}; + return vadd_s16(pset1(a), vld1_s16(c)); +} +template<> EIGEN_STRONG_INLINE Packet4us plset(const uint16_t& a) +{ + const uint16_t c[] = {0,1,2,3}; + return vadd_u16(pset1(a), vld1_u16(c)); +} +template<> EIGEN_STRONG_INLINE Packet8s plset(const int16_t& a) +{ + const int16_t c[] = {0,1,2,3,4,5,6,7}; + return vaddq_s16(pset1(a), vld1q_s16(c)); +} +template<> EIGEN_STRONG_INLINE Packet8us plset(const uint16_t& a) +{ + const uint16_t c[] = {0,1,2,3,4,5,6,7}; + return vaddq_u16(pset1(a), vld1q_u16(c)); +} +template<> EIGEN_STRONG_INLINE Packet2i plset(const int32_t& a) +{ + const int32_t c[] = {0,1}; + return vadd_s32(pset1(a), vld1_s32(c)); } template<> EIGEN_STRONG_INLINE Packet4i plset(const int32_t& a) { - const int32_t i[] = {0, 1, 2, 3}; - Packet4i countdown = vld1q_s32(i); - return vaddq_s32(pset1(a), countdown); + const int32_t c[] = {0,1,2,3}; + return vaddq_s32(pset1(a), vld1q_s32(c)); +} +template<> EIGEN_STRONG_INLINE Packet2ui plset(const uint32_t& a) +{ + const uint32_t c[] = {0,1}; + return vadd_u32(pset1(a), vld1_u32(c)); +} +template<> EIGEN_STRONG_INLINE Packet4ui plset(const uint32_t& a) +{ + const uint32_t c[] = {0,1,2,3}; + return vaddq_u32(pset1(a), vld1q_u32(c)); +} +template<> EIGEN_STRONG_INLINE Packet2l plset(const int64_t& a) +{ + const int64_t c[] = {0,1}; + return vaddq_s64(pset1(a), vld1q_s64(c)); +} +template<> EIGEN_STRONG_INLINE Packet2ul plset(const uint64_t& a) +{ + const uint64_t c[] = {0,1}; + return vaddq_u64(pset1(a), vld1q_u64(c)); } +template<> EIGEN_STRONG_INLINE Packet2f padd(const Packet2f& a, const Packet2f& b) { return vadd_f32(a,b); } template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) { return vaddq_f32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4c padd(const Packet4c& a, const Packet4c& b) +{ + return vget_lane_s32(vreinterpret_s32_s8(vadd_s8( + vreinterpret_s8_s32(vdup_n_s32(a)), + vreinterpret_s8_s32(vdup_n_s32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8c padd(const Packet8c& a, const Packet8c& b) { return vadd_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16c padd(const Packet16c& a, const Packet16c& b) { return vaddq_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4uc padd(const Packet4uc& a, const Packet4uc& b) +{ + return vget_lane_u32(vreinterpret_u32_u8(vadd_u8( + vreinterpret_u8_u32(vdup_n_u32(a)), + vreinterpret_u8_u32(vdup_n_u32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8uc padd(const Packet8uc& a, const Packet8uc& b) { return vadd_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc padd(const Packet16uc& a, const Packet16uc& b) { return vaddq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4s padd(const Packet4s& a, const Packet4s& b) { return vadd_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8s padd(const Packet8s& a, const Packet8s& b) { return vaddq_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet4us padd(const Packet4us& a, const Packet4us& b) { return vadd_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us padd(const Packet8us& a, const Packet8us& b) { return vaddq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet2i padd(const Packet2i& a, const Packet2i& b) { return vadd_s32(a,b); } template<> EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { return vaddq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ui padd(const Packet2ui& a, const Packet2ui& b) { return vadd_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui padd(const Packet4ui& a, const Packet4ui& b) { return vaddq_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2l padd(const Packet2l& a, const Packet2l& b) { return vaddq_s64(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ul padd(const Packet2ul& a, const Packet2ul& b) { return vaddq_u64(a,b); } +template<> EIGEN_STRONG_INLINE Packet2f psub(const Packet2f& a, const Packet2f& b) { return vsub_f32(a,b); } template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) { return vsubq_f32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4c psub(const Packet4c& a, const Packet4c& b) +{ + return vget_lane_s32(vreinterpret_s32_s8(vsub_s8( + vreinterpret_s8_s32(vdup_n_s32(a)), + vreinterpret_s8_s32(vdup_n_s32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8c psub(const Packet8c& a, const Packet8c& b) { return vsub_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16c psub(const Packet16c& a, const Packet16c& b) { return vsubq_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4uc psub(const Packet4uc& a, const Packet4uc& b) +{ + return vget_lane_u32(vreinterpret_u32_u8(vsub_u8( + vreinterpret_u8_u32(vdup_n_u32(a)), + vreinterpret_u8_u32(vdup_n_u32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8uc psub(const Packet8uc& a, const Packet8uc& b) { return vsub_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc psub(const Packet16uc& a, const Packet16uc& b) { return vsubq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4s psub(const Packet4s& a, const Packet4s& b) { return vsub_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8s psub(const Packet8s& a, const Packet8s& b) { return vsubq_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet4us psub(const Packet4us& a, const Packet4us& b) { return vsub_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us psub(const Packet8us& a, const Packet8us& b) { return vsubq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet2i psub(const Packet2i& a, const Packet2i& b) { return vsub_s32(a,b); } template<> EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { return vsubq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ui psub(const Packet2ui& a, const Packet2ui& b) { return vsub_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui psub(const Packet4ui& a, const Packet4ui& b) { return vsubq_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2l psub(const Packet2l& a, const Packet2l& b) { return vsubq_s64(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ul psub(const Packet2ul& a, const Packet2ul& b) { return vsubq_u64(a,b); } +template<> EIGEN_STRONG_INLINE Packet2f pxor(const Packet2f& a, const Packet2f& b); +template<> EIGEN_STRONG_INLINE Packet2f paddsub(const Packet2f& a, const Packet2f & b) { + Packet2f mask = {numext::bit_cast(0x80000000u), 0.0f}; + return padd(a, pxor(mask, b)); +} +template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b); +template<> EIGEN_STRONG_INLINE Packet4f paddsub(const Packet4f& a, const Packet4f& b) { + Packet4f mask = {numext::bit_cast(0x80000000u), 0.0f, numext::bit_cast(0x80000000u), 0.0f}; + return padd(a, pxor(mask, b)); +} + +template<> EIGEN_STRONG_INLINE Packet2f pnegate(const Packet2f& a) { return vneg_f32(a); } template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return vnegq_f32(a); } +template<> EIGEN_STRONG_INLINE Packet4c pnegate(const Packet4c& a) +{ return vget_lane_s32(vreinterpret_s32_s8(vneg_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); } +template<> EIGEN_STRONG_INLINE Packet8c pnegate(const Packet8c& a) { return vneg_s8(a); } +template<> EIGEN_STRONG_INLINE Packet16c pnegate(const Packet16c& a) { return vnegq_s8(a); } +template<> EIGEN_STRONG_INLINE Packet4s pnegate(const Packet4s& a) { return vneg_s16(a); } +template<> EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) { return vnegq_s16(a); } +template<> EIGEN_STRONG_INLINE Packet2i pnegate(const Packet2i& a) { return vneg_s32(a); } template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return vnegq_s32(a); } +template<> EIGEN_STRONG_INLINE Packet2l pnegate(const Packet2l& a) { +#if EIGEN_ARCH_ARM64 + return vnegq_s64(a); +#else + return vcombine_s64( + vdup_n_s64(-vgetq_lane_s64(a, 0)), + vdup_n_s64(-vgetq_lane_s64(a, 1))); +#endif +} +template<> EIGEN_STRONG_INLINE Packet2f pconj(const Packet2f& a) { return a; } template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4c pconj(const Packet4c& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet8c pconj(const Packet8c& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet16c pconj(const Packet16c& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4uc pconj(const Packet4uc& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet8uc pconj(const Packet8uc& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet16uc pconj(const Packet16uc& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4s pconj(const Packet4s& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet8s pconj(const Packet8s& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4us pconj(const Packet4us& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet8us pconj(const Packet8us& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet2i pconj(const Packet2i& a) { return a; } template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet2ui pconj(const Packet2ui& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4ui pconj(const Packet4ui& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet2l pconj(const Packet2l& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet2ul pconj(const Packet2ul& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet2f pmul(const Packet2f& a, const Packet2f& b) { return vmul_f32(a,b); } template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) { return vmulq_f32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4c pmul(const Packet4c& a, const Packet4c& b) +{ + return vget_lane_s32(vreinterpret_s32_s8(vmul_s8( + vreinterpret_s8_s32(vdup_n_s32(a)), + vreinterpret_s8_s32(vdup_n_s32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8c pmul(const Packet8c& a, const Packet8c& b) { return vmul_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16c pmul(const Packet16c& a, const Packet16c& b) { return vmulq_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4uc pmul(const Packet4uc& a, const Packet4uc& b) +{ + return vget_lane_u32(vreinterpret_u32_u8(vmul_u8( + vreinterpret_u8_u32(vdup_n_u32(a)), + vreinterpret_u8_u32(vdup_n_u32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8uc pmul(const Packet8uc& a, const Packet8uc& b) { return vmul_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc pmul(const Packet16uc& a, const Packet16uc& b) { return vmulq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4s pmul(const Packet4s& a, const Packet4s& b) { return vmul_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8s pmul(const Packet8s& a, const Packet8s& b) { return vmulq_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet4us pmul(const Packet4us& a, const Packet4us& b) { return vmul_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us pmul(const Packet8us& a, const Packet8us& b) { return vmulq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet2i pmul(const Packet2i& a, const Packet2i& b) { return vmul_s32(a,b); } template<> EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const Packet4i& b) { return vmulq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ui pmul(const Packet2ui& a, const Packet2ui& b) { return vmul_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui pmul(const Packet4ui& a, const Packet4ui& b) { return vmulq_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2l pmul(const Packet2l& a, const Packet2l& b) { + return vcombine_s64( + vdup_n_s64(vgetq_lane_s64(a, 0)*vgetq_lane_s64(b, 0)), + vdup_n_s64(vgetq_lane_s64(a, 1)*vgetq_lane_s64(b, 1))); +} +template<> EIGEN_STRONG_INLINE Packet2ul pmul(const Packet2ul& a, const Packet2ul& b) { + return vcombine_u64( + vdup_n_u64(vgetq_lane_u64(a, 0)*vgetq_lane_u64(b, 0)), + vdup_n_u64(vgetq_lane_u64(a, 1)*vgetq_lane_u64(b, 1))); +} +template<> EIGEN_STRONG_INLINE Packet2f pdiv(const Packet2f& a, const Packet2f& b) +{ +#if EIGEN_ARCH_ARM64 + return vdiv_f32(a,b); +#else + Packet2f inv, restep, div; + + // NEON does not offer a divide instruction, we have to do a reciprocal approximation + // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers + // a reciprocal estimate AND a reciprocal step -which saves a few instructions + // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with + // Newton-Raphson and vrecpsq_f32() + inv = vrecpe_f32(b); + + // This returns a differential, by which we will have to multiply inv to get a better + // approximation of 1/b. + restep = vrecps_f32(b, inv); + inv = vmul_f32(restep, inv); + + // Finally, multiply a by 1/b and get the wanted result of the division. + div = vmul_f32(a, inv); + + return div; +#endif +} template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { #if EIGEN_ARCH_ARM64 @@ -199,357 +998,2629 @@ template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const #endif } +template<> EIGEN_STRONG_INLINE Packet4c pdiv(const Packet4c& /*a*/, const Packet4c& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet8c pdiv(const Packet8c& /*a*/, const Packet8c& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet16c pdiv(const Packet16c& /*a*/, const Packet16c& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet4uc pdiv(const Packet4uc& /*a*/, const Packet4uc& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet8uc pdiv(const Packet8uc& /*a*/, const Packet8uc& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet16uc pdiv(const Packet16uc& /*a*/, const Packet16uc& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet4s pdiv(const Packet4s& /*a*/, const Packet4s& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet8s pdiv(const Packet8s& /*a*/, const Packet8s& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet4us pdiv(const Packet4us& /*a*/, const Packet4us& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet8us pdiv(const Packet8us& /*a*/, const Packet8us& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet2i pdiv(const Packet2i& /*a*/, const Packet2i& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& /*a*/, const Packet4i& /*b*/) -{ eigen_assert(false && "packet integer division are not supported by NEON"); +{ + eigen_assert(false && "packet integer division are not supported by NEON"); return pset1(0); } +template<> EIGEN_STRONG_INLINE Packet2ui pdiv(const Packet2ui& /*a*/, const Packet2ui& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet4ui pdiv(const Packet4ui& /*a*/, const Packet4ui& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet2l pdiv(const Packet2l& /*a*/, const Packet2l& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0LL); +} +template<> EIGEN_STRONG_INLINE Packet2ul pdiv(const Packet2ul& /*a*/, const Packet2ul& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0ULL); +} -// Clang/ARM wrongly advertises __ARM_FEATURE_FMA even when it's not available, -// then implements a slow software scalar fallback calling fmaf()! -// Filed LLVM bug: -// https://llvm.org/bugs/show_bug.cgi?id=27216 -#if (defined __ARM_FEATURE_FMA) && !(EIGEN_COMP_CLANG && EIGEN_ARCH_ARM) -// See bug 936. -// FMA is available on VFPv4 i.e. when compiling with -mfpu=neon-vfpv4. -// FMA is a true fused multiply-add i.e. only 1 rounding at the end, no intermediate rounding. -// MLA is not fused i.e. does 2 roundings. -// In addition to giving better accuracy, FMA also gives better performance here on a Krait (Nexus 4): -// MLA: 10 GFlop/s ; FMA: 12 GFlops/s. -template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vfmaq_f32(c,a,b); } -#else -template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { -#if EIGEN_COMP_CLANG && EIGEN_ARCH_ARM - // Clang/ARM will replace VMLA by VMUL+VADD at least for some values of -mcpu, - // at least -mcpu=cortex-a8 and -mcpu=cortex-a7. Since the former is the default on - // -march=armv7-a, that is a very common case. - // See e.g. this thread: - // http://lists.llvm.org/pipermail/llvm-dev/2013-December/068806.html - // Filed LLVM bug: - // https://llvm.org/bugs/show_bug.cgi?id=27219 - Packet4f r = c; - asm volatile( - "vmla.f32 %q[r], %q[a], %q[b]" - : [r] "+w" (r) - : [a] "w" (a), - [b] "w" (b) - : ); - return r; + +#ifdef __ARM_FEATURE_FMA +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) +{ return vfmaq_f32(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) +{ return vfma_f32(c,a,b); } #else +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) +{ return vmlaq_f32(c,a,b); -#endif +} +template<> EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) +{ + return vmla_f32(c,a,b); } #endif // No FMA instruction for int, so use MLA unconditionally. -template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return vmlaq_s32(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet4c pmadd(const Packet4c& a, const Packet4c& b, const Packet4c& c) +{ + return vget_lane_s32(vreinterpret_s32_s8(vmla_s8( + vreinterpret_s8_s32(vdup_n_s32(c)), + vreinterpret_s8_s32(vdup_n_s32(a)), + vreinterpret_s8_s32(vdup_n_s32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8c pmadd(const Packet8c& a, const Packet8c& b, const Packet8c& c) +{ return vmla_s8(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet16c pmadd(const Packet16c& a, const Packet16c& b, const Packet16c& c) +{ return vmlaq_s8(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet4uc pmadd(const Packet4uc& a, const Packet4uc& b, const Packet4uc& c) +{ + return vget_lane_u32(vreinterpret_u32_u8(vmla_u8( + vreinterpret_u8_u32(vdup_n_u32(c)), + vreinterpret_u8_u32(vdup_n_u32(a)), + vreinterpret_u8_u32(vdup_n_u32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8uc pmadd(const Packet8uc& a, const Packet8uc& b, const Packet8uc& c) +{ return vmla_u8(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc pmadd(const Packet16uc& a, const Packet16uc& b, const Packet16uc& c) +{ return vmlaq_u8(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet4s pmadd(const Packet4s& a, const Packet4s& b, const Packet4s& c) +{ return vmla_s16(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) +{ return vmlaq_s16(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet4us pmadd(const Packet4us& a, const Packet4us& b, const Packet4us& c) +{ return vmla_u16(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) +{ return vmlaq_u16(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet2i pmadd(const Packet2i& a, const Packet2i& b, const Packet2i& c) +{ return vmla_s32(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) +{ return vmlaq_s32(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet2ui pmadd(const Packet2ui& a, const Packet2ui& b, const Packet2ui& c) +{ return vmla_u32(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui pmadd(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c) +{ return vmlaq_u32(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet2f pabsdiff(const Packet2f& a, const Packet2f& b) +{ return vabd_f32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pabsdiff(const Packet4f& a, const Packet4f& b) +{ return vabdq_f32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4c pabsdiff(const Packet4c& a, const Packet4c& b) +{ + return vget_lane_s32(vreinterpret_s32_s8(vabd_s8( + vreinterpret_s8_s32(vdup_n_s32(a)), + vreinterpret_s8_s32(vdup_n_s32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8c pabsdiff(const Packet8c& a, const Packet8c& b) +{ return vabd_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16c pabsdiff(const Packet16c& a, const Packet16c& b) +{ return vabdq_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4uc pabsdiff(const Packet4uc& a, const Packet4uc& b) +{ + return vget_lane_u32(vreinterpret_u32_u8(vabd_u8( + vreinterpret_u8_u32(vdup_n_u32(a)), + vreinterpret_u8_u32(vdup_n_u32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8uc pabsdiff(const Packet8uc& a, const Packet8uc& b) +{ return vabd_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc pabsdiff(const Packet16uc& a, const Packet16uc& b) +{ return vabdq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4s pabsdiff(const Packet4s& a, const Packet4s& b) +{ return vabd_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8s pabsdiff(const Packet8s& a, const Packet8s& b) +{ return vabdq_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet4us pabsdiff(const Packet4us& a, const Packet4us& b) +{ return vabd_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us pabsdiff(const Packet8us& a, const Packet8us& b) +{ return vabdq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet2i pabsdiff(const Packet2i& a, const Packet2i& b) +{ return vabd_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pabsdiff(const Packet4i& a, const Packet4i& b) +{ return vabdq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ui pabsdiff(const Packet2ui& a, const Packet2ui& b) +{ return vabd_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui pabsdiff(const Packet4ui& a, const Packet4ui& b) +{ return vabdq_u32(a,b); } + +template<> EIGEN_STRONG_INLINE Packet2f pmin(const Packet2f& a, const Packet2f& b) { return vmin_f32(a,b); } template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { return vminq_s32(a,b); } +#ifdef __ARM_FEATURE_NUMERIC_MAXMIN +// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems). +template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { return vminnmq_f32(a, b); } +template<> EIGEN_STRONG_INLINE Packet2f pmin(const Packet2f& a, const Packet2f& b) { return vminnm_f32(a, b); } +#endif + +template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { return pmin(a, b); } + +template<> EIGEN_STRONG_INLINE Packet2f pmin(const Packet2f& a, const Packet2f& b) { return pmin(a, b); } + +template<> EIGEN_STRONG_INLINE Packet4c pmin(const Packet4c& a, const Packet4c& b) +{ + return vget_lane_s32(vreinterpret_s32_s8(vmin_s8( + vreinterpret_s8_s32(vdup_n_s32(a)), + vreinterpret_s8_s32(vdup_n_s32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8c pmin(const Packet8c& a, const Packet8c& b) { return vmin_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16c pmin(const Packet16c& a, const Packet16c& b) { return vminq_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4uc pmin(const Packet4uc& a, const Packet4uc& b) +{ + return vget_lane_u32(vreinterpret_u32_u8(vmin_u8( + vreinterpret_u8_u32(vdup_n_u32(a)), + vreinterpret_u8_u32(vdup_n_u32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8uc pmin(const Packet8uc& a, const Packet8uc& b) { return vmin_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc pmin(const Packet16uc& a, const Packet16uc& b) { return vminq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4s pmin(const Packet4s& a, const Packet4s& b) { return vmin_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8s pmin(const Packet8s& a, const Packet8s& b) { return vminq_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet4us pmin(const Packet4us& a, const Packet4us& b) { return vmin_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us pmin(const Packet8us& a, const Packet8us& b) { return vminq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet2i pmin(const Packet2i& a, const Packet2i& b) { return vmin_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { return vminq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ui pmin(const Packet2ui& a, const Packet2ui& b) { return vmin_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui pmin(const Packet4ui& a, const Packet4ui& b) { return vminq_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2l pmin(const Packet2l& a, const Packet2l& b) { + return vcombine_s64( + vdup_n_s64((std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))), + vdup_n_s64((std::min)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1)))); +} +template<> EIGEN_STRONG_INLINE Packet2ul pmin(const Packet2ul& a, const Packet2ul& b) { + return vcombine_u64( + vdup_n_u64((std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))), + vdup_n_u64((std::min)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1)))); +} + +template<> EIGEN_STRONG_INLINE Packet2f pmax(const Packet2f& a, const Packet2f& b) { return vmax_f32(a,b); } template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { return vmaxq_f32(a,b); } + +#ifdef __ARM_FEATURE_NUMERIC_MAXMIN +// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems). +template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { return vmaxnmq_f32(a, b); } +template<> EIGEN_STRONG_INLINE Packet2f pmax(const Packet2f& a, const Packet2f& b) { return vmaxnm_f32(a, b); } +#endif + +template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { return pmax(a, b); } + +template<> EIGEN_STRONG_INLINE Packet2f pmax(const Packet2f& a, const Packet2f& b) { return pmax(a, b); } + +template<> EIGEN_STRONG_INLINE Packet4c pmax(const Packet4c& a, const Packet4c& b) +{ + return vget_lane_s32(vreinterpret_s32_s8(vmax_s8( + vreinterpret_s8_s32(vdup_n_s32(a)), + vreinterpret_s8_s32(vdup_n_s32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8c pmax(const Packet8c& a, const Packet8c& b) { return vmax_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16c pmax(const Packet16c& a, const Packet16c& b) { return vmaxq_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4uc pmax(const Packet4uc& a, const Packet4uc& b) +{ + return vget_lane_u32(vreinterpret_u32_u8(vmax_u8( + vreinterpret_u8_u32(vdup_n_u32(a)), + vreinterpret_u8_u32(vdup_n_u32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8uc pmax(const Packet8uc& a, const Packet8uc& b) { return vmax_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc pmax(const Packet16uc& a, const Packet16uc& b) { return vmaxq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4s pmax(const Packet4s& a, const Packet4s& b) { return vmax_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8s pmax(const Packet8s& a, const Packet8s& b) { return vmaxq_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet4us pmax(const Packet4us& a, const Packet4us& b) { return vmax_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us pmax(const Packet8us& a, const Packet8us& b) { return vmaxq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet2i pmax(const Packet2i& a, const Packet2i& b) { return vmax_s32(a,b); } template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { return vmaxq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ui pmax(const Packet2ui& a, const Packet2ui& b) { return vmax_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui pmax(const Packet4ui& a, const Packet4ui& b) { return vmaxq_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2l pmax(const Packet2l& a, const Packet2l& b) { + return vcombine_s64( + vdup_n_s64((std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))), + vdup_n_s64((std::max)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1)))); +} +template<> EIGEN_STRONG_INLINE Packet2ul pmax(const Packet2ul& a, const Packet2ul& b) { + return vcombine_u64( + vdup_n_u64((std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))), + vdup_n_u64((std::max)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1)))); +} + +template<> EIGEN_STRONG_INLINE Packet2f pcmp_le(const Packet2f& a, const Packet2f& b) +{ return vreinterpret_f32_u32(vcle_f32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) +{ return vreinterpretq_f32_u32(vcleq_f32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4c pcmp_le(const Packet4c& a, const Packet4c& b) +{ + return vget_lane_s32(vreinterpret_s32_u8(vcle_s8( + vreinterpret_s8_s32(vdup_n_s32(a)), + vreinterpret_s8_s32(vdup_n_s32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8c pcmp_le(const Packet8c& a, const Packet8c& b) +{ return vreinterpret_s8_u8(vcle_s8(a,b)); } +template<> EIGEN_STRONG_INLINE Packet16c pcmp_le(const Packet16c& a, const Packet16c& b) +{ return vreinterpretq_s8_u8(vcleq_s8(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4uc pcmp_le(const Packet4uc& a, const Packet4uc& b) +{ + return vget_lane_u32(vreinterpret_u32_u8(vcle_u8( + vreinterpret_u8_u32(vdup_n_u32(a)), + vreinterpret_u8_u32(vdup_n_u32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8uc pcmp_le(const Packet8uc& a, const Packet8uc& b) +{ return vcle_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc pcmp_le(const Packet16uc& a, const Packet16uc& b) +{ return vcleq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4s pcmp_le(const Packet4s& a, const Packet4s& b) +{ return vreinterpret_s16_u16(vcle_s16(a,b)); } +template<> EIGEN_STRONG_INLINE Packet8s pcmp_le(const Packet8s& a, const Packet8s& b) +{ return vreinterpretq_s16_u16(vcleq_s16(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4us pcmp_le(const Packet4us& a, const Packet4us& b) +{ return vcle_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us pcmp_le(const Packet8us& a, const Packet8us& b) +{ return vcleq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet2i pcmp_le(const Packet2i& a, const Packet2i& b) +{ return vreinterpret_s32_u32(vcle_s32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) +{ return vreinterpretq_s32_u32(vcleq_s32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet2ui pcmp_le(const Packet2ui& a, const Packet2ui& b) +{ return vcle_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui pcmp_le(const Packet4ui& a, const Packet4ui& b) +{ return vcleq_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2l pcmp_le(const Packet2l& a, const Packet2l& b) +{ +#if EIGEN_ARCH_ARM64 + return vreinterpretq_s64_u64(vcleq_s64(a,b)); +#else + return vcombine_s64( + vdup_n_s64(vgetq_lane_s64(a, 0) <= vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0), + vdup_n_s64(vgetq_lane_s64(a, 1) <= vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet2ul pcmp_le(const Packet2ul& a, const Packet2ul& b) +{ +#if EIGEN_ARCH_ARM64 + return vcleq_u64(a,b); +#else + return vcombine_u64( + vdup_n_u64(vgetq_lane_u64(a, 0) <= vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0), + vdup_n_u64(vgetq_lane_u64(a, 1) <= vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0)); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet2f pcmp_lt(const Packet2f& a, const Packet2f& b) +{ return vreinterpret_f32_u32(vclt_f32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) +{ return vreinterpretq_f32_u32(vcltq_f32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4c pcmp_lt(const Packet4c& a, const Packet4c& b) +{ + return vget_lane_s32(vreinterpret_s32_u8(vclt_s8( + vreinterpret_s8_s32(vdup_n_s32(a)), + vreinterpret_s8_s32(vdup_n_s32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8c pcmp_lt(const Packet8c& a, const Packet8c& b) +{ return vreinterpret_s8_u8(vclt_s8(a,b)); } +template<> EIGEN_STRONG_INLINE Packet16c pcmp_lt(const Packet16c& a, const Packet16c& b) +{ return vreinterpretq_s8_u8(vcltq_s8(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4uc pcmp_lt(const Packet4uc& a, const Packet4uc& b) +{ + return vget_lane_u32(vreinterpret_u32_u8(vclt_u8( + vreinterpret_u8_u32(vdup_n_u32(a)), + vreinterpret_u8_u32(vdup_n_u32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8uc pcmp_lt(const Packet8uc& a, const Packet8uc& b) +{ return vclt_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc pcmp_lt(const Packet16uc& a, const Packet16uc& b) +{ return vcltq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4s pcmp_lt(const Packet4s& a, const Packet4s& b) +{ return vreinterpret_s16_u16(vclt_s16(a,b)); } +template<> EIGEN_STRONG_INLINE Packet8s pcmp_lt(const Packet8s& a, const Packet8s& b) +{ return vreinterpretq_s16_u16(vcltq_s16(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4us pcmp_lt(const Packet4us& a, const Packet4us& b) +{ return vclt_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us pcmp_lt(const Packet8us& a, const Packet8us& b) +{ return vcltq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet2i pcmp_lt(const Packet2i& a, const Packet2i& b) +{ return vreinterpret_s32_u32(vclt_s32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) +{ return vreinterpretq_s32_u32(vcltq_s32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet2ui pcmp_lt(const Packet2ui& a, const Packet2ui& b) +{ return vclt_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui pcmp_lt(const Packet4ui& a, const Packet4ui& b) +{ return vcltq_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2l pcmp_lt(const Packet2l& a, const Packet2l& b) +{ +#if EIGEN_ARCH_ARM64 + return vreinterpretq_s64_u64(vcltq_s64(a,b)); +#else + return vcombine_s64( + vdup_n_s64(vgetq_lane_s64(a, 0) < vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0), + vdup_n_s64(vgetq_lane_s64(a, 1) < vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet2ul pcmp_lt(const Packet2ul& a, const Packet2ul& b) +{ +#if EIGEN_ARCH_ARM64 + return vcltq_u64(a,b); +#else + return vcombine_u64( + vdup_n_u64(vgetq_lane_u64(a, 0) < vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0), + vdup_n_u64(vgetq_lane_u64(a, 1) < vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0)); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet2f pcmp_eq(const Packet2f& a, const Packet2f& b) +{ return vreinterpret_f32_u32(vceq_f32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) +{ return vreinterpretq_f32_u32(vceqq_f32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4c pcmp_eq(const Packet4c& a, const Packet4c& b) +{ + return vget_lane_s32(vreinterpret_s32_u8(vceq_s8( + vreinterpret_s8_s32(vdup_n_s32(a)), + vreinterpret_s8_s32(vdup_n_s32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8c pcmp_eq(const Packet8c& a, const Packet8c& b) +{ return vreinterpret_s8_u8(vceq_s8(a,b)); } +template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq(const Packet16c& a, const Packet16c& b) +{ return vreinterpretq_s8_u8(vceqq_s8(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4uc pcmp_eq(const Packet4uc& a, const Packet4uc& b) +{ + return vget_lane_u32(vreinterpret_u32_u8(vceq_u8( + vreinterpret_u8_u32(vdup_n_u32(a)), + vreinterpret_u8_u32(vdup_n_u32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8uc pcmp_eq(const Packet8uc& a, const Packet8uc& b) +{ return vceq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc pcmp_eq(const Packet16uc& a, const Packet16uc& b) +{ return vceqq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4s pcmp_eq(const Packet4s& a, const Packet4s& b) +{ return vreinterpret_s16_u16(vceq_s16(a,b)); } +template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq(const Packet8s& a, const Packet8s& b) +{ return vreinterpretq_s16_u16(vceqq_s16(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4us pcmp_eq(const Packet4us& a, const Packet4us& b) +{ return vceq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq(const Packet8us& a, const Packet8us& b) +{ return vceqq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet2i pcmp_eq(const Packet2i& a, const Packet2i& b) +{ return vreinterpret_s32_u32(vceq_s32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) +{ return vreinterpretq_s32_u32(vceqq_s32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet2ui pcmp_eq(const Packet2ui& a, const Packet2ui& b) +{ return vceq_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui pcmp_eq(const Packet4ui& a, const Packet4ui& b) +{ return vceqq_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2l pcmp_eq(const Packet2l& a, const Packet2l& b) +{ +#if EIGEN_ARCH_ARM64 + return vreinterpretq_s64_u64(vceqq_s64(a,b)); +#else + return vcombine_s64( + vdup_n_s64(vgetq_lane_s64(a, 0) == vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0), + vdup_n_s64(vgetq_lane_s64(a, 1) == vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet2ul pcmp_eq(const Packet2ul& a, const Packet2ul& b) +{ +#if EIGEN_ARCH_ARM64 + return vceqq_u64(a,b); +#else + return vcombine_u64( + vdup_n_u64(vgetq_lane_u64(a, 0) == vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0), + vdup_n_u64(vgetq_lane_u64(a, 1) == vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0)); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet2f pcmp_lt_or_nan(const Packet2f& a, const Packet2f& b) +{ return vreinterpret_f32_u32(vmvn_u32(vcge_f32(a,b))); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) +{ return vreinterpretq_f32_u32(vmvnq_u32(vcgeq_f32(a,b))); } // Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics +template<> EIGEN_STRONG_INLINE Packet2f pand(const Packet2f& a, const Packet2f& b) +{ return vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); } template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) -{ - return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); -} +{ return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); } +template<> EIGEN_STRONG_INLINE Packet4c pand(const Packet4c& a, const Packet4c& b) +{ return a & b; } +template<> EIGEN_STRONG_INLINE Packet8c pand(const Packet8c& a, const Packet8c& b) +{ return vand_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16c pand(const Packet16c& a, const Packet16c& b) +{ return vandq_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4uc pand(const Packet4uc& a, const Packet4uc& b) +{ return a & b; } +template<> EIGEN_STRONG_INLINE Packet8uc pand(const Packet8uc& a, const Packet8uc& b) +{ return vand_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc pand(const Packet16uc& a, const Packet16uc& b) +{ return vandq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4s pand(const Packet4s& a, const Packet4s& b) { return vand_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8s pand(const Packet8s& a, const Packet8s& b) { return vandq_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet4us pand(const Packet4us& a, const Packet4us& b) +{ return vand_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us pand(const Packet8us& a, const Packet8us& b) +{ return vandq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet2i pand(const Packet2i& a, const Packet2i& b) { return vand_s32(a,b); } template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return vandq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ui pand(const Packet2ui& a, const Packet2ui& b) +{ return vand_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui pand(const Packet4ui& a, const Packet4ui& b) +{ return vandq_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2l pand(const Packet2l& a, const Packet2l& b) { return vandq_s64(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ul pand(const Packet2ul& a, const Packet2ul& b) +{ return vandq_u64(a,b); } +template<> EIGEN_STRONG_INLINE Packet2f por(const Packet2f& a, const Packet2f& b) +{ return vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); } template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) -{ - return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); -} +{ return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); } +template<> EIGEN_STRONG_INLINE Packet4c por(const Packet4c& a, const Packet4c& b) +{ return a | b; } +template<> EIGEN_STRONG_INLINE Packet8c por(const Packet8c& a, const Packet8c& b) { return vorr_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16c por(const Packet16c& a, const Packet16c& b) +{ return vorrq_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4uc por(const Packet4uc& a, const Packet4uc& b) +{ return a | b; } +template<> EIGEN_STRONG_INLINE Packet8uc por(const Packet8uc& a, const Packet8uc& b) +{ return vorr_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc por(const Packet16uc& a, const Packet16uc& b) +{ return vorrq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4s por(const Packet4s& a, const Packet4s& b) +{ return vorr_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8s por(const Packet8s& a, const Packet8s& b) +{ return vorrq_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet4us por(const Packet4us& a, const Packet4us& b) +{ return vorr_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us por(const Packet8us& a, const Packet8us& b) +{ return vorrq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet2i por(const Packet2i& a, const Packet2i& b) { return vorr_s32(a,b); } template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { return vorrq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ui por(const Packet2ui& a, const Packet2ui& b) +{ return vorr_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui por(const Packet4ui& a, const Packet4ui& b) +{ return vorrq_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2l por(const Packet2l& a, const Packet2l& b) +{ return vorrq_s64(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ul por(const Packet2ul& a, const Packet2ul& b) +{ return vorrq_u64(a,b); } +template<> EIGEN_STRONG_INLINE Packet2f pxor(const Packet2f& a, const Packet2f& b) +{ return vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); } template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) -{ - return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); -} +{ return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); } +template<> EIGEN_STRONG_INLINE Packet4c pxor(const Packet4c& a, const Packet4c& b) +{ return a ^ b; } +template<> EIGEN_STRONG_INLINE Packet8c pxor(const Packet8c& a, const Packet8c& b) +{ return veor_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16c pxor(const Packet16c& a, const Packet16c& b) +{ return veorq_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4uc pxor(const Packet4uc& a, const Packet4uc& b) +{ return a ^ b; } +template<> EIGEN_STRONG_INLINE Packet8uc pxor(const Packet8uc& a, const Packet8uc& b) +{ return veor_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc pxor(const Packet16uc& a, const Packet16uc& b) +{ return veorq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4s pxor(const Packet4s& a, const Packet4s& b) { return veor_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8s pxor(const Packet8s& a, const Packet8s& b) { return veorq_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet4us pxor(const Packet4us& a, const Packet4us& b) +{ return veor_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us pxor(const Packet8us& a, const Packet8us& b) +{ return veorq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet2i pxor(const Packet2i& a, const Packet2i& b) { return veor_s32(a,b); } template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return veorq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ui pxor(const Packet2ui& a, const Packet2ui& b) +{ return veor_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui pxor(const Packet4ui& a, const Packet4ui& b) +{ return veorq_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2l pxor(const Packet2l& a, const Packet2l& b) +{ return veorq_s64(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ul pxor(const Packet2ul& a, const Packet2ul& b) +{ return veorq_u64(a,b); } +template<> EIGEN_STRONG_INLINE Packet2f pandnot(const Packet2f& a, const Packet2f& b) +{ return vreinterpret_f32_u32(vbic_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); } template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) +{ return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); } +template<> EIGEN_STRONG_INLINE Packet4c pandnot(const Packet4c& a, const Packet4c& b) +{ return a & ~b; } +template<> EIGEN_STRONG_INLINE Packet8c pandnot(const Packet8c& a, const Packet8c& b) { return vbic_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16c pandnot(const Packet16c& a, const Packet16c& b) { return vbicq_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4uc pandnot(const Packet4uc& a, const Packet4uc& b) +{ return a & ~b; } +template<> EIGEN_STRONG_INLINE Packet8uc pandnot(const Packet8uc& a, const Packet8uc& b) +{ return vbic_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc pandnot(const Packet16uc& a, const Packet16uc& b) +{ return vbicq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4s pandnot(const Packet4s& a, const Packet4s& b) +{ return vbic_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8s pandnot(const Packet8s& a, const Packet8s& b) +{ return vbicq_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet4us pandnot(const Packet4us& a, const Packet4us& b) +{ return vbic_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us pandnot(const Packet8us& a, const Packet8us& b) +{ return vbicq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet2i pandnot(const Packet2i& a, const Packet2i& b) +{ return vbic_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) +{ return vbicq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ui pandnot(const Packet2ui& a, const Packet2ui& b) +{ return vbic_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui pandnot(const Packet4ui& a, const Packet4ui& b) +{ return vbicq_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2l pandnot(const Packet2l& a, const Packet2l& b) +{ return vbicq_s64(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ul pandnot(const Packet2ul& a, const Packet2ul& b) +{ return vbicq_u64(a,b); } + + +template EIGEN_STRONG_INLINE Packet4c parithmetic_shift_right(Packet4c& a) +{ return vget_lane_s32(vreinterpret_s32_s8(vshr_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0); } +template EIGEN_STRONG_INLINE Packet8c parithmetic_shift_right(Packet8c a) { return vshr_n_s8(a,N); } +template EIGEN_STRONG_INLINE Packet16c parithmetic_shift_right(Packet16c a) { return vshrq_n_s8(a,N); } +template EIGEN_STRONG_INLINE Packet4uc parithmetic_shift_right(Packet4uc& a) +{ return vget_lane_u32(vreinterpret_u32_u8(vshr_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0); } +template EIGEN_STRONG_INLINE Packet8uc parithmetic_shift_right(Packet8uc a) { return vshr_n_u8(a,N); } +template EIGEN_STRONG_INLINE Packet16uc parithmetic_shift_right(Packet16uc a) { return vshrq_n_u8(a,N); } +template EIGEN_STRONG_INLINE Packet4s parithmetic_shift_right(Packet4s a) { return vshr_n_s16(a,N); } +template EIGEN_STRONG_INLINE Packet8s parithmetic_shift_right(Packet8s a) { return vshrq_n_s16(a,N); } +template EIGEN_STRONG_INLINE Packet4us parithmetic_shift_right(Packet4us a) { return vshr_n_u16(a,N); } +template EIGEN_STRONG_INLINE Packet8us parithmetic_shift_right(Packet8us a) { return vshrq_n_u16(a,N); } +template EIGEN_STRONG_INLINE Packet2i parithmetic_shift_right(Packet2i a) { return vshr_n_s32(a,N); } +template EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(Packet4i a) { return vshrq_n_s32(a,N); } +template EIGEN_STRONG_INLINE Packet2ui parithmetic_shift_right(Packet2ui a) { return vshr_n_u32(a,N); } +template EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(Packet4ui a) { return vshrq_n_u32(a,N); } +template EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(Packet2l a) { return vshrq_n_s64(a,N); } +template EIGEN_STRONG_INLINE Packet2ul parithmetic_shift_right(Packet2ul a) { return vshrq_n_u64(a,N); } + +template EIGEN_STRONG_INLINE Packet4c plogical_shift_right(Packet4c& a) +{ return vget_lane_s32(vreinterpret_s32_u8(vshr_n_u8(vreinterpret_u8_s32(vdup_n_s32(a)), N)), 0); } +template EIGEN_STRONG_INLINE Packet8c plogical_shift_right(Packet8c a) +{ return vreinterpret_s8_u8(vshr_n_u8(vreinterpret_u8_s8(a),N)); } +template EIGEN_STRONG_INLINE Packet16c plogical_shift_right(Packet16c a) +{ return vreinterpretq_s8_u8(vshrq_n_u8(vreinterpretq_u8_s8(a),N)); } +template EIGEN_STRONG_INLINE Packet4uc plogical_shift_right(Packet4uc& a) +{ return vget_lane_u32(vreinterpret_u32_s8(vshr_n_s8(vreinterpret_s8_u32(vdup_n_u32(a)), N)), 0); } +template EIGEN_STRONG_INLINE Packet8uc plogical_shift_right(Packet8uc a) { return vshr_n_u8(a,N); } +template EIGEN_STRONG_INLINE Packet16uc plogical_shift_right(Packet16uc a) { return vshrq_n_u8(a,N); } +template EIGEN_STRONG_INLINE Packet4s plogical_shift_right(Packet4s a) +{ return vreinterpret_s16_u16(vshr_n_u16(vreinterpret_u16_s16(a),N)); } +template EIGEN_STRONG_INLINE Packet8s plogical_shift_right(Packet8s a) +{ return vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(a),N)); } +template EIGEN_STRONG_INLINE Packet4us plogical_shift_right(Packet4us a) { return vshr_n_u16(a,N); } +template EIGEN_STRONG_INLINE Packet8us plogical_shift_right(Packet8us a) { return vshrq_n_u16(a,N); } +template EIGEN_STRONG_INLINE Packet2i plogical_shift_right(Packet2i a) +{ return vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(a),N)); } +template EIGEN_STRONG_INLINE Packet4i plogical_shift_right(Packet4i a) +{ return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a),N)); } +template EIGEN_STRONG_INLINE Packet2ui plogical_shift_right(Packet2ui a) { return vshr_n_u32(a,N); } +template EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(Packet4ui a) { return vshrq_n_u32(a,N); } +template EIGEN_STRONG_INLINE Packet2l plogical_shift_right(Packet2l a) +{ return vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a),N)); } +template EIGEN_STRONG_INLINE Packet2ul plogical_shift_right(Packet2ul a) { return vshrq_n_u64(a,N); } + +template EIGEN_STRONG_INLINE Packet4c plogical_shift_left(Packet4c& a) +{ return vget_lane_s32(vreinterpret_s32_s8(vshl_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0); } +template EIGEN_STRONG_INLINE Packet8c plogical_shift_left(Packet8c a) { return vshl_n_s8(a,N); } +template EIGEN_STRONG_INLINE Packet16c plogical_shift_left(Packet16c a) { return vshlq_n_s8(a,N); } +template EIGEN_STRONG_INLINE Packet4uc plogical_shift_left(Packet4uc& a) +{ return vget_lane_u32(vreinterpret_u32_u8(vshl_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0); } +template EIGEN_STRONG_INLINE Packet8uc plogical_shift_left(Packet8uc a) { return vshl_n_u8(a,N); } +template EIGEN_STRONG_INLINE Packet16uc plogical_shift_left(Packet16uc a) { return vshlq_n_u8(a,N); } +template EIGEN_STRONG_INLINE Packet4s plogical_shift_left(Packet4s a) { return vshl_n_s16(a,N); } +template EIGEN_STRONG_INLINE Packet8s plogical_shift_left(Packet8s a) { return vshlq_n_s16(a,N); } +template EIGEN_STRONG_INLINE Packet4us plogical_shift_left(Packet4us a) { return vshl_n_u16(a,N); } +template EIGEN_STRONG_INLINE Packet8us plogical_shift_left(Packet8us a) { return vshlq_n_u16(a,N); } +template EIGEN_STRONG_INLINE Packet2i plogical_shift_left(Packet2i a) { return vshl_n_s32(a,N); } +template EIGEN_STRONG_INLINE Packet4i plogical_shift_left(Packet4i a) { return vshlq_n_s32(a,N); } +template EIGEN_STRONG_INLINE Packet2ui plogical_shift_left(Packet2ui a) { return vshl_n_u32(a,N); } +template EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(Packet4ui a) { return vshlq_n_u32(a,N); } +template EIGEN_STRONG_INLINE Packet2l plogical_shift_left(Packet2l a) { return vshlq_n_s64(a,N); } +template EIGEN_STRONG_INLINE Packet2ul plogical_shift_left(Packet2ul a) { return vshlq_n_u64(a,N); } + +template<> EIGEN_STRONG_INLINE Packet2f pload(const float* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4c pload(const int8_t* from) { - return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); + Packet4c res; + memcpy(&res, from, sizeof(Packet4c)); + return res; } -template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet8c pload(const int8_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_s8(from); } +template<> EIGEN_STRONG_INLINE Packet16c pload(const int8_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s8(from); } +template<> EIGEN_STRONG_INLINE Packet4uc pload(const uint8_t* from) +{ + Packet4uc res; + memcpy(&res, from, sizeof(Packet4uc)); + return res; +} +template<> EIGEN_STRONG_INLINE Packet8uc pload(const uint8_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_u8(from); } +template<> EIGEN_STRONG_INLINE Packet16uc pload(const uint8_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u8(from); } +template<> EIGEN_STRONG_INLINE Packet4s pload(const int16_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_s16(from); } +template<> EIGEN_STRONG_INLINE Packet8s pload(const int16_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s16(from); } +template<> EIGEN_STRONG_INLINE Packet4us pload(const uint16_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_u16(from); } +template<> EIGEN_STRONG_INLINE Packet8us pload(const uint16_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u16(from); } +template<> EIGEN_STRONG_INLINE Packet2i pload(const int32_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_s32(from); } +template<> EIGEN_STRONG_INLINE Packet4i pload(const int32_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); } +template<> EIGEN_STRONG_INLINE Packet2ui pload(const uint32_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_u32(from); } +template<> EIGEN_STRONG_INLINE Packet4ui pload(const uint32_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u32(from); } +template<> EIGEN_STRONG_INLINE Packet2l pload(const int64_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s64(from); } +template<> EIGEN_STRONG_INLINE Packet2ul pload(const uint64_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u64(from); } -template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); } -template<> EIGEN_STRONG_INLINE Packet4i pload(const int32_t* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); } - -template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); } -template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int32_t* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); } +template<> EIGEN_STRONG_INLINE Packet2f ploadu(const float* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4c ploadu(const int8_t* from) +{ + Packet4c res; + memcpy(&res, from, sizeof(Packet4c)); + return res; +} +template<> EIGEN_STRONG_INLINE Packet8c ploadu(const int8_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s8(from); } +template<> EIGEN_STRONG_INLINE Packet16c ploadu(const int8_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s8(from); } +template<> EIGEN_STRONG_INLINE Packet4uc ploadu(const uint8_t* from) +{ + Packet4uc res; + memcpy(&res, from, sizeof(Packet4uc)); + return res; +} +template<> EIGEN_STRONG_INLINE Packet8uc ploadu(const uint8_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u8(from); } +template<> EIGEN_STRONG_INLINE Packet16uc ploadu(const uint8_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u8(from); } +template<> EIGEN_STRONG_INLINE Packet4s ploadu(const int16_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s16(from); } +template<> EIGEN_STRONG_INLINE Packet8s ploadu(const int16_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s16(from); } +template<> EIGEN_STRONG_INLINE Packet4us ploadu(const uint16_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u16(from); } +template<> EIGEN_STRONG_INLINE Packet8us ploadu(const uint16_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u16(from); } +template<> EIGEN_STRONG_INLINE Packet2i ploadu(const int32_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s32(from); } +template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int32_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); } +template<> EIGEN_STRONG_INLINE Packet2ui ploadu(const uint32_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u32(from); } +template<> EIGEN_STRONG_INLINE Packet4ui ploadu(const uint32_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u32(from); } +template<> EIGEN_STRONG_INLINE Packet2l ploadu(const int64_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s64(from); } +template<> EIGEN_STRONG_INLINE Packet2ul ploadu(const uint64_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u64(from); } +template<> EIGEN_STRONG_INLINE Packet2f ploaddup(const float* from) +{ return vld1_dup_f32(from); } template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) +{ return vcombine_f32(vld1_dup_f32(from), vld1_dup_f32(from+1)); } +template<> EIGEN_STRONG_INLINE Packet4c ploaddup(const int8_t* from) { - float32x2_t lo, hi; - lo = vld1_dup_f32(from); - hi = vld1_dup_f32(from+1); - return vcombine_f32(lo, hi); + const int8x8_t a = vreinterpret_s8_s32(vdup_n_s32(pload(from))); + return vget_lane_s32(vreinterpret_s32_s8(vzip_s8(a,a).val[0]), 0); } +template<> EIGEN_STRONG_INLINE Packet8c ploaddup(const int8_t* from) +{ + const int8x8_t a = vld1_s8(from); + return vzip_s8(a,a).val[0]; +} +template<> EIGEN_STRONG_INLINE Packet16c ploaddup(const int8_t* from) +{ + const int8x8_t a = vld1_s8(from); + const int8x8x2_t b = vzip_s8(a,a); + return vcombine_s8(b.val[0], b.val[1]); +} +template<> EIGEN_STRONG_INLINE Packet4uc ploaddup(const uint8_t* from) +{ + const uint8x8_t a = vreinterpret_u8_u32(vdup_n_u32(pload(from))); + return vget_lane_u32(vreinterpret_u32_u8(vzip_u8(a,a).val[0]), 0); +} +template<> EIGEN_STRONG_INLINE Packet8uc ploaddup(const uint8_t* from) +{ + const uint8x8_t a = vld1_u8(from); + return vzip_u8(a,a).val[0]; +} +template<> EIGEN_STRONG_INLINE Packet16uc ploaddup(const uint8_t* from) +{ + const uint8x8_t a = vld1_u8(from); + const uint8x8x2_t b = vzip_u8(a,a); + return vcombine_u8(b.val[0], b.val[1]); +} +template<> EIGEN_STRONG_INLINE Packet4s ploaddup(const int16_t* from) +{ + return vreinterpret_s16_u32(vzip_u32(vreinterpret_u32_s16(vld1_dup_s16(from)), + vreinterpret_u32_s16(vld1_dup_s16(from+1))).val[0]); +} +template<> EIGEN_STRONG_INLINE Packet8s ploaddup(const int16_t* from) +{ + const int16x4_t a = vld1_s16(from); + const int16x4x2_t b = vzip_s16(a,a); + return vcombine_s16(b.val[0], b.val[1]); +} +template<> EIGEN_STRONG_INLINE Packet4us ploaddup(const uint16_t* from) +{ + return vreinterpret_u16_u32(vzip_u32(vreinterpret_u32_u16(vld1_dup_u16(from)), + vreinterpret_u32_u16(vld1_dup_u16(from+1))).val[0]); +} +template<> EIGEN_STRONG_INLINE Packet8us ploaddup(const uint16_t* from) +{ + const uint16x4_t a = vld1_u16(from); + const uint16x4x2_t b = vzip_u16(a,a); + return vcombine_u16(b.val[0], b.val[1]); +} +template<> EIGEN_STRONG_INLINE Packet2i ploaddup(const int32_t* from) +{ return vld1_dup_s32(from); } template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int32_t* from) +{ return vcombine_s32(vld1_dup_s32(from), vld1_dup_s32(from+1)); } +template<> EIGEN_STRONG_INLINE Packet2ui ploaddup(const uint32_t* from) +{ return vld1_dup_u32(from); } +template<> EIGEN_STRONG_INLINE Packet4ui ploaddup(const uint32_t* from) +{ return vcombine_u32(vld1_dup_u32(from), vld1_dup_u32(from+1)); } +template<> EIGEN_STRONG_INLINE Packet2l ploaddup(const int64_t* from) +{ return vld1q_dup_s64(from); } +template<> EIGEN_STRONG_INLINE Packet2ul ploaddup(const uint64_t* from) +{ return vld1q_dup_u64(from); } + +template<> EIGEN_STRONG_INLINE Packet4f ploadquad(const float* from) { return vld1q_dup_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4c ploadquad(const int8_t* from) +{ return vget_lane_s32(vreinterpret_s32_s8(vld1_dup_s8(from)), 0); } +template<> EIGEN_STRONG_INLINE Packet8c ploadquad(const int8_t* from) { - int32x2_t lo, hi; - lo = vld1_dup_s32(from); - hi = vld1_dup_s32(from+1); - return vcombine_s32(lo, hi); + return vreinterpret_s8_u32(vzip_u32( + vreinterpret_u32_s8(vld1_dup_s8(from)), + vreinterpret_u32_s8(vld1_dup_s8(from+1))).val[0]); } - -template<> EIGEN_STRONG_INLINE void pstore (float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); } -template<> EIGEN_STRONG_INLINE void pstore(int32_t* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); } - -template<> EIGEN_STRONG_INLINE void pstoreu (float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); } -template<> EIGEN_STRONG_INLINE void pstoreu(int32_t* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); } - -template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) +template<> EIGEN_STRONG_INLINE Packet16c ploadquad(const int8_t* from) { - Packet4f res = pset1(0.f); - res = vsetq_lane_f32(from[0*stride], res, 0); - res = vsetq_lane_f32(from[1*stride], res, 1); - res = vsetq_lane_f32(from[2*stride], res, 2); - res = vsetq_lane_f32(from[3*stride], res, 3); + const int8x8_t a = vreinterpret_s8_u32(vzip_u32( + vreinterpret_u32_s8(vld1_dup_s8(from)), + vreinterpret_u32_s8(vld1_dup_s8(from+1))).val[0]); + const int8x8_t b = vreinterpret_s8_u32(vzip_u32( + vreinterpret_u32_s8(vld1_dup_s8(from+2)), + vreinterpret_u32_s8(vld1_dup_s8(from+3))).val[0]); + return vcombine_s8(a,b); +} +template<> EIGEN_STRONG_INLINE Packet4uc ploadquad(const uint8_t* from) +{ return vget_lane_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), 0); } +template<> EIGEN_STRONG_INLINE Packet8uc ploadquad(const uint8_t* from) +{ + return vreinterpret_u8_u32(vzip_u32( + vreinterpret_u32_u8(vld1_dup_u8(from)), + vreinterpret_u32_u8(vld1_dup_u8(from+1))).val[0]); +} +template<> EIGEN_STRONG_INLINE Packet16uc ploadquad(const uint8_t* from) +{ + const uint8x8_t a = vreinterpret_u8_u32(vzip_u32( + vreinterpret_u32_u8(vld1_dup_u8(from)), + vreinterpret_u32_u8(vld1_dup_u8(from+1))).val[0]); + const uint8x8_t b = vreinterpret_u8_u32(vzip_u32( + vreinterpret_u32_u8(vld1_dup_u8(from+2)), + vreinterpret_u32_u8(vld1_dup_u8(from+3))).val[0]); + return vcombine_u8(a,b); +} +template<> EIGEN_STRONG_INLINE Packet8s ploadquad(const int16_t* from) +{ return vcombine_s16(vld1_dup_s16(from), vld1_dup_s16(from+1)); } +template<> EIGEN_STRONG_INLINE Packet8us ploadquad(const uint16_t* from) +{ return vcombine_u16(vld1_dup_u16(from), vld1_dup_u16(from+1)); } +template<> EIGEN_STRONG_INLINE Packet4i ploadquad(const int32_t* from) { return vld1q_dup_s32(from); } +template<> EIGEN_STRONG_INLINE Packet4ui ploadquad(const uint32_t* from) { return vld1q_dup_u32(from); } + +template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet2f& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1_f32(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(int8_t* to, const Packet4c& from) +{ memcpy(to, &from, sizeof(from)); } +template<> EIGEN_STRONG_INLINE void pstore(int8_t* to, const Packet8c& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1_s8(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(int8_t* to, const Packet16c& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s8(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(uint8_t* to, const Packet4uc& from) +{ memcpy(to, &from, sizeof(from)); } +template<> EIGEN_STRONG_INLINE void pstore(uint8_t* to, const Packet8uc& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1_u8(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(uint8_t* to, const Packet16uc& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u8(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(int16_t* to, const Packet4s& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1_s16(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(int16_t* to, const Packet8s& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s16(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(uint16_t* to, const Packet4us& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1_u16(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(uint16_t* to, const Packet8us& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u16(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(int32_t* to, const Packet2i& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1_s32(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(int32_t* to, const Packet4i& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(uint32_t* to, const Packet2ui& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1_u32(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(uint32_t* to, const Packet4ui& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u32(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(int64_t* to, const Packet2l& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s64(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(uint64_t* to, const Packet2ul& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u64(to,from); } + +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet2f& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1_f32(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(int8_t* to, const Packet4c& from) +{ memcpy(to, &from, sizeof(from)); } +template<> EIGEN_STRONG_INLINE void pstoreu(int8_t* to, const Packet8c& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1_s8(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(int8_t* to, const Packet16c& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s8(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(uint8_t* to, const Packet4uc& from) +{ memcpy(to, &from, sizeof(from)); } +template<> EIGEN_STRONG_INLINE void pstoreu(uint8_t* to, const Packet8uc& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1_u8(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(uint8_t* to, const Packet16uc& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u8(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(int16_t* to, const Packet4s& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1_s16(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(int16_t* to, const Packet8s& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s16(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(uint16_t* to, const Packet4us& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1_u16(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(uint16_t* to, const Packet8us& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u16(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(int32_t* to, const Packet2i& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1_s32(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(int32_t* to, const Packet4i& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(uint32_t* to, const Packet2ui& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1_u32(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(uint32_t* to, const Packet4ui& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u32(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(int64_t* to, const Packet2l& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s64(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(uint64_t* to, const Packet2ul& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u64(to,from); } + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pgather(const float* from, Index stride) +{ + Packet2f res = vld1_dup_f32(from); + res = vld1_lane_f32(from + 1*stride, res, 1); return res; } -template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int32_t* from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pgather(const float* from, Index stride) { - Packet4i res = pset1(0); - res = vsetq_lane_s32(from[0*stride], res, 0); - res = vsetq_lane_s32(from[1*stride], res, 1); - res = vsetq_lane_s32(from[2*stride], res, 2); - res = vsetq_lane_s32(from[3*stride], res, 3); + Packet4f res = vld1q_dup_f32(from); + res = vld1q_lane_f32(from + 1*stride, res, 1); + res = vld1q_lane_f32(from + 2*stride, res, 2); + res = vld1q_lane_f32(from + 3*stride, res, 3); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c pgather(const int8_t* from, Index stride) +{ + Packet4c res; + for (int i = 0; i != 4; i++) + reinterpret_cast(&res)[i] = *(from + i * stride); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pgather(const int8_t* from, Index stride) +{ + Packet8c res = vld1_dup_s8(from); + res = vld1_lane_s8(from + 1*stride, res, 1); + res = vld1_lane_s8(from + 2*stride, res, 2); + res = vld1_lane_s8(from + 3*stride, res, 3); + res = vld1_lane_s8(from + 4*stride, res, 4); + res = vld1_lane_s8(from + 5*stride, res, 5); + res = vld1_lane_s8(from + 6*stride, res, 6); + res = vld1_lane_s8(from + 7*stride, res, 7); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pgather(const int8_t* from, Index stride) +{ + Packet16c res = vld1q_dup_s8(from); + res = vld1q_lane_s8(from + 1*stride, res, 1); + res = vld1q_lane_s8(from + 2*stride, res, 2); + res = vld1q_lane_s8(from + 3*stride, res, 3); + res = vld1q_lane_s8(from + 4*stride, res, 4); + res = vld1q_lane_s8(from + 5*stride, res, 5); + res = vld1q_lane_s8(from + 6*stride, res, 6); + res = vld1q_lane_s8(from + 7*stride, res, 7); + res = vld1q_lane_s8(from + 8*stride, res, 8); + res = vld1q_lane_s8(from + 9*stride, res, 9); + res = vld1q_lane_s8(from + 10*stride, res, 10); + res = vld1q_lane_s8(from + 11*stride, res, 11); + res = vld1q_lane_s8(from + 12*stride, res, 12); + res = vld1q_lane_s8(from + 13*stride, res, 13); + res = vld1q_lane_s8(from + 14*stride, res, 14); + res = vld1q_lane_s8(from + 15*stride, res, 15); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc pgather(const uint8_t* from, Index stride) +{ + Packet4uc res; + for (int i = 0; i != 4; i++) + reinterpret_cast(&res)[i] = *(from + i * stride); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pgather(const uint8_t* from, Index stride) +{ + Packet8uc res = vld1_dup_u8(from); + res = vld1_lane_u8(from + 1*stride, res, 1); + res = vld1_lane_u8(from + 2*stride, res, 2); + res = vld1_lane_u8(from + 3*stride, res, 3); + res = vld1_lane_u8(from + 4*stride, res, 4); + res = vld1_lane_u8(from + 5*stride, res, 5); + res = vld1_lane_u8(from + 6*stride, res, 6); + res = vld1_lane_u8(from + 7*stride, res, 7); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pgather(const uint8_t* from, Index stride) +{ + Packet16uc res = vld1q_dup_u8(from); + res = vld1q_lane_u8(from + 1*stride, res, 1); + res = vld1q_lane_u8(from + 2*stride, res, 2); + res = vld1q_lane_u8(from + 3*stride, res, 3); + res = vld1q_lane_u8(from + 4*stride, res, 4); + res = vld1q_lane_u8(from + 5*stride, res, 5); + res = vld1q_lane_u8(from + 6*stride, res, 6); + res = vld1q_lane_u8(from + 7*stride, res, 7); + res = vld1q_lane_u8(from + 8*stride, res, 8); + res = vld1q_lane_u8(from + 9*stride, res, 9); + res = vld1q_lane_u8(from + 10*stride, res, 10); + res = vld1q_lane_u8(from + 11*stride, res, 11); + res = vld1q_lane_u8(from + 12*stride, res, 12); + res = vld1q_lane_u8(from + 13*stride, res, 13); + res = vld1q_lane_u8(from + 14*stride, res, 14); + res = vld1q_lane_u8(from + 15*stride, res, 15); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pgather(const int16_t* from, Index stride) +{ + Packet4s res = vld1_dup_s16(from); + res = vld1_lane_s16(from + 1*stride, res, 1); + res = vld1_lane_s16(from + 2*stride, res, 2); + res = vld1_lane_s16(from + 3*stride, res, 3); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pgather(const int16_t* from, Index stride) +{ + Packet8s res = vld1q_dup_s16(from); + res = vld1q_lane_s16(from + 1*stride, res, 1); + res = vld1q_lane_s16(from + 2*stride, res, 2); + res = vld1q_lane_s16(from + 3*stride, res, 3); + res = vld1q_lane_s16(from + 4*stride, res, 4); + res = vld1q_lane_s16(from + 5*stride, res, 5); + res = vld1q_lane_s16(from + 6*stride, res, 6); + res = vld1q_lane_s16(from + 7*stride, res, 7); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pgather(const uint16_t* from, Index stride) +{ + Packet4us res = vld1_dup_u16(from); + res = vld1_lane_u16(from + 1*stride, res, 1); + res = vld1_lane_u16(from + 2*stride, res, 2); + res = vld1_lane_u16(from + 3*stride, res, 3); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pgather(const uint16_t* from, Index stride) +{ + Packet8us res = vld1q_dup_u16(from); + res = vld1q_lane_u16(from + 1*stride, res, 1); + res = vld1q_lane_u16(from + 2*stride, res, 2); + res = vld1q_lane_u16(from + 3*stride, res, 3); + res = vld1q_lane_u16(from + 4*stride, res, 4); + res = vld1q_lane_u16(from + 5*stride, res, 5); + res = vld1q_lane_u16(from + 6*stride, res, 6); + res = vld1q_lane_u16(from + 7*stride, res, 7); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pgather(const int32_t* from, Index stride) +{ + Packet2i res = vld1_dup_s32(from); + res = vld1_lane_s32(from + 1*stride, res, 1); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pgather(const int32_t* from, Index stride) +{ + Packet4i res = vld1q_dup_s32(from); + res = vld1q_lane_s32(from + 1*stride, res, 1); + res = vld1q_lane_s32(from + 2*stride, res, 2); + res = vld1q_lane_s32(from + 3*stride, res, 3); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pgather(const uint32_t* from, Index stride) +{ + Packet2ui res = vld1_dup_u32(from); + res = vld1_lane_u32(from + 1*stride, res, 1); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pgather(const uint32_t* from, Index stride) +{ + Packet4ui res = vld1q_dup_u32(from); + res = vld1q_lane_u32(from + 1*stride, res, 1); + res = vld1q_lane_u32(from + 2*stride, res, 2); + res = vld1q_lane_u32(from + 3*stride, res, 3); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pgather(const int64_t* from, Index stride) +{ + Packet2l res = vld1q_dup_s64(from); + res = vld1q_lane_s64(from + 1*stride, res, 1); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pgather(const uint64_t* from, Index stride) +{ + Packet2ul res = vld1q_dup_u64(from); + res = vld1q_lane_u64(from + 1*stride, res, 1); return res; } -template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(float* to, const Packet2f& from, Index stride) { - to[stride*0] = vgetq_lane_f32(from, 0); - to[stride*1] = vgetq_lane_f32(from, 1); - to[stride*2] = vgetq_lane_f32(from, 2); - to[stride*3] = vgetq_lane_f32(from, 3); + vst1_lane_f32(to + stride*0, from, 0); + vst1_lane_f32(to + stride*1, from, 1); } -template<> EIGEN_DEVICE_FUNC inline void pscatter(int32_t* to, const Packet4i& from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(float* to, const Packet4f& from, Index stride) { - to[stride*0] = vgetq_lane_s32(from, 0); - to[stride*1] = vgetq_lane_s32(from, 1); - to[stride*2] = vgetq_lane_s32(from, 2); - to[stride*3] = vgetq_lane_s32(from, 3); + vst1q_lane_f32(to + stride*0, from, 0); + vst1q_lane_f32(to + stride*1, from, 1); + vst1q_lane_f32(to + stride*2, from, 2); + vst1q_lane_f32(to + stride*3, from, 3); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(int8_t* to, const Packet4c& from, Index stride) +{ + for (int i = 0; i != 4; i++) + *(to + i * stride) = reinterpret_cast(&from)[i]; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(int8_t* to, const Packet8c& from, Index stride) +{ + vst1_lane_s8(to + stride*0, from, 0); + vst1_lane_s8(to + stride*1, from, 1); + vst1_lane_s8(to + stride*2, from, 2); + vst1_lane_s8(to + stride*3, from, 3); + vst1_lane_s8(to + stride*4, from, 4); + vst1_lane_s8(to + stride*5, from, 5); + vst1_lane_s8(to + stride*6, from, 6); + vst1_lane_s8(to + stride*7, from, 7); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(int8_t* to, const Packet16c& from, Index stride) +{ + vst1q_lane_s8(to + stride*0, from, 0); + vst1q_lane_s8(to + stride*1, from, 1); + vst1q_lane_s8(to + stride*2, from, 2); + vst1q_lane_s8(to + stride*3, from, 3); + vst1q_lane_s8(to + stride*4, from, 4); + vst1q_lane_s8(to + stride*5, from, 5); + vst1q_lane_s8(to + stride*6, from, 6); + vst1q_lane_s8(to + stride*7, from, 7); + vst1q_lane_s8(to + stride*8, from, 8); + vst1q_lane_s8(to + stride*9, from, 9); + vst1q_lane_s8(to + stride*10, from, 10); + vst1q_lane_s8(to + stride*11, from, 11); + vst1q_lane_s8(to + stride*12, from, 12); + vst1q_lane_s8(to + stride*13, from, 13); + vst1q_lane_s8(to + stride*14, from, 14); + vst1q_lane_s8(to + stride*15, from, 15); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(uint8_t* to, const Packet4uc& from, Index stride) +{ + for (int i = 0; i != 4; i++) + *(to + i * stride) = reinterpret_cast(&from)[i]; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(uint8_t* to, const Packet8uc& from, Index stride) +{ + vst1_lane_u8(to + stride*0, from, 0); + vst1_lane_u8(to + stride*1, from, 1); + vst1_lane_u8(to + stride*2, from, 2); + vst1_lane_u8(to + stride*3, from, 3); + vst1_lane_u8(to + stride*4, from, 4); + vst1_lane_u8(to + stride*5, from, 5); + vst1_lane_u8(to + stride*6, from, 6); + vst1_lane_u8(to + stride*7, from, 7); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(uint8_t* to, const Packet16uc& from, Index stride) +{ + vst1q_lane_u8(to + stride*0, from, 0); + vst1q_lane_u8(to + stride*1, from, 1); + vst1q_lane_u8(to + stride*2, from, 2); + vst1q_lane_u8(to + stride*3, from, 3); + vst1q_lane_u8(to + stride*4, from, 4); + vst1q_lane_u8(to + stride*5, from, 5); + vst1q_lane_u8(to + stride*6, from, 6); + vst1q_lane_u8(to + stride*7, from, 7); + vst1q_lane_u8(to + stride*8, from, 8); + vst1q_lane_u8(to + stride*9, from, 9); + vst1q_lane_u8(to + stride*10, from, 10); + vst1q_lane_u8(to + stride*11, from, 11); + vst1q_lane_u8(to + stride*12, from, 12); + vst1q_lane_u8(to + stride*13, from, 13); + vst1q_lane_u8(to + stride*14, from, 14); + vst1q_lane_u8(to + stride*15, from, 15); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(int16_t* to, const Packet4s& from, Index stride) +{ + vst1_lane_s16(to + stride*0, from, 0); + vst1_lane_s16(to + stride*1, from, 1); + vst1_lane_s16(to + stride*2, from, 2); + vst1_lane_s16(to + stride*3, from, 3); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(int16_t* to, const Packet8s& from, Index stride) +{ + vst1q_lane_s16(to + stride*0, from, 0); + vst1q_lane_s16(to + stride*1, from, 1); + vst1q_lane_s16(to + stride*2, from, 2); + vst1q_lane_s16(to + stride*3, from, 3); + vst1q_lane_s16(to + stride*4, from, 4); + vst1q_lane_s16(to + stride*5, from, 5); + vst1q_lane_s16(to + stride*6, from, 6); + vst1q_lane_s16(to + stride*7, from, 7); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(uint16_t* to, const Packet4us& from, Index stride) +{ + vst1_lane_u16(to + stride*0, from, 0); + vst1_lane_u16(to + stride*1, from, 1); + vst1_lane_u16(to + stride*2, from, 2); + vst1_lane_u16(to + stride*3, from, 3); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(uint16_t* to, const Packet8us& from, Index stride) +{ + vst1q_lane_u16(to + stride*0, from, 0); + vst1q_lane_u16(to + stride*1, from, 1); + vst1q_lane_u16(to + stride*2, from, 2); + vst1q_lane_u16(to + stride*3, from, 3); + vst1q_lane_u16(to + stride*4, from, 4); + vst1q_lane_u16(to + stride*5, from, 5); + vst1q_lane_u16(to + stride*6, from, 6); + vst1q_lane_u16(to + stride*7, from, 7); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(int32_t* to, const Packet2i& from, Index stride) +{ + vst1_lane_s32(to + stride*0, from, 0); + vst1_lane_s32(to + stride*1, from, 1); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(int32_t* to, const Packet4i& from, Index stride) +{ + vst1q_lane_s32(to + stride*0, from, 0); + vst1q_lane_s32(to + stride*1, from, 1); + vst1q_lane_s32(to + stride*2, from, 2); + vst1q_lane_s32(to + stride*3, from, 3); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(uint32_t* to, const Packet2ui& from, Index stride) +{ + vst1_lane_u32(to + stride*0, from, 0); + vst1_lane_u32(to + stride*1, from, 1); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(uint32_t* to, const Packet4ui& from, Index stride) +{ + vst1q_lane_u32(to + stride*0, from, 0); + vst1q_lane_u32(to + stride*1, from, 1); + vst1q_lane_u32(to + stride*2, from, 2); + vst1q_lane_u32(to + stride*3, from, 3); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(int64_t* to, const Packet2l& from, Index stride) +{ + vst1q_lane_s64(to + stride*0, from, 0); + vst1q_lane_s64(to + stride*1, from, 1); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(uint64_t* to, const Packet2ul& from, Index stride) +{ + vst1q_lane_u64(to + stride*0, from, 0); + vst1q_lane_u64(to + stride*1, from, 1); } -template<> EIGEN_STRONG_INLINE void prefetch (const float* addr) { EIGEN_ARM_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE void prefetch(const int32_t* addr) { EIGEN_ARM_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { EIGEN_ARM_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const int8_t* addr) { EIGEN_ARM_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const uint8_t* addr) { EIGEN_ARM_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const int16_t* addr) { EIGEN_ARM_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const uint16_t* addr) { EIGEN_ARM_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const int32_t* addr) { EIGEN_ARM_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const uint32_t* addr) { EIGEN_ARM_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const int64_t* addr) { EIGEN_ARM_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const uint64_t* addr) { EIGEN_ARM_PREFETCH(addr); } -// FIXME only store the 2 first elements ? -template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; } -template<> EIGEN_STRONG_INLINE int32_t pfirst(const Packet4i& a) { int32_t EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE float pfirst(const Packet2f& a) { return vget_lane_f32(a,0); } +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { return vgetq_lane_f32(a,0); } +template<> EIGEN_STRONG_INLINE int8_t pfirst(const Packet4c& a) { return static_cast(a & 0xff); } +template<> EIGEN_STRONG_INLINE int8_t pfirst(const Packet8c& a) { return vget_lane_s8(a,0); } +template<> EIGEN_STRONG_INLINE int8_t pfirst(const Packet16c& a) { return vgetq_lane_s8(a,0); } +template<> EIGEN_STRONG_INLINE uint8_t pfirst(const Packet4uc& a) { return static_cast(a & 0xff); } +template<> EIGEN_STRONG_INLINE uint8_t pfirst(const Packet8uc& a) { return vget_lane_u8(a,0); } +template<> EIGEN_STRONG_INLINE uint8_t pfirst(const Packet16uc& a) { return vgetq_lane_u8(a,0); } +template<> EIGEN_STRONG_INLINE int16_t pfirst(const Packet4s& a) { return vget_lane_s16(a,0); } +template<> EIGEN_STRONG_INLINE int16_t pfirst(const Packet8s& a) { return vgetq_lane_s16(a,0); } +template<> EIGEN_STRONG_INLINE uint16_t pfirst(const Packet4us& a) { return vget_lane_u16(a,0); } +template<> EIGEN_STRONG_INLINE uint16_t pfirst(const Packet8us& a) { return vgetq_lane_u16(a,0); } +template<> EIGEN_STRONG_INLINE int32_t pfirst(const Packet2i& a) { return vget_lane_s32(a,0); } +template<> EIGEN_STRONG_INLINE int32_t pfirst(const Packet4i& a) { return vgetq_lane_s32(a,0); } +template<> EIGEN_STRONG_INLINE uint32_t pfirst(const Packet2ui& a) { return vget_lane_u32(a,0); } +template<> EIGEN_STRONG_INLINE uint32_t pfirst(const Packet4ui& a) { return vgetq_lane_u32(a,0); } +template<> EIGEN_STRONG_INLINE int64_t pfirst(const Packet2l& a) { return vgetq_lane_s64(a,0); } +template<> EIGEN_STRONG_INLINE uint64_t pfirst(const Packet2ul& a) { return vgetq_lane_u64(a,0); } -template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { - float32x2_t a_lo, a_hi; - Packet4f a_r64; - - a_r64 = vrev64q_f32(a); - a_lo = vget_low_f32(a_r64); - a_hi = vget_high_f32(a_r64); - return vcombine_f32(a_hi, a_lo); +template<> EIGEN_STRONG_INLINE Packet2f preverse(const Packet2f& a) { return vrev64_f32(a); } +template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) +{ + const float32x4_t a_r64 = vrev64q_f32(a); + return vcombine_f32(vget_high_f32(a_r64), vget_low_f32(a_r64)); } -template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { - int32x2_t a_lo, a_hi; - Packet4i a_r64; - - a_r64 = vrev64q_s32(a); - a_lo = vget_low_s32(a_r64); - a_hi = vget_high_s32(a_r64); - return vcombine_s32(a_hi, a_lo); +template<> EIGEN_STRONG_INLINE Packet4c preverse(const Packet4c& a) +{ return vget_lane_s32(vreinterpret_s32_s8(vrev64_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); } +template<> EIGEN_STRONG_INLINE Packet8c preverse(const Packet8c& a) { return vrev64_s8(a); } +template<> EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a) +{ + const int8x16_t a_r64 = vrev64q_s8(a); + return vcombine_s8(vget_high_s8(a_r64), vget_low_s8(a_r64)); } +template<> EIGEN_STRONG_INLINE Packet4uc preverse(const Packet4uc& a) +{ return vget_lane_u32(vreinterpret_u32_u8(vrev64_u8(vreinterpret_u8_u32(vdup_n_u32(a)))), 0); } +template<> EIGEN_STRONG_INLINE Packet8uc preverse(const Packet8uc& a) { return vrev64_u8(a); } +template<> EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a) +{ + const uint8x16_t a_r64 = vrev64q_u8(a); + return vcombine_u8(vget_high_u8(a_r64), vget_low_u8(a_r64)); +} +template<> EIGEN_STRONG_INLINE Packet4s preverse(const Packet4s& a) { return vrev64_s16(a); } +template<> EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a) +{ + const int16x8_t a_r64 = vrev64q_s16(a); + return vcombine_s16(vget_high_s16(a_r64), vget_low_s16(a_r64)); +} +template<> EIGEN_STRONG_INLINE Packet4us preverse(const Packet4us& a) { return vrev64_u16(a); } +template<> EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a) +{ + const uint16x8_t a_r64 = vrev64q_u16(a); + return vcombine_u16(vget_high_u16(a_r64), vget_low_u16(a_r64)); +} +template<> EIGEN_STRONG_INLINE Packet2i preverse(const Packet2i& a) { return vrev64_s32(a); } +template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) +{ + const int32x4_t a_r64 = vrev64q_s32(a); + return vcombine_s32(vget_high_s32(a_r64), vget_low_s32(a_r64)); +} +template<> EIGEN_STRONG_INLINE Packet2ui preverse(const Packet2ui& a) { return vrev64_u32(a); } +template<> EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a) +{ + const uint32x4_t a_r64 = vrev64q_u32(a); + return vcombine_u32(vget_high_u32(a_r64), vget_low_u32(a_r64)); +} +template<> EIGEN_STRONG_INLINE Packet2l preverse(const Packet2l& a) +{ return vcombine_s64(vget_high_s64(a), vget_low_s64(a)); } +template<> EIGEN_STRONG_INLINE Packet2ul preverse(const Packet2ul& a) +{ return vcombine_u64(vget_high_u64(a), vget_low_u64(a)); } +template<> EIGEN_STRONG_INLINE Packet2f pabs(const Packet2f& a) { return vabs_f32(a); } template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); } +template<> EIGEN_STRONG_INLINE Packet4c pabs(const Packet4c& a) +{ return vget_lane_s32(vreinterpret_s32_s8(vabs_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); } +template<> EIGEN_STRONG_INLINE Packet8c pabs(const Packet8c& a) { return vabs_s8(a); } +template<> EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) { return vabsq_s8(a); } +template<> EIGEN_STRONG_INLINE Packet4uc pabs(const Packet4uc& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet8uc pabs(const Packet8uc& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4s pabs(const Packet4s& a) { return vabs_s16(a); } +template<> EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) { return vabsq_s16(a); } +template<> EIGEN_STRONG_INLINE Packet4us pabs(const Packet4us& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet2i pabs(const Packet2i& a) { return vabs_s32(a); } template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); } +template<> EIGEN_STRONG_INLINE Packet2ui pabs(const Packet2ui& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4ui pabs(const Packet4ui& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet2l pabs(const Packet2l& a) { +#if EIGEN_ARCH_ARM64 + return vabsq_s64(a); +#else + return vcombine_s64( + vdup_n_s64((std::abs)(vgetq_lane_s64(a, 0))), + vdup_n_s64((std::abs)(vgetq_lane_s64(a, 1)))); +#endif +} +template<> EIGEN_STRONG_INLINE Packet2ul pabs(const Packet2ul& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet2f pfrexp(const Packet2f& a, Packet2f& exponent) +{ return pfrexp_generic(a,exponent); } +template<> EIGEN_STRONG_INLINE Packet4f pfrexp(const Packet4f& a, Packet4f& exponent) +{ return pfrexp_generic(a,exponent); } + +template<> EIGEN_STRONG_INLINE Packet2f pldexp(const Packet2f& a, const Packet2f& exponent) +{ return pldexp_generic(a,exponent); } +template<> EIGEN_STRONG_INLINE Packet4f pldexp(const Packet4f& a, const Packet4f& exponent) +{ return pldexp_generic(a,exponent); } + +template<> EIGEN_STRONG_INLINE float predux(const Packet2f& a) { return vget_lane_f32(vpadd_f32(a,a), 0); } template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) { - float32x2_t a_lo, a_hi, sum; - - a_lo = vget_low_f32(a); - a_hi = vget_high_f32(a); - sum = vpadd_f32(a_lo, a_hi); - sum = vpadd_f32(sum, sum); - return vget_lane_f32(sum, 0); + const float32x2_t sum = vadd_f32(vget_low_f32(a), vget_high_f32(a)); + return vget_lane_f32(vpadd_f32(sum, sum), 0); } - -template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) +template<> EIGEN_STRONG_INLINE int8_t predux(const Packet4c& a) { - float32x4x2_t vtrn1, vtrn2, res1, res2; - Packet4f sum1, sum2, sum; - - // NEON zip performs interleaving of the supplied vectors. - // We perform two interleaves in a row to acquire the transposed vector - vtrn1 = vzipq_f32(vecs[0], vecs[2]); - vtrn2 = vzipq_f32(vecs[1], vecs[3]); - res1 = vzipq_f32(vtrn1.val[0], vtrn2.val[0]); - res2 = vzipq_f32(vtrn1.val[1], vtrn2.val[1]); - - // Do the addition of the resulting vectors - sum1 = vaddq_f32(res1.val[0], res1.val[1]); - sum2 = vaddq_f32(res2.val[0], res2.val[1]); - sum = vaddq_f32(sum1, sum2); - - return sum; + const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a)); + int8x8_t sum = vpadd_s8(a_dup, a_dup); + sum = vpadd_s8(sum, sum); + return vget_lane_s8(sum, 0); } - +template<> EIGEN_STRONG_INLINE int8_t predux(const Packet8c& a) +{ + int8x8_t sum = vpadd_s8(a,a); + sum = vpadd_s8(sum, sum); + sum = vpadd_s8(sum, sum); + return vget_lane_s8(sum, 0); +} +template<> EIGEN_STRONG_INLINE int8_t predux(const Packet16c& a) +{ + int8x8_t sum = vadd_s8(vget_low_s8(a), vget_high_s8(a)); + sum = vpadd_s8(sum, sum); + sum = vpadd_s8(sum, sum); + sum = vpadd_s8(sum, sum); + return vget_lane_s8(sum, 0); +} +template<> EIGEN_STRONG_INLINE uint8_t predux(const Packet4uc& a) +{ + const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a)); + uint8x8_t sum = vpadd_u8(a_dup, a_dup); + sum = vpadd_u8(sum, sum); + return vget_lane_u8(sum, 0); +} +template<> EIGEN_STRONG_INLINE uint8_t predux(const Packet8uc& a) +{ + uint8x8_t sum = vpadd_u8(a,a); + sum = vpadd_u8(sum, sum); + sum = vpadd_u8(sum, sum); + return vget_lane_u8(sum, 0); +} +template<> EIGEN_STRONG_INLINE uint8_t predux(const Packet16uc& a) +{ + uint8x8_t sum = vadd_u8(vget_low_u8(a), vget_high_u8(a)); + sum = vpadd_u8(sum, sum); + sum = vpadd_u8(sum, sum); + sum = vpadd_u8(sum, sum); + return vget_lane_u8(sum, 0); +} +template<> EIGEN_STRONG_INLINE int16_t predux(const Packet4s& a) +{ + const int16x4_t sum = vpadd_s16(a,a); + return vget_lane_s16(vpadd_s16(sum, sum), 0); +} +template<> EIGEN_STRONG_INLINE int16_t predux(const Packet8s& a) +{ + int16x4_t sum = vadd_s16(vget_low_s16(a), vget_high_s16(a)); + sum = vpadd_s16(sum, sum); + sum = vpadd_s16(sum, sum); + return vget_lane_s16(sum, 0); +} +template<> EIGEN_STRONG_INLINE uint16_t predux(const Packet4us& a) +{ + const uint16x4_t sum = vpadd_u16(a,a); + return vget_lane_u16(vpadd_u16(sum, sum), 0); +} +template<> EIGEN_STRONG_INLINE uint16_t predux(const Packet8us& a) +{ + uint16x4_t sum = vadd_u16(vget_low_u16(a), vget_high_u16(a)); + sum = vpadd_u16(sum, sum); + sum = vpadd_u16(sum, sum); + return vget_lane_u16(sum, 0); +} +template<> EIGEN_STRONG_INLINE int32_t predux(const Packet2i& a) { return vget_lane_s32(vpadd_s32(a,a), 0); } template<> EIGEN_STRONG_INLINE int32_t predux(const Packet4i& a) { - int32x2_t a_lo, a_hi, sum; - - a_lo = vget_low_s32(a); - a_hi = vget_high_s32(a); - sum = vpadd_s32(a_lo, a_hi); - sum = vpadd_s32(sum, sum); - return vget_lane_s32(sum, 0); + const int32x2_t sum = vadd_s32(vget_low_s32(a), vget_high_s32(a)); + return vget_lane_s32(vpadd_s32(sum, sum), 0); } - -template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) +template<> EIGEN_STRONG_INLINE uint32_t predux(const Packet2ui& a) { return vget_lane_u32(vpadd_u32(a,a), 0); } +template<> EIGEN_STRONG_INLINE uint32_t predux(const Packet4ui& a) { - int32x4x2_t vtrn1, vtrn2, res1, res2; - Packet4i sum1, sum2, sum; - - // NEON zip performs interleaving of the supplied vectors. - // We perform two interleaves in a row to acquire the transposed vector - vtrn1 = vzipq_s32(vecs[0], vecs[2]); - vtrn2 = vzipq_s32(vecs[1], vecs[3]); - res1 = vzipq_s32(vtrn1.val[0], vtrn2.val[0]); - res2 = vzipq_s32(vtrn1.val[1], vtrn2.val[1]); - - // Do the addition of the resulting vectors - sum1 = vaddq_s32(res1.val[0], res1.val[1]); - sum2 = vaddq_s32(res2.val[0], res2.val[1]); - sum = vaddq_s32(sum1, sum2); - - return sum; + const uint32x2_t sum = vadd_u32(vget_low_u32(a), vget_high_u32(a)); + return vget_lane_u32(vpadd_u32(sum, sum), 0); } +template<> EIGEN_STRONG_INLINE int64_t predux(const Packet2l& a) +{ return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1); } +template<> EIGEN_STRONG_INLINE uint64_t predux(const Packet2ul& a) +{ return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1); } + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c predux_half_dowto4(const Packet8c& a) +{ + return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(a, + vreinterpret_s8_s32(vrev64_s32(vreinterpret_s32_s8(a))))), 0); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c predux_half_dowto4(const Packet16c& a) +{ return vadd_s8(vget_high_s8(a), vget_low_s8(a)); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc predux_half_dowto4(const Packet8uc& a) +{ + return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(a, + vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(a))))), 0); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc predux_half_dowto4(const Packet16uc& a) +{ return vadd_u8(vget_high_u8(a), vget_low_u8(a)); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s predux_half_dowto4(const Packet8s& a) +{ return vadd_s16(vget_high_s16(a), vget_low_s16(a)); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us predux_half_dowto4(const Packet8us& a) +{ return vadd_u16(vget_high_u16(a), vget_low_u16(a)); } // Other reduction functions: // mul +template<> EIGEN_STRONG_INLINE float predux_mul(const Packet2f& a) +{ return vget_lane_f32(a, 0) * vget_lane_f32(a, 1); } template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) +{ return predux_mul(vmul_f32(vget_low_f32(a), vget_high_f32(a))); } +template<> EIGEN_STRONG_INLINE int8_t predux_mul(const Packet4c& a) { - float32x2_t a_lo, a_hi, prod; - - // Get a_lo = |a1|a2| and a_hi = |a3|a4| - a_lo = vget_low_f32(a); - a_hi = vget_high_f32(a); - // Get the product of a_lo * a_hi -> |a1*a3|a2*a4| - prod = vmul_f32(a_lo, a_hi); - // Multiply prod with its swapped value |a2*a4|a1*a3| - prod = vmul_f32(prod, vrev64_f32(prod)); - - return vget_lane_f32(prod, 0); + int8x8_t prod = vreinterpret_s8_s32(vdup_n_s32(a)); + prod = vmul_s8(prod, vrev16_s8(prod)); + return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 2); } +template<> EIGEN_STRONG_INLINE int8_t predux_mul(const Packet8c& a) +{ + int8x8_t prod = vmul_s8(a, vrev16_s8(a)); + prod = vmul_s8(prod, vrev32_s8(prod)); + return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 4); +} +template<> EIGEN_STRONG_INLINE int8_t predux_mul(const Packet16c& a) +{ return predux_mul(vmul_s8(vget_low_s8(a), vget_high_s8(a))); } +template<> EIGEN_STRONG_INLINE uint8_t predux_mul(const Packet4uc& a) +{ + uint8x8_t prod = vreinterpret_u8_u32(vdup_n_u32(a)); + prod = vmul_u8(prod, vrev16_u8(prod)); + return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 2); +} +template<> EIGEN_STRONG_INLINE uint8_t predux_mul(const Packet8uc& a) +{ + uint8x8_t prod = vmul_u8(a, vrev16_u8(a)); + prod = vmul_u8(prod, vrev32_u8(prod)); + return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 4); +} +template<> EIGEN_STRONG_INLINE uint8_t predux_mul(const Packet16uc& a) +{ return predux_mul(vmul_u8(vget_low_u8(a), vget_high_u8(a))); } +template<> EIGEN_STRONG_INLINE int16_t predux_mul(const Packet4s& a) +{ + const int16x4_t prod = vmul_s16(a, vrev32_s16(a)); + return vget_lane_s16(prod, 0) * vget_lane_s16(prod, 2); +} +template<> EIGEN_STRONG_INLINE int16_t predux_mul(const Packet8s& a) +{ + int16x4_t prod; + + // Get the product of a_lo * a_hi -> |a1*a5|a2*a6|a3*a7|a4*a8| + prod = vmul_s16(vget_low_s16(a), vget_high_s16(a)); + // Swap and multiply |a1*a5*a2*a6|a3*a7*a4*a8| + prod = vmul_s16(prod, vrev32_s16(prod)); + // Multiply |a1*a5*a2*a6*a3*a7*a4*a8| + return vget_lane_s16(prod, 0) * vget_lane_s16(prod, 2); +} +template<> EIGEN_STRONG_INLINE uint16_t predux_mul(const Packet4us& a) +{ + const uint16x4_t prod = vmul_u16(a, vrev32_u16(a)); + return vget_lane_u16(prod, 0) * vget_lane_u16(prod, 2); +} +template<> EIGEN_STRONG_INLINE uint16_t predux_mul(const Packet8us& a) +{ + uint16x4_t prod; + + // Get the product of a_lo * a_hi -> |a1*a5|a2*a6|a3*a7|a4*a8| + prod = vmul_u16(vget_low_u16(a), vget_high_u16(a)); + // Swap and multiply |a1*a5*a2*a6|a3*a7*a4*a8| + prod = vmul_u16(prod, vrev32_u16(prod)); + // Multiply |a1*a5*a2*a6*a3*a7*a4*a8| + return vget_lane_u16(prod, 0) * vget_lane_u16(prod, 2); +} +template<> EIGEN_STRONG_INLINE int32_t predux_mul(const Packet2i& a) +{ return vget_lane_s32(a, 0) * vget_lane_s32(a, 1); } template<> EIGEN_STRONG_INLINE int32_t predux_mul(const Packet4i& a) -{ - int32x2_t a_lo, a_hi, prod; - - // Get a_lo = |a1|a2| and a_hi = |a3|a4| - a_lo = vget_low_s32(a); - a_hi = vget_high_s32(a); - // Get the product of a_lo * a_hi -> |a1*a3|a2*a4| - prod = vmul_s32(a_lo, a_hi); - // Multiply prod with its swapped value |a2*a4|a1*a3| - prod = vmul_s32(prod, vrev64_s32(prod)); - - return vget_lane_s32(prod, 0); -} +{ return predux_mul(vmul_s32(vget_low_s32(a), vget_high_s32(a))); } +template<> EIGEN_STRONG_INLINE uint32_t predux_mul(const Packet2ui& a) +{ return vget_lane_u32(a, 0) * vget_lane_u32(a, 1); } +template<> EIGEN_STRONG_INLINE uint32_t predux_mul(const Packet4ui& a) +{ return predux_mul(vmul_u32(vget_low_u32(a), vget_high_u32(a))); } +template<> EIGEN_STRONG_INLINE int64_t predux_mul(const Packet2l& a) +{ return vgetq_lane_s64(a, 0) * vgetq_lane_s64(a, 1); } +template<> EIGEN_STRONG_INLINE uint64_t predux_mul(const Packet2ul& a) +{ return vgetq_lane_u64(a, 0) * vgetq_lane_u64(a, 1); } // min +template<> EIGEN_STRONG_INLINE float predux_min(const Packet2f& a) +{ return vget_lane_f32(vpmin_f32(a,a), 0); } template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) { - float32x2_t a_lo, a_hi, min; - - a_lo = vget_low_f32(a); - a_hi = vget_high_f32(a); - min = vpmin_f32(a_lo, a_hi); - min = vpmin_f32(min, min); - - return vget_lane_f32(min, 0); + const float32x2_t min = vmin_f32(vget_low_f32(a), vget_high_f32(a)); + return vget_lane_f32(vpmin_f32(min, min), 0); } - +template<> EIGEN_STRONG_INLINE int8_t predux_min(const Packet4c& a) +{ + const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a)); + int8x8_t min = vpmin_s8(a_dup, a_dup); + min = vpmin_s8(min, min); + return vget_lane_s8(min, 0); +} +template<> EIGEN_STRONG_INLINE int8_t predux_min(const Packet8c& a) +{ + int8x8_t min = vpmin_s8(a,a); + min = vpmin_s8(min, min); + min = vpmin_s8(min, min); + return vget_lane_s8(min, 0); +} +template<> EIGEN_STRONG_INLINE int8_t predux_min(const Packet16c& a) +{ + int8x8_t min = vmin_s8(vget_low_s8(a), vget_high_s8(a)); + min = vpmin_s8(min, min); + min = vpmin_s8(min, min); + min = vpmin_s8(min, min); + return vget_lane_s8(min, 0); +} +template<> EIGEN_STRONG_INLINE uint8_t predux_min(const Packet4uc& a) +{ + const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a)); + uint8x8_t min = vpmin_u8(a_dup, a_dup); + min = vpmin_u8(min, min); + return vget_lane_u8(min, 0); +} +template<> EIGEN_STRONG_INLINE uint8_t predux_min(const Packet8uc& a) +{ + uint8x8_t min = vpmin_u8(a,a); + min = vpmin_u8(min, min); + min = vpmin_u8(min, min); + return vget_lane_u8(min, 0); +} +template<> EIGEN_STRONG_INLINE uint8_t predux_min(const Packet16uc& a) +{ + uint8x8_t min = vmin_u8(vget_low_u8(a), vget_high_u8(a)); + min = vpmin_u8(min, min); + min = vpmin_u8(min, min); + min = vpmin_u8(min, min); + return vget_lane_u8(min, 0); +} +template<> EIGEN_STRONG_INLINE int16_t predux_min(const Packet4s& a) +{ + const int16x4_t min = vpmin_s16(a,a); + return vget_lane_s16(vpmin_s16(min, min), 0); +} +template<> EIGEN_STRONG_INLINE int16_t predux_min(const Packet8s& a) +{ + int16x4_t min = vmin_s16(vget_low_s16(a), vget_high_s16(a)); + min = vpmin_s16(min, min); + min = vpmin_s16(min, min); + return vget_lane_s16(min, 0); +} +template<> EIGEN_STRONG_INLINE uint16_t predux_min(const Packet4us& a) +{ + const uint16x4_t min = vpmin_u16(a,a); + return vget_lane_u16(vpmin_u16(min, min), 0); +} +template<> EIGEN_STRONG_INLINE uint16_t predux_min(const Packet8us& a) +{ + uint16x4_t min = vmin_u16(vget_low_u16(a), vget_high_u16(a)); + min = vpmin_u16(min, min); + min = vpmin_u16(min, min); + return vget_lane_u16(min, 0); +} +template<> EIGEN_STRONG_INLINE int32_t predux_min(const Packet2i& a) +{ return vget_lane_s32(vpmin_s32(a,a), 0); } template<> EIGEN_STRONG_INLINE int32_t predux_min(const Packet4i& a) { - int32x2_t a_lo, a_hi, min; - - a_lo = vget_low_s32(a); - a_hi = vget_high_s32(a); - min = vpmin_s32(a_lo, a_hi); - min = vpmin_s32(min, min); - - return vget_lane_s32(min, 0); + const int32x2_t min = vmin_s32(vget_low_s32(a), vget_high_s32(a)); + return vget_lane_s32(vpmin_s32(min, min), 0); } +template<> EIGEN_STRONG_INLINE uint32_t predux_min(const Packet2ui& a) +{ return vget_lane_u32(vpmin_u32(a,a), 0); } +template<> EIGEN_STRONG_INLINE uint32_t predux_min(const Packet4ui& a) +{ + const uint32x2_t min = vmin_u32(vget_low_u32(a), vget_high_u32(a)); + return vget_lane_u32(vpmin_u32(min, min), 0); +} +template<> EIGEN_STRONG_INLINE int64_t predux_min(const Packet2l& a) +{ return (std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1)); } +template<> EIGEN_STRONG_INLINE uint64_t predux_min(const Packet2ul& a) +{ return (std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1)); } // max +template<> EIGEN_STRONG_INLINE float predux_max(const Packet2f& a) +{ return vget_lane_f32(vpmax_f32(a,a), 0); } template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) { - float32x2_t a_lo, a_hi, max; - - a_lo = vget_low_f32(a); - a_hi = vget_high_f32(a); - max = vpmax_f32(a_lo, a_hi); - max = vpmax_f32(max, max); - - return vget_lane_f32(max, 0); + const float32x2_t max = vmax_f32(vget_low_f32(a), vget_high_f32(a)); + return vget_lane_f32(vpmax_f32(max, max), 0); } - +template<> EIGEN_STRONG_INLINE int8_t predux_max(const Packet4c& a) +{ + const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a)); + int8x8_t max = vpmax_s8(a_dup, a_dup); + max = vpmax_s8(max, max); + return vget_lane_s8(max, 0); +} +template<> EIGEN_STRONG_INLINE int8_t predux_max(const Packet8c& a) +{ + int8x8_t max = vpmax_s8(a,a); + max = vpmax_s8(max, max); + max = vpmax_s8(max, max); + return vget_lane_s8(max, 0); +} +template<> EIGEN_STRONG_INLINE int8_t predux_max(const Packet16c& a) +{ + int8x8_t max = vmax_s8(vget_low_s8(a), vget_high_s8(a)); + max = vpmax_s8(max, max); + max = vpmax_s8(max, max); + max = vpmax_s8(max, max); + return vget_lane_s8(max, 0); +} +template<> EIGEN_STRONG_INLINE uint8_t predux_max(const Packet4uc& a) +{ + const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a)); + uint8x8_t max = vpmax_u8(a_dup, a_dup); + max = vpmax_u8(max, max); + return vget_lane_u8(max, 0); +} +template<> EIGEN_STRONG_INLINE uint8_t predux_max(const Packet8uc& a) +{ + uint8x8_t max = vpmax_u8(a,a); + max = vpmax_u8(max, max); + max = vpmax_u8(max, max); + return vget_lane_u8(max, 0); +} +template<> EIGEN_STRONG_INLINE uint8_t predux_max(const Packet16uc& a) +{ + uint8x8_t max = vmax_u8(vget_low_u8(a), vget_high_u8(a)); + max = vpmax_u8(max, max); + max = vpmax_u8(max, max); + max = vpmax_u8(max, max); + return vget_lane_u8(max, 0); +} +template<> EIGEN_STRONG_INLINE int16_t predux_max(const Packet4s& a) +{ + const int16x4_t max = vpmax_s16(a,a); + return vget_lane_s16(vpmax_s16(max, max), 0); +} +template<> EIGEN_STRONG_INLINE int16_t predux_max(const Packet8s& a) +{ + int16x4_t max = vmax_s16(vget_low_s16(a), vget_high_s16(a)); + max = vpmax_s16(max, max); + max = vpmax_s16(max, max); + return vget_lane_s16(max, 0); +} +template<> EIGEN_STRONG_INLINE uint16_t predux_max(const Packet4us& a) +{ + const uint16x4_t max = vpmax_u16(a,a); + return vget_lane_u16(vpmax_u16(max, max), 0); +} +template<> EIGEN_STRONG_INLINE uint16_t predux_max(const Packet8us& a) +{ + uint16x4_t max = vmax_u16(vget_low_u16(a), vget_high_u16(a)); + max = vpmax_u16(max, max); + max = vpmax_u16(max, max); + return vget_lane_u16(max, 0); +} +template<> EIGEN_STRONG_INLINE int32_t predux_max(const Packet2i& a) +{ return vget_lane_s32(vpmax_s32(a,a), 0); } template<> EIGEN_STRONG_INLINE int32_t predux_max(const Packet4i& a) { - int32x2_t a_lo, a_hi, max; + const int32x2_t max = vmax_s32(vget_low_s32(a), vget_high_s32(a)); + return vget_lane_s32(vpmax_s32(max, max), 0); +} +template<> EIGEN_STRONG_INLINE uint32_t predux_max(const Packet2ui& a) +{ return vget_lane_u32(vpmax_u32(a,a), 0); } +template<> EIGEN_STRONG_INLINE uint32_t predux_max(const Packet4ui& a) +{ + const uint32x2_t max = vmax_u32(vget_low_u32(a), vget_high_u32(a)); + return vget_lane_u32(vpmax_u32(max, max), 0); +} +template<> EIGEN_STRONG_INLINE int64_t predux_max(const Packet2l& a) +{ return (std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1)); } +template<> EIGEN_STRONG_INLINE uint64_t predux_max(const Packet2ul& a) +{ return (std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1)); } - a_lo = vget_low_s32(a); - a_hi = vget_high_s32(a); - max = vpmax_s32(a_lo, a_hi); - max = vpmax_s32(max, max); - - return vget_lane_s32(max, 0); +template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) +{ + uint32x2_t tmp = vorr_u32(vget_low_u32( vreinterpretq_u32_f32(x)), + vget_high_u32(vreinterpretq_u32_f32(x))); + return vget_lane_u32(vpmax_u32(tmp, tmp), 0); } -// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors, -// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074 -#define PALIGN_NEON(Offset,Type,Command) \ -template<>\ -struct palign_impl\ -{\ - EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\ - {\ - if (Offset!=0)\ - first = Command(first, second, Offset);\ - }\ -};\ +// Helpers for ptranspose. +namespace detail { + +template +void zip_in_place(Packet& p1, Packet& p2); -PALIGN_NEON(0,Packet4f,vextq_f32) -PALIGN_NEON(1,Packet4f,vextq_f32) -PALIGN_NEON(2,Packet4f,vextq_f32) -PALIGN_NEON(3,Packet4f,vextq_f32) -PALIGN_NEON(0,Packet4i,vextq_s32) -PALIGN_NEON(1,Packet4i,vextq_s32) -PALIGN_NEON(2,Packet4i,vextq_s32) -PALIGN_NEON(3,Packet4i,vextq_s32) - -#undef PALIGN_NEON - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - float32x4x2_t tmp1 = vzipq_f32(kernel.packet[0], kernel.packet[1]); - float32x4x2_t tmp2 = vzipq_f32(kernel.packet[2], kernel.packet[3]); - - kernel.packet[0] = vcombine_f32(vget_low_f32(tmp1.val[0]), vget_low_f32(tmp2.val[0])); - kernel.packet[1] = vcombine_f32(vget_high_f32(tmp1.val[0]), vget_high_f32(tmp2.val[0])); - kernel.packet[2] = vcombine_f32(vget_low_f32(tmp1.val[1]), vget_low_f32(tmp2.val[1])); - kernel.packet[3] = vcombine_f32(vget_high_f32(tmp1.val[1]), vget_high_f32(tmp2.val[1])); +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet2f& p1, Packet2f& p2) { + const float32x2x2_t tmp = vzip_f32(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; } -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - int32x4x2_t tmp1 = vzipq_s32(kernel.packet[0], kernel.packet[1]); - int32x4x2_t tmp2 = vzipq_s32(kernel.packet[2], kernel.packet[3]); - kernel.packet[0] = vcombine_s32(vget_low_s32(tmp1.val[0]), vget_low_s32(tmp2.val[0])); - kernel.packet[1] = vcombine_s32(vget_high_s32(tmp1.val[0]), vget_high_s32(tmp2.val[0])); - kernel.packet[2] = vcombine_s32(vget_low_s32(tmp1.val[1]), vget_low_s32(tmp2.val[1])); - kernel.packet[3] = vcombine_s32(vget_high_s32(tmp1.val[1]), vget_high_s32(tmp2.val[1])); +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet4f& p1, Packet4f& p2) { + const float32x4x2_t tmp = vzipq_f32(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet8c& p1, Packet8c& p2) { + const int8x8x2_t tmp = vzip_s8(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet16c& p1, Packet16c& p2) { + const int8x16x2_t tmp = vzipq_s8(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet8uc& p1, Packet8uc& p2) { + const uint8x8x2_t tmp = vzip_u8(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet16uc& p1, Packet16uc& p2) { + const uint8x16x2_t tmp = vzipq_u8(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet2i& p1, Packet2i& p2) { + const int32x2x2_t tmp = vzip_s32(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet4i& p1, Packet4i& p2) { + const int32x4x2_t tmp = vzipq_s32(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet2ui& p1, Packet2ui& p2) { + const uint32x2x2_t tmp = vzip_u32(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet4ui& p1, Packet4ui& p2) { + const uint32x4x2_t tmp = vzipq_u32(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet4s& p1, Packet4s& p2) { + const int16x4x2_t tmp = vzip_s16(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet8s& p1, Packet8s& p2) { + const int16x8x2_t tmp = vzipq_s16(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet4us& p1, Packet4us& p2) { + const uint16x4x2_t tmp = vzip_u16(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet8us& p1, Packet8us& p2) { + const uint16x8x2_t tmp = vzipq_u16(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template +EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock& kernel) { + zip_in_place(kernel.packet[0], kernel.packet[1]); +} + +template +EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock& kernel) { + zip_in_place(kernel.packet[0], kernel.packet[2]); + zip_in_place(kernel.packet[1], kernel.packet[3]); + zip_in_place(kernel.packet[0], kernel.packet[1]); + zip_in_place(kernel.packet[2], kernel.packet[3]); +} + +template +EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock& kernel) { + zip_in_place(kernel.packet[0], kernel.packet[4]); + zip_in_place(kernel.packet[1], kernel.packet[5]); + zip_in_place(kernel.packet[2], kernel.packet[6]); + zip_in_place(kernel.packet[3], kernel.packet[7]); + + zip_in_place(kernel.packet[0], kernel.packet[2]); + zip_in_place(kernel.packet[1], kernel.packet[3]); + zip_in_place(kernel.packet[4], kernel.packet[6]); + zip_in_place(kernel.packet[5], kernel.packet[7]); + + zip_in_place(kernel.packet[0], kernel.packet[1]); + zip_in_place(kernel.packet[2], kernel.packet[3]); + zip_in_place(kernel.packet[4], kernel.packet[5]); + zip_in_place(kernel.packet[6], kernel.packet[7]); +} + +template +EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock& kernel) { + EIGEN_UNROLL_LOOP + for (int i=0; i<4; ++i) { + const int m = (1 << i); + EIGEN_UNROLL_LOOP + for (int j=0; j& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) +{ + const int8x8_t a = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[2], vdup_n_s32(kernel.packet[0]), 1)); + const int8x8_t b = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[3], vdup_n_s32(kernel.packet[1]), 1)); + + const int8x8x2_t zip8 = vzip_s8(a,b); + const int16x4x2_t zip16 = vzip_s16(vreinterpret_s16_s8(zip8.val[0]), vreinterpret_s16_s8(zip8.val[1])); + + kernel.packet[0] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 0); + kernel.packet[1] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 1); + kernel.packet[2] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 0); + kernel.packet[3] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 1); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) +{ + const uint8x8_t a = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[2], vdup_n_u32(kernel.packet[0]), 1)); + const uint8x8_t b = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[3], vdup_n_u32(kernel.packet[1]), 1)); + + const uint8x8x2_t zip8 = vzip_u8(a,b); + const uint16x4x2_t zip16 = vzip_u16(vreinterpret_u16_u8(zip8.val[0]), vreinterpret_u16_u8(zip8.val[1])); + + kernel.packet[0] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 0); + kernel.packet[1] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 1); + kernel.packet[2] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 0); + kernel.packet[3] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 1); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::zip_in_place(kernel.packet[0], kernel.packet[1]); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) +{ +#if EIGEN_ARCH_ARM64 + const int64x2_t tmp1 = vzip1q_s64(kernel.packet[0], kernel.packet[1]); + kernel.packet[1] = vzip2q_s64(kernel.packet[0], kernel.packet[1]); + kernel.packet[0] = tmp1; +#else + const int64x1_t tmp[2][2] = { + { vget_low_s64(kernel.packet[0]), vget_high_s64(kernel.packet[0]) }, + { vget_low_s64(kernel.packet[1]), vget_high_s64(kernel.packet[1]) } + }; + + kernel.packet[0] = vcombine_s64(tmp[0][0], tmp[1][0]); + kernel.packet[1] = vcombine_s64(tmp[0][1], tmp[1][1]); +#endif +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) +{ +#if EIGEN_ARCH_ARM64 + const uint64x2_t tmp1 = vzip1q_u64(kernel.packet[0], kernel.packet[1]); + kernel.packet[1] = vzip2q_u64(kernel.packet[0], kernel.packet[1]); + kernel.packet[0] = tmp1; +#else + const uint64x1_t tmp[2][2] = { + { vget_low_u64(kernel.packet[0]), vget_high_u64(kernel.packet[0]) }, + { vget_low_u64(kernel.packet[1]), vget_high_u64(kernel.packet[1]) } + }; + + kernel.packet[0] = vcombine_u64(tmp[0][0], tmp[1][0]); + kernel.packet[1] = vcombine_u64(tmp[0][1], tmp[1][1]); +#endif +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pselect( const Packet2f& mask, const Packet2f& a, const Packet2f& b) +{ return vbsl_f32(vreinterpret_u32_f32(mask), a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) +{ return vbslq_f32(vreinterpretq_u32_f32(mask), a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pselect(const Packet8c& mask, const Packet8c& a, const Packet8c& b) +{ return vbsl_s8(vreinterpret_u8_s8(mask), a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pselect(const Packet16c& mask, const Packet16c& a, const Packet16c& b) +{ return vbslq_s8(vreinterpretq_u8_s8(mask), a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pselect(const Packet8uc& mask, const Packet8uc& a, const Packet8uc& b) +{ return vbsl_u8(mask, a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pselect(const Packet16uc& mask, const Packet16uc& a, const Packet16uc& b) +{ return vbslq_u8(mask, a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pselect(const Packet4s& mask, const Packet4s& a, const Packet4s& b) +{ return vbsl_s16(vreinterpret_u16_s16(mask), a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pselect(const Packet8s& mask, const Packet8s& a, const Packet8s& b) +{ return vbslq_s16(vreinterpretq_u16_s16(mask), a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pselect(const Packet4us& mask, const Packet4us& a, const Packet4us& b) +{ return vbsl_u16(mask, a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pselect(const Packet8us& mask, const Packet8us& a, const Packet8us& b) +{ return vbslq_u16(mask, a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pselect(const Packet2i& mask, const Packet2i& a, const Packet2i& b) +{ return vbsl_s32(vreinterpret_u32_s32(mask), a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) +{ return vbslq_s32(vreinterpretq_u32_s32(mask), a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pselect(const Packet2ui& mask, const Packet2ui& a, const Packet2ui& b) +{ return vbsl_u32(mask, a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b) +{ return vbslq_u32(mask, a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pselect(const Packet2l& mask, const Packet2l& a, const Packet2l& b) +{ return vbslq_s64(vreinterpretq_u64_s64(mask), a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pselect(const Packet2ul& mask, const Packet2ul& a, const Packet2ul& b) +{ return vbslq_u64(mask, a, b); } + +// Use armv8 rounding intinsics if available. +#if EIGEN_ARCH_ARMV8 +template<> EIGEN_STRONG_INLINE Packet2f print(const Packet2f& a) +{ return vrndn_f32(a); } + +template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) +{ return vrndnq_f32(a); } + +template<> EIGEN_STRONG_INLINE Packet2f pfloor(const Packet2f& a) +{ return vrndm_f32(a); } + +template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) +{ return vrndmq_f32(a); } + +template<> EIGEN_STRONG_INLINE Packet2f pceil(const Packet2f& a) +{ return vrndp_f32(a); } + +template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) +{ return vrndpq_f32(a); } + +#else + +template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) { + // Adds and subtracts signum(a) * 2^23 to force rounding. + const Packet4f limit = pset1(static_cast(1<<23)); + const Packet4f abs_a = pabs(a); + Packet4f r = padd(abs_a, limit); + // Don't compile-away addition and subtraction. + EIGEN_OPTIMIZATION_BARRIER(r); + r = psub(r, limit); + // If greater than limit, simply return a. Otherwise, account for sign. + r = pselect(pcmp_lt(abs_a, limit), + pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); + return r; +} + +template<> EIGEN_STRONG_INLINE Packet2f print(const Packet2f& a) { + // Adds and subtracts signum(a) * 2^23 to force rounding. + const Packet2f limit = pset1(static_cast(1<<23)); + const Packet2f abs_a = pabs(a); + Packet2f r = padd(abs_a, limit); + // Don't compile-away addition and subtraction. + EIGEN_OPTIMIZATION_BARRIER(r); + r = psub(r, limit); + // If greater than limit, simply return a. Otherwise, account for sign. + r = pselect(pcmp_lt(abs_a, limit), + pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); + return r; +} + +template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) +{ + const Packet4f cst_1 = pset1(1.0f); + Packet4f tmp = print(a); + // If greater, subtract one. + Packet4f mask = pcmp_lt(a, tmp); + mask = pand(mask, cst_1); + return psub(tmp, mask); +} + +template<> EIGEN_STRONG_INLINE Packet2f pfloor(const Packet2f& a) +{ + const Packet2f cst_1 = pset1(1.0f); + Packet2f tmp = print(a); + // If greater, subtract one. + Packet2f mask = pcmp_lt(a, tmp); + mask = pand(mask, cst_1); + return psub(tmp, mask); +} + +template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) +{ + const Packet4f cst_1 = pset1(1.0f); + Packet4f tmp = print(a); + // If smaller, add one. + Packet4f mask = pcmp_lt(tmp, a); + mask = pand(mask, cst_1); + return padd(tmp, mask); +} + +template<> EIGEN_STRONG_INLINE Packet2f pceil(const Packet2f& a) +{ + const Packet2f cst_1 = pset1(1.0); + Packet2f tmp = print(a); + // If smaller, add one. + Packet2f mask = pcmp_lt(tmp, a); + mask = pand(mask, cst_1); + return padd(tmp, mask); +} + +#endif + +/** + * Computes the integer square root + * @remarks The calculation is performed using an algorithm which iterates through each binary digit of the result + * and tests whether setting that digit to 1 would cause the square of the value to be greater than the argument + * value. The algorithm is described in detail here: http://ww1.microchip.com/downloads/en/AppNotes/91040a.pdf . + */ +template<> EIGEN_STRONG_INLINE Packet4uc psqrt(const Packet4uc& a) { + uint8x8_t x = vreinterpret_u8_u32(vdup_n_u32(a)); + uint8x8_t res = vdup_n_u8(0); + uint8x8_t add = vdup_n_u8(0x8); + for (int i = 0; i < 4; i++) + { + const uint8x8_t temp = vorr_u8(res, add); + res = vbsl_u8(vcge_u8(x, vmul_u8(temp, temp)), temp, res); + add = vshr_n_u8(add, 1); + } + return vget_lane_u32(vreinterpret_u32_u8(res), 0); +} +/// @copydoc Eigen::internal::psqrt(const Packet4uc& a) +template<> EIGEN_STRONG_INLINE Packet8uc psqrt(const Packet8uc& a) { + uint8x8_t res = vdup_n_u8(0); + uint8x8_t add = vdup_n_u8(0x8); + for (int i = 0; i < 4; i++) + { + const uint8x8_t temp = vorr_u8(res, add); + res = vbsl_u8(vcge_u8(a, vmul_u8(temp, temp)), temp, res); + add = vshr_n_u8(add, 1); + } + return res; +} +/// @copydoc Eigen::internal::psqrt(const Packet4uc& a) +template<> EIGEN_STRONG_INLINE Packet16uc psqrt(const Packet16uc& a) { + uint8x16_t res = vdupq_n_u8(0); + uint8x16_t add = vdupq_n_u8(0x8); + for (int i = 0; i < 4; i++) + { + const uint8x16_t temp = vorrq_u8(res, add); + res = vbslq_u8(vcgeq_u8(a, vmulq_u8(temp, temp)), temp, res); + add = vshrq_n_u8(add, 1); + } + return res; +} +/// @copydoc Eigen::internal::psqrt(const Packet4uc& a) +template<> EIGEN_STRONG_INLINE Packet4us psqrt(const Packet4us& a) { + uint16x4_t res = vdup_n_u16(0); + uint16x4_t add = vdup_n_u16(0x80); + for (int i = 0; i < 8; i++) + { + const uint16x4_t temp = vorr_u16(res, add); + res = vbsl_u16(vcge_u16(a, vmul_u16(temp, temp)), temp, res); + add = vshr_n_u16(add, 1); + } + return res; +} +/// @copydoc Eigen::internal::psqrt(const Packet4uc& a) +template<> EIGEN_STRONG_INLINE Packet8us psqrt(const Packet8us& a) { + uint16x8_t res = vdupq_n_u16(0); + uint16x8_t add = vdupq_n_u16(0x80); + for (int i = 0; i < 8; i++) + { + const uint16x8_t temp = vorrq_u16(res, add); + res = vbslq_u16(vcgeq_u16(a, vmulq_u16(temp, temp)), temp, res); + add = vshrq_n_u16(add, 1); + } + return res; +} +/// @copydoc Eigen::internal::psqrt(const Packet4uc& a) +template<> EIGEN_STRONG_INLINE Packet2ui psqrt(const Packet2ui& a) { + uint32x2_t res = vdup_n_u32(0); + uint32x2_t add = vdup_n_u32(0x8000); + for (int i = 0; i < 16; i++) + { + const uint32x2_t temp = vorr_u32(res, add); + res = vbsl_u32(vcge_u32(a, vmul_u32(temp, temp)), temp, res); + add = vshr_n_u32(add, 1); + } + return res; +} +/// @copydoc Eigen::internal::psqrt(const Packet4uc& a) +template<> EIGEN_STRONG_INLINE Packet4ui psqrt(const Packet4ui& a) { + uint32x4_t res = vdupq_n_u32(0); + uint32x4_t add = vdupq_n_u32(0x8000); + for (int i = 0; i < 16; i++) + { + const uint32x4_t temp = vorrq_u32(res, add); + res = vbslq_u32(vcgeq_u32(a, vmulq_u32(temp, temp)), temp, res); + add = vshrq_n_u32(add, 1); + } + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) { + // Compute approximate reciprocal sqrt. + Packet4f x = vrsqrteq_f32(a); + // Do Newton iterations for 1/sqrt(x). + x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, x), x), x); + x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, x), x), x); + const Packet4f infinity = pset1(NumTraits::infinity()); + return pselect(pcmp_eq(a, pzero(a)), infinity, x); +} + +template<> EIGEN_STRONG_INLINE Packet2f prsqrt(const Packet2f& a) { + // Compute approximate reciprocal sqrt. + Packet2f x = vrsqrte_f32(a); + // Do Newton iterations for 1/sqrt(x). + x = vmul_f32(vrsqrts_f32(vmul_f32(a, x), x), x); + x = vmul_f32(vrsqrts_f32(vmul_f32(a, x), x), x); + const Packet2f infinity = pset1(NumTraits::infinity()); + return pselect(pcmp_eq(a, pzero(a)), infinity, x); +} + +// Unfortunately vsqrt_f32 is only available for A64. +#if EIGEN_ARCH_ARM64 +template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& _x){return vsqrtq_f32(_x);} +template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& _x){return vsqrt_f32(_x); } +#else +template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) { + const Packet4f infinity = pset1(NumTraits::infinity()); + const Packet4f is_zero_or_inf = por(pcmp_eq(a, pzero(a)), pcmp_eq(a, infinity)); + return pselect(is_zero_or_inf, a, pmul(a, prsqrt(a))); +} +template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& a) { + const Packet2f infinity = pset1(NumTraits::infinity()); + const Packet2f is_zero_or_inf = por(pcmp_eq(a, pzero(a)), pcmp_eq(a, infinity)); + return pselect(is_zero_or_inf, a, pmul(a, prsqrt(a))); +} +#endif + +//---------- bfloat16 ---------- +// TODO: Add support for native armv8.6-a bfloat16_t + +// TODO: Guard if we have native bfloat16 support +typedef eigen_packet_wrapper Packet4bf; + +template<> struct is_arithmetic { enum { value = true }; }; + +template<> struct packet_traits : default_packet_traits +{ + typedef Packet4bf type; + typedef Packet4bf half; + enum + { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + HasHalfPacket = 0, + + HasCmp = 1, + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasAbsDiff = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasDiv = 1, + HasFloor = 1, + HasCeil = 1, + HasRint = 1, + + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 1, + HasExp = 1, + HasSqrt = 0, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH, + HasBessel = 0, // Issues with accuracy. + HasNdtri = 0 + }; +}; + +template<> struct unpacket_traits +{ + typedef bfloat16 type; + typedef Packet4bf half; + enum + { + size = 4, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +namespace detail { +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet4bf& p1, Packet4bf& p2) { + const uint16x4x2_t tmp = vzip_u16(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} +} // namespace detail + +EIGEN_STRONG_INLINE Packet4bf F32ToBf16(const Packet4f& p) +{ + // See the scalar implemention in BFloat16.h for a comprehensible explanation + // of this fast rounding algorithm + Packet4ui input = reinterpret_cast(p); + + // lsb = (input >> 16) & 1 + Packet4ui lsb = vandq_u32(vshrq_n_u32(input, 16), vdupq_n_u32(1)); + + // rounding_bias = 0x7fff + lsb + Packet4ui rounding_bias = vaddq_u32(lsb, vdupq_n_u32(0x7fff)); + + // input += rounding_bias + input = vaddq_u32(input, rounding_bias); + + // input = input >> 16 + input = vshrq_n_u32(input, 16); + + // Replace float-nans by bfloat16-nans, that is 0x7fc0 + const Packet4ui bf16_nan = vdupq_n_u32(0x7fc0); + const Packet4ui mask = vceqq_f32(p, p); + input = vbslq_u32(mask, input, bf16_nan); + + // output = static_cast(input) + return vmovn_u32(input); +} + +EIGEN_STRONG_INLINE Packet4f Bf16ToF32(const Packet4bf& p) +{ + return reinterpret_cast(vshlq_n_u32(vmovl_u16(p), 16)); +} + +EIGEN_STRONG_INLINE Packet4bf F32MaskToBf16Mask(const Packet4f& p) { + return vmovn_u32(vreinterpretq_u32_f32(p)); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pset1(const bfloat16& from) { + return pset1(from.value); +} + +template<> EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet4bf& from) { + return bfloat16_impl::raw_uint16_to_bfloat16(static_cast(pfirst(from))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pload(const bfloat16* from) +{ + return pload(reinterpret_cast(from)); +} + +template<> EIGEN_STRONG_INLINE Packet4bf ploadu(const bfloat16* from) +{ + return ploadu(reinterpret_cast(from)); +} + +template<> EIGEN_STRONG_INLINE void pstore(bfloat16* to, const Packet4bf& from) +{ + EIGEN_DEBUG_ALIGNED_STORE vst1_u16(reinterpret_cast(to), from); +} + +template<> EIGEN_STRONG_INLINE void pstoreu(bfloat16* to, const Packet4bf& from) +{ + EIGEN_DEBUG_UNALIGNED_STORE vst1_u16(reinterpret_cast(to), from); +} + +template<> EIGEN_STRONG_INLINE Packet4bf ploaddup(const bfloat16* from) +{ + return ploaddup(reinterpret_cast(from)); +} + +template <> EIGEN_STRONG_INLINE Packet4bf pabs(const Packet4bf& a) { + return F32ToBf16(pabs(Bf16ToF32(a))); +} + +template <> EIGEN_STRONG_INLINE Packet4bf pmin(const Packet4bf &a, + const Packet4bf &b) +{ + return F32ToBf16(pmin(Bf16ToF32(a), Bf16ToF32(b))); +} +template <> EIGEN_STRONG_INLINE Packet4bf pmin(const Packet4bf &a, + const Packet4bf &b) +{ + return F32ToBf16(pmin(Bf16ToF32(a), Bf16ToF32(b))); +} + +template <> EIGEN_STRONG_INLINE Packet4bf pmin(const Packet4bf &a, + const Packet4bf &b) +{ + return F32ToBf16(pmin(Bf16ToF32(a), Bf16ToF32(b))); +} + +template <> EIGEN_STRONG_INLINE Packet4bf pmax(const Packet4bf &a, + const Packet4bf &b) +{ + return F32ToBf16(pmax(Bf16ToF32(a), Bf16ToF32(b))); +} +template <> EIGEN_STRONG_INLINE Packet4bf pmax(const Packet4bf &a, + const Packet4bf &b) +{ + return F32ToBf16(pmax(Bf16ToF32(a), Bf16ToF32(b))); +} + +template <> EIGEN_STRONG_INLINE Packet4bf pmax(const Packet4bf &a, + const Packet4bf &b) +{ + return F32ToBf16(pmax(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf plset(const bfloat16& a) +{ + return F32ToBf16(plset(static_cast(a))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf por(const Packet4bf& a,const Packet4bf& b) { + return por(a, b); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pxor(const Packet4bf& a,const Packet4bf& b) { + return pxor(a, b); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pand(const Packet4bf& a,const Packet4bf& b) { + return pand(a, b); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pandnot(const Packet4bf& a,const Packet4bf& b) { + return pandnot(a, b); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4bf pselect(const Packet4bf& mask, const Packet4bf& a, + const Packet4bf& b) +{ + return pselect(mask, a, b); +} + +template<> EIGEN_STRONG_INLINE Packet4bf print(const Packet4bf& a) +{ + return F32ToBf16(print(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pfloor(const Packet4bf& a) +{ + return F32ToBf16(pfloor(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pceil(const Packet4bf& a) +{ + return F32ToBf16(pceil(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pconj(const Packet4bf& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet4bf padd(const Packet4bf& a, const Packet4bf& b) { + return F32ToBf16(padd(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf psub(const Packet4bf& a, const Packet4bf& b) { + return F32ToBf16(psub(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pmul(const Packet4bf& a, const Packet4bf& b) { + return F32ToBf16(pmul(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pdiv(const Packet4bf& a, const Packet4bf& b) { + return F32ToBf16(pdiv(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> +EIGEN_STRONG_INLINE Packet4bf pgather(const bfloat16* from, Index stride) +{ + return pgather(reinterpret_cast(from), stride); +} + +template<> +EIGEN_STRONG_INLINE void pscatter(bfloat16* to, const Packet4bf& from, Index stride) +{ + pscatter(reinterpret_cast(to), from, stride); +} + +template<> EIGEN_STRONG_INLINE bfloat16 predux(const Packet4bf& a) +{ + return static_cast(predux(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE bfloat16 predux_max(const Packet4bf& a) +{ + return static_cast(predux_max(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE bfloat16 predux_min(const Packet4bf& a) +{ + return static_cast(predux_min(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE bfloat16 predux_mul(const Packet4bf& a) +{ + return static_cast(predux_mul(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf preverse(const Packet4bf& a) +{ + return preverse(a); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) +{ + detail::ptranspose_impl(kernel); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pabsdiff(const Packet4bf& a, const Packet4bf& b) +{ + return F32ToBf16(pabsdiff(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pcmp_eq(const Packet4bf& a, const Packet4bf& b) +{ + return F32MaskToBf16Mask(pcmp_eq(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pcmp_lt(const Packet4bf& a, const Packet4bf& b) +{ + return F32MaskToBf16Mask(pcmp_lt(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pcmp_lt_or_nan(const Packet4bf& a, const Packet4bf& b) +{ + return F32MaskToBf16Mask(pcmp_lt_or_nan(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pcmp_le(const Packet4bf& a, const Packet4bf& b) +{ + return F32MaskToBf16Mask(pcmp_le(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pnegate(const Packet4bf& a) +{ + return pxor(a, pset1(static_cast(0x8000))); } //---------- double ---------- @@ -571,55 +3642,115 @@ ptranspose(PacketBlock& kernel) { // Defining these functions as templates ensures that if these intrinsics are // already defined in arm_neon.h, then our workaround doesn't cause a conflict // and has lower priority in overload resolution. -template -uint64x2_t vreinterpretq_u64_f64(T a) -{ - return (uint64x2_t) a; -} +template uint64x2_t vreinterpretq_u64_f64(T a) { return (uint64x2_t) a; } -template -float64x2_t vreinterpretq_f64_u64(T a) -{ - return (float64x2_t) a; -} +template float64x2_t vreinterpretq_f64_u64(T a) { return (float64x2_t) a; } typedef float64x2_t Packet2d; typedef float64x1_t Packet1d; +// fuctionally equivalent to _mm_shuffle_pd in SSE (i.e. shuffle(m, n, mask) equals _mm_shuffle_pd(m,n,mask)) +// Currently used in LU/arch/InverseSize4.h to enable a shared implementation +// for fast inversion of matrices of size 4. +EIGEN_STRONG_INLINE Packet2d shuffle(const Packet2d& m, const Packet2d& n, int mask) +{ + const double* a = reinterpret_cast(&m); + const double* b = reinterpret_cast(&n); + Packet2d res = {*(a + (mask & 1)), *(b + ((mask >> 1) & 1))}; + return res; +} + +EIGEN_STRONG_INLINE Packet2d vec2d_swizzle2(const Packet2d& a, const Packet2d& b, int mask) +{ + return shuffle(a, b, mask); +} +EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a,const Packet2d& b) +{ + return shuffle(a, b, 0); +} +EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a,const Packet2d& b) +{ + return shuffle(a, b, 3); +} +#define vec2d_duplane(a, p) \ + vdupq_laneq_f64(a, p) + template<> struct packet_traits : default_packet_traits { typedef Packet2d type; typedef Packet2d half; - enum { + enum + { Vectorizable = 1, AlignedOnScalar = 1, size = 2, - HasHalfPacket=0, - - HasDiv = 1, - // FIXME check the Has* + HasHalfPacket = 0, + + HasCmp = 1, + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasAbsDiff = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + + HasDiv = 1, + HasFloor = 1, + HasCeil = 1, + HasRint = 1, + HasSin = 0, HasCos = 0, - HasLog = 0, - HasExp = 0, - HasSqrt = 0 + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasTanh = 0, + HasErf = 0 }; }; -template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; +template<> struct unpacket_traits +{ + typedef double type; + typedef Packet2d half; + typedef Packet2l integer_packet; + enum + { + size = 2, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return vdupq_n_f64(from); } template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) { - const double countdown_raw[] = {0.0,1.0}; - const Packet2d countdown = vld1q_f64(countdown_raw); - return vaddq_f64(pset1(a), countdown); + const double c[] = {0.0,1.0}; + return vaddq_f64(pset1(a), vld1q_f64(c)); } + template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { return vaddq_f64(a,b); } template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return vsubq_f64(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& , const Packet2d& ); +template<> EIGEN_STRONG_INLINE Packet2d paddsub(const Packet2d& a, const Packet2d& b){ + const Packet2d mask = {numext::bit_cast(0x8000000000000000ull),0.0}; + return padd(a, pxor(mask, b)); +} + template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return vnegq_f64(a); } template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } @@ -630,128 +3761,824 @@ template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const #ifdef __ARM_FEATURE_FMA // See bug 936. See above comment about FMA for float. -template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vfmaq_f64(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) +{ return vfmaq_f64(c,a,b); } #else -template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vmlaq_f64(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) +{ return vmlaq_f64(c,a,b); } #endif template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); } +#ifdef __ARM_FEATURE_NUMERIC_MAXMIN +// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems). +template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return vminnmq_f64(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return vmaxnmq_f64(a, b); } + +#endif + +template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return pmin(a, b); } + template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return vmaxq_f64(a,b); } + +template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return pmax(a, b); } + // Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) -{ - return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); -} +{ return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); } template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) -{ - return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); -} +{ return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); } template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) -{ - return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); -} +{ return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); } template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) -{ - return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); -} +{ return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); } -template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); } +template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) +{ return vreinterpretq_f64_u64(vcleq_f64(a,b)); } -template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); } +template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) +{ return vreinterpretq_f64_u64(vcltq_f64(a,b)); } -template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) -{ - return vld1q_dup_f64(from); -} -template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to, from); } +template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) +{ return vreinterpretq_f64_u32(vmvnq_u32(vreinterpretq_u32_u64(vcgeq_f64(a,b)))); } -template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to, from); } +template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) +{ return vreinterpretq_f64_u64(vceqq_f64(a,b)); } -template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) +template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); } + +template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); } + +template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) { return vld1q_dup_f64(from); } +template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to,from); } + +template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to,from); } + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pgather(const double* from, Index stride) { Packet2d res = pset1(0.0); - res = vsetq_lane_f64(from[0*stride], res, 0); - res = vsetq_lane_f64(from[1*stride], res, 1); + res = vld1q_lane_f64(from + 0*stride, res, 0); + res = vld1q_lane_f64(from + 1*stride, res, 1); return res; } -template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, Index stride) + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(double* to, const Packet2d& from, Index stride) { - to[stride*0] = vgetq_lane_f64(from, 0); - to[stride*1] = vgetq_lane_f64(from, 1); + vst1q_lane_f64(to + stride*0, from, 0); + vst1q_lane_f64(to + stride*1, from, 1); } + template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { EIGEN_ARM_PREFETCH(addr); } // FIXME only store the 2 first elements ? -template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { return vgetq_lane_f64(a, 0); } +template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { return vgetq_lane_f64(a,0); } -template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); } +template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) +{ return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); } template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); } #if EIGEN_COMP_CLANG && defined(__apple_build_version__) // workaround ICE, see bug 907 -template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) { return (vget_low_f64(a) + vget_high_f64(a))[0]; } +template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) +{ return (vget_low_f64(a) + vget_high_f64(a))[0]; } #else -template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); } +template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) +{ return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); } #endif -template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) -{ - float64x2_t trn1, trn2; - - // NEON zip performs interleaving of the supplied vectors. - // We perform two interleaves in a row to acquire the transposed vector - trn1 = vzip1q_f64(vecs[0], vecs[1]); - trn2 = vzip2q_f64(vecs[0], vecs[1]); - - // Do the addition of the resulting vectors - return vaddq_f64(trn1, trn2); -} // Other reduction functions: // mul #if EIGEN_COMP_CLANG && defined(__apple_build_version__) -template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) { return (vget_low_f64(a) * vget_high_f64(a))[0]; } +template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) +{ return (vget_low_f64(a) * vget_high_f64(a))[0]; } #else -template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); } +template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) +{ return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); } #endif // min -template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) { return vgetq_lane_f64(vpminq_f64(a, a), 0); } +template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) +{ return vgetq_lane_f64(vpminq_f64(a,a), 0); } // max -template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) { return vgetq_lane_f64(vpmaxq_f64(a, a), 0); } +template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) +{ return vgetq_lane_f64(vpmaxq_f64(a,a), 0); } -// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors, -// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074 -#define PALIGN_NEON(Offset,Type,Command) \ -template<>\ -struct palign_impl\ -{\ - EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\ - {\ - if (Offset!=0)\ - first = Command(first, second, Offset);\ - }\ -};\ -PALIGN_NEON(0,Packet2d,vextq_f64) -PALIGN_NEON(1,Packet2d,vextq_f64) -#undef PALIGN_NEON +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) +{ + const float64x2_t tmp1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]); + const float64x2_t tmp2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]); -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - float64x2_t trn1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]); - float64x2_t trn2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]); - - kernel.packet[0] = trn1; - kernel.packet[1] = trn2; + kernel.packet[0] = tmp1; + kernel.packet[1] = tmp2; } -#endif // EIGEN_ARCH_ARM64 + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pselect( const Packet2d& mask, const Packet2d& a, const Packet2d& b) +{ return vbslq_f64(vreinterpretq_u64_f64(mask), a, b); } + +template<> EIGEN_STRONG_INLINE Packet2d print(const Packet2d& a) +{ return vrndnq_f64(a); } + +template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) +{ return vrndmq_f64(a); } + +template<> EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) +{ return vrndpq_f64(a); } + +template<> EIGEN_STRONG_INLINE Packet2d pldexp(const Packet2d& a, const Packet2d& exponent) +{ return pldexp_generic(a, exponent); } + +template<> EIGEN_STRONG_INLINE Packet2d pfrexp(const Packet2d& a, Packet2d& exponent) +{ return pfrexp_generic(a,exponent); } + +template<> EIGEN_STRONG_INLINE Packet2d pset1frombits(uint64_t from) +{ return vreinterpretq_f64_u64(vdupq_n_u64(from)); } + +template<> EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) { + // Compute approximate reciprocal sqrt. + Packet2d x = vrsqrteq_f64(a); + // Do Newton iterations for 1/sqrt(x). + x = vmulq_f64(vrsqrtsq_f64(vmulq_f64(a, x), x), x); + x = vmulq_f64(vrsqrtsq_f64(vmulq_f64(a, x), x), x); + x = vmulq_f64(vrsqrtsq_f64(vmulq_f64(a, x), x), x); + const Packet2d infinity = pset1(NumTraits::infinity()); + return pselect(pcmp_eq(a, pzero(a)), infinity, x); +} + +template<> EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& _x){ return vsqrtq_f64(_x); } + +#endif // EIGEN_ARCH_ARM64 + +// Do we have an fp16 types and supporting Neon intrinsics? +#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC +typedef float16x4_t Packet4hf; +typedef float16x8_t Packet8hf; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet8hf type; + typedef Packet4hf half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 8, + HasHalfPacket = 1, + + HasCmp = 1, + HasCast = 1, + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasAbsDiff = 0, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasInsert = 1, + HasReduxp = 1, + HasDiv = 1, + HasFloor = 1, + HasCeil = 1, + HasRint = 1, + HasSin = 0, + HasCos = 0, + HasLog = 0, + HasExp = 0, + HasSqrt = 1, + HasRsqrt = 1, + HasErf = EIGEN_FAST_MATH, + HasBessel = 0, // Issues with accuracy. + HasNdtri = 0 + }; +}; + +template <> +struct unpacket_traits { + typedef Eigen::half type; + typedef Packet4hf half; + enum { + size = 4, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef Eigen::half type; + typedef Packet4hf half; + enum { + size = 8, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template<> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf predux_half_dowto4(const Packet8hf& a) { + return vadd_f16(vget_low_f16(a), vget_high_f16(a)); +} + +template <> +EIGEN_STRONG_INLINE Packet8hf pset1(const Eigen::half& from) { + return vdupq_n_f16(from.x); +} + +template <> +EIGEN_STRONG_INLINE Packet4hf pset1(const Eigen::half& from) { + return vdup_n_f16(from.x); +} + +template <> +EIGEN_STRONG_INLINE Packet8hf plset(const Eigen::half& a) { + const float16_t f[] = {0, 1, 2, 3, 4, 5, 6, 7}; + Packet8hf countdown = vld1q_f16(f); + return vaddq_f16(pset1(a), countdown); +} + +template <> +EIGEN_STRONG_INLINE Packet4hf plset(const Eigen::half& a) { + const float16_t f[] = {0, 1, 2, 3}; + Packet4hf countdown = vld1_f16(f); + return vadd_f16(pset1(a), countdown); +} + +template <> +EIGEN_STRONG_INLINE Packet8hf padd(const Packet8hf& a, const Packet8hf& b) { + return vaddq_f16(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4hf padd(const Packet4hf& a, const Packet4hf& b) { + return vadd_f16(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet8hf psub(const Packet8hf& a, const Packet8hf& b) { + return vsubq_f16(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4hf psub(const Packet4hf& a, const Packet4hf& b) { + return vsub_f16(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet8hf pnegate(const Packet8hf& a) { + return vnegq_f16(a); +} + +template <> +EIGEN_STRONG_INLINE Packet4hf pnegate(const Packet4hf& a) { + return vneg_f16(a); +} + +template <> +EIGEN_STRONG_INLINE Packet8hf pconj(const Packet8hf& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE Packet4hf pconj(const Packet4hf& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE Packet8hf pmul(const Packet8hf& a, const Packet8hf& b) { + return vmulq_f16(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4hf pmul(const Packet4hf& a, const Packet4hf& b) { + return vmul_f16(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet8hf pdiv(const Packet8hf& a, const Packet8hf& b) { + return vdivq_f16(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4hf pdiv(const Packet4hf& a, const Packet4hf& b) { + return vdiv_f16(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet8hf pmadd(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) { + return vfmaq_f16(c, a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4hf pmadd(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) { + return vfma_f16(c, a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet8hf pmin(const Packet8hf& a, const Packet8hf& b) { + return vminq_f16(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4hf pmin(const Packet4hf& a, const Packet4hf& b) { + return vmin_f16(a, b); +} + +#ifdef __ARM_FEATURE_NUMERIC_MAXMIN +// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems). +template<> EIGEN_STRONG_INLINE Packet4hf pmin(const Packet4hf& a, const Packet4hf& b) { return vminnm_f16(a, b); } +template<> EIGEN_STRONG_INLINE Packet8hf pmin(const Packet8hf& a, const Packet8hf& b) { return vminnmq_f16(a, b); } +#endif + +template<> EIGEN_STRONG_INLINE Packet4hf pmin(const Packet4hf& a, const Packet4hf& b) { return pmin(a, b); } + +template<> EIGEN_STRONG_INLINE Packet8hf pmin(const Packet8hf& a, const Packet8hf& b) { return pmin(a, b); } + +template <> +EIGEN_STRONG_INLINE Packet8hf pmax(const Packet8hf& a, const Packet8hf& b) { + return vmaxq_f16(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4hf pmax(const Packet4hf& a, const Packet4hf& b) { + return vmax_f16(a, b); +} + +#ifdef __ARM_FEATURE_NUMERIC_MAXMIN +// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems). +template<> EIGEN_STRONG_INLINE Packet4hf pmax(const Packet4hf& a, const Packet4hf& b) { return vmaxnm_f16(a, b); } +template<> EIGEN_STRONG_INLINE Packet8hf pmax(const Packet8hf& a, const Packet8hf& b) { return vmaxnmq_f16(a, b); } +#endif + +template<> EIGEN_STRONG_INLINE Packet4hf pmax(const Packet4hf& a, const Packet4hf& b) { return pmax(a, b); } + +template<> EIGEN_STRONG_INLINE Packet8hf pmax(const Packet8hf& a, const Packet8hf& b) { return pmax(a, b); } + +#define EIGEN_MAKE_ARM_FP16_CMP_8(name) \ + template <> \ + EIGEN_STRONG_INLINE Packet8hf pcmp_##name(const Packet8hf& a, const Packet8hf& b) { \ + return vreinterpretq_f16_u16(vc##name##q_f16(a, b)); \ + } + +#define EIGEN_MAKE_ARM_FP16_CMP_4(name) \ + template <> \ + EIGEN_STRONG_INLINE Packet4hf pcmp_##name(const Packet4hf& a, const Packet4hf& b) { \ + return vreinterpret_f16_u16(vc##name##_f16(a, b)); \ + } + +EIGEN_MAKE_ARM_FP16_CMP_8(eq) +EIGEN_MAKE_ARM_FP16_CMP_8(lt) +EIGEN_MAKE_ARM_FP16_CMP_8(le) + +EIGEN_MAKE_ARM_FP16_CMP_4(eq) +EIGEN_MAKE_ARM_FP16_CMP_4(lt) +EIGEN_MAKE_ARM_FP16_CMP_4(le) + +#undef EIGEN_MAKE_ARM_FP16_CMP_8 +#undef EIGEN_MAKE_ARM_FP16_CMP_4 + +template <> +EIGEN_STRONG_INLINE Packet8hf pcmp_lt_or_nan(const Packet8hf& a, const Packet8hf& b) { + return vreinterpretq_f16_u16(vmvnq_u16(vcgeq_f16(a, b))); +} + +template <> +EIGEN_STRONG_INLINE Packet4hf pcmp_lt_or_nan(const Packet4hf& a, const Packet4hf& b) { + return vreinterpret_f16_u16(vmvn_u16(vcge_f16(a, b))); +} + +template <> +EIGEN_STRONG_INLINE Packet8hf print(const Packet8hf& a) +{ return vrndnq_f16(a); } + +template <> +EIGEN_STRONG_INLINE Packet4hf print(const Packet4hf& a) +{ return vrndn_f16(a); } + +template <> +EIGEN_STRONG_INLINE Packet8hf pfloor(const Packet8hf& a) +{ return vrndmq_f16(a); } + +template <> +EIGEN_STRONG_INLINE Packet4hf pfloor(const Packet4hf& a) +{ return vrndm_f16(a); } + +template <> +EIGEN_STRONG_INLINE Packet8hf pceil(const Packet8hf& a) +{ return vrndpq_f16(a); } + +template <> +EIGEN_STRONG_INLINE Packet4hf pceil(const Packet4hf& a) +{ return vrndp_f16(a); } + +template <> +EIGEN_STRONG_INLINE Packet8hf psqrt(const Packet8hf& a) { + return vsqrtq_f16(a); +} + +template <> +EIGEN_STRONG_INLINE Packet4hf psqrt(const Packet4hf& a) { + return vsqrt_f16(a); +} + +template <> +EIGEN_STRONG_INLINE Packet8hf pand(const Packet8hf& a, const Packet8hf& b) { + return vreinterpretq_f16_u16(vandq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet4hf pand(const Packet4hf& a, const Packet4hf& b) { + return vreinterpret_f16_u16(vand_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet8hf por(const Packet8hf& a, const Packet8hf& b) { + return vreinterpretq_f16_u16(vorrq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet4hf por(const Packet4hf& a, const Packet4hf& b) { + return vreinterpret_f16_u16(vorr_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet8hf pxor(const Packet8hf& a, const Packet8hf& b) { + return vreinterpretq_f16_u16(veorq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet4hf pxor(const Packet4hf& a, const Packet4hf& b) { + return vreinterpret_f16_u16(veor_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet8hf pandnot(const Packet8hf& a, const Packet8hf& b) { + return vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet4hf pandnot(const Packet4hf& a, const Packet4hf& b) { + return vreinterpret_f16_u16(vbic_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet8hf pload(const Eigen::half* from) { + EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f16(reinterpret_cast(from)); +} + +template <> +EIGEN_STRONG_INLINE Packet4hf pload(const Eigen::half* from) { + EIGEN_DEBUG_ALIGNED_LOAD return vld1_f16(reinterpret_cast(from)); +} + +template <> +EIGEN_STRONG_INLINE Packet8hf ploadu(const Eigen::half* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f16(reinterpret_cast(from)); +} + +template <> +EIGEN_STRONG_INLINE Packet4hf ploadu(const Eigen::half* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return vld1_f16(reinterpret_cast(from)); +} + +template <> +EIGEN_STRONG_INLINE Packet8hf ploaddup(const Eigen::half* from) { + Packet8hf packet; + packet[0] = from[0].x; + packet[1] = from[0].x; + packet[2] = from[1].x; + packet[3] = from[1].x; + packet[4] = from[2].x; + packet[5] = from[2].x; + packet[6] = from[3].x; + packet[7] = from[3].x; + return packet; +} + +template <> +EIGEN_STRONG_INLINE Packet4hf ploaddup(const Eigen::half* from) { + float16x4_t packet; + float16_t* tmp; + tmp = (float16_t*)&packet; + tmp[0] = from[0].x; + tmp[1] = from[0].x; + tmp[2] = from[1].x; + tmp[3] = from[1].x; + return packet; +} + +template <> +EIGEN_STRONG_INLINE Packet8hf ploadquad(const Eigen::half* from) { + Packet4hf lo, hi; + lo = vld1_dup_f16(reinterpret_cast(from)); + hi = vld1_dup_f16(reinterpret_cast(from+1)); + return vcombine_f16(lo, hi); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertfirst(const Packet8hf& a, Eigen::half b) { return vsetq_lane_f16(b.x, a, 0); } + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertfirst(const Packet4hf& a, Eigen::half b) { return vset_lane_f16(b.x, a, 0); } + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pselect(const Packet8hf& mask, const Packet8hf& a, const Packet8hf& b) { + return vbslq_f16(vreinterpretq_u16_f16(mask), a, b); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pselect(const Packet4hf& mask, const Packet4hf& a, const Packet4hf& b) { + return vbsl_f16(vreinterpret_u16_f16(mask), a, b); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertlast(const Packet8hf& a, Eigen::half b) { return vsetq_lane_f16(b.x, a, 7); } + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertlast(const Packet4hf& a, Eigen::half b) { return vset_lane_f16(b.x, a, 3); } + +template <> +EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet8hf& from) { + EIGEN_DEBUG_ALIGNED_STORE vst1q_f16(reinterpret_cast(to), from); +} + +template <> +EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet4hf& from) { + EIGEN_DEBUG_ALIGNED_STORE vst1_f16(reinterpret_cast(to), from); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet8hf& from) { + EIGEN_DEBUG_UNALIGNED_STORE vst1q_f16(reinterpret_cast(to), from); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet4hf& from) { + EIGEN_DEBUG_UNALIGNED_STORE vst1_f16(reinterpret_cast(to), from); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pgather(const Eigen::half* from, Index stride) { + Packet8hf res = pset1(Eigen::half(0.f)); + res = vsetq_lane_f16(from[0 * stride].x, res, 0); + res = vsetq_lane_f16(from[1 * stride].x, res, 1); + res = vsetq_lane_f16(from[2 * stride].x, res, 2); + res = vsetq_lane_f16(from[3 * stride].x, res, 3); + res = vsetq_lane_f16(from[4 * stride].x, res, 4); + res = vsetq_lane_f16(from[5 * stride].x, res, 5); + res = vsetq_lane_f16(from[6 * stride].x, res, 6); + res = vsetq_lane_f16(from[7 * stride].x, res, 7); + return res; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pgather(const Eigen::half* from, Index stride) { + Packet4hf res = pset1(Eigen::half(0.f)); + res = vset_lane_f16(from[0 * stride].x, res, 0); + res = vset_lane_f16(from[1 * stride].x, res, 1); + res = vset_lane_f16(from[2 * stride].x, res, 2); + res = vset_lane_f16(from[3 * stride].x, res, 3); + return res; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const Packet8hf& from, Index stride) { + to[stride * 0].x = vgetq_lane_f16(from, 0); + to[stride * 1].x = vgetq_lane_f16(from, 1); + to[stride * 2].x = vgetq_lane_f16(from, 2); + to[stride * 3].x = vgetq_lane_f16(from, 3); + to[stride * 4].x = vgetq_lane_f16(from, 4); + to[stride * 5].x = vgetq_lane_f16(from, 5); + to[stride * 6].x = vgetq_lane_f16(from, 6); + to[stride * 7].x = vgetq_lane_f16(from, 7); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const Packet4hf& from, Index stride) { + to[stride * 0].x = vget_lane_f16(from, 0); + to[stride * 1].x = vget_lane_f16(from, 1); + to[stride * 2].x = vget_lane_f16(from, 2); + to[stride * 3].x = vget_lane_f16(from, 3); +} + +template <> +EIGEN_STRONG_INLINE void prefetch(const Eigen::half* addr) { + EIGEN_ARM_PREFETCH(addr); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet8hf& a) { + float16_t x[8]; + vst1q_f16(x, a); + Eigen::half h; + h.x = x[0]; + return h; +} + +template <> +EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet4hf& a) { + float16_t x[4]; + vst1_f16(x, a); + Eigen::half h; + h.x = x[0]; + return h; +} + +template<> EIGEN_STRONG_INLINE Packet8hf preverse(const Packet8hf& a) { + float16x4_t a_lo, a_hi; + Packet8hf a_r64; + + a_r64 = vrev64q_f16(a); + a_lo = vget_low_f16(a_r64); + a_hi = vget_high_f16(a_r64); + return vcombine_f16(a_hi, a_lo); +} + +template <> +EIGEN_STRONG_INLINE Packet4hf preverse(const Packet4hf& a) { + return vrev64_f16(a); +} + +template <> +EIGEN_STRONG_INLINE Packet8hf pabs(const Packet8hf& a) { + return vabsq_f16(a); +} + +template <> +EIGEN_STRONG_INLINE Packet4hf pabs(const Packet4hf& a) { + return vabs_f16(a); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half predux(const Packet8hf& a) { + float16x4_t a_lo, a_hi, sum; + + a_lo = vget_low_f16(a); + a_hi = vget_high_f16(a); + sum = vpadd_f16(a_lo, a_hi); + sum = vpadd_f16(sum, sum); + sum = vpadd_f16(sum, sum); + + Eigen::half h; + h.x = vget_lane_f16(sum, 0); + return h; +} + +template <> +EIGEN_STRONG_INLINE Eigen::half predux(const Packet4hf& a) { + float16x4_t sum; + + sum = vpadd_f16(a, a); + sum = vpadd_f16(sum, sum); + Eigen::half h; + h.x = vget_lane_f16(sum, 0); + return h; +} + +template <> +EIGEN_STRONG_INLINE Eigen::half predux_mul(const Packet8hf& a) { + float16x4_t a_lo, a_hi, prod; + + a_lo = vget_low_f16(a); + a_hi = vget_high_f16(a); + prod = vmul_f16(a_lo, a_hi); + prod = vmul_f16(prod, vrev64_f16(prod)); + + Eigen::half h; + h.x = vmulh_f16(vget_lane_f16(prod, 0), vget_lane_f16(prod, 1)); + return h; +} + +template <> +EIGEN_STRONG_INLINE Eigen::half predux_mul(const Packet4hf& a) { + float16x4_t prod; + prod = vmul_f16(a, vrev64_f16(a)); + Eigen::half h; + h.x = vmulh_f16(vget_lane_f16(prod, 0), vget_lane_f16(prod, 1)); + return h; +} + +template <> +EIGEN_STRONG_INLINE Eigen::half predux_min(const Packet8hf& a) { + float16x4_t a_lo, a_hi, min; + + a_lo = vget_low_f16(a); + a_hi = vget_high_f16(a); + min = vpmin_f16(a_lo, a_hi); + min = vpmin_f16(min, min); + min = vpmin_f16(min, min); + + Eigen::half h; + h.x = vget_lane_f16(min, 0); + return h; +} + +template <> +EIGEN_STRONG_INLINE Eigen::half predux_min(const Packet4hf& a) { + Packet4hf tmp; + tmp = vpmin_f16(a, a); + tmp = vpmin_f16(tmp, tmp); + Eigen::half h; + h.x = vget_lane_f16(tmp, 0); + return h; +} + +template <> +EIGEN_STRONG_INLINE Eigen::half predux_max(const Packet8hf& a) { + float16x4_t a_lo, a_hi, max; + + a_lo = vget_low_f16(a); + a_hi = vget_high_f16(a); + max = vpmax_f16(a_lo, a_hi); + max = vpmax_f16(max, max); + max = vpmax_f16(max, max); + + Eigen::half h; + h.x = vget_lane_f16(max, 0); + return h; +} + +template <> +EIGEN_STRONG_INLINE Eigen::half predux_max(const Packet4hf& a) { + Packet4hf tmp; + tmp = vpmax_f16(a, a); + tmp = vpmax_f16(tmp, tmp); + Eigen::half h; + h.x = vget_lane_f16(tmp, 0); + return h; +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) +{ + const float16x8x2_t zip16_1 = vzipq_f16(kernel.packet[0], kernel.packet[1]); + const float16x8x2_t zip16_2 = vzipq_f16(kernel.packet[2], kernel.packet[3]); + + const float32x4x2_t zip32_1 = vzipq_f32(vreinterpretq_f32_f16(zip16_1.val[0]), vreinterpretq_f32_f16(zip16_2.val[0])); + const float32x4x2_t zip32_2 = vzipq_f32(vreinterpretq_f32_f16(zip16_1.val[1]), vreinterpretq_f32_f16(zip16_2.val[1])); + + kernel.packet[0] = vreinterpretq_f16_f32(zip32_1.val[0]); + kernel.packet[1] = vreinterpretq_f16_f32(zip32_1.val[1]); + kernel.packet[2] = vreinterpretq_f16_f32(zip32_2.val[0]); + kernel.packet[3] = vreinterpretq_f16_f32(zip32_2.val[1]); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + EIGEN_ALIGN16 float16x4x4_t tmp_x4; + float16_t* tmp = (float16_t*)&kernel; + tmp_x4 = vld4_f16(tmp); + + kernel.packet[0] = tmp_x4.val[0]; + kernel.packet[1] = tmp_x4.val[1]; + kernel.packet[2] = tmp_x4.val[2]; + kernel.packet[3] = tmp_x4.val[3]; +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + float16x8x2_t T_1[4]; + + T_1[0] = vuzpq_f16(kernel.packet[0], kernel.packet[1]); + T_1[1] = vuzpq_f16(kernel.packet[2], kernel.packet[3]); + T_1[2] = vuzpq_f16(kernel.packet[4], kernel.packet[5]); + T_1[3] = vuzpq_f16(kernel.packet[6], kernel.packet[7]); + + float16x8x2_t T_2[4]; + T_2[0] = vuzpq_f16(T_1[0].val[0], T_1[1].val[0]); + T_2[1] = vuzpq_f16(T_1[0].val[1], T_1[1].val[1]); + T_2[2] = vuzpq_f16(T_1[2].val[0], T_1[3].val[0]); + T_2[3] = vuzpq_f16(T_1[2].val[1], T_1[3].val[1]); + + float16x8x2_t T_3[4]; + T_3[0] = vuzpq_f16(T_2[0].val[0], T_2[2].val[0]); + T_3[1] = vuzpq_f16(T_2[0].val[1], T_2[2].val[1]); + T_3[2] = vuzpq_f16(T_2[1].val[0], T_2[3].val[0]); + T_3[3] = vuzpq_f16(T_2[1].val[1], T_2[3].val[1]); + + kernel.packet[0] = T_3[0].val[0]; + kernel.packet[1] = T_3[2].val[0]; + kernel.packet[2] = T_3[1].val[0]; + kernel.packet[3] = T_3[3].val[0]; + kernel.packet[4] = T_3[0].val[1]; + kernel.packet[5] = T_3[2].val[1]; + kernel.packet[6] = T_3[1].val[1]; + kernel.packet[7] = T_3[3].val[1]; +} +#endif // end EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC } // end namespace internal diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/NEON/TypeCasting.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/NEON/TypeCasting.h new file mode 100644 index 0000000000..54f97336e0 --- /dev/null +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/NEON/TypeCasting.h @@ -0,0 +1,1419 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2018 Rasmus Munk Larsen +// Copyright (C) 2020 Antonio Sanchez +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_TYPE_CASTING_NEON_H +#define EIGEN_TYPE_CASTING_NEON_H + +namespace Eigen { + +namespace internal { + +//============================================================================== +// pcast, SrcType = float +//============================================================================== +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4f pcast(const Packet4f& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet2f pcast(const Packet2f& a) { + return a; +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +// If float64 exists, first convert to that to keep as much precision as possible. +#if EIGEN_ARCH_ARM64 +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet4f& a) { + // Discard second half of input. + return vcvtq_s64_f64(vcvt_f64_f32(vget_low_f32(a))); +} +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet4f& a) { + // Discard second half of input. + return vcvtq_u64_f64(vcvt_f64_f32(vget_low_f32(a))); +} +#else +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet4f& a) { + // Discard second half of input. + return vmovl_s32(vget_low_s32(vcvtq_s32_f32(a))); +} +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet4f& a) { + // Discard second half of input. + return vmovl_u32(vget_low_u32(vcvtq_u32_f32(a))); +} +#endif // EIGEN_ARCH_ARM64 + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4i pcast(const Packet4f& a) { + return vcvtq_s32_f32(a); +} +template <> +EIGEN_STRONG_INLINE Packet2i pcast(const Packet2f& a) { + return vcvt_s32_f32(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4ui pcast(const Packet4f& a) { + return vcvtq_u32_f32(a); +} +template <> +EIGEN_STRONG_INLINE Packet2ui pcast(const Packet2f& a) { + return vcvt_u32_f32(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8s pcast(const Packet4f& a, const Packet4f& b) { + return vcombine_s16(vmovn_s32(vcvtq_s32_f32(a)), vmovn_s32(vcvtq_s32_f32(b))); +} +template <> +EIGEN_STRONG_INLINE Packet4s pcast(const Packet2f& a, const Packet2f& b) { + return vmovn_s32(vcombine_s32(vcvt_s32_f32(a), vcvt_s32_f32(b))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8us pcast(const Packet4f& a, const Packet4f& b) { + return vcombine_u16(vmovn_u32(vcvtq_u32_f32(a)), vmovn_u32(vcvtq_u32_f32(b))); +} +template <> +EIGEN_STRONG_INLINE Packet4us pcast(const Packet2f& a, const Packet2f& b) { + return vmovn_u32(vcombine_u32(vcvt_u32_f32(a), vcvt_u32_f32(b))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16c pcast(const Packet4f& a, const Packet4f& b, const Packet4f& c, + const Packet4f& d) { + const int16x8_t ab_s16 = pcast(a, b); + const int16x8_t cd_s16 = pcast(c, d); + return vcombine_s8(vmovn_s16(ab_s16), vmovn_s16(cd_s16)); +} +template <> +EIGEN_STRONG_INLINE Packet8c pcast(const Packet2f& a, const Packet2f& b, const Packet2f& c, + const Packet2f& d) { + const int16x4_t ab_s16 = pcast(a, b); + const int16x4_t cd_s16 = pcast(c, d); + return vmovn_s16(vcombine_s16(ab_s16, cd_s16)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16uc pcast(const Packet4f& a, const Packet4f& b, const Packet4f& c, + const Packet4f& d) { + const uint16x8_t ab_u16 = pcast(a, b); + const uint16x8_t cd_u16 = pcast(c, d); + return vcombine_u8(vmovn_u16(ab_u16), vmovn_u16(cd_u16)); +} +template <> +EIGEN_STRONG_INLINE Packet8uc pcast(const Packet2f& a, const Packet2f& b, const Packet2f& c, + const Packet2f& d) { + const uint16x4_t ab_u16 = pcast(a, b); + const uint16x4_t cd_u16 = pcast(c, d); + return vmovn_u16(vcombine_u16(ab_u16, cd_u16)); +} + +//============================================================================== +// pcast, SrcType = int8_t +//============================================================================== +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4f pcast(const Packet16c& a) { + // Discard all but first 4 bytes. + return vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(a))))); +} +template <> +EIGEN_STRONG_INLINE Packet2f pcast(const Packet8c& a) { + // Discard all but first 2 bytes. + return vcvt_f32_s32(vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(a))))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 8 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet16c& a) { + // Discard all but first two bytes. + return vmovl_s32(vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(a)))))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 8 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet16c& a) { + return vreinterpretq_u64_s64(pcast(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4i pcast(const Packet16c& a) { + // Discard all but first 4 bytes. + return vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(a)))); +} +template <> +EIGEN_STRONG_INLINE Packet2i pcast(const Packet8c& a) { + // Discard all but first 2 bytes. + return vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(a)))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4ui pcast(const Packet16c& a) { + return vreinterpretq_u32_s32(pcast(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2ui pcast(const Packet8c& a) { + return vreinterpret_u32_s32(pcast(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8s pcast(const Packet16c& a) { + // Discard second half of input. + return vmovl_s8(vget_low_s8(a)); +} +template <> +EIGEN_STRONG_INLINE Packet4s pcast(const Packet8c& a) { + // Discard second half of input. + return vget_low_s16(vmovl_s8(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8us pcast(const Packet16c& a) { + return vreinterpretq_u16_s16(pcast(a)); +} +template <> +EIGEN_STRONG_INLINE Packet4us pcast(const Packet8c& a) { + return vreinterpret_u16_s16(pcast(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16c pcast(const Packet16c& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet8c pcast(const Packet8c& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet4c pcast(const Packet4c& a) { + return a; +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16uc pcast(const Packet16c& a) { + return vreinterpretq_u8_s8(a); +} +template <> +EIGEN_STRONG_INLINE Packet8uc pcast(const Packet8c& a) { + return vreinterpret_u8_s8(a); +} +template <> +EIGEN_STRONG_INLINE Packet4uc pcast(const Packet4c& a) { + return static_cast(a); +} + +//============================================================================== +// pcast, SrcType = uint8_t +//============================================================================== +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4f pcast(const Packet16uc& a) { + // Discard all but first 4 bytes. + return vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))); +} +template <> +EIGEN_STRONG_INLINE Packet2f pcast(const Packet8uc& a) { + // Discard all but first 2 bytes. + return vcvt_f32_u32(vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(a))))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 8 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet16uc& a) { + // Discard all but first two bytes. + return vmovl_u32(vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a)))))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 8 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet16uc& a) { + return vreinterpretq_s64_u64(pcast(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4ui pcast(const Packet16uc& a) { + // Discard all but first 4 bytes. + return vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a)))); +} +template <> +EIGEN_STRONG_INLINE Packet2ui pcast(const Packet8uc& a) { + // Discard all but first 2 bytes. + return vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(a)))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4i pcast(const Packet16uc& a) { + return vreinterpretq_s32_u32(pcast(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2i pcast(const Packet8uc& a) { + return vreinterpret_s32_u32(pcast(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8us pcast(const Packet16uc& a) { + // Discard second half of input. + return vmovl_u8(vget_low_u8(a)); +} +template <> +EIGEN_STRONG_INLINE Packet4us pcast(const Packet8uc& a) { + // Discard second half of input. + return vget_low_u16(vmovl_u8(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8s pcast(const Packet16uc& a) { + return vreinterpretq_s16_u16(pcast(a)); +} +template <> +EIGEN_STRONG_INLINE Packet4s pcast(const Packet8uc& a) { + return vreinterpret_s16_u16(pcast(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16uc pcast(const Packet16uc& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet8uc pcast(const Packet8uc& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet4uc pcast(const Packet4uc& a) { + return a; +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16c pcast(const Packet16uc& a) { + return vreinterpretq_s8_u8(a); +} +template <> +EIGEN_STRONG_INLINE Packet8c pcast(const Packet8uc& a) { + return vreinterpret_s8_u8(a); +} +template <> +EIGEN_STRONG_INLINE Packet4c pcast(const Packet4uc& a) { + return static_cast(a); +} + +//============================================================================== +// pcast, SrcType = int16_t +//============================================================================== +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4f pcast(const Packet8s& a) { + // Discard second half of input. + return vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))); +} +template <> +EIGEN_STRONG_INLINE Packet2f pcast(const Packet4s& a) { + // Discard second half of input. + return vcvt_f32_s32(vget_low_s32(vmovl_s16(a))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet8s& a) { + // Discard all but first two values. + return vmovl_s32(vget_low_s32(vmovl_s16(vget_low_s16(a)))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet8s& a) { + return vreinterpretq_u64_s64(pcast(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4i pcast(const Packet8s& a) { + // Discard second half of input. + return vmovl_s16(vget_low_s16(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2i pcast(const Packet4s& a) { + // Discard second half of input. + return vget_low_s32(vmovl_s16(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4ui pcast(const Packet8s& a) { + return vreinterpretq_u32_s32(pcast(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2ui pcast(const Packet4s& a) { + return vreinterpret_u32_s32(pcast(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8s pcast(const Packet8s& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet4s pcast(const Packet4s& a) { + return a; +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8us pcast(const Packet8s& a) { + return vreinterpretq_u16_s16(a); +} +template <> +EIGEN_STRONG_INLINE Packet4us pcast(const Packet4s& a) { + return vreinterpret_u16_s16(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16c pcast(const Packet8s& a, const Packet8s& b) { + return vcombine_s8(vmovn_s16(a), vmovn_s16(b)); +} +template <> +EIGEN_STRONG_INLINE Packet8c pcast(const Packet4s& a, const Packet4s& b) { + return vmovn_s16(vcombine_s16(a, b)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16uc pcast(const Packet8s& a, const Packet8s& b) { + return vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(a)), vmovn_u16(vreinterpretq_u16_s16(b))); +} +template <> +EIGEN_STRONG_INLINE Packet8uc pcast(const Packet4s& a, const Packet4s& b) { + return vmovn_u16(vcombine_u16(vreinterpret_u16_s16(a), vreinterpret_u16_s16(b))); +} + +//============================================================================== +// pcast, SrcType = uint16_t +//============================================================================== +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4f pcast(const Packet8us& a) { + // Discard second half of input. + return vcvtq_f32_u32(vmovl_u16(vget_low_u16(a))); +} +template <> +EIGEN_STRONG_INLINE Packet2f pcast(const Packet4us& a) { + // Discard second half of input. + return vcvt_f32_u32(vget_low_u32(vmovl_u16(a))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet8us& a) { + // Discard all but first two values. + return vmovl_u32(vget_low_u32(vmovl_u16(vget_low_u16(a)))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet8us& a) { + return vreinterpretq_s64_u64(pcast(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4ui pcast(const Packet8us& a) { + // Discard second half of input. + return vmovl_u16(vget_low_u16(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2ui pcast(const Packet4us& a) { + // Discard second half of input. + return vget_low_u32(vmovl_u16(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4i pcast(const Packet8us& a) { + return vreinterpretq_s32_u32(pcast(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2i pcast(const Packet4us& a) { + return vreinterpret_s32_u32(pcast(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8us pcast(const Packet8us& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet4us pcast(const Packet4us& a) { + return a; +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8s pcast(const Packet8us& a) { + return vreinterpretq_s16_u16(a); +} +template <> +EIGEN_STRONG_INLINE Packet4s pcast(const Packet4us& a) { + return vreinterpret_s16_u16(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16uc pcast(const Packet8us& a, const Packet8us& b) { + return vcombine_u8(vmovn_u16(a), vmovn_u16(b)); +} +template <> +EIGEN_STRONG_INLINE Packet8uc pcast(const Packet4us& a, const Packet4us& b) { + return vmovn_u16(vcombine_u16(a, b)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16c pcast(const Packet8us& a, const Packet8us& b) { + return vreinterpretq_s8_u8(pcast(a, b)); +} +template <> +EIGEN_STRONG_INLINE Packet8c pcast(const Packet4us& a, const Packet4us& b) { + return vreinterpret_s8_u8(pcast(a, b)); +} + +//============================================================================== +// pcast, SrcType = int32_t +//============================================================================== +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4f pcast(const Packet4i& a) { + return vcvtq_f32_s32(a); +} +template <> +EIGEN_STRONG_INLINE Packet2f pcast(const Packet2i& a) { + return vcvt_f32_s32(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet4i& a) { + // Discard second half of input. + return vmovl_s32(vget_low_s32(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet4i& a) { + return vreinterpretq_u64_s64(pcast(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4i pcast(const Packet4i& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet2i pcast(const Packet2i& a) { + return a; +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4ui pcast(const Packet4i& a) { + return vreinterpretq_u32_s32(a); +} +template <> +EIGEN_STRONG_INLINE Packet2ui pcast(const Packet2i& a) { + return vreinterpret_u32_s32(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8s pcast(const Packet4i& a, const Packet4i& b) { + return vcombine_s16(vmovn_s32(a), vmovn_s32(b)); +} +template <> +EIGEN_STRONG_INLINE Packet4s pcast(const Packet2i& a, const Packet2i& b) { + return vmovn_s32(vcombine_s32(a, b)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8us pcast(const Packet4i& a, const Packet4i& b) { + return vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(a)), vmovn_u32(vreinterpretq_u32_s32(b))); +} +template <> +EIGEN_STRONG_INLINE Packet4us pcast(const Packet2i& a, const Packet2i& b) { + return vmovn_u32(vreinterpretq_u32_s32(vcombine_s32(a, b))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16c pcast(const Packet4i& a, const Packet4i& b, const Packet4i& c, + const Packet4i& d) { + const int16x8_t ab_s16 = pcast(a, b); + const int16x8_t cd_s16 = pcast(c, d); + return vcombine_s8(vmovn_s16(ab_s16), vmovn_s16(cd_s16)); +} +template <> +EIGEN_STRONG_INLINE Packet8c pcast(const Packet2i& a, const Packet2i& b, const Packet2i& c, + const Packet2i& d) { + const int16x4_t ab_s16 = vmovn_s32(vcombine_s32(a, b)); + const int16x4_t cd_s16 = vmovn_s32(vcombine_s32(c, d)); + return vmovn_s16(vcombine_s16(ab_s16, cd_s16)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16uc pcast(const Packet4i& a, const Packet4i& b, const Packet4i& c, + const Packet4i& d) { + const uint16x8_t ab_u16 = pcast(a, b); + const uint16x8_t cd_u16 = pcast(c, d); + return vcombine_u8(vmovn_u16(ab_u16), vmovn_u16(cd_u16)); +} +template <> +EIGEN_STRONG_INLINE Packet8uc pcast(const Packet2i& a, const Packet2i& b, const Packet2i& c, + const Packet2i& d) { + const uint16x4_t ab_u16 = pcast(a, b); + const uint16x4_t cd_u16 = pcast(c, d); + return vmovn_u16(vcombine_u16(ab_u16, cd_u16)); +} + +//============================================================================== +// pcast, SrcType = uint32_t +//============================================================================== +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4f pcast(const Packet4ui& a) { + return vcvtq_f32_u32(a); +} +template <> +EIGEN_STRONG_INLINE Packet2f pcast(const Packet2ui& a) { + return vcvt_f32_u32(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet4ui& a) { + // Discard second half of input. + return vmovl_u32(vget_low_u32(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet4ui& a) { + return vreinterpretq_s64_u64(pcast(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4ui pcast(const Packet4ui& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet2ui pcast(const Packet2ui& a) { + return a; +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4i pcast(const Packet4ui& a) { + return vreinterpretq_s32_u32(a); +} +template <> +EIGEN_STRONG_INLINE Packet2i pcast(const Packet2ui& a) { + return vreinterpret_s32_u32(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8us pcast(const Packet4ui& a, const Packet4ui& b) { + return vcombine_u16(vmovn_u32(a), vmovn_u32(b)); +} +template <> +EIGEN_STRONG_INLINE Packet4us pcast(const Packet2ui& a, const Packet2ui& b) { + return vmovn_u32(vcombine_u32(a, b)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8s pcast(const Packet4ui& a, const Packet4ui& b) { + return vreinterpretq_s16_u16(pcast(a, b)); +} +template <> +EIGEN_STRONG_INLINE Packet4s pcast(const Packet2ui& a, const Packet2ui& b) { + return vreinterpret_s16_u16(pcast(a, b)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16uc pcast(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c, + const Packet4ui& d) { + const uint16x8_t ab_u16 = vcombine_u16(vmovn_u32(a), vmovn_u32(b)); + const uint16x8_t cd_u16 = vcombine_u16(vmovn_u32(c), vmovn_u32(d)); + return vcombine_u8(vmovn_u16(ab_u16), vmovn_u16(cd_u16)); +} +template <> +EIGEN_STRONG_INLINE Packet8uc pcast(const Packet2ui& a, const Packet2ui& b, const Packet2ui& c, + const Packet2ui& d) { + const uint16x4_t ab_u16 = vmovn_u32(vcombine_u32(a, b)); + const uint16x4_t cd_u16 = vmovn_u32(vcombine_u32(c, d)); + return vmovn_u16(vcombine_u16(ab_u16, cd_u16)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16c pcast(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c, + const Packet4ui& d) { + return vreinterpretq_s8_u8(pcast(a, b, c, d)); +} +template <> +EIGEN_STRONG_INLINE Packet8c pcast(const Packet2ui& a, const Packet2ui& b, const Packet2ui& c, + const Packet2ui& d) { + return vreinterpret_s8_u8(pcast(a, b, c, d)); +} + +//============================================================================== +// pcast, SrcType = int64_t +//============================================================================== +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4f pcast(const Packet2l& a, const Packet2l& b) { + return vcvtq_f32_s32(vcombine_s32(vmovn_s64(a), vmovn_s64(b))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet2l& a) { + return a; +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet2l& a) { + return vreinterpretq_u64_s64(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4i pcast(const Packet2l& a, const Packet2l& b) { + return vcombine_s32(vmovn_s64(a), vmovn_s64(b)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4ui pcast(const Packet2l& a, const Packet2l& b) { + return vcombine_u32(vmovn_u64(vreinterpretq_u64_s64(a)), vmovn_u64(vreinterpretq_u64_s64(b))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8s pcast(const Packet2l& a, const Packet2l& b, const Packet2l& c, + const Packet2l& d) { + const int32x4_t ab_s32 = pcast(a, b); + const int32x4_t cd_s32 = pcast(c, d); + return vcombine_s16(vmovn_s32(ab_s32), vmovn_s32(cd_s32)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8us pcast(const Packet2l& a, const Packet2l& b, const Packet2l& c, + const Packet2l& d) { + const uint32x4_t ab_u32 = pcast(a, b); + const uint32x4_t cd_u32 = pcast(c, d); + return vcombine_u16(vmovn_u32(ab_u32), vmovn_u32(cd_u32)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 8, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16c pcast(const Packet2l& a, const Packet2l& b, const Packet2l& c, + const Packet2l& d, const Packet2l& e, const Packet2l& f, + const Packet2l& g, const Packet2l& h) { + const int16x8_t abcd_s16 = pcast(a, b, c, d); + const int16x8_t efgh_s16 = pcast(e, f, g, h); + return vcombine_s8(vmovn_s16(abcd_s16), vmovn_s16(efgh_s16)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 8, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16uc pcast(const Packet2l& a, const Packet2l& b, const Packet2l& c, + const Packet2l& d, const Packet2l& e, const Packet2l& f, + const Packet2l& g, const Packet2l& h) { + const uint16x8_t abcd_u16 = pcast(a, b, c, d); + const uint16x8_t efgh_u16 = pcast(e, f, g, h); + return vcombine_u8(vmovn_u16(abcd_u16), vmovn_u16(efgh_u16)); +} + +//============================================================================== +// pcast, SrcType = uint64_t +//============================================================================== +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4f pcast(const Packet2ul& a, const Packet2ul& b) { + return vcvtq_f32_u32(vcombine_u32(vmovn_u64(a), vmovn_u64(b))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet2ul& a) { + return a; +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet2ul& a) { + return vreinterpretq_s64_u64(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4ui pcast(const Packet2ul& a, const Packet2ul& b) { + return vcombine_u32(vmovn_u64(a), vmovn_u64(b)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4i pcast(const Packet2ul& a, const Packet2ul& b) { + return vreinterpretq_s32_u32(pcast(a, b)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8us pcast(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c, + const Packet2ul& d) { + const uint16x4_t ab_u16 = vmovn_u32(vcombine_u32(vmovn_u64(a), vmovn_u64(b))); + const uint16x4_t cd_u16 = vmovn_u32(vcombine_u32(vmovn_u64(c), vmovn_u64(d))); + return vcombine_u16(ab_u16, cd_u16); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8s pcast(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c, + const Packet2ul& d) { + return vreinterpretq_s16_u16(pcast(a, b, c, d)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 8, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16uc pcast(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c, + const Packet2ul& d, const Packet2ul& e, const Packet2ul& f, + const Packet2ul& g, const Packet2ul& h) { + const uint16x8_t abcd_u16 = pcast(a, b, c, d); + const uint16x8_t efgh_u16 = pcast(e, f, g, h); + return vcombine_u8(vmovn_u16(abcd_u16), vmovn_u16(efgh_u16)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 8, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16c pcast(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c, + const Packet2ul& d, const Packet2ul& e, const Packet2ul& f, + const Packet2ul& g, const Packet2ul& h) { + return vreinterpretq_s8_u8(pcast(a, b, c, d, e, f, g, h)); +} + +//============================================================================== +// preinterpret +//============================================================================== +template <> +EIGEN_STRONG_INLINE Packet2f preinterpret(const Packet2i& a) { + return vreinterpret_f32_s32(a); +} +template <> +EIGEN_STRONG_INLINE Packet2f preinterpret(const Packet2ui& a) { + return vreinterpret_f32_u32(a); +} +template <> +EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4i& a) { + return vreinterpretq_f32_s32(a); +} +template <> +EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4ui& a) { + return vreinterpretq_f32_u32(a); +} + +template <> +EIGEN_STRONG_INLINE Packet4c preinterpret(const Packet4uc& a) { + return static_cast(a); +} +template <> +EIGEN_STRONG_INLINE Packet8c preinterpret(const Packet8uc& a) { + return vreinterpret_s8_u8(a); +} +template <> +EIGEN_STRONG_INLINE Packet16c preinterpret(const Packet16uc& a) { + return vreinterpretq_s8_u8(a); +} + +template <> +EIGEN_STRONG_INLINE Packet4uc preinterpret(const Packet4c& a) { + return static_cast(a); +} +template <> +EIGEN_STRONG_INLINE Packet8uc preinterpret(const Packet8c& a) { + return vreinterpret_u8_s8(a); +} +template <> +EIGEN_STRONG_INLINE Packet16uc preinterpret(const Packet16c& a) { + return vreinterpretq_u8_s8(a); +} + +template <> +EIGEN_STRONG_INLINE Packet4s preinterpret(const Packet4us& a) { + return vreinterpret_s16_u16(a); +} +template <> +EIGEN_STRONG_INLINE Packet8s preinterpret(const Packet8us& a) { + return vreinterpretq_s16_u16(a); +} + +template <> +EIGEN_STRONG_INLINE Packet4us preinterpret(const Packet4s& a) { + return vreinterpret_u16_s16(a); +} +template <> +EIGEN_STRONG_INLINE Packet8us preinterpret(const Packet8s& a) { + return vreinterpretq_u16_s16(a); +} + +template <> +EIGEN_STRONG_INLINE Packet2i preinterpret(const Packet2f& a) { + return vreinterpret_s32_f32(a); +} +template <> +EIGEN_STRONG_INLINE Packet2i preinterpret(const Packet2ui& a) { + return vreinterpret_s32_u32(a); +} +template <> +EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4f& a) { + return vreinterpretq_s32_f32(a); +} +template <> +EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4ui& a) { + return vreinterpretq_s32_u32(a); +} + +template <> +EIGEN_STRONG_INLINE Packet2ui preinterpret(const Packet2f& a) { + return vreinterpret_u32_f32(a); +} +template <> +EIGEN_STRONG_INLINE Packet2ui preinterpret(const Packet2i& a) { + return vreinterpret_u32_s32(a); +} +template <> +EIGEN_STRONG_INLINE Packet4ui preinterpret(const Packet4f& a) { + return vreinterpretq_u32_f32(a); +} +template <> +EIGEN_STRONG_INLINE Packet4ui preinterpret(const Packet4i& a) { + return vreinterpretq_u32_s32(a); +} + +template <> +EIGEN_STRONG_INLINE Packet2l preinterpret(const Packet2ul& a) { + return vreinterpretq_s64_u64(a); +} +template <> +EIGEN_STRONG_INLINE Packet2ul preinterpret(const Packet2l& a) { + return vreinterpretq_u64_s64(a); +} + +#if EIGEN_ARCH_ARM64 + +//============================================================================== +// pcast/preinterpret, Double +//============================================================================== + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet2d& a) { + return a; +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4f pcast(const Packet2d& a, const Packet2d& b) { + return vcombine_f32(vcvt_f32_f64(a), vcvt_f32_f64(b)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet2d& a) { + return vcvtq_s64_f64(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet2d& a) { + return vcvtq_u64_f64(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4i pcast(const Packet2d& a, const Packet2d& b) { + return vcombine_s32(vmovn_s64(vcvtq_s64_f64(a)), vmovn_s64(vcvtq_s64_f64(b))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4ui pcast(const Packet2d& a, const Packet2d& b) { + return vcombine_u32(vmovn_u64(vcvtq_u64_f64(a)), vmovn_u64(vcvtq_u64_f64(b))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8s pcast(const Packet2d& a, const Packet2d& b, const Packet2d& c, + const Packet2d& d) { + const int32x4_t ab_s32 = pcast(a, b); + const int32x4_t cd_s32 = pcast(c, d); + return vcombine_s16(vmovn_s32(ab_s32), vmovn_s32(cd_s32)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8us pcast(const Packet2d& a, const Packet2d& b, const Packet2d& c, + const Packet2d& d) { + const uint32x4_t ab_u32 = pcast(a, b); + const uint32x4_t cd_u32 = pcast(c, d); + return vcombine_u16(vmovn_u32(ab_u32), vmovn_u32(cd_u32)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 8, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16c pcast(const Packet2d& a, const Packet2d& b, const Packet2d& c, + const Packet2d& d, const Packet2d& e, const Packet2d& f, + const Packet2d& g, const Packet2d& h) { + const int16x8_t abcd_s16 = pcast(a, b, c, d); + const int16x8_t efgh_s16 = pcast(e, f, g, h); + return vcombine_s8(vmovn_s16(abcd_s16), vmovn_s16(efgh_s16)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 8, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16uc pcast(const Packet2d& a, const Packet2d& b, const Packet2d& c, + const Packet2d& d, const Packet2d& e, const Packet2d& f, + const Packet2d& g, const Packet2d& h) { + const uint16x8_t abcd_u16 = pcast(a, b, c, d); + const uint16x8_t efgh_u16 = pcast(e, f, g, h); + return vcombine_u8(vmovn_u16(abcd_u16), vmovn_u16(efgh_u16)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet4f& a) { + // Discard second-half of input. + return vcvt_f64_f32(vget_low_f32(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 8 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet16c& a) { + // Discard all but first two values. + return vcvt_f64_f32(pcast(vget_low_s8(a))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 8 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet16uc& a) { + // Discard all but first two values. + return vcvt_f64_f32(pcast(vget_low_u8(a))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet8s& a) { + // Discard all but first two values. + return vcvt_f64_f32(pcast(vget_low_s16(a))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet8us& a) { + // Discard all but first two values. + return vcvt_f64_f32(pcast(vget_low_u16(a))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet4i& a) { + // Discard second half of input. + return vcvtq_f64_s64(vmovl_s32(vget_low_s32(a))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet4ui& a) { + // Discard second half of input. + return vcvtq_f64_u64(vmovl_u32(vget_low_u32(a))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet2l& a) { + return vcvtq_f64_s64(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet2ul& a) { + return vcvtq_f64_u64(a); +} + +template <> +EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet2l& a) { + return vreinterpretq_f64_s64(a); +} +template <> +EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet2ul& a) { + return vreinterpretq_f64_u64(a); +} +template <> +EIGEN_STRONG_INLINE Packet2l preinterpret(const Packet2d& a) { + return vreinterpretq_s64_f64(a); +} +template <> +EIGEN_STRONG_INLINE Packet2ul preinterpret(const Packet2d& a) { + return vreinterpretq_u64_f64(a); +} +template <> +EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet4i& a) { + return vreinterpretq_f64_s32(a); +} +template <> +EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet2d& a) { + return vreinterpretq_s32_f64(a); +} + +#endif // EIGEN_ARCH_ARM64 + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_TYPE_CASTING_NEON_H diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/SSE/Complex.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/SSE/Complex.h index d075043ce1..8fe22da46c 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/SSE/Complex.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/SSE/Complex.h @@ -19,7 +19,7 @@ struct Packet2cf { EIGEN_STRONG_INLINE Packet2cf() {} EIGEN_STRONG_INLINE explicit Packet2cf(const __m128& a) : v(a) {} - __m128 v; + Packet4f v; }; // Use the packet_traits defined in AVX/PacketMath.h instead if we're going @@ -40,20 +40,33 @@ template<> struct packet_traits > : default_packet_traits HasMul = 1, HasDiv = 1, HasNegate = 1, + HasSqrt = 1, HasAbs = 0, HasAbs2 = 0, HasMin = 0, HasMax = 0, HasSetLinear = 0, - HasBlend = 1 + HasBlend = 1 }; }; #endif -template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; }; +template<> struct unpacket_traits { + typedef std::complex type; + typedef Packet2cf half; + typedef Packet4f as_real; + enum { + size=2, + alignment=Aligned16, + vectorizable=true, + masked_load_available=false, + masked_store_available=false + }; +}; template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); } + template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000)); @@ -82,10 +95,11 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, con #endif } +template<> EIGEN_STRONG_INLINE Packet2cf ptrue (const Packet2cf& a) { return Packet2cf(ptrue(Packet4f(a.v))); } template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(b.v,a.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload(&numext::real_ref(*from))); } template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu(&numext::real_ref(*from))); } @@ -93,19 +107,13 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) { Packet2cf res; -#if EIGEN_GNUC_AT_MOST(4,2) - // Workaround annoying "may be used uninitialized in this function" warning with gcc 4.2 - res.v = _mm_loadl_pi(_mm_set1_ps(0.0f), reinterpret_cast(&from)); -#elif EIGEN_GNUC_AT_LEAST(4,6) - // Suppress annoying "may be used uninitialized in this function" warning with gcc >= 4.6 - #pragma GCC diagnostic push - #pragma GCC diagnostic ignored "-Wuninitialized" - res.v = _mm_loadl_pi(res.v, (const __m64*)&from); - #pragma GCC diagnostic pop +#ifdef EIGEN_VECTORIZE_SSE3 + res.v = _mm_castpd_ps(_mm_loaddup_pd(reinterpret_cast(&from))); #else - res.v = _mm_loadl_pi(res.v, (const __m64*)&from); + res.v = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&from))); + res.v = _mm_movelh_ps(res.v, res.v); #endif - return Packet2cf(_mm_movelh_ps(res.v,res.v)); + return res; } template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } @@ -152,105 +160,34 @@ template<> EIGEN_STRONG_INLINE std::complex predux(const Packe return pfirst(Packet2cf(_mm_add_ps(a.v, _mm_movehl_ps(a.v,a.v)))); } -template<> EIGEN_STRONG_INLINE Packet2cf preduxp(const Packet2cf* vecs) -{ - return Packet2cf(_mm_add_ps(_mm_movelh_ps(vecs[0].v,vecs[1].v), _mm_movehl_ps(vecs[1].v,vecs[0].v))); -} - template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) { return pfirst(pmul(a, Packet2cf(_mm_movehl_ps(a.v,a.v)))); } -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second) - { - if (Offset==1) - { - first.v = _mm_movehl_ps(first.v, first.v); - first.v = _mm_movelh_ps(first.v, second.v); - } - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - #ifdef EIGEN_VECTORIZE_SSE3 - return internal::pmul(a, pconj(b)); - #else - const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); - return Packet2cf(_mm_add_ps(_mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask), - _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), - vec4f_swizzle1(b.v, 1, 0, 3, 2)))); - #endif - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - #ifdef EIGEN_VECTORIZE_SSE3 - return internal::pmul(pconj(a), b); - #else - const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); - return Packet2cf(_mm_add_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), - _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), - vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask))); - #endif - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - #ifdef EIGEN_VECTORIZE_SSE3 - return pconj(internal::pmul(a, b)); - #else - const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); - return Packet2cf(_mm_sub_ps(_mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask), - _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), - vec4f_swizzle1(b.v, 1, 0, 3, 2)))); - #endif - } -}; - -EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) - -template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) -{ - // TODO optimize it for SSE3 and 4 - Packet2cf res = conj_helper().pmul(a,b); - __m128 s = _mm_mul_ps(b.v,b.v); - return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(s), 0xb1))))); -} - EIGEN_STRONG_INLINE Packet2cf pcplxflip/* */(const Packet2cf& x) { return Packet2cf(vec4f_swizzle1(x.v, 1, 0, 3, 2)); } +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) + +template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) +{ + // TODO optimize it for SSE3 and 4 + Packet2cf res = pmul(a, pconj(b)); + __m128 s = _mm_mul_ps(b.v,b.v); + return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,vec4f_swizzle1(s, 1, 0, 3, 2)))); +} + + //---------- double ---------- struct Packet1cd { EIGEN_STRONG_INLINE Packet1cd() {} EIGEN_STRONG_INLINE explicit Packet1cd(const __m128d& a) : v(a) {} - __m128d v; + Packet2d v; }; // Use the packet_traits defined in AVX/PacketMath.h instead if we're going @@ -271,6 +208,7 @@ template<> struct packet_traits > : default_packet_traits HasMul = 1, HasDiv = 1, HasNegate = 1, + HasSqrt = 1, HasAbs = 0, HasAbs2 = 0, HasMin = 0, @@ -280,7 +218,18 @@ template<> struct packet_traits > : default_packet_traits }; #endif -template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; +template<> struct unpacket_traits { + typedef std::complex type; + typedef Packet1cd half; + typedef Packet2d as_real; + enum { + size=1, + alignment=Aligned16, + vectorizable=true, + masked_load_available=false, + masked_store_available=false + }; +}; template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); } @@ -305,10 +254,11 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, con #endif } +template<> EIGEN_STRONG_INLINE Packet1cd ptrue (const Packet1cd& a) { return Packet1cd(ptrue(Packet2d(a.v))); } template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(b.v,a.v)); } // FIXME force unaligned load, this is a temporary fix template<> EIGEN_STRONG_INLINE Packet1cd pload (const std::complex* from) @@ -340,86 +290,17 @@ template<> EIGEN_STRONG_INLINE std::complex predux(const Pack return pfirst(a); } -template<> EIGEN_STRONG_INLINE Packet1cd preduxp(const Packet1cd* vecs) -{ - return vecs[0]; -} - template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) { return pfirst(a); } -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) - { - // FIXME is it sure we never have to align a Packet1cd? - // Even though a std::complex has 16 bytes, it is not necessarily aligned on a 16 bytes boundary... - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - #ifdef EIGEN_VECTORIZE_SSE3 - return internal::pmul(a, pconj(b)); - #else - const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); - return Packet1cd(_mm_add_pd(_mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), mask), - _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), - vec2d_swizzle1(b.v, 1, 0)))); - #endif - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - #ifdef EIGEN_VECTORIZE_SSE3 - return internal::pmul(pconj(a), b); - #else - const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); - return Packet1cd(_mm_add_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), - _mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), - vec2d_swizzle1(b.v, 1, 0)), mask))); - #endif - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - #ifdef EIGEN_VECTORIZE_SSE3 - return pconj(internal::pmul(a, b)); - #else - const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); - return Packet1cd(_mm_sub_pd(_mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), mask), - _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), - vec2d_swizzle1(b.v, 1, 0)))); - #endif - } -}; - EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { // TODO optimize it for SSE3 and 4 - Packet1cd res = conj_helper().pmul(a,b); + Packet1cd res = pmul(a,pconj(b)); __m128d s = _mm_mul_pd(b.v,b.v); return Packet1cd(_mm_div_pd(res.v, _mm_add_pd(s,_mm_shuffle_pd(s, s, 0x1)))); } @@ -439,33 +320,32 @@ ptranspose(PacketBlock& kernel) { kernel.packet[1].v = tmp; } +template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) +{ + __m128 eq = _mm_cmpeq_ps(a.v, b.v); + return Packet2cf(pand(eq, vec4f_swizzle1(eq, 1, 0, 3, 2))); +} + +template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) +{ + __m128d eq = _mm_cmpeq_pd(a.v, b.v); + return Packet1cd(pand(eq, vec2d_swizzle1(eq, 1, 0))); +} + template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { __m128d result = pblend(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v)); return Packet2cf(_mm_castpd_ps(result)); } -template<> EIGEN_STRONG_INLINE Packet2cf pinsertfirst(const Packet2cf& a, std::complex b) -{ - return Packet2cf(_mm_loadl_pi(a.v, reinterpret_cast(&b))); +template<> EIGEN_STRONG_INLINE Packet1cd psqrt(const Packet1cd& a) { + return psqrt_complex(a); } -template<> EIGEN_STRONG_INLINE Packet1cd pinsertfirst(const Packet1cd&, std::complex b) -{ - return pset1(b); -} - -template<> EIGEN_STRONG_INLINE Packet2cf pinsertlast(const Packet2cf& a, std::complex b) -{ - return Packet2cf(_mm_loadh_pi(a.v, reinterpret_cast(&b))); -} - -template<> EIGEN_STRONG_INLINE Packet1cd pinsertlast(const Packet1cd&, std::complex b) -{ - return pset1(b); +template<> EIGEN_STRONG_INLINE Packet2cf psqrt(const Packet2cf& a) { + return psqrt_complex(a); } } // end namespace internal - } // end namespace Eigen #endif // EIGEN_COMPLEX_SSE_H diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/SSE/MathFunctions.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/SSE/MathFunctions.h index 7b5f948e11..8736d0d6b5 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -8,7 +8,7 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -/* The sin, cos, exp, and log functions of this file come from +/* The sin and cos and functions of this file come from * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ */ @@ -20,426 +20,57 @@ namespace Eigen { namespace internal { template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f plog(const Packet4f& _x) -{ - Packet4f x = _x; - _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); - _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); - _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); +Packet4f plog(const Packet4f& _x) { + return plog_float(_x); +} - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000); +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet2d plog(const Packet2d& _x) { + return plog_double(_x); +} - /* the smallest non denormalized float number */ - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000); - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000);//-1.f/0.f); +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f plog2(const Packet4f& _x) { + return plog2_float(_x); +} - /* natural logarithm computed for 4 simultaneous float - return NaN for x <= 0 - */ - _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f); +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet2d plog2(const Packet2d& _x) { + return plog2_double(_x); +} +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f plog1p(const Packet4f& _x) { + return generic_plog1p(_x); +} - Packet4i emm0; - - Packet4f invalid_mask = _mm_cmpnge_ps(x, _mm_setzero_ps()); // not greater equal is true if x is NaN - Packet4f iszero_mask = _mm_cmpeq_ps(x, _mm_setzero_ps()); - - x = pmax(x, p4f_min_norm_pos); /* cut off denormalized stuff */ - emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23); - - /* keep only the fractional part */ - x = _mm_and_ps(x, p4f_inv_mant_mask); - x = _mm_or_ps(x, p4f_half); - - emm0 = _mm_sub_epi32(emm0, p4i_0x7f); - Packet4f e = padd(Packet4f(_mm_cvtepi32_ps(emm0)), p4f_1); - - /* part2: - if( x < SQRTHF ) { - e -= 1; - x = x + x - 1.0; - } else { x = x - 1.0; } - */ - Packet4f mask = _mm_cmplt_ps(x, p4f_cephes_SQRTHF); - Packet4f tmp = pand(x, mask); - x = psub(x, p4f_1); - e = psub(e, pand(p4f_1, mask)); - x = padd(x, tmp); - - Packet4f x2 = pmul(x,x); - Packet4f x3 = pmul(x2,x); - - Packet4f y, y1, y2; - y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1); - y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4); - y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7); - y = pmadd(y , x, p4f_cephes_log_p2); - y1 = pmadd(y1, x, p4f_cephes_log_p5); - y2 = pmadd(y2, x, p4f_cephes_log_p8); - y = pmadd(y, x3, y1); - y = pmadd(y, x3, y2); - y = pmul(y, x3); - - y1 = pmul(e, p4f_cephes_log_q1); - tmp = pmul(x2, p4f_half); - y = padd(y, y1); - x = psub(x, tmp); - y2 = pmul(e, p4f_cephes_log_q2); - x = padd(x, y); - x = padd(x, y2); - // negative arg will be NAN, 0 will be -INF - return _mm_or_ps(_mm_andnot_ps(iszero_mask, _mm_or_ps(x, invalid_mask)), - _mm_and_ps(iszero_mask, p4f_minus_inf)); +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f pexpm1(const Packet4f& _x) { + return generic_expm1(_x); } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f pexp(const Packet4f& _x) { - Packet4f x = _x; - _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); - _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); - _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); - - - _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f); - _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f); - - _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); - - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f); - - Packet4f tmp, fx; - Packet4i emm0; - - // clamp x - x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo); - - /* express exp(x) as exp(g + n*log(2)) */ - fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half); - -#ifdef EIGEN_VECTORIZE_SSE4_1 - fx = _mm_floor_ps(fx); -#else - emm0 = _mm_cvttps_epi32(fx); - tmp = _mm_cvtepi32_ps(emm0); - /* if greater, substract 1 */ - Packet4f mask = _mm_cmpgt_ps(tmp, fx); - mask = _mm_and_ps(mask, p4f_1); - fx = psub(tmp, mask); -#endif - - tmp = pmul(fx, p4f_cephes_exp_C1); - Packet4f z = pmul(fx, p4f_cephes_exp_C2); - x = psub(x, tmp); - x = psub(x, z); - - z = pmul(x,x); - - Packet4f y = p4f_cephes_exp_p0; - y = pmadd(y, x, p4f_cephes_exp_p1); - y = pmadd(y, x, p4f_cephes_exp_p2); - y = pmadd(y, x, p4f_cephes_exp_p3); - y = pmadd(y, x, p4f_cephes_exp_p4); - y = pmadd(y, x, p4f_cephes_exp_p5); - y = pmadd(y, z, x); - y = padd(y, p4f_1); - - // build 2^n - emm0 = _mm_cvttps_epi32(fx); - emm0 = _mm_add_epi32(emm0, p4i_0x7f); - emm0 = _mm_slli_epi32(emm0, 23); - return pmax(pmul(y, Packet4f(_mm_castsi128_ps(emm0))), _x); + return pexp_float(_x); } + template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet2d pexp(const Packet2d& _x) +Packet2d pexp(const Packet2d& x) { - Packet2d x = _x; - - _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0); - _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0); - _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); - - _EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437); - _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303); - - _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599); - - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1); - - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0); - - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6); - static const __m128i p4i_1023_0 = _mm_setr_epi32(1023, 1023, 0, 0); - - Packet2d tmp, fx; - Packet4i emm0; - - // clamp x - x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo); - /* express exp(x) as exp(g + n*log(2)) */ - fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half); - -#ifdef EIGEN_VECTORIZE_SSE4_1 - fx = _mm_floor_pd(fx); -#else - emm0 = _mm_cvttpd_epi32(fx); - tmp = _mm_cvtepi32_pd(emm0); - /* if greater, substract 1 */ - Packet2d mask = _mm_cmpgt_pd(tmp, fx); - mask = _mm_and_pd(mask, p2d_1); - fx = psub(tmp, mask); -#endif - - tmp = pmul(fx, p2d_cephes_exp_C1); - Packet2d z = pmul(fx, p2d_cephes_exp_C2); - x = psub(x, tmp); - x = psub(x, z); - - Packet2d x2 = pmul(x,x); - - Packet2d px = p2d_cephes_exp_p0; - px = pmadd(px, x2, p2d_cephes_exp_p1); - px = pmadd(px, x2, p2d_cephes_exp_p2); - px = pmul (px, x); - - Packet2d qx = p2d_cephes_exp_q0; - qx = pmadd(qx, x2, p2d_cephes_exp_q1); - qx = pmadd(qx, x2, p2d_cephes_exp_q2); - qx = pmadd(qx, x2, p2d_cephes_exp_q3); - - x = pdiv(px,psub(qx,px)); - x = pmadd(p2d_2,x,p2d_1); - - // build 2^n - emm0 = _mm_cvttpd_epi32(fx); - emm0 = _mm_add_epi32(emm0, p4i_1023_0); - emm0 = _mm_slli_epi32(emm0, 20); - emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3)); - return pmax(pmul(x, Packet2d(_mm_castsi128_pd(emm0))), _x); + return pexp_double(x); } -/* evaluation of 4 sines at onces, using SSE2 intrinsics. - - The code is the exact rewriting of the cephes sinf function. - Precision is excellent as long as x < 8192 (I did not bother to - take into account the special handling they have for greater values - -- it does not return garbage for arguments over 8192, though, but - the extra precision is missing). - - Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the - surprising but correct result. -*/ - template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f psin(const Packet4f& _x) { - Packet4f x = _x; - _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); - _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); - - _EIGEN_DECLARE_CONST_Packet4i(1, 1); - _EIGEN_DECLARE_CONST_Packet4i(not1, ~1); - _EIGEN_DECLARE_CONST_Packet4i(2, 2); - _EIGEN_DECLARE_CONST_Packet4i(4, 4); - - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000); - - _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f); - _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f); - _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f); - _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f); - _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736E-3f); - _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f); - _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005f); - _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f); - _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI - - Packet4f xmm1, xmm2, xmm3, sign_bit, y; - - Packet4i emm0, emm2; - sign_bit = x; - /* take the absolute value */ - x = pabs(x); - - /* take the modulo */ - - /* extract the sign bit (upper one) */ - sign_bit = _mm_and_ps(sign_bit, p4f_sign_mask); - - /* scale by 4/Pi */ - y = pmul(x, p4f_cephes_FOPI); - - /* store the integer part of y in mm0 */ - emm2 = _mm_cvttps_epi32(y); - /* j=(j+1) & (~1) (see the cephes sources) */ - emm2 = _mm_add_epi32(emm2, p4i_1); - emm2 = _mm_and_si128(emm2, p4i_not1); - y = _mm_cvtepi32_ps(emm2); - /* get the swap sign flag */ - emm0 = _mm_and_si128(emm2, p4i_4); - emm0 = _mm_slli_epi32(emm0, 29); - /* get the polynom selection mask - there is one polynom for 0 <= x <= Pi/4 - and another one for Pi/4 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f pcos(const Packet4f& _x) { - Packet4f x = _x; - _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); - _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); - - _EIGEN_DECLARE_CONST_Packet4i(1, 1); - _EIGEN_DECLARE_CONST_Packet4i(not1, ~1); - _EIGEN_DECLARE_CONST_Packet4i(2, 2); - _EIGEN_DECLARE_CONST_Packet4i(4, 4); - - _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f); - _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f); - _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f); - _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f); - _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736E-3f); - _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f); - _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005f); - _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f); - _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI - - Packet4f xmm1, xmm2, xmm3, y; - Packet4i emm0, emm2; - - x = pabs(x); - - /* scale by 4/Pi */ - y = pmul(x, p4f_cephes_FOPI); - - /* get the integer part of y */ - emm2 = _mm_cvttps_epi32(y); - /* j=(j+1) & (~1) (see the cephes sources) */ - emm2 = _mm_add_epi32(emm2, p4i_1); - emm2 = _mm_and_si128(emm2, p4i_not1); - y = _mm_cvtepi32_ps(emm2); - - emm2 = _mm_sub_epi32(emm2, p4i_2); - - /* get the swap sign flag */ - emm0 = _mm_andnot_si128(emm2, p4i_4); - emm0 = _mm_slli_epi32(emm0, 29); - /* get the polynom selection mask */ - emm2 = _mm_and_si128(emm2, p4i_2); - emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); - - Packet4f sign_bit = _mm_castsi128_ps(emm0); - Packet4f poly_mask = _mm_castsi128_ps(emm2); - - /* The magic pass: "Extended precision modular arithmetic" - x = ((x - y * DP1) - y * DP2) - y * DP3; */ - xmm1 = pmul(y, p4f_minus_cephes_DP1); - xmm2 = pmul(y, p4f_minus_cephes_DP2); - xmm3 = pmul(y, p4f_minus_cephes_DP3); - x = padd(x, xmm1); - x = padd(x, xmm2); - x = padd(x, xmm3); - - /* Evaluate the first polynom (0 <= x <= Pi/4) */ - y = p4f_coscof_p0; - Packet4f z = pmul(x,x); - - y = pmadd(y,z,p4f_coscof_p1); - y = pmadd(y,z,p4f_coscof_p2); - y = pmul(y, z); - y = pmul(y, z); - Packet4f tmp = _mm_mul_ps(z, p4f_half); - y = psub(y, tmp); - y = padd(y, p4f_1); - - /* Evaluate the second polynom (Pi/4 <= x <= 0) */ - Packet4f y2 = p4f_sincof_p0; - y2 = pmadd(y2, z, p4f_sincof_p1); - y2 = pmadd(y2, z, p4f_sincof_p2); - y2 = pmul(y2, z); - y2 = pmadd(y2, x, x); - - /* select the correct result from the two polynoms */ - y2 = _mm_and_ps(poly_mask, y2); - y = _mm_andnot_ps(poly_mask, y); - y = _mm_or_ps(y,y2); - - /* update the sign */ - return _mm_xor_ps(y, sign_bit); + return pcos_float(_x); } #if EIGEN_FAST_MATH @@ -455,17 +86,17 @@ Packet4f pcos(const Packet4f& _x) template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f psqrt(const Packet4f& _x) { - Packet4f half = pmul(_x, pset1(.5f)); - Packet4f denormal_mask = _mm_and_ps( - _mm_cmpge_ps(_x, _mm_setzero_ps()), - _mm_cmplt_ps(_x, pset1((std::numeric_limits::min)()))); + Packet4f minus_half_x = pmul(_x, pset1(-0.5f)); + Packet4f denormal_mask = pandnot( + pcmp_lt(_x, pset1((std::numeric_limits::min)())), + pcmp_lt(_x, pzero(_x))); // Compute approximate reciprocal sqrt. Packet4f x = _mm_rsqrt_ps(_x); // Do a single step of Newton's iteration. - x = pmul(x, psub(pset1(1.5f), pmul(half, pmul(x,x)))); + x = pmul(x, pmadd(minus_half_x, pmul(x,x), pset1(1.5f))); // Flush results for denormals to zero. - return _mm_andnot_ps(denormal_mask, pmul(_x,x)); + return pandnot(pmul(_x,x), denormal_mask); } #else @@ -478,41 +109,48 @@ Packet4f psqrt(const Packet4f& x) { return _mm_sqrt_ps(x); } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d psqrt(const Packet2d& x) { return _mm_sqrt_pd(x); } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet16b psqrt(const Packet16b& x) { return x; } + #if EIGEN_FAST_MATH template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f prsqrt(const Packet4f& _x) { - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000); - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(nan, 0x7fc00000); _EIGEN_DECLARE_CONST_Packet4f(one_point_five, 1.5f); _EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5f); - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000); + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000u); + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000u); Packet4f neg_half = pmul(_x, p4f_minus_half); - // select only the inverse sqrt of positive normal inputs (denormals are - // flushed to zero and cause infs as well). - Packet4f le_zero_mask = _mm_cmple_ps(_x, p4f_flt_min); - Packet4f x = _mm_andnot_ps(le_zero_mask, _mm_rsqrt_ps(_x)); + // Identity infinite, zero, negative and denormal arguments. + Packet4f lt_min_mask = _mm_cmplt_ps(_x, p4f_flt_min); + Packet4f inf_mask = _mm_cmpeq_ps(_x, p4f_inf); + Packet4f not_normal_finite_mask = _mm_or_ps(lt_min_mask, inf_mask); - // Fill in NaNs and Infs for the negative/zero entries. - Packet4f neg_mask = _mm_cmplt_ps(_x, _mm_setzero_ps()); - Packet4f zero_mask = _mm_andnot_ps(neg_mask, le_zero_mask); - Packet4f infs_and_nans = _mm_or_ps(_mm_and_ps(neg_mask, p4f_nan), - _mm_and_ps(zero_mask, p4f_inf)); + // Compute an approximate result using the rsqrt intrinsic. + Packet4f y_approx = _mm_rsqrt_ps(_x); - // Do a single step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p4f_one_point_five)); + // Do a single step of Newton-Raphson iteration to improve the approximation. + // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n). + // It is essential to evaluate the inner term like this because forming + // y_n^2 may over- or underflow. + Packet4f y_newton = pmul( + y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p4f_one_point_five)); - // Insert NaNs and Infs in all the right places. - return _mm_or_ps(x, infs_and_nans); + // Select the result of the Newton-Raphson step for positive normal arguments. + // For other arguments, choose the output of the intrinsic. This will + // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(x) = +inf if + // x is zero or a positive denormalized float (equivalent to flushing positive + // denormalized inputs to zero). + return pselect(not_normal_finite_mask, y_approx, y_newton); } #else template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f prsqrt(const Packet4f& x) { - // Unfortunately we can't use the much faster mm_rqsrt_ps since it only provides an approximation. + // Unfortunately we can't use the much faster mm_rsqrt_ps since it only provides an approximation. return _mm_div_ps(pset1(1.0f), _mm_sqrt_ps(x)); } @@ -520,7 +158,6 @@ Packet4f prsqrt(const Packet4f& x) { template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d prsqrt(const Packet2d& x) { - // Unfortunately we can't use the much faster mm_rqsrt_pd since it only provides an approximation. return _mm_div_pd(pset1(1.0), _mm_sqrt_pd(x)); } @@ -548,7 +185,7 @@ double sqrt(const double &x) { #if EIGEN_COMP_GNUC_STRICT // This works around a GCC bug generating poor code for _mm_sqrt_pd - // See https://bitbucket.org/eigen/eigen/commits/14f468dba4d350d7c19c9b93072e19f7b3df563b + // See https://gitlab.com/libeigen/eigen/commit/8dca9f97e38970 return internal::pfirst(internal::Packet2d(__builtin_ia32_sqrtsd(_mm_set_sd(x)))); #else return internal::pfirst(internal::Packet2d(_mm_sqrt_pd(_mm_set_sd(x)))); diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/SSE/PacketMath.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/SSE/PacketMath.h index 60e2517e4b..db102c73a2 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/SSE/PacketMath.h @@ -18,13 +18,15 @@ namespace internal { #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 #endif -#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS +#if !defined(EIGEN_VECTORIZE_AVX) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS) +// 32 bits => 8 registers +// 64 bits => 16 registers #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) #endif -#ifdef __FMA__ +#ifdef EIGEN_VECTORIZE_FMA #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD -#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1 +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif #endif @@ -34,47 +36,75 @@ namespace internal { // One solution is to increase ABI version using -fabi-version=4 (or greater). // Otherwise, we workaround this inconvenience by wrapping 128bit types into the following helper // structure: -template -struct eigen_packet_wrapper -{ - EIGEN_ALWAYS_INLINE operator T&() { return m_val; } - EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; } - EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {} - EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T &v) : m_val(v) {} - EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T &v) { - m_val = v; - return *this; - } - - T m_val; -}; typedef eigen_packet_wrapper<__m128> Packet4f; -typedef eigen_packet_wrapper<__m128i> Packet4i; typedef eigen_packet_wrapper<__m128d> Packet2d; #else typedef __m128 Packet4f; -typedef __m128i Packet4i; typedef __m128d Packet2d; #endif +typedef eigen_packet_wrapper<__m128i, 0> Packet4i; +typedef eigen_packet_wrapper<__m128i, 1> Packet16b; + template<> struct is_arithmetic<__m128> { enum { value = true }; }; template<> struct is_arithmetic<__m128i> { enum { value = true }; }; template<> struct is_arithmetic<__m128d> { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; +template +struct shuffle_mask{ + enum { mask = (s)<<6|(r)<<4|(q)<<2|(p) }; +}; + +// TODO: change the implementation of all swizzle* ops from macro to template, #define vec4f_swizzle1(v,p,q,r,s) \ - (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), ((s)<<6|(r)<<4|(q)<<2|(p))))) + Packet4f(_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), (shuffle_mask::mask)))) #define vec4i_swizzle1(v,p,q,r,s) \ - (_mm_shuffle_epi32( v, ((s)<<6|(r)<<4|(q)<<2|(p)))) + Packet4i(_mm_shuffle_epi32( v, (shuffle_mask::mask))) #define vec2d_swizzle1(v,p,q) \ - (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2))))) - + Packet2d(_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), (shuffle_mask<2*p,2*p+1,2*q,2*q+1>::mask)))) + #define vec4f_swizzle2(a,b,p,q,r,s) \ - (_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p)))) + Packet4f(_mm_shuffle_ps( (a), (b), (shuffle_mask::mask))) #define vec4i_swizzle2(a,b,p,q,r,s) \ - (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p)))))) + Packet4i(_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), (shuffle_mask::mask))))) + +EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b) +{ + return Packet4f(_mm_movelh_ps(a,b)); +} +EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b) +{ + return Packet4f(_mm_movehl_ps(a,b)); +} +EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b) +{ + return Packet4f(_mm_unpacklo_ps(a,b)); +} +EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b) +{ + return Packet4f(_mm_unpackhi_ps(a,b)); +} +#define vec4f_duplane(a,p) \ + vec4f_swizzle2(a,a,p,p,p,p) + +#define vec2d_swizzle2(a,b,mask) \ + Packet2d(_mm_shuffle_pd(a,b,mask)) + +EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a, const Packet2d& b) +{ + return Packet2d(_mm_unpacklo_pd(a,b)); +} +EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a, const Packet2d& b) +{ + return Packet2d(_mm_unpackhi_pd(a,b)); +} +#define vec2d_duplane(a,p) \ + vec2d_swizzle2(a,a,(p<<1)|p) #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ const Packet4f p4f_##NAME = pset1(X) @@ -83,7 +113,7 @@ template<> struct is_arithmetic<__m128d> { enum { value = true }; }; const Packet2d p2d_##NAME = pset1(X) #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ - const Packet4f p4f_##NAME = _mm_castsi128_ps(pset1(X)) + const Packet4f p4f_##NAME = pset1frombits(X) #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ const Packet4i p4i_##NAME = pset1(X) @@ -92,36 +122,41 @@ template<> struct is_arithmetic<__m128d> { enum { value = true }; }; // Use the packet_traits defined in AVX/PacketMath.h instead if we're going // to leverage AVX instructions. #ifndef EIGEN_VECTORIZE_AVX -template<> struct packet_traits : default_packet_traits -{ +template <> +struct packet_traits : default_packet_traits { typedef Packet4f type; typedef Packet4f half; enum { Vectorizable = 1, AlignedOnScalar = 1, - size=4, + size = 4, HasHalfPacket = 0, - HasDiv = 1, - HasSin = EIGEN_FAST_MATH, - HasCos = EIGEN_FAST_MATH, - HasLog = 1, - HasExp = 1, + HasCmp = 1, + HasDiv = 1, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 1, + HasLog1p = 1, + HasExpm1 = 1, + HasNdtri = 1, + HasExp = 1, + HasBessel = 1, HasSqrt = 1, HasRsqrt = 1, - HasTanh = EIGEN_FAST_MATH, - HasBlend = 1 - -#ifdef EIGEN_VECTORIZE_SSE4_1 - , - HasRound = 1, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH, + HasBlend = 1, + HasCeil = 1, HasFloor = 1, - HasCeil = 1 +#ifdef EIGEN_VECTORIZE_SSE4_1 + HasRound = 1, #endif + HasRint = 1 }; }; -template<> struct packet_traits : default_packet_traits -{ +template <> +struct packet_traits : default_packet_traits { typedef Packet2d type; typedef Packet2d half; enum { @@ -130,18 +165,19 @@ template<> struct packet_traits : default_packet_traits size=2, HasHalfPacket = 0, + HasCmp = 1, HasDiv = 1, + HasLog = 1, HasExp = 1, HasSqrt = 1, HasRsqrt = 1, - HasBlend = 1 - -#ifdef EIGEN_VECTORIZE_SSE4_1 - , - HasRound = 1, + HasBlend = 1, HasFloor = 1, - HasCeil = 1 + HasCeil = 1, +#ifdef EIGEN_VECTORIZE_SSE4_1 + HasRound = 1, #endif + HasRint = 1 }; }; #endif @@ -154,13 +190,56 @@ template<> struct packet_traits : default_packet_traits AlignedOnScalar = 1, size=4, + HasShift = 1, HasBlend = 1 }; }; -template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; -template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; -template<> struct unpacket_traits { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; +template<> struct packet_traits : default_packet_traits +{ + typedef Packet16b type; + typedef Packet16b half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + HasHalfPacket = 0, + size=16, + + HasAdd = 1, + HasSub = 1, + HasShift = 0, + HasMul = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasConj = 0, + HasSqrt = 1 + }; +}; + +template<> struct unpacket_traits { + typedef float type; + typedef Packet4f half; + typedef Packet4i integer_packet; + enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; +}; +template<> struct unpacket_traits { + typedef double type; + typedef Packet2d half; + enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; +}; +template<> struct unpacket_traits { + typedef int type; + typedef Packet4i half; + enum {size=4, alignment=Aligned16, vectorizable=false, masked_load_available=false, masked_store_available=false}; +}; +template<> struct unpacket_traits { + typedef bool type; + typedef Packet16b half; + enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; +}; #ifndef EIGEN_VECTORIZE_AVX template<> struct scalar_div_cost { enum { value = 7 }; }; @@ -179,6 +258,18 @@ template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { re template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return _mm_set1_pd(from); } template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return _mm_set1_epi32(from); } #endif +template<> EIGEN_STRONG_INLINE Packet16b pset1(const bool& from) { return _mm_set1_epi8(static_cast(from)); } + +template<> EIGEN_STRONG_INLINE Packet4f pset1frombits(unsigned int from) { return _mm_castsi128_ps(pset1(from)); } +template<> EIGEN_STRONG_INLINE Packet2d pset1frombits(uint64_t from) { return _mm_castsi128_pd(_mm_set1_epi64x(from)); } + +template<> EIGEN_STRONG_INLINE Packet4f peven_mask(const Packet4f& /*a*/) { return _mm_castsi128_ps(_mm_set_epi32(0, -1, 0, -1)); } +template<> EIGEN_STRONG_INLINE Packet4i peven_mask(const Packet4i& /*a*/) { return _mm_set_epi32(0, -1, 0, -1); } +template<> EIGEN_STRONG_INLINE Packet2d peven_mask(const Packet2d& /*a*/) { return _mm_castsi128_pd(_mm_set_epi32(0, 0, -1, -1)); } + +template<> EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /*a*/) { return _mm_setzero_ps(); } +template<> EIGEN_STRONG_INLINE Packet2d pzero(const Packet2d& /*a*/) { return _mm_setzero_pd(); } +template<> EIGEN_STRONG_INLINE Packet4i pzero(const Packet4i& /*a*/) { return _mm_setzero_si128(); } // GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction. // However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203) @@ -190,7 +281,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pload1(const float *from) { return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0); } #endif - + template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return _mm_add_ps(pset1(a), _mm_set_ps(3,2,1,0)); } template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) { return _mm_add_pd(pset1(a),_mm_set_pd(1,0)); } template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return _mm_add_epi32(pset1(a),_mm_set_epi32(3,2,1,0)); } @@ -199,9 +290,34 @@ template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { return _mm_add_epi32(a,b); } +template<> EIGEN_STRONG_INLINE Packet16b padd(const Packet16b& a, const Packet16b& b) { return _mm_or_si128(a,b); } + template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) { return _mm_sub_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); } +template<> EIGEN_STRONG_INLINE Packet16b psub(const Packet16b& a, const Packet16b& b) { return _mm_xor_si128(a,b); } + +template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b); +template<> EIGEN_STRONG_INLINE Packet4f paddsub(const Packet4f& a, const Packet4f& b) +{ +#ifdef EIGEN_VECTORIZE_SSE3 + return _mm_addsub_ps(a,b); +#else + const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x0,0x80000000,0x0)); + return padd(a, pxor(mask, b)); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& , const Packet2d& ); +template<> EIGEN_STRONG_INLINE Packet2d paddsub(const Packet2d& a, const Packet2d& b) +{ +#ifdef EIGEN_VECTORIZE_SSE3 + return _mm_addsub_pd(a,b); +#else + const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x0)); + return padd(a, pxor(mask, b)); +#endif +} template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { @@ -218,6 +334,11 @@ template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) return psub(Packet4i(_mm_setr_epi32(0,0,0,0)), a); } +template<> EIGEN_STRONG_INLINE Packet16b pnegate(const Packet16b& a) +{ + return psub(pset1(false), a); +} + template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } @@ -240,18 +361,126 @@ template<> EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const #endif } +template<> EIGEN_STRONG_INLINE Packet16b pmul(const Packet16b& a, const Packet16b& b) { return _mm_and_si128(a,b); } + template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); } // for some weird raisons, it has to be overloaded for packet of integers template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); } -#ifdef __FMA__ +#ifdef EIGEN_VECTORIZE_FMA template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); } template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); } #endif -template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); } +#ifdef EIGEN_VECTORIZE_SSE4_1 +template<> EIGEN_DEVICE_FUNC inline Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) { + return _mm_blendv_ps(b,a,mask); +} + +template<> EIGEN_DEVICE_FUNC inline Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) { + return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b),_mm_castsi128_ps(a),_mm_castsi128_ps(mask))); +} + +template<> EIGEN_DEVICE_FUNC inline Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) { return _mm_blendv_pd(b,a,mask); } + +template<> EIGEN_DEVICE_FUNC inline Packet16b pselect(const Packet16b& mask, const Packet16b& a, const Packet16b& b) { + return _mm_blendv_epi8(b,a,mask); +} +#else +template<> EIGEN_DEVICE_FUNC inline Packet16b pselect(const Packet16b& mask, const Packet16b& a, const Packet16b& b) { + Packet16b a_part = _mm_and_si128(mask, a); + Packet16b b_part = _mm_andnot_si128(mask, b); + return _mm_or_si128(a_part, b_part); +} +#endif + +template<> EIGEN_STRONG_INLINE Packet4i ptrue(const Packet4i& a) { return _mm_cmpeq_epi32(a, a); } +template<> EIGEN_STRONG_INLINE Packet16b ptrue(const Packet16b& a) { return _mm_cmpeq_epi8(a, a); } +template<> EIGEN_STRONG_INLINE Packet4f +ptrue(const Packet4f& a) { + Packet4i b = _mm_castps_si128(a); + return _mm_castsi128_ps(_mm_cmpeq_epi32(b, b)); +} +template<> EIGEN_STRONG_INLINE Packet2d +ptrue(const Packet2d& a) { + Packet4i b = _mm_castpd_si128(a); + return _mm_castsi128_pd(_mm_cmpeq_epi32(b, b)); +} + + +template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); } +template<> EIGEN_STRONG_INLINE Packet16b pand(const Packet16b& a, const Packet16b& b) { return _mm_and_si128(a,b); } + +template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); } +template<> EIGEN_STRONG_INLINE Packet16b por(const Packet16b& a, const Packet16b& b) { return _mm_or_si128(a,b); } + +template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); } +template<> EIGEN_STRONG_INLINE Packet16b pxor(const Packet16b& a, const Packet16b& b) { return _mm_xor_si128(a,b); } + +template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(b,a); } +template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(b,a); } +template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(b,a); } + +template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return _mm_cmple_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return _mm_cmplt_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return _mm_cmpeq_ps(a,b); } + +template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) { return _mm_cmple_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) { return _mm_cmplt_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) { return _mm_cmpnge_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return _mm_cmpeq_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return _mm_cmplt_epi32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return _mm_cmpeq_epi32(a,b); } +template<> EIGEN_STRONG_INLINE Packet16b pcmp_eq(const Packet16b& a, const Packet16b& b) { return _mm_cmpeq_epi8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return por(pcmp_lt(a,b), pcmp_eq(a,b)); } + +template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { +#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 + // There appears to be a bug in GCC, by which the optimizer may + // flip the argument order in calls to _mm_min_ps, so we have to + // resort to inline ASM here. This is supposed to be fixed in gcc6.3, + // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 + #ifdef EIGEN_VECTORIZE_AVX + Packet4f res; + asm("vminps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); + #else + Packet4f res = b; + asm("minps %[a], %[res]" : [res] "+x" (res) : [a] "x" (a)); + #endif + return res; +#else + // Arguments are reversed to match NaN propagation behavior of std::min. + return _mm_min_ps(b, a); +#endif +} +template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { +#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 + // There appears to be a bug in GCC, by which the optimizer may + // flip the argument order in calls to _mm_min_pd, so we have to + // resort to inline ASM here. This is supposed to be fixed in gcc6.3, + // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 + #ifdef EIGEN_VECTORIZE_AVX + Packet2d res; + asm("vminpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); + #else + Packet2d res = b; + asm("minpd %[a], %[res]" : [res] "+x" (res) : [a] "x" (a)); + #endif + return res; +#else + // Arguments are reversed to match NaN propagation behavior of std::min. + return _mm_min_pd(b, a); +#endif +} template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { #ifdef EIGEN_VECTORIZE_SSE4_1 @@ -263,8 +492,45 @@ template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const #endif } -template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { return _mm_max_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return _mm_max_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { +#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 + // There appears to be a bug in GCC, by which the optimizer may + // flip the argument order in calls to _mm_max_ps, so we have to + // resort to inline ASM here. This is supposed to be fixed in gcc6.3, + // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 + #ifdef EIGEN_VECTORIZE_AVX + Packet4f res; + asm("vmaxps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); + #else + Packet4f res = b; + asm("maxps %[a], %[res]" : [res] "+x" (res) : [a] "x" (a)); + #endif + return res; +#else + // Arguments are reversed to match NaN propagation behavior of std::max. + return _mm_max_ps(b, a); +#endif +} +template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { +#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 + // There appears to be a bug in GCC, by which the optimizer may + // flip the argument order in calls to _mm_max_pd, so we have to + // resort to inline ASM here. This is supposed to be fixed in gcc6.3, + // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 + #ifdef EIGEN_VECTORIZE_AVX + Packet2d res; + asm("vmaxpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); + #else + Packet2d res = b; + asm("maxpd %[a], %[res]" : [res] "+x" (res) : [a] "x" (a)); + #endif + return res; +#else + // Arguments are reversed to match NaN propagation behavior of std::max. + return _mm_max_pd(b, a); +#endif +} template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { #ifdef EIGEN_VECTORIZE_SSE4_1 @@ -276,36 +542,180 @@ template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const #endif } +template +EIGEN_STRONG_INLINE Packet pminmax_propagate_numbers(const Packet& a, const Packet& b, Op op) { + // In this implementation, we take advantage of the fact that pmin/pmax for SSE + // always return a if either a or b is NaN. + Packet not_nan_mask_a = pcmp_eq(a, a); + Packet m = op(a, b); + return pselect(not_nan_mask_a, m, b); +} + +template +EIGEN_STRONG_INLINE Packet pminmax_propagate_nan(const Packet& a, const Packet& b, Op op) { + // In this implementation, we take advantage of the fact that pmin/pmax for SSE + // always return a if either a or b is NaN. + Packet not_nan_mask_a = pcmp_eq(a, a); + Packet m = op(b, a); + return pselect(not_nan_mask_a, m, a); +} + +// Add specializations for min/max with prescribed NaN progation. +template<> +EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { + return pminmax_propagate_numbers(a, b, pmin); +} +template<> +EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { + return pminmax_propagate_numbers(a, b, pmin); +} +template<> +EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { + return pminmax_propagate_numbers(a, b, pmax); +} +template<> +EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { + return pminmax_propagate_numbers(a, b, pmax); +} +template<> +EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { + return pminmax_propagate_nan(a, b, pmin); +} +template<> +EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { + return pminmax_propagate_nan(a, b, pmin); +} +template<> +EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { + return pminmax_propagate_nan(a, b, pmax); +} +template<> +EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { + return pminmax_propagate_nan(a, b, pmax); +} + +template EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) { return _mm_srai_epi32(a,N); } +template EIGEN_STRONG_INLINE Packet4i plogical_shift_right (const Packet4i& a) { return _mm_srli_epi32(a,N); } +template EIGEN_STRONG_INLINE Packet4i plogical_shift_left (const Packet4i& a) { return _mm_slli_epi32(a,N); } + +template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) +{ + const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF)); + return _mm_and_ps(a,mask); +} +template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) +{ + const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF)); + return _mm_and_pd(a,mask); +} +template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) +{ + #ifdef EIGEN_VECTORIZE_SSSE3 + return _mm_abs_epi32(a); + #else + Packet4i aux = _mm_srai_epi32(a,31); + return _mm_sub_epi32(_mm_xor_si128(a,aux),aux); + #endif +} + #ifdef EIGEN_VECTORIZE_SSE4_1 -template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) { return _mm_round_ps(a, 0); } -template<> EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) { return _mm_round_pd(a, 0); } +template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) +{ + // Unfortunatly _mm_round_ps doesn't have a rounding mode to implement numext::round. + const Packet4f mask = pset1frombits(0x80000000u); + const Packet4f prev0dot5 = pset1frombits(0x3EFFFFFFu); + return _mm_round_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO); +} + +template<> EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) +{ + const Packet2d mask = _mm_castsi128_pd(_mm_set_epi64x(0x8000000000000000ull, 0x8000000000000000ull)); + const Packet2d prev0dot5 = _mm_castsi128_pd(_mm_set_epi64x(0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull)); + return _mm_round_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO); +} + +template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) { return _mm_round_ps(a, _MM_FROUND_CUR_DIRECTION); } +template<> EIGEN_STRONG_INLINE Packet2d print(const Packet2d& a) { return _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); } template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) { return _mm_ceil_ps(a); } template<> EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) { return _mm_ceil_pd(a); } template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) { return _mm_floor_ps(a); } template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { return _mm_floor_pd(a); } +#else +template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) { + // Adds and subtracts signum(a) * 2^23 to force rounding. + const Packet4f limit = pset1(static_cast(1<<23)); + const Packet4f abs_a = pabs(a); + Packet4f r = padd(abs_a, limit); + // Don't compile-away addition and subtraction. + EIGEN_OPTIMIZATION_BARRIER(r); + r = psub(r, limit); + // If greater than limit, simply return a. Otherwise, account for sign. + r = pselect(pcmp_lt(abs_a, limit), + pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); + return r; +} + +template<> EIGEN_STRONG_INLINE Packet2d print(const Packet2d& a) { + // Adds and subtracts signum(a) * 2^52 to force rounding. + const Packet2d limit = pset1(static_cast(1ull<<52)); + const Packet2d abs_a = pabs(a); + Packet2d r = padd(abs_a, limit); + // Don't compile-away addition and subtraction. + EIGEN_OPTIMIZATION_BARRIER(r); + r = psub(r, limit); + // If greater than limit, simply return a. Otherwise, account for sign. + r = pselect(pcmp_lt(abs_a, limit), + pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); + return r; +} + +template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) +{ + const Packet4f cst_1 = pset1(1.0f); + Packet4f tmp = print(a); + // If greater, subtract one. + Packet4f mask = _mm_cmpgt_ps(tmp, a); + mask = pand(mask, cst_1); + return psub(tmp, mask); +} + +template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) +{ + const Packet2d cst_1 = pset1(1.0); + Packet2d tmp = print(a); + // If greater, subtract one. + Packet2d mask = _mm_cmpgt_pd(tmp, a); + mask = pand(mask, cst_1); + return psub(tmp, mask); +} + +template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) +{ + const Packet4f cst_1 = pset1(1.0f); + Packet4f tmp = print(a); + // If smaller, add one. + Packet4f mask = _mm_cmplt_ps(tmp, a); + mask = pand(mask, cst_1); + return padd(tmp, mask); +} + +template<> EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) +{ + const Packet2d cst_1 = pset1(1.0); + Packet2d tmp = print(a); + // If smaller, add one. + Packet2d mask = _mm_cmplt_pd(tmp, a); + mask = pand(mask, cst_1); + return padd(tmp, mask); +} #endif -template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); } - -template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); } - -template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); } - -template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(a,b); } - template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); } template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); } template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast(from)); } +template<> EIGEN_STRONG_INLINE Packet16b pload(const bool* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast(from)); } #if EIGEN_COMP_MSVC template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { @@ -340,6 +750,10 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast(from)); } +template<> EIGEN_STRONG_INLINE Packet16b ploadu(const bool* from) { + EIGEN_DEBUG_UNALIGNED_LOAD + return _mm_loadu_si128(reinterpret_cast(from)); +} template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) @@ -355,13 +769,32 @@ template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int* from) return vec4i_swizzle1(tmp, 0, 0, 1, 1); } +// Loads 8 bools from memory and returns the packet +// {b0, b0, b1, b1, b2, b2, b3, b3, b4, b4, b5, b5, b6, b6, b7, b7} +template<> EIGEN_STRONG_INLINE Packet16b ploaddup(const bool* from) +{ + __m128i tmp = _mm_castpd_si128(pload1(reinterpret_cast(from))); + return _mm_unpacklo_epi8(tmp, tmp); +} + +// Loads 4 bools from memory and returns the packet +// {b0, b0 b0, b0, b1, b1, b1, b1, b2, b2, b2, b2, b3, b3, b3, b3} +template<> EIGEN_STRONG_INLINE Packet16b +ploadquad(const bool* from) { + __m128i tmp = _mm_castps_si128(pload1(reinterpret_cast(from))); + tmp = _mm_unpacklo_epi8(tmp, tmp); + return _mm_unpacklo_epi16(tmp, tmp); +} + template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); } template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); } template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); } +template<> EIGEN_STRONG_INLINE void pstore(bool* to, const Packet16b& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); } template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from); } template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from); } template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); } +template<> EIGEN_STRONG_INLINE void pstoreu(bool* to, const Packet16b& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); } template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) { @@ -374,7 +807,15 @@ template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const dou template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* from, Index stride) { return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); - } +} + +template<> EIGEN_DEVICE_FUNC inline Packet16b pgather(const bool* from, Index stride) +{ + return _mm_set_epi8(from[15*stride], from[14*stride], from[13*stride], from[12*stride], + from[11*stride], from[10*stride], from[9*stride], from[8*stride], + from[7*stride], from[6*stride], from[5*stride], from[4*stride], + from[3*stride], from[2*stride], from[1*stride], from[0*stride]); +} template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) { @@ -395,6 +836,14 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2)); to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3)); } +template<> EIGEN_DEVICE_FUNC inline void pscatter(bool* to, const Packet16b& from, Index stride) +{ + to[4*stride*0] = _mm_cvtsi128_si32(from); + to[4*stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1)); + to[4*stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2)); + to[4*stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3)); +} + // some compilers might be tempted to perform multiple moves instead of using a vector path. template<> EIGEN_STRONG_INLINE void pstore1(float* to, const float& a) @@ -409,7 +858,7 @@ template<> EIGEN_STRONG_INLINE void pstore1(double* to, const double& pstore(to, Packet2d(vec2d_swizzle1(pa,0,0))); } -#if EIGEN_COMP_PGI +#if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900 typedef const void * SsePrefetchPtrType; #else typedef const char * SsePrefetchPtrType; @@ -437,32 +886,62 @@ template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { retu template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { return _mm_cvtsd_f64(a); } template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { return _mm_cvtsi128_si32(a); } #endif +template<> EIGEN_STRONG_INLINE bool pfirst(const Packet16b& a) { int x = _mm_cvtsi128_si32(a); return static_cast(x & 1); } -template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) -{ return _mm_shuffle_ps(a,a,0x1B); } -template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) -{ return _mm_shuffle_pd(a,a,0x1); } -template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) -{ return _mm_shuffle_epi32(a,0x1B); } +template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { return _mm_shuffle_ps(a,a,0x1B); } +template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return _mm_shuffle_pd(a,a,0x1); } +template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return _mm_shuffle_epi32(a,0x1B); } +template<> EIGEN_STRONG_INLINE Packet16b preverse(const Packet16b& a) { +#ifdef EIGEN_VECTORIZE_SSSE3 + __m128i mask = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + return _mm_shuffle_epi8(a, mask); +#else + Packet16b tmp = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 1, 2, 3)); + tmp = _mm_shufflehi_epi16(_mm_shufflelo_epi16(tmp, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1)); + return _mm_or_si128(_mm_slli_epi16(tmp, 8), _mm_srli_epi16(tmp, 8)); +#endif +} -template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) -{ - const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF)); - return _mm_and_ps(a,mask); +template<> EIGEN_STRONG_INLINE Packet4f pfrexp(const Packet4f& a, Packet4f& exponent) { + return pfrexp_generic(a,exponent); } -template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) -{ - const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF)); - return _mm_and_pd(a,mask); + +// Extract exponent without existence of Packet2l. +template<> +EIGEN_STRONG_INLINE +Packet2d pfrexp_generic_get_biased_exponent(const Packet2d& a) { + const Packet2d cst_exp_mask = pset1frombits(static_cast(0x7ff0000000000000ull)); + __m128i a_expo = _mm_srli_epi64(_mm_castpd_si128(pand(a, cst_exp_mask)), 52); + return _mm_cvtepi32_pd(vec4i_swizzle1(a_expo, 0, 2, 1, 3)); } -template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) -{ - #ifdef EIGEN_VECTORIZE_SSSE3 - return _mm_abs_epi32(a); - #else - Packet4i aux = _mm_srai_epi32(a,31); - return _mm_sub_epi32(_mm_xor_si128(a,aux),aux); - #endif + +template<> EIGEN_STRONG_INLINE Packet2d pfrexp(const Packet2d& a, Packet2d& exponent) { + return pfrexp_generic(a, exponent); +} + +template<> EIGEN_STRONG_INLINE Packet4f pldexp(const Packet4f& a, const Packet4f& exponent) { + return pldexp_generic(a,exponent); +} + +// We specialize pldexp here, since the generic implementation uses Packet2l, which is not well +// supported by SSE, and has more range than is needed for exponents. +template<> EIGEN_STRONG_INLINE Packet2d pldexp(const Packet2d& a, const Packet2d& exponent) { + // Clamp exponent to [-2099, 2099] + const Packet2d max_exponent = pset1(2099.0); + const Packet2d e = pmin(pmax(exponent, pnegate(max_exponent)), max_exponent); + + // Convert e to integer and swizzle to low-order bits. + const Packet4i ei = vec4i_swizzle1(_mm_cvtpd_epi32(e), 0, 3, 1, 3); + + // Split 2^e into four factors and multiply: + const Packet4i bias = _mm_set_epi32(0, 1023, 0, 1023); + Packet4i b = parithmetic_shift_right<2>(ei); // floor(e/4) + Packet2d c = _mm_castsi128_pd(_mm_slli_epi64(padd(b, bias), 52)); // 2^b + Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b) + b = psub(psub(psub(ei, b), b), b); // e - 3b + c = _mm_castsi128_pd(_mm_slli_epi64(padd(b, bias), 52)); // 2^(e - 3b) + out = pmul(out, c); // a * 2^e + return out; } // with AVX, the default implementations based on pload1 are faster @@ -505,38 +984,6 @@ EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs) vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00)); } -#ifdef EIGEN_VECTORIZE_SSE3 -template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) -{ - return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3])); -} - -template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) -{ - return _mm_hadd_pd(vecs[0], vecs[1]); -} - -#else -template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) -{ - Packet4f tmp0, tmp1, tmp2; - tmp0 = _mm_unpacklo_ps(vecs[0], vecs[1]); - tmp1 = _mm_unpackhi_ps(vecs[0], vecs[1]); - tmp2 = _mm_unpackhi_ps(vecs[2], vecs[3]); - tmp0 = _mm_add_ps(tmp0, tmp1); - tmp1 = _mm_unpacklo_ps(vecs[2], vecs[3]); - tmp1 = _mm_add_ps(tmp1, tmp2); - tmp2 = _mm_movehl_ps(tmp1, tmp0); - tmp0 = _mm_movelh_ps(tmp0, tmp1); - return _mm_add_ps(tmp0, tmp2); -} - -template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) -{ - return _mm_add_pd(_mm_unpacklo_pd(vecs[0], vecs[1]), _mm_unpackhi_pd(vecs[0], vecs[1])); -} -#endif // SSE3 - template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) { // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures @@ -562,38 +1009,28 @@ template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) } #ifdef EIGEN_VECTORIZE_SSSE3 -template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) -{ - return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3])); -} template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) { Packet4i tmp0 = _mm_hadd_epi32(a,a); return pfirst(_mm_hadd_epi32(tmp0,tmp0)); } + #else template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) { Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a)); return pfirst(tmp) + pfirst(_mm_shuffle_epi32(tmp, 1)); } - -template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) -{ - Packet4i tmp0, tmp1, tmp2; - tmp0 = _mm_unpacklo_epi32(vecs[0], vecs[1]); - tmp1 = _mm_unpackhi_epi32(vecs[0], vecs[1]); - tmp2 = _mm_unpackhi_epi32(vecs[2], vecs[3]); - tmp0 = _mm_add_epi32(tmp0, tmp1); - tmp1 = _mm_unpacklo_epi32(vecs[2], vecs[3]); - tmp1 = _mm_add_epi32(tmp1, tmp2); - tmp2 = _mm_unpacklo_epi64(tmp0, tmp1); - tmp0 = _mm_unpackhi_epi64(tmp0, tmp1); - return _mm_add_epi32(tmp0, tmp2); -} #endif + +template<> EIGEN_STRONG_INLINE bool predux(const Packet16b& a) { + Packet4i tmp = _mm_or_si128(a, _mm_unpackhi_epi64(a,a)); + return (pfirst(tmp) != 0) || (pfirst(_mm_shuffle_epi32(tmp, 1)) != 0); +} + // Other reduction functions: + // mul template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) { @@ -611,7 +1048,13 @@ template<> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) // TODO try to call _mm_mul_epu32 directly EIGEN_ALIGN16 int aux[4]; pstore(aux, a); - return (aux[0] * aux[1]) * (aux[2] * aux[3]);; + return (aux[0] * aux[1]) * (aux[2] * aux[3]); +} + +template<> EIGEN_STRONG_INLINE bool predux_mul(const Packet16b& a) { + Packet4i tmp = _mm_and_si128(a, _mm_unpackhi_epi64(a,a)); + return ((pfirst(tmp) == 0x01010101) && + (pfirst(_mm_shuffle_epi32(tmp, 1)) == 0x01010101)); } // min @@ -666,113 +1109,16 @@ template<> EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) #endif // EIGEN_VECTORIZE_SSE4_1 } -#if EIGEN_COMP_GNUC -// template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) +// not needed yet +// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet4f& x) // { -// Packet4f res = b; -// asm("mulps %[a], %[b] \n\taddps %[c], %[b]" : [b] "+x" (res) : [a] "x" (a), [c] "x" (c)); -// return res; +// return _mm_movemask_ps(x) == 0xF; // } -// EIGEN_STRONG_INLINE Packet4i _mm_alignr_epi8(const Packet4i& a, const Packet4i& b, const int i) -// { -// Packet4i res = a; -// asm("palignr %[i], %[a], %[b] " : [b] "+x" (res) : [a] "x" (a), [i] "i" (i)); -// return res; -// } -#endif -#ifdef EIGEN_VECTORIZE_SSSE3 -// SSSE3 versions -template -struct palign_impl +template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) { - static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) - { - if (Offset!=0) - first = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(second), _mm_castps_si128(first), Offset*4)); - } -}; - -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) - { - if (Offset!=0) - first = _mm_alignr_epi8(second,first, Offset*4); - } -}; - -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) - { - if (Offset==1) - first = _mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(second), _mm_castpd_si128(first), 8)); - } -}; -#else -// SSE2 versions -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) - { - if (Offset==1) - { - first = _mm_move_ss(first,second); - first = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(first),0x39)); - } - else if (Offset==2) - { - first = _mm_movehl_ps(first,first); - first = _mm_movelh_ps(first,second); - } - else if (Offset==3) - { - first = _mm_move_ss(first,second); - first = _mm_shuffle_ps(first,second,0x93); - } - } -}; - -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) - { - if (Offset==1) - { - first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); - first = _mm_shuffle_epi32(first,0x39); - } - else if (Offset==2) - { - first = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(first))); - first = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); - } - else if (Offset==3) - { - first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); - first = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second),0x93)); - } - } -}; - -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) - { - if (Offset==1) - { - first = _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(first),_mm_castpd_ps(first))); - first = _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(first),_mm_castpd_ps(second))); - } - } -}; -#endif + return _mm_movemask_ps(x) != 0x0; +} EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { @@ -799,6 +1145,100 @@ ptranspose(PacketBlock& kernel) { kernel.packet[3] = _mm_unpackhi_epi64(T2, T3); } +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + __m128i T0 = _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]); + __m128i T1 = _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]); + __m128i T2 = _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]); + __m128i T3 = _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]); + kernel.packet[0] = _mm_unpacklo_epi16(T0, T2); + kernel.packet[1] = _mm_unpackhi_epi16(T0, T2); + kernel.packet[2] = _mm_unpacklo_epi16(T1, T3); + kernel.packet[3] = _mm_unpackhi_epi16(T1, T3); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + // If we number the elements in the input thus: + // kernel.packet[ 0] = {00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 0a, 0b, 0c, 0d, 0e, 0f} + // kernel.packet[ 1] = {10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 1a, 1b, 1c, 1d, 1e, 1f} + // ... + // kernel.packet[15] = {f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, fa, fb, fc, fd, fe, ff}, + // + // the desired output is: + // kernel.packet[ 0] = {00, 10, 20, 30, 40, 50, 60, 70, 80, 90, a0, b0, c0, d0, e0, f0} + // kernel.packet[ 1] = {01, 11, 21, 31, 41, 51, 61, 71, 81, 91, a1, b1, c1, d1, e1, f1} + // ... + // kernel.packet[15] = {0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f, 9f, af, bf, cf, df, ef, ff}, + __m128i t0 = _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + __m128i t1 = _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]); // 08 18 09 19 0a 1a 0b 1b 0c 1c 0d 1d 0e 1e 0f 1f + __m128i t2 = _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]); // 20 30 21 31 22 32 ... 27 37 + __m128i t3 = _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]); // 28 38 29 39 2a 3a ... 2f 3f + __m128i t4 = _mm_unpacklo_epi8(kernel.packet[4], kernel.packet[5]); // 40 50 41 51 42 52 47 57 + __m128i t5 = _mm_unpackhi_epi8(kernel.packet[4], kernel.packet[5]); // 48 58 49 59 4a 5a + __m128i t6 = _mm_unpacklo_epi8(kernel.packet[6], kernel.packet[7]); + __m128i t7 = _mm_unpackhi_epi8(kernel.packet[6], kernel.packet[7]); + __m128i t8 = _mm_unpacklo_epi8(kernel.packet[8], kernel.packet[9]); + __m128i t9 = _mm_unpackhi_epi8(kernel.packet[8], kernel.packet[9]); + __m128i ta = _mm_unpacklo_epi8(kernel.packet[10], kernel.packet[11]); + __m128i tb = _mm_unpackhi_epi8(kernel.packet[10], kernel.packet[11]); + __m128i tc = _mm_unpacklo_epi8(kernel.packet[12], kernel.packet[13]); + __m128i td = _mm_unpackhi_epi8(kernel.packet[12], kernel.packet[13]); + __m128i te = _mm_unpacklo_epi8(kernel.packet[14], kernel.packet[15]); + __m128i tf = _mm_unpackhi_epi8(kernel.packet[14], kernel.packet[15]); + + __m128i s0 = _mm_unpacklo_epi16(t0, t2); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + __m128i s1 = _mm_unpackhi_epi16(t0, t2); // 04 14 24 34 + __m128i s2 = _mm_unpacklo_epi16(t1, t3); // 08 18 28 38 ... + __m128i s3 = _mm_unpackhi_epi16(t1, t3); // 0c 1c 2c 3c ... + __m128i s4 = _mm_unpacklo_epi16(t4, t6); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + __m128i s5 = _mm_unpackhi_epi16(t4, t6); // 44 54 64 74 ... + __m128i s6 = _mm_unpacklo_epi16(t5, t7); + __m128i s7 = _mm_unpackhi_epi16(t5, t7); + __m128i s8 = _mm_unpacklo_epi16(t8, ta); + __m128i s9 = _mm_unpackhi_epi16(t8, ta); + __m128i sa = _mm_unpacklo_epi16(t9, tb); + __m128i sb = _mm_unpackhi_epi16(t9, tb); + __m128i sc = _mm_unpacklo_epi16(tc, te); + __m128i sd = _mm_unpackhi_epi16(tc, te); + __m128i se = _mm_unpacklo_epi16(td, tf); + __m128i sf = _mm_unpackhi_epi16(td, tf); + + __m128i u0 = _mm_unpacklo_epi32(s0, s4); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + __m128i u1 = _mm_unpackhi_epi32(s0, s4); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + __m128i u2 = _mm_unpacklo_epi32(s1, s5); + __m128i u3 = _mm_unpackhi_epi32(s1, s5); + __m128i u4 = _mm_unpacklo_epi32(s2, s6); + __m128i u5 = _mm_unpackhi_epi32(s2, s6); + __m128i u6 = _mm_unpacklo_epi32(s3, s7); + __m128i u7 = _mm_unpackhi_epi32(s3, s7); + __m128i u8 = _mm_unpacklo_epi32(s8, sc); + __m128i u9 = _mm_unpackhi_epi32(s8, sc); + __m128i ua = _mm_unpacklo_epi32(s9, sd); + __m128i ub = _mm_unpackhi_epi32(s9, sd); + __m128i uc = _mm_unpacklo_epi32(sa, se); + __m128i ud = _mm_unpackhi_epi32(sa, se); + __m128i ue = _mm_unpacklo_epi32(sb, sf); + __m128i uf = _mm_unpackhi_epi32(sb, sf); + + kernel.packet[0] = _mm_unpacklo_epi64(u0, u8); + kernel.packet[1] = _mm_unpackhi_epi64(u0, u8); + kernel.packet[2] = _mm_unpacklo_epi64(u1, u9); + kernel.packet[3] = _mm_unpackhi_epi64(u1, u9); + kernel.packet[4] = _mm_unpacklo_epi64(u2, ua); + kernel.packet[5] = _mm_unpackhi_epi64(u2, ua); + kernel.packet[6] = _mm_unpacklo_epi64(u3, ub); + kernel.packet[7] = _mm_unpackhi_epi64(u3, ub); + kernel.packet[8] = _mm_unpacklo_epi64(u4, uc); + kernel.packet[9] = _mm_unpackhi_epi64(u4, uc); + kernel.packet[10] = _mm_unpacklo_epi64(u5, ud); + kernel.packet[11] = _mm_unpackhi_epi64(u5, ud); + kernel.packet[12] = _mm_unpacklo_epi64(u6, ue); + kernel.packet[13] = _mm_unpackhi_epi64(u6, ue); + kernel.packet[14] = _mm_unpacklo_epi64(u7, uf); + kernel.packet[15] = _mm_unpackhi_epi64(u7, uf); +} + template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { const __m128i zero = _mm_setzero_si128(); const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); @@ -830,46 +1270,8 @@ template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, cons #endif } -template<> EIGEN_STRONG_INLINE Packet4f pinsertfirst(const Packet4f& a, float b) -{ -#ifdef EIGEN_VECTORIZE_SSE4_1 - return _mm_blend_ps(a,pset1(b),1); -#else - return _mm_move_ss(a, _mm_load_ss(&b)); -#endif -} - -template<> EIGEN_STRONG_INLINE Packet2d pinsertfirst(const Packet2d& a, double b) -{ -#ifdef EIGEN_VECTORIZE_SSE4_1 - return _mm_blend_pd(a,pset1(b),1); -#else - return _mm_move_sd(a, _mm_load_sd(&b)); -#endif -} - -template<> EIGEN_STRONG_INLINE Packet4f pinsertlast(const Packet4f& a, float b) -{ -#ifdef EIGEN_VECTORIZE_SSE4_1 - return _mm_blend_ps(a,pset1(b),(1<<3)); -#else - const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x0,0x0,0x0,0xFFFFFFFF)); - return _mm_or_ps(_mm_andnot_ps(mask, a), _mm_and_ps(mask, pset1(b))); -#endif -} - -template<> EIGEN_STRONG_INLINE Packet2d pinsertlast(const Packet2d& a, double b) -{ -#ifdef EIGEN_VECTORIZE_SSE4_1 - return _mm_blend_pd(a,pset1(b),(1<<1)); -#else - const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x0,0xFFFFFFFF,0xFFFFFFFF)); - return _mm_or_pd(_mm_andnot_pd(mask, a), _mm_and_pd(mask, pset1(b))); -#endif -} - // Scalar path for pmadd with FMA to ensure consistency with vectorized path. -#ifdef __FMA__ +#ifdef EIGEN_VECTORIZE_FMA template<> EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) { return ::fmaf(a,b,c); } @@ -878,11 +1280,219 @@ template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, co } #endif + +// Packet math for Eigen::half +// Disable the following code since it's broken on too many platforms / compilers. +//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC) +#if 0 + +typedef struct { + __m64 x; +} Packet4h; + + +template<> struct is_arithmetic { enum { value = true }; }; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet4h type; + // There is no half-size packet for Packet4h. + typedef Packet4h half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + HasHalfPacket = 0, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasConj = 0, + HasSetLinear = 0, + HasSqrt = 0, + HasRsqrt = 0, + HasExp = 0, + HasLog = 0, + HasBlend = 0 + }; +}; + + +template<> struct unpacket_traits { typedef Eigen::half type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h half; }; + +template<> EIGEN_STRONG_INLINE Packet4h pset1(const Eigen::half& from) { + Packet4h result; + result.x = _mm_set1_pi16(from.x); + return result; +} + +template<> EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet4h& from) { + return half_impl::raw_uint16_to_half(static_cast(_mm_cvtsi64_si32(from.x))); +} + +template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet4h padd(const Packet4h& a, const Packet4h& b) { + __int64_t a64 = _mm_cvtm64_si64(a.x); + __int64_t b64 = _mm_cvtm64_si64(b.x); + + Eigen::half h[4]; + + Eigen::half ha = half_impl::raw_uint16_to_half(static_cast(a64)); + Eigen::half hb = half_impl::raw_uint16_to_half(static_cast(b64)); + h[0] = ha + hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 16)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 16)); + h[1] = ha + hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 32)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 32)); + h[2] = ha + hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 48)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 48)); + h[3] = ha + hb; + Packet4h result; + result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet4h psub(const Packet4h& a, const Packet4h& b) { + __int64_t a64 = _mm_cvtm64_si64(a.x); + __int64_t b64 = _mm_cvtm64_si64(b.x); + + Eigen::half h[4]; + + Eigen::half ha = half_impl::raw_uint16_to_half(static_cast(a64)); + Eigen::half hb = half_impl::raw_uint16_to_half(static_cast(b64)); + h[0] = ha - hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 16)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 16)); + h[1] = ha - hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 32)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 32)); + h[2] = ha - hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 48)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 48)); + h[3] = ha - hb; + Packet4h result; + result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet4h pmul(const Packet4h& a, const Packet4h& b) { + __int64_t a64 = _mm_cvtm64_si64(a.x); + __int64_t b64 = _mm_cvtm64_si64(b.x); + + Eigen::half h[4]; + + Eigen::half ha = half_impl::raw_uint16_to_half(static_cast(a64)); + Eigen::half hb = half_impl::raw_uint16_to_half(static_cast(b64)); + h[0] = ha * hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 16)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 16)); + h[1] = ha * hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 32)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 32)); + h[2] = ha * hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 48)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 48)); + h[3] = ha * hb; + Packet4h result; + result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet4h pdiv(const Packet4h& a, const Packet4h& b) { + __int64_t a64 = _mm_cvtm64_si64(a.x); + __int64_t b64 = _mm_cvtm64_si64(b.x); + + Eigen::half h[4]; + + Eigen::half ha = half_impl::raw_uint16_to_half(static_cast(a64)); + Eigen::half hb = half_impl::raw_uint16_to_half(static_cast(b64)); + h[0] = ha / hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 16)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 16)); + h[1] = ha / hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 32)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 32)); + h[2] = ha / hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 48)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 48)); + h[3] = ha / hb; + Packet4h result; + result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet4h pload(const Eigen::half* from) { + Packet4h result; + result.x = _mm_cvtsi64_m64(*reinterpret_cast(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet4h ploadu(const Eigen::half* from) { + Packet4h result; + result.x = _mm_cvtsi64_m64(*reinterpret_cast(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet4h& from) { + __int64_t r = _mm_cvtm64_si64(from.x); + *(reinterpret_cast<__int64_t*>(to)) = r; +} + +template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet4h& from) { + __int64_t r = _mm_cvtm64_si64(from.x); + *(reinterpret_cast<__int64_t*>(to)) = r; +} + +template<> EIGEN_STRONG_INLINE Packet4h +ploadquad(const Eigen::half* from) { + return pset1(*from); +} + +template<> EIGEN_STRONG_INLINE Packet4h pgather(const Eigen::half* from, Index stride) +{ + Packet4h result; + result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); + return result; +} + +template<> EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const Packet4h& from, Index stride) +{ + __int64_t a = _mm_cvtm64_si64(from.x); + to[stride*0].x = static_cast(a); + to[stride*1].x = static_cast(a >> 16); + to[stride*2].x = static_cast(a >> 32); + to[stride*3].x = static_cast(a >> 48); +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x); + __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x); + __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x); + __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x); + + kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1); + kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1); + kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3); + kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3); +} + +#endif + + } // end namespace internal } // end namespace Eigen -#if EIGEN_COMP_PGI +#if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900 // PGI++ does not define the following intrinsics in C++ mode. static inline __m128 _mm_castpd_ps (__m128d x) { return reinterpret_cast<__m128&>(x); } static inline __m128i _mm_castpd_si128(__m128d x) { return reinterpret_cast<__m128i&>(x); } diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/SSE/TypeCasting.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/SSE/TypeCasting.h index c6ca8c716c..d2a0037e01 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/SSE/TypeCasting.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/arch/SSE/TypeCasting.h @@ -69,6 +69,71 @@ template<> EIGEN_STRONG_INLINE Packet2d pcast(const Packet4f return _mm_cvtps_pd(a); } +template<> EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4f& a) { + return _mm_castps_si128(a); +} + +template<> EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4i& a) { + return _mm_castsi128_ps(a); +} + +template<> EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet4i& a) { + return _mm_castsi128_pd(a); +} + +template<> EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet2d& a) { + return _mm_castpd_si128(a); +} + +// Disable the following code since it's broken on too many platforms / compilers. +//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC) +#if 0 + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4h& a) { + __int64_t a64 = _mm_cvtm64_si64(a.x); + Eigen::half h = raw_uint16_to_half(static_cast(a64)); + float f1 = static_cast(h); + h = raw_uint16_to_half(static_cast(a64 >> 16)); + float f2 = static_cast(h); + h = raw_uint16_to_half(static_cast(a64 >> 32)); + float f3 = static_cast(h); + h = raw_uint16_to_half(static_cast(a64 >> 48)); + float f4 = static_cast(h); + return _mm_set_ps(f4, f3, f2, f1); +} + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet4h pcast(const Packet4f& a) { + EIGEN_ALIGN16 float aux[4]; + pstore(aux, a); + Eigen::half h0(aux[0]); + Eigen::half h1(aux[1]); + Eigen::half h2(aux[2]); + Eigen::half h3(aux[3]); + + Packet4h result; + result.x = _mm_set_pi16(h3.x, h2.x, h1.x, h0.x); + return result; +} + +#endif } // end namespace internal diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/functors/AssignmentFunctors.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/functors/AssignmentFunctors.h index 4153b877cf..bf64ef4ede 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/functors/AssignmentFunctors.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/functors/AssignmentFunctors.h @@ -144,7 +144,7 @@ template struct swap_assign_op { EIGEN_EMPTY_STRUCT_CTOR(swap_assign_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const { -#ifdef __CUDACC__ +#ifdef EIGEN_GPUCC // FIXME is there some kind of cuda::swap? Scalar t=b; const_cast(b)=a; a=t; #else @@ -157,7 +157,16 @@ template struct functor_traits > { enum { Cost = 3 * NumTraits::ReadCost, - PacketAccess = packet_traits::Vectorizable + PacketAccess = + #if defined(EIGEN_VECTORIZE_AVX) && EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<800 || defined(__apple_build_version__)) + // This is a partial workaround for a bug in clang generating bad code + // when mixing 256/512 bits loads and 128 bits moves. + // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1684 + // https://bugs.llvm.org/show_bug.cgi?id=40815 + 0 + #else + packet_traits::Vectorizable + #endif }; }; diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/functors/BinaryFunctors.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/functors/BinaryFunctors.h index 3eae6b8cad..63f09ab931 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/functors/BinaryFunctors.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/functors/BinaryFunctors.h @@ -39,32 +39,26 @@ struct scalar_sum_op : binary_op_base EIGEN_SCALAR_BINARY_OP_PLUGIN } #endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a + b; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a + b; } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const { return internal::padd(a,b); } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const { return internal::predux(a); } }; template struct functor_traits > { enum { - Cost = (NumTraits::AddCost+NumTraits::AddCost)/2, // rough estimate! + Cost = (int(NumTraits::AddCost) + int(NumTraits::AddCost)) / 2, // rough estimate! PacketAccess = is_same::value && packet_traits::HasAdd && packet_traits::HasAdd // TODO vectorize mixed sum }; }; -/** \internal - * \brief Template specialization to deprecate the summation of boolean expressions. - * This is required to solve Bug 426. - * \sa DenseBase::count(), DenseBase::any(), ArrayBase::cast(), MatrixBase::cast() - */ -template<> struct scalar_sum_op : scalar_sum_op { - EIGEN_DEPRECATED - scalar_sum_op() {} -}; + +template<> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool scalar_sum_op::operator() (const bool& a, const bool& b) const { return a || b; } /** \internal @@ -83,23 +77,27 @@ struct scalar_product_op : binary_op_base EIGEN_SCALAR_BINARY_OP_PLUGIN } #endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a * b; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a * b; } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const { return internal::pmul(a,b); } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const { return internal::predux_mul(a); } }; template struct functor_traits > { enum { - Cost = (NumTraits::MulCost + NumTraits::MulCost)/2, // rough estimate! + Cost = (int(NumTraits::MulCost) + int(NumTraits::MulCost))/2, // rough estimate! PacketAccess = is_same::value && packet_traits::HasMul && packet_traits::HasMul // TODO vectorize mixed product }; }; +template<> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool scalar_product_op::operator() (const bool& a, const bool& b) const { return a && b; } + + /** \internal * \brief Template functor to compute the conjugate product of two scalars * @@ -116,11 +114,11 @@ struct scalar_conj_product_op : binary_op_base typedef typename ScalarBinaryOpTraits::ReturnType result_type; EIGEN_EMPTY_STRUCT_CTOR(scalar_conj_product_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return conj_helper().pmul(a,b); } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const { return conj_helper().pmul(a,b); } }; template @@ -136,21 +134,28 @@ struct functor_traits > { * * \sa class CwiseBinaryOp, MatrixBase::cwiseMin, class VectorwiseOp, MatrixBase::minCoeff() */ -template +template struct scalar_min_op : binary_op_base { typedef typename ScalarBinaryOpTraits::ReturnType result_type; EIGEN_EMPTY_STRUCT_CTOR(scalar_min_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::mini(a, b); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { + return internal::pmin(a, b); + } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const - { return internal::pmin(a,b); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const + { + return internal::pmin(a,b); + } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const - { return internal::predux_min(a); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const + { + return internal::predux_min(a); + } }; -template -struct functor_traits > { + +template +struct functor_traits > { enum { Cost = (NumTraits::AddCost+NumTraits::AddCost)/2, PacketAccess = internal::is_same::value && packet_traits::HasMin @@ -162,21 +167,28 @@ struct functor_traits > { * * \sa class CwiseBinaryOp, MatrixBase::cwiseMax, class VectorwiseOp, MatrixBase::maxCoeff() */ -template -struct scalar_max_op : binary_op_base +template +struct scalar_max_op : binary_op_base { typedef typename ScalarBinaryOpTraits::ReturnType result_type; EIGEN_EMPTY_STRUCT_CTOR(scalar_max_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::maxi(a, b); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { + return internal::pmax(a,b); + } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const - { return internal::pmax(a,b); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const + { + return internal::pmax(a,b); + } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const - { return internal::predux_max(a); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const + { + return internal::predux_max(a); + } }; -template -struct functor_traits > { + +template +struct functor_traits > { enum { Cost = (NumTraits::AddCost+NumTraits::AddCost)/2, PacketAccess = internal::is_same::value && packet_traits::HasMax @@ -253,7 +265,6 @@ struct scalar_cmp_op : binary_op_base > { /** \internal * \brief Template functor to compute the pow of two scalars + * See the specification of pow in https://en.cppreference.com/w/cpp/numeric/math/pow */ template struct scalar_pow_op : binary_op_base @@ -301,16 +313,31 @@ struct scalar_pow_op : binary_op_base EIGEN_SCALAR_BINARY_OP_PLUGIN } #endif + EIGEN_DEVICE_FUNC inline result_type operator() (const Scalar& a, const Exponent& b) const { return numext::pow(a, b); } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const + { + return generic_pow(a,b); + } }; + template struct functor_traits > { - enum { Cost = 5 * NumTraits::MulCost, PacketAccess = false }; + enum { + Cost = 5 * NumTraits::MulCost, + PacketAccess = (!NumTraits::IsComplex && !NumTraits::IsInteger && + packet_traits::HasExp && packet_traits::HasLog && + packet_traits::HasRound && packet_traits::HasCmp && + // Temporarly disable packet access for half/bfloat16 until + // accuracy is improved. + !is_same::value && !is_same::value + ) + }; }; - - //---------- non associative binary functors ---------- /** \internal @@ -337,7 +364,7 @@ struct scalar_difference_op : binary_op_base template struct functor_traits > { enum { - Cost = (NumTraits::AddCost+NumTraits::AddCost)/2, + Cost = (int(NumTraits::AddCost) + int(NumTraits::AddCost)) / 2, PacketAccess = is_same::value && packet_traits::HasSub && packet_traits::HasSub }; }; @@ -382,11 +409,14 @@ struct functor_traits > { struct scalar_boolean_and_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_and_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a && b; } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const + { return internal::pand(a,b); } }; template<> struct functor_traits { enum { Cost = NumTraits::AddCost, - PacketAccess = false + PacketAccess = true }; }; @@ -398,11 +428,14 @@ template<> struct functor_traits { struct scalar_boolean_or_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_or_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a || b; } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const + { return internal::por(a,b); } }; template<> struct functor_traits { enum { Cost = NumTraits::AddCost, - PacketAccess = false + PacketAccess = true }; }; @@ -414,11 +447,44 @@ template<> struct functor_traits { struct scalar_boolean_xor_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_xor_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a ^ b; } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const + { return internal::pxor(a,b); } }; template<> struct functor_traits { enum { Cost = NumTraits::AddCost, - PacketAccess = false + PacketAccess = true + }; +}; + +/** \internal + * \brief Template functor to compute the absolute difference of two scalars + * + * \sa class CwiseBinaryOp, MatrixBase::absolute_difference + */ +template +struct scalar_absolute_difference_op : binary_op_base +{ + typedef typename ScalarBinaryOpTraits::ReturnType result_type; +#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN + EIGEN_EMPTY_STRUCT_CTOR(scalar_absolute_difference_op) +#else + scalar_absolute_difference_op() { + EIGEN_SCALAR_BINARY_OP_PLUGIN + } +#endif + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const + { return numext::absdiff(a,b); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const + { return internal::pabsdiff(a,b); } +}; +template +struct functor_traits > { + enum { + Cost = (NumTraits::AddCost+NumTraits::AddCost)/2, + PacketAccess = is_same::value && packet_traits::HasAbsDiff }; }; @@ -436,7 +502,7 @@ template struct bind1st_op : BinaryOp { typedef typename BinaryOp::second_argument_type second_argument_type; typedef typename BinaryOp::result_type result_type; - bind1st_op(const first_argument_type &val) : m_value(val) {} + EIGEN_DEVICE_FUNC explicit bind1st_op(const first_argument_type &val) : m_value(val) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const second_argument_type& b) const { return BinaryOp::operator()(m_value,b); } @@ -455,7 +521,7 @@ template struct bind2nd_op : BinaryOp { typedef typename BinaryOp::second_argument_type second_argument_type; typedef typename BinaryOp::result_type result_type; - bind2nd_op(const second_argument_type &val) : m_value(val) {} + EIGEN_DEVICE_FUNC explicit bind2nd_op(const second_argument_type &val) : m_value(val) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const first_argument_type& a) const { return BinaryOp::operator()(a,m_value); } diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/functors/NullaryFunctors.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/functors/NullaryFunctors.h index b03be0269c..192f225dda 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/functors/NullaryFunctors.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/functors/NullaryFunctors.h @@ -37,26 +37,27 @@ template struct functor_traits > { enum { Cost = NumTraits::AddCost, PacketAccess = false, IsRepeatable = true }; }; -template struct linspaced_op_impl; +template struct linspaced_op_impl; -template -struct linspaced_op_impl +template +struct linspaced_op_impl { - linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) : - m_low(low), m_high(high), m_size1(num_steps==1 ? 1 : num_steps-1), m_step(num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1)), + typedef typename NumTraits::Real RealScalar; + + EIGEN_DEVICE_FUNC linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) : + m_low(low), m_high(high), m_size1(num_steps==1 ? 1 : num_steps-1), m_step(num_steps==1 ? Scalar() : Scalar((high-low)/RealScalar(num_steps-1))), m_flip(numext::abs(high) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const { - typedef typename NumTraits::Real RealScalar; if(m_flip) - return (i==0)? m_low : (m_high - RealScalar(m_size1-i)*m_step); + return (i==0)? m_low : Scalar(m_high - RealScalar(m_size1-i)*m_step); else - return (i==m_size1)? m_high : (m_low + RealScalar(i)*m_step); + return (i==m_size1)? m_high : Scalar(m_low + RealScalar(i)*m_step); } - template + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const { // Principle: @@ -65,17 +66,17 @@ struct linspaced_op_impl { Packet pi = plset(Scalar(i-m_size1)); Packet res = padd(pset1(m_high), pmul(pset1(m_step), pi)); - if(i==0) - res = pinsertfirst(res, m_low); - return res; + if (EIGEN_PREDICT_TRUE(i != 0)) return res; + Packet mask = pcmp_lt(pset1(0), plset(0)); + return pselect(mask, res, pset1(m_low)); } else { Packet pi = plset(Scalar(i)); Packet res = padd(pset1(m_low), pmul(pset1(m_step), pi)); - if(i==m_size1-unpacket_traits::size+1) - res = pinsertlast(res, m_high); - return res; + if(EIGEN_PREDICT_TRUE(i != m_size1-unpacket_traits::size+1)) return res; + Packet mask = pcmp_lt(plset(0), pset1(unpacket_traits::size-1)); + return pselect(mask, res, pset1(m_high)); } } @@ -86,10 +87,10 @@ struct linspaced_op_impl const bool m_flip; }; -template -struct linspaced_op_impl +template +struct linspaced_op_impl { - linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) : + EIGEN_DEVICE_FUNC linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) : m_low(low), m_multiplier((high-low)/convert_index(num_steps<=1 ? 1 : num_steps-1)), m_divisor(convert_index((high>=low?num_steps:-num_steps)+(high-low))/((numext::abs(high-low)+1)==0?1:(numext::abs(high-low)+1))), @@ -115,8 +116,8 @@ struct linspaced_op_impl // Forward declaration (we default to random access which does not really give // us a speed gain when using packet access but it allows to use the functor in // nested expressions). -template struct linspaced_op; -template struct functor_traits< linspaced_op > +template struct linspaced_op; +template struct functor_traits< linspaced_op > { enum { @@ -126,9 +127,9 @@ template struct functor_traits< linspaced IsRepeatable = true }; }; -template struct linspaced_op +template struct linspaced_op { - linspaced_op(const Scalar& low, const Scalar& high, Index num_steps) + EIGEN_DEVICE_FUNC linspaced_op(const Scalar& low, const Scalar& high, Index num_steps) : impl((num_steps==1 ? high : low),high,num_steps) {} @@ -136,11 +137,11 @@ template struct linspaced_op EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const { return impl(i); } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const { return impl.packetOp(i); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const { return impl.template packetOp(i); } // This proxy object handles the actual required temporaries and the different // implementations (integer vs. floating point). - const linspaced_op_impl::IsInteger> impl; + const linspaced_op_impl::IsInteger> impl; }; // Linear access is automatically determined from the operator() prototypes available for the given functor. @@ -166,12 +167,12 @@ struct has_unary_operator,IndexType> { enum { value = template struct has_binary_operator,IndexType> { enum { value = 1}; }; -template -struct has_nullary_operator,IndexType> { enum { value = 0}; }; -template -struct has_unary_operator,IndexType> { enum { value = 1}; }; -template -struct has_binary_operator,IndexType> { enum { value = 0}; }; +template +struct has_nullary_operator,IndexType> { enum { value = 0}; }; +template +struct has_unary_operator,IndexType> { enum { value = 1}; }; +template +struct has_binary_operator,IndexType> { enum { value = 0}; }; template struct has_nullary_operator,IndexType> { enum { value = 1}; }; diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/functors/StlFunctors.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/functors/StlFunctors.h index 9c1d75850b..4570c9b634 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/functors/StlFunctors.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/functors/StlFunctors.h @@ -12,6 +12,28 @@ namespace Eigen { +// Portable replacements for certain functors. +namespace numext { + +template +struct equal_to { + typedef bool result_type; + EIGEN_DEVICE_FUNC bool operator()(const T& lhs, const T& rhs) const { + return lhs == rhs; + } +}; + +template +struct not_equal_to { + typedef bool result_type; + EIGEN_DEVICE_FUNC bool operator()(const T& lhs, const T& rhs) const { + return lhs != rhs; + } +}; + +} + + namespace internal { // default functor traits for STL functors: @@ -68,11 +90,19 @@ template struct functor_traits > { enum { Cost = 1, PacketAccess = false }; }; +template +struct functor_traits > + : functor_traits > {}; + template struct functor_traits > { enum { Cost = 1, PacketAccess = false }; }; -#if (__cplusplus < 201103L) && (EIGEN_COMP_MSVC <= 1900) +template +struct functor_traits > + : functor_traits > {}; + +#if (EIGEN_COMP_CXXVER < 11) // std::binder* are deprecated since c++11 and will be removed in c++17 template struct functor_traits > @@ -83,7 +113,7 @@ struct functor_traits > { enum { Cost = functor_traits::Cost, PacketAccess = false }; }; #endif -#if (__cplusplus < 201703L) && (EIGEN_COMP_MSVC < 1910) +#if (EIGEN_COMP_CXXVER < 17) // std::unary_negate is deprecated since c++17 and will be removed in c++20 template struct functor_traits > diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/functors/UnaryFunctors.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/functors/UnaryFunctors.h index b56e7afd2c..16136d185a 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/functors/UnaryFunctors.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/functors/UnaryFunctors.h @@ -109,7 +109,7 @@ struct functor_traits > template struct scalar_conjugate_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_conjugate_op) EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { using numext::conj; return conj(a); } + EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::conj(a); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { return internal::pconj(a); } }; @@ -117,7 +117,15 @@ template struct functor_traits > { enum { - Cost = NumTraits::IsComplex ? NumTraits::AddCost : 0, + Cost = 0, + // Yes the cost is zero even for complexes because in most cases for which + // the cost is used, conjugation turns to be a no-op. Some examples: + // cost(a*conj(b)) == cost(a*b) + // cost(a+conj(b)) == cost(a+b) + // ::HasConj }; }; @@ -130,7 +138,7 @@ struct functor_traits > template struct scalar_arg_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_arg_op) typedef typename NumTraits::Real result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { using numext::arg; return arg(a); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { return numext::arg(a); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { return internal::parg(a); } @@ -158,6 +166,44 @@ template struct functor_traits > { enum { Cost = is_same::value ? 0 : NumTraits::AddCost, PacketAccess = false }; }; +/** \internal + * \brief Template functor to arithmetically shift a scalar right by a number of bits + * + * \sa class CwiseUnaryOp, MatrixBase::shift_right() + */ +template +struct scalar_shift_right_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_shift_right_op) + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const + { return a >> N; } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const + { return internal::parithmetic_shift_right(a); } +}; +template +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = packet_traits::HasShift }; }; + +/** \internal + * \brief Template functor to logically shift a scalar left by a number of bits + * + * \sa class CwiseUnaryOp, MatrixBase::shift_left() + */ +template +struct scalar_shift_left_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_shift_left_op) + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const + { return a << N; } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const + { return internal::plogical_shift_left(a); } +}; +template +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = packet_traits::HasShift }; }; + /** \internal * \brief Template functor to extract the real part of a complex * @@ -262,6 +308,26 @@ struct functor_traits > { }; }; +/** \internal + * + * \brief Template functor to compute the exponential of a scalar - 1. + * + * \sa class CwiseUnaryOp, ArrayBase::expm1() + */ +template struct scalar_expm1_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_expm1_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::expm1(a); } + template + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pexpm1(a); } +}; +template +struct functor_traits > { + enum { + PacketAccess = packet_traits::HasExpm1, + Cost = functor_traits >::Cost // TODO measure cost of expm1 + }; +}; + /** \internal * * \brief Template functor to compute the logarithm of a scalar @@ -321,7 +387,7 @@ struct functor_traits > { */ template struct scalar_log10_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_log10_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { EIGEN_USING_STD_MATH(log10) return log10(a); } + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { EIGEN_USING_STD(log10) return log10(a); } template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog10(a); } }; @@ -329,6 +395,22 @@ template struct functor_traits > { enum { Cost = 5 * NumTraits::MulCost, PacketAccess = packet_traits::HasLog10 }; }; +/** \internal + * + * \brief Template functor to compute the base-2 logarithm of a scalar + * + * \sa class CwiseUnaryOp, Cwise::log2() + */ +template struct scalar_log2_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_log2_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return Scalar(EIGEN_LOG2E) * numext::log(a); } + template + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog2(a); } +}; +template +struct functor_traits > +{ enum { Cost = 5 * NumTraits::MulCost, PacketAccess = packet_traits::HasLog }; }; + /** \internal * \brief Template functor to compute the square root of a scalar * \sa class CwiseUnaryOp, Cwise::sqrt() @@ -356,13 +438,25 @@ struct functor_traits > { }; }; +// Boolean specialization to eliminate -Wimplicit-conversion-floating-point-to-bool warnings. +template<> struct scalar_sqrt_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_sqrt_op) + EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline bool operator() (const bool& a) const { return a; } + template + EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return a; } +}; +template <> +struct functor_traits > { + enum { Cost = 1, PacketAccess = packet_traits::Vectorizable }; +}; + /** \internal * \brief Template functor to compute the reciprocal square root of a scalar * \sa class CwiseUnaryOp, Cwise::rsqrt() */ template struct scalar_rsqrt_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_rsqrt_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return Scalar(1)/numext::sqrt(a); } + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::rsqrt(a); } template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::prsqrt(a); } }; @@ -528,6 +622,23 @@ struct functor_traits > { }; }; +#if EIGEN_HAS_CXX11_MATH +/** \internal + * \brief Template functor to compute the atanh of a scalar + * \sa class CwiseUnaryOp, ArrayBase::atanh() + */ +template +struct scalar_atanh_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_atanh_op) + EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::atanh(a); } +}; + +template +struct functor_traits > { + enum { Cost = 5 * NumTraits::MulCost, PacketAccess = false }; +}; +#endif + /** \internal * \brief Template functor to compute the sinh of a scalar * \sa class CwiseUnaryOp, ArrayBase::sinh() @@ -547,6 +658,23 @@ struct functor_traits > }; }; +#if EIGEN_HAS_CXX11_MATH +/** \internal + * \brief Template functor to compute the asinh of a scalar + * \sa class CwiseUnaryOp, ArrayBase::asinh() + */ +template +struct scalar_asinh_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_asinh_op) + EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::asinh(a); } +}; + +template +struct functor_traits > { + enum { Cost = 5 * NumTraits::MulCost, PacketAccess = false }; +}; +#endif + /** \internal * \brief Template functor to compute the cosh of a scalar * \sa class CwiseUnaryOp, ArrayBase::cosh() @@ -566,6 +694,23 @@ struct functor_traits > }; }; +#if EIGEN_HAS_CXX11_MATH +/** \internal + * \brief Template functor to compute the acosh of a scalar + * \sa class CwiseUnaryOp, ArrayBase::acosh() + */ +template +struct scalar_acosh_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_acosh_op) + EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::acosh(a); } +}; + +template +struct functor_traits > { + enum { Cost = 5 * NumTraits::MulCost, PacketAccess = false }; +}; +#endif + /** \internal * \brief Template functor to compute the inverse of a scalar * \sa class CwiseUnaryOp, Cwise::inverse() @@ -578,9 +723,13 @@ struct scalar_inverse_op { EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const { return internal::pdiv(pset1(Scalar(1)),a); } }; -template -struct functor_traits > -{ enum { Cost = NumTraits::MulCost, PacketAccess = packet_traits::HasDiv }; }; +template +struct functor_traits > { + enum { + PacketAccess = packet_traits::HasDiv, + Cost = scalar_div_cost::value + }; +}; /** \internal * \brief Template functor to compute the square of a scalar @@ -598,6 +747,19 @@ template struct functor_traits > { enum { Cost = NumTraits::MulCost, PacketAccess = packet_traits::HasMul }; }; +// Boolean specialization to avoid -Wint-in-bool-context warnings on GCC. +template<> +struct scalar_square_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_square_op) + EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline bool operator() (const bool& a) const { return a; } + template + EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const + { return a; } +}; +template<> +struct functor_traits > +{ enum { Cost = 0, PacketAccess = packet_traits::Vectorizable }; }; + /** \internal * \brief Template functor to compute the cube of a scalar * \sa class CwiseUnaryOp, Cwise::cube() @@ -614,6 +776,19 @@ template struct functor_traits > { enum { Cost = 2*NumTraits::MulCost, PacketAccess = packet_traits::HasMul }; }; +// Boolean specialization to avoid -Wint-in-bool-context warnings on GCC. +template<> +struct scalar_cube_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cube_op) + EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline bool operator() (const bool& a) const { return a; } + template + EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const + { return a; } +}; +template<> +struct functor_traits > +{ enum { Cost = 0, PacketAccess = packet_traits::Vectorizable }; }; + /** \internal * \brief Template functor to compute the rounded value of a scalar * \sa class CwiseUnaryOp, ArrayBase::round() @@ -652,6 +827,25 @@ struct functor_traits > }; }; +/** \internal + * \brief Template functor to compute the rounded (with current rounding mode) value of a scalar + * \sa class CwiseUnaryOp, ArrayBase::rint() + */ +template struct scalar_rint_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_rint_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::rint(a); } + template + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::print(a); } +}; +template +struct functor_traits > +{ + enum { + Cost = NumTraits::MulCost, + PacketAccess = packet_traits::HasRint + }; +}; + /** \internal * \brief Template functor to compute the ceil of a scalar * \sa class CwiseUnaryOp, ArrayBase::ceil() @@ -678,7 +872,13 @@ struct functor_traits > template struct scalar_isnan_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_isnan_op) typedef bool result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return (numext::isnan)(a); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { +#if defined(SYCL_DEVICE_ONLY) + return numext::isnan(a); +#else + return (numext::isnan)(a); +#endif + } }; template struct functor_traits > @@ -696,7 +896,13 @@ struct functor_traits > template struct scalar_isinf_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_isinf_op) typedef bool result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return (numext::isinf)(a); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { +#if defined(SYCL_DEVICE_ONLY) + return numext::isinf(a); +#else + return (numext::isinf)(a); +#endif + } }; template struct functor_traits > @@ -714,7 +920,13 @@ struct functor_traits > template struct scalar_isfinite_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_isfinite_op) typedef bool result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return (numext::isfinite)(a); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { +#if defined(SYCL_DEVICE_ONLY) + return numext::isfinite(a); +#else + return (numext::isfinite)(a); +#endif + } }; template struct functor_traits > @@ -746,9 +958,9 @@ struct functor_traits > { * \brief Template functor to compute the signum of a scalar * \sa class CwiseUnaryOp, Cwise::sign() */ -template::IsComplex!=0) > struct scalar_sign_op; +template::IsComplex!=0), bool is_integer=(NumTraits::IsInteger!=0) > struct scalar_sign_op; template -struct scalar_sign_op { +struct scalar_sign_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op) EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { @@ -758,8 +970,21 @@ struct scalar_sign_op { //template //EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psign(a); } }; + template -struct scalar_sign_op { +struct scalar_sign_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const + { + return (numext::isnan)(a) ? a : Scalar( (a>Scalar(0)) - (a + //EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psign(a); } +}; + +template +struct scalar_sign_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op) EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { @@ -777,7 +1002,7 @@ struct scalar_sign_op { template struct functor_traits > { enum { - Cost = + Cost = NumTraits::IsComplex ? ( 8*NumTraits::MulCost ) // roughly : ( 3*NumTraits::AddCost), @@ -785,6 +1010,120 @@ struct functor_traits > }; }; +/** \internal + * \brief Template functor to compute the logistic function of a scalar + * \sa class CwiseUnaryOp, ArrayBase::logistic() + */ +template +struct scalar_logistic_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_logistic_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const { + return packetOp(x); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Packet packetOp(const Packet& x) const { + const Packet one = pset1(T(1)); + return pdiv(one, padd(one, pexp(pnegate(x)))); + } +}; + +#ifndef EIGEN_GPU_COMPILE_PHASE +/** \internal + * \brief Template specialization of the logistic function for float. + * + * Uses just a 9/10-degree rational interpolant which + * interpolates 1/(1+exp(-x)) - 0.5 up to a couple of ulps in the range + * [-9, 18]. Below -9 we use the more accurate approximation + * 1/(1+exp(-x)) ~= exp(x), and above 18 the logistic function is 1 withing + * one ulp. The shifted logistic is interpolated because it was easier to + * make the fit converge. + * + */ +template <> +struct scalar_logistic_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_logistic_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator()(const float& x) const { + return packetOp(x); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Packet packetOp(const Packet& _x) const { + const Packet cutoff_lower = pset1(-9.f); + const Packet lt_mask = pcmp_lt(_x, cutoff_lower); + const bool any_small = predux_any(lt_mask); + + // The upper cut-off is the smallest x for which the rational approximation evaluates to 1. + // Choosing this value saves us a few instructions clamping the results at the end. +#ifdef EIGEN_VECTORIZE_FMA + const Packet cutoff_upper = pset1(15.7243833541870117f); +#else + const Packet cutoff_upper = pset1(15.6437711715698242f); +#endif + const Packet x = pmin(_x, cutoff_upper); + + // The monomial coefficients of the numerator polynomial (odd). + const Packet alpha_1 = pset1(2.48287947061529e-01f); + const Packet alpha_3 = pset1(8.51377133304701e-03f); + const Packet alpha_5 = pset1(6.08574864600143e-05f); + const Packet alpha_7 = pset1(1.15627324459942e-07f); + const Packet alpha_9 = pset1(4.37031012579801e-11f); + + // The monomial coefficients of the denominator polynomial (even). + const Packet beta_0 = pset1(9.93151921023180e-01f); + const Packet beta_2 = pset1(1.16817656904453e-01f); + const Packet beta_4 = pset1(1.70198817374094e-03f); + const Packet beta_6 = pset1(6.29106785017040e-06f); + const Packet beta_8 = pset1(5.76102136993427e-09f); + const Packet beta_10 = pset1(6.10247389755681e-13f); + + // Since the polynomials are odd/even, we need x^2. + const Packet x2 = pmul(x, x); + + // Evaluate the numerator polynomial p. + Packet p = pmadd(x2, alpha_9, alpha_7); + p = pmadd(x2, p, alpha_5); + p = pmadd(x2, p, alpha_3); + p = pmadd(x2, p, alpha_1); + p = pmul(x, p); + + // Evaluate the denominator polynomial q. + Packet q = pmadd(x2, beta_10, beta_8); + q = pmadd(x2, q, beta_6); + q = pmadd(x2, q, beta_4); + q = pmadd(x2, q, beta_2); + q = pmadd(x2, q, beta_0); + // Divide the numerator by the denominator and shift it up. + const Packet logistic = padd(pdiv(p, q), pset1(0.5f)); + if (EIGEN_PREDICT_FALSE(any_small)) { + const Packet exponential = pexp(_x); + return pselect(lt_mask, exponential, logistic); + } else { + return logistic; + } + } +}; +#endif // #ifndef EIGEN_GPU_COMPILE_PHASE + +template +struct functor_traits > { + enum { + // The cost estimate for float here here is for the common(?) case where + // all arguments are greater than -9. + Cost = scalar_div_cost::HasDiv>::value + + (internal::is_same::value + ? NumTraits::AddCost * 15 + NumTraits::MulCost * 11 + : NumTraits::AddCost * 2 + + functor_traits >::Cost), + PacketAccess = + packet_traits::HasAdd && packet_traits::HasDiv && + (internal::is_same::value + ? packet_traits::HasMul && packet_traits::HasMax && + packet_traits::HasMin + : packet_traits::HasNegate && packet_traits::HasExp) + }; +}; + } // end namespace internal } // end namespace Eigen diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 681451cc30..f35b760c1d 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -15,7 +15,13 @@ namespace Eigen { namespace internal { -template +enum GEBPPacketSizeType { + GEBPPacketFull = 0, + GEBPPacketHalf, + GEBPPacketQuarter +}; + +template class gebp_traits; @@ -25,16 +31,42 @@ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff return a<=0 ? b : a; } -#if EIGEN_ARCH_i386_OR_x86_64 -const std::ptrdiff_t defaultL1CacheSize = 32*1024; -const std::ptrdiff_t defaultL2CacheSize = 256*1024; -const std::ptrdiff_t defaultL3CacheSize = 2*1024*1024; +#if defined(EIGEN_DEFAULT_L1_CACHE_SIZE) +#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) EIGEN_DEFAULT_L1_CACHE_SIZE #else -const std::ptrdiff_t defaultL1CacheSize = 16*1024; -const std::ptrdiff_t defaultL2CacheSize = 512*1024; -const std::ptrdiff_t defaultL3CacheSize = 512*1024; +#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) val +#endif // defined(EIGEN_DEFAULT_L1_CACHE_SIZE) + +#if defined(EIGEN_DEFAULT_L2_CACHE_SIZE) +#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) EIGEN_DEFAULT_L2_CACHE_SIZE +#else +#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) val +#endif // defined(EIGEN_DEFAULT_L2_CACHE_SIZE) + +#if defined(EIGEN_DEFAULT_L3_CACHE_SIZE) +#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) EIGEN_DEFAULT_L3_CACHE_SIZE +#else +#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) val +#endif // defined(EIGEN_DEFAULT_L3_CACHE_SIZE) + +#if EIGEN_ARCH_i386_OR_x86_64 +const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(32*1024); +const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(256*1024); +const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(2*1024*1024); +#elif EIGEN_ARCH_PPC +const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(64*1024); +const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024); +const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4*1024*1024); +#else +const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(16*1024); +const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024); +const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(512*1024); #endif +#undef EIGEN_SET_DEFAULT_L1_CACHE_SIZE +#undef EIGEN_SET_DEFAULT_L2_CACHE_SIZE +#undef EIGEN_SET_DEFAULT_L3_CACHE_SIZE + /** \internal */ struct CacheSizes { CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) { @@ -50,7 +82,6 @@ struct CacheSizes { std::ptrdiff_t m_l3; }; - /** \internal */ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3) { @@ -101,6 +132,16 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n // at the register level. This small horizontal panel has to stay within L1 cache. std::ptrdiff_t l1, l2, l3; manage_caching_sizes(GetAction, &l1, &l2, &l3); + #ifdef EIGEN_VECTORIZE_AVX512 + // We need to find a rationale for that, but without this adjustment, + // performance with AVX512 is pretty bad, like -20% slower. + // One reason is that with increasing packet-size, the blocking size k + // has to become pretty small if we want that 1 lhs panel fit within L1. + // For instance, with the 3pX4 kernel and double, the size of the lhs+rhs panels are: + // k*(3*64 + 4*8) Bytes, with l1=32kBytes, and k%8=0, we have k=144. + // This is quite small for a good reuse of the accumulation registers. + l1 *= 4; + #endif if (num_threads > 1) { typedef typename Traits::ResScalar ResScalar; @@ -308,35 +349,60 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_ computeProductBlockingSizes(k, m, n, num_threads); } -#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD - #define CJMADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C); -#else +template +struct RhsPanelHelper { + private: + static const int remaining_registers = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS - registers_taken; + public: + typedef typename conditional=4, RhsPacketx4, RhsPacket>::type type; +}; - // FIXME (a bit overkill maybe ?) +template +struct QuadPacket +{ + Packet B_0, B1, B2, B3; + const Packet& get(const FixedInt<0>&) const { return B_0; } + const Packet& get(const FixedInt<1>&) const { return B1; } + const Packet& get(const FixedInt<2>&) const { return B2; } + const Packet& get(const FixedInt<3>&) const { return B3; } +}; - template struct gebp_madd_selector { - EIGEN_ALWAYS_INLINE static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/) - { - c = cj.pmadd(a,b,c); - } - }; +template +struct packet_conditional { typedef T3 type; }; - template struct gebp_madd_selector { - EIGEN_ALWAYS_INLINE static void run(const CJ& cj, T& a, T& b, T& c, T& t) - { - t = b; t = cj.pmul(a,t); c = padd(c,t); - } - }; +template +struct packet_conditional { typedef T1 type; }; - template - EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t) - { - gebp_madd_selector::run(cj,a,b,c,t); - } +template +struct packet_conditional { typedef T2 type; }; - #define CJMADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T); -// #define CJMADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T); -#endif +#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \ + typedef typename packet_conditional::type, \ + typename packet_traits::half, \ + typename unpacket_traits::half>::half>::type \ + prefix ## name ## Packet + +#define PACKET_DECL_COND(name, packet_size) \ + typedef typename packet_conditional::type, \ + typename packet_traits::half, \ + typename unpacket_traits::half>::half>::type \ + name ## Packet + +#define PACKET_DECL_COND_SCALAR_PREFIX(prefix, packet_size) \ + typedef typename packet_conditional::type, \ + typename packet_traits::half, \ + typename unpacket_traits::half>::half>::type \ + prefix ## ScalarPacket + +#define PACKET_DECL_COND_SCALAR(packet_size) \ + typedef typename packet_conditional::type, \ + typename packet_traits::half, \ + typename unpacket_traits::half>::half>::type \ + ScalarPacket /* Vectorization logic * real*real: unpack rhs to constant packets, ... @@ -348,7 +414,7 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_ * cplx*real : unpack rhs to constant packets, ... * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual */ -template +template class gebp_traits { public: @@ -356,13 +422,17 @@ public: typedef _RhsScalar RhsScalar; typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); + enum { ConjLhs = _ConjLhs, ConjRhs = _ConjRhs, - Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable, - LhsPacketSize = Vectorizable ? packet_traits::size : 1, - RhsPacketSize = Vectorizable ? packet_traits::size : 1, - ResPacketSize = Vectorizable ? packet_traits::size : 1, + Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable, + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, @@ -371,10 +441,12 @@ public: // register block size along the M direction (currently, this one cannot be modified) default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize, -#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) - // we assume 16 registers +#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) \ + && ((!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC>=1914)) + // we assume 16 registers or more // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined, // then using 3*LhsPacketSize triggers non-implemented paths in syrk. + // Bug 1515: MSVC prior to v19.14 yields to register spilling. mr = Vectorizable ? 3*LhsPacketSize : default_mr, #else mr = default_mr, @@ -384,37 +456,41 @@ public: RhsProgress = 1 }; - typedef typename packet_traits::type _LhsPacket; - typedef typename packet_traits::type _RhsPacket; - typedef typename packet_traits::type _ResPacket; typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; + typedef LhsPacket LhsPacket4Packing; + typedef QuadPacket RhsPacketx4; typedef ResPacket AccPacket; EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1(ResScalar(0)); } - - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) - { - pbroadcast4(b, b0, b1, b2, b3); - } - -// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) -// { -// pbroadcast2(b, b0, b1); -// } - + template EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const { dest = pset1(*b); } - + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); + } + + template + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const + { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const + { + } + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = ploadquad(b); @@ -432,8 +508,8 @@ public: dest = ploadu(a); } - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const { conj_helper cj; // It would be a lot cleaner to call pmadd all the time. Unfortunately if we @@ -448,6 +524,12 @@ public: #endif } + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const + { + madd(a, b.get(lane), c, tmp, lane); + } + EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const { r = pmadd(c,alpha,r); @@ -461,21 +543,25 @@ public: }; -template -class gebp_traits, RealScalar, _ConjLhs, false> +template +class gebp_traits, RealScalar, _ConjLhs, false, Arch, _PacketSize> { public: typedef std::complex LhsScalar; typedef RealScalar RhsScalar; typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); + enum { ConjLhs = _ConjLhs, ConjRhs = false, - Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable, - LhsPacketSize = Vectorizable ? packet_traits::size : 1, - RhsPacketSize = Vectorizable ? packet_traits::size : 1, - ResPacketSize = Vectorizable ? packet_traits::size : 1, + Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable, + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, nr = 4, @@ -490,13 +576,12 @@ public: RhsProgress = 1 }; - typedef typename packet_traits::type _LhsPacket; - typedef typename packet_traits::type _RhsPacket; - typedef typename packet_traits::type _ResPacket; - typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; + typedef LhsPacket LhsPacket4Packing; + + typedef QuadPacket RhsPacketx4; typedef ResPacket AccPacket; @@ -505,13 +590,42 @@ public: p = pset1(ResScalar(0)); } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + template + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const { - dest = pset1(*b); + dest = pset1(*b); } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); + } + + template + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const + { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const + {} EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { + loadRhsQuad_impl(b,dest, typename conditional::type()); + } + + EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const true_type&) const + { + // FIXME we can do better! + // what we want here is a ploadheight + RhsScalar tmp[4] = {b[0],b[0],b[1],b[1]}; + dest = ploadquad(tmp); + } + + EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const false_type&) const + { + eigen_internal_assert(RhsPacketSize<=8); dest = pset1(*b); } @@ -520,27 +634,20 @@ public: dest = pload(a); } - EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + template + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const { - dest = ploadu(a); + dest = ploadu(a); } - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) - { - pbroadcast4(b, b0, b1, b2, b3); - } - -// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) -// { -// pbroadcast2(b, b0, b1); -// } - - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const { madd_impl(a, b, c, tmp, typename conditional::type()); } - EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const + template + EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const { #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD EIGEN_UNUSED_VARIABLE(tmp); @@ -555,13 +662,20 @@ public: c += a * b; } - EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const { + madd(a, b.get(lane), c, tmp, lane); + } + + template + EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const + { + conj_helper cj; r = cj.pmadd(c,alpha,r); } protected: - conj_helper cj; }; template @@ -580,13 +694,57 @@ DoublePacket padd(const DoublePacket &a, const DoublePacket the "4" in "downto4" +// corresponds to the number of complexes, so it means "8" +// it terms of real coefficients. + template -const DoublePacket& predux_downto4(const DoublePacket &a) +const DoublePacket& +predux_half_dowto4(const DoublePacket &a, + typename enable_if::size<=8>::type* = 0) { return a; } -template struct unpacket_traits > { typedef DoublePacket half; }; +template +DoublePacket::half> +predux_half_dowto4(const DoublePacket &a, + typename enable_if::size==16>::type* = 0) +{ + // yes, that's pretty hackish :( + DoublePacket::half> res; + typedef std::complex::type> Cplx; + typedef typename packet_traits::type CplxPacket; + res.first = predux_half_dowto4(CplxPacket(a.first)).v; + res.second = predux_half_dowto4(CplxPacket(a.second)).v; + return res; +} + +// same here, "quad" actually means "8" in terms of real coefficients +template +void loadQuadToDoublePacket(const Scalar* b, DoublePacket& dest, + typename enable_if::size<=8>::type* = 0) +{ + dest.first = pset1(numext::real(*b)); + dest.second = pset1(numext::imag(*b)); +} + +template +void loadQuadToDoublePacket(const Scalar* b, DoublePacket& dest, + typename enable_if::size==16>::type* = 0) +{ + // yes, that's pretty hackish too :( + typedef typename NumTraits::Real RealScalar; + RealScalar r[4] = {numext::real(b[0]), numext::real(b[0]), numext::real(b[1]), numext::real(b[1])}; + RealScalar i[4] = {numext::imag(b[0]), numext::imag(b[0]), numext::imag(b[1]), numext::imag(b[1])}; + dest.first = ploadquad(r); + dest.second = ploadquad(i); +} + + +template struct unpacket_traits > { + typedef DoublePacket::half> half; +}; // template // DoublePacket pmadd(const DoublePacket &a, const DoublePacket &b) // { @@ -596,8 +754,8 @@ template struct unpacket_traits > { typede // return res; // } -template -class gebp_traits, std::complex, _ConjLhs, _ConjRhs > +template +class gebp_traits, std::complex, _ConjLhs, _ConjRhs, Arch, _PacketSize > { public: typedef std::complex Scalar; @@ -605,15 +763,21 @@ public: typedef std::complex RhsScalar; typedef std::complex ResScalar; + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); + PACKET_DECL_COND(Real, _PacketSize); + PACKET_DECL_COND_SCALAR(_PacketSize); + enum { ConjLhs = _ConjLhs, ConjRhs = _ConjRhs, - Vectorizable = packet_traits::Vectorizable - && packet_traits::Vectorizable, - RealPacketSize = Vectorizable ? packet_traits::size : 1, - ResPacketSize = Vectorizable ? packet_traits::size : 1, - LhsPacketSize = Vectorizable ? packet_traits::size : 1, - RhsPacketSize = Vectorizable ? packet_traits::size : 1, + Vectorizable = unpacket_traits::vectorizable + && unpacket_traits::vectorizable, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits::size : 1, + RealPacketSize = Vectorizable ? unpacket_traits::size : 1, // FIXME: should depend on NumberOfRegisters nr = 4, @@ -623,14 +787,16 @@ public: RhsProgress = 1 }; - typedef typename packet_traits::type RealPacket; - typedef typename packet_traits::type ScalarPacket; - typedef DoublePacket DoublePacketType; + typedef DoublePacket DoublePacketType; + typedef typename conditional::type LhsPacket4Packing; typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; typedef typename conditional::type AccPacket; + + // this actualy holds 8 packets! + typedef QuadPacket RhsPacketx4; EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); } @@ -641,17 +807,41 @@ public: } // Scalar path - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ScalarPacket& dest) const { - dest = pset1(*b); + dest = pset1(*b); } // Vectorized path - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const + template + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const { - dest.first = pset1(numext::real(*b)); - dest.second = pset1(numext::imag(*b)); + dest.first = pset1(numext::real(*b)); + dest.second = pset1(numext::imag(*b)); } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + loadRhs(b, dest.B_0); + loadRhs(b + 1, dest.B1); + loadRhs(b + 2, dest.B2); + loadRhs(b + 3, dest.B3); + } + + // Scalar path + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, ScalarPacket& dest) const + { + loadRhs(b, dest); + } + + // Vectorized path + template + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, DoublePacket& dest) const + { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const { @@ -659,33 +849,7 @@ public: } EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const { - eigen_internal_assert(unpacket_traits::size<=4); - loadRhs(b,dest); - } - - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) - { - // FIXME not sure that's the best way to implement it! - loadRhs(b+0, b0); - loadRhs(b+1, b1); - loadRhs(b+2, b2); - loadRhs(b+3, b3); - } - - // Vectorized path - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1) - { - // FIXME not sure that's the best way to implement it! - loadRhs(b+0, b0); - loadRhs(b+1, b1); - } - - // Scalar path - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1) - { - // FIXME not sure that's the best way to implement it! - loadRhs(b+0, b0); - loadRhs(b+1, b1); + loadQuadToDoublePacket(b,dest); } // nothing special here @@ -694,47 +858,59 @@ public: dest = pload((const typename unpacket_traits::type*)(a)); } - EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + template + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const { - dest = ploadu((const typename unpacket_traits::type*)(a)); + dest = ploadu((const typename unpacket_traits::type*)(a)); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacketType& c, RhsPacket& /*tmp*/) const + template + EIGEN_STRONG_INLINE + typename enable_if::value>::type + madd(const LhsPacketType& a, const RhsPacketType& b, DoublePacket& c, TmpType& /*tmp*/, const LaneIdType&) const { c.first = padd(pmul(a,b.first), c.first); c.second = padd(pmul(a,b.second),c.second); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/, const LaneIdType&) const { c = cj.pmadd(a,b,c); } + + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const + { + madd(a, b.get(lane), c, tmp, lane); + } EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; } - EIGEN_STRONG_INLINE void acc(const DoublePacketType& c, const ResPacket& alpha, ResPacket& r) const + template + EIGEN_STRONG_INLINE void acc(const DoublePacket& c, const ResPacketType& alpha, ResPacketType& r) const { // assemble c - ResPacket tmp; + ResPacketType tmp; if((!ConjLhs)&&(!ConjRhs)) { - tmp = pcplxflip(pconj(ResPacket(c.second))); - tmp = padd(ResPacket(c.first),tmp); + tmp = pcplxflip(pconj(ResPacketType(c.second))); + tmp = padd(ResPacketType(c.first),tmp); } else if((!ConjLhs)&&(ConjRhs)) { - tmp = pconj(pcplxflip(ResPacket(c.second))); - tmp = padd(ResPacket(c.first),tmp); + tmp = pconj(pcplxflip(ResPacketType(c.second))); + tmp = padd(ResPacketType(c.first),tmp); } else if((ConjLhs)&&(!ConjRhs)) { - tmp = pcplxflip(ResPacket(c.second)); - tmp = padd(pconj(ResPacket(c.first)),tmp); + tmp = pcplxflip(ResPacketType(c.second)); + tmp = padd(pconj(ResPacketType(c.first)),tmp); } else if((ConjLhs)&&(ConjRhs)) { - tmp = pcplxflip(ResPacket(c.second)); - tmp = psub(pconj(ResPacket(c.first)),tmp); + tmp = pcplxflip(ResPacketType(c.second)); + tmp = psub(pconj(ResPacketType(c.first)),tmp); } r = pmadd(tmp,alpha,r); @@ -744,8 +920,8 @@ protected: conj_helper cj; }; -template -class gebp_traits, false, _ConjRhs > +template +class gebp_traits, false, _ConjRhs, Arch, _PacketSize > { public: typedef std::complex Scalar; @@ -753,14 +929,25 @@ public: typedef Scalar RhsScalar; typedef Scalar ResScalar; + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Real, _PacketSize); + PACKET_DECL_COND_SCALAR_PREFIX(_, _PacketSize); + +#undef PACKET_DECL_COND_SCALAR_PREFIX +#undef PACKET_DECL_COND_PREFIX +#undef PACKET_DECL_COND_SCALAR +#undef PACKET_DECL_COND + enum { ConjLhs = false, ConjRhs = _ConjRhs, - Vectorizable = packet_traits::Vectorizable - && packet_traits::Vectorizable, - LhsPacketSize = Vectorizable ? packet_traits::size : 1, - RhsPacketSize = Vectorizable ? packet_traits::size : 1, - ResPacketSize = Vectorizable ? packet_traits::size : 1, + Vectorizable = unpacket_traits<_RealPacket>::vectorizable + && unpacket_traits<_ScalarPacket>::vectorizable, + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, // FIXME: should depend on NumberOfRegisters @@ -771,14 +958,11 @@ public: RhsProgress = 1 }; - typedef typename packet_traits::type _LhsPacket; - typedef typename packet_traits::type _RhsPacket; - typedef typename packet_traits::type _ResPacket; - typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; - + typedef LhsPacket LhsPacket4Packing; + typedef QuadPacket RhsPacketx4; typedef ResPacket AccPacket; EIGEN_STRONG_INLINE void initAcc(AccPacket& p) @@ -786,22 +970,25 @@ public: p = pset1(ResScalar(0)); } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + template + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const { - dest = pset1(*b); + dest = pset1(*b); } - - void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { - pbroadcast4(b, b0, b1, b2, b3); + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); } - -// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) -// { -// // FIXME not sure that's the best way to implement it! -// b0 = pload1(b+0); -// b1 = pload1(b+1); -// } + + template + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const + { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const + {} EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { @@ -810,21 +997,23 @@ public: EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { - eigen_internal_assert(unpacket_traits::size<=4); - loadRhs(b,dest); + dest = ploadquad(b); } - EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + template + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const { - dest = ploaddup(a); + dest = ploaddup(a); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const { madd_impl(a, b, c, tmp, typename conditional::type()); } - EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const + template + EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const { #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD EIGEN_UNUSED_VARIABLE(tmp); @@ -840,16 +1029,24 @@ public: c += a * b; } - EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const { + madd(a, b.get(lane), c, tmp, lane); + } + + template + EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const + { + conj_helper cj; r = cj.pmadd(alpha,c,r); } protected: - conj_helper cj; + }; -/* optimized GEneral packed Block * packed Panel product kernel +/* optimized General packed Block * packed Panel product kernel * * Mixing type logic: C += A * B * | A | B | comments @@ -859,26 +1056,47 @@ protected: template struct gebp_kernel { - typedef gebp_traits Traits; + typedef gebp_traits Traits; + typedef gebp_traits HalfTraits; + typedef gebp_traits QuarterTraits; + typedef typename Traits::ResScalar ResScalar; typedef typename Traits::LhsPacket LhsPacket; typedef typename Traits::RhsPacket RhsPacket; typedef typename Traits::ResPacket ResPacket; typedef typename Traits::AccPacket AccPacket; + typedef typename Traits::RhsPacketx4 RhsPacketx4; + + typedef typename RhsPanelHelper::type RhsPanel15; + + typedef gebp_traits SwappedTraits; - typedef gebp_traits SwappedTraits; typedef typename SwappedTraits::ResScalar SResScalar; typedef typename SwappedTraits::LhsPacket SLhsPacket; typedef typename SwappedTraits::RhsPacket SRhsPacket; typedef typename SwappedTraits::ResPacket SResPacket; typedef typename SwappedTraits::AccPacket SAccPacket; + typedef typename HalfTraits::LhsPacket LhsPacketHalf; + typedef typename HalfTraits::RhsPacket RhsPacketHalf; + typedef typename HalfTraits::ResPacket ResPacketHalf; + typedef typename HalfTraits::AccPacket AccPacketHalf; + + typedef typename QuarterTraits::LhsPacket LhsPacketQuarter; + typedef typename QuarterTraits::RhsPacket RhsPacketQuarter; + typedef typename QuarterTraits::ResPacket ResPacketQuarter; + typedef typename QuarterTraits::AccPacket AccPacketQuarter; + typedef typename DataMapper::LinearMapper LinearMapper; enum { Vectorizable = Traits::Vectorizable, LhsProgress = Traits::LhsProgress, + LhsProgressHalf = HalfTraits::LhsProgress, + LhsProgressQuarter = QuarterTraits::LhsProgress, RhsProgress = Traits::RhsProgress, + RhsProgressHalf = HalfTraits::RhsProgress, + RhsProgressQuarter = QuarterTraits::RhsProgress, ResPacketSize = Traits::ResPacketSize }; @@ -888,6 +1106,299 @@ struct gebp_kernel Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); }; +template::LhsProgress> +struct last_row_process_16_packets +{ + typedef gebp_traits Traits; + typedef gebp_traits SwappedTraits; + + typedef typename Traits::ResScalar ResScalar; + typedef typename SwappedTraits::LhsPacket SLhsPacket; + typedef typename SwappedTraits::RhsPacket SRhsPacket; + typedef typename SwappedTraits::ResPacket SResPacket; + typedef typename SwappedTraits::AccPacket SAccPacket; + + EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA, + const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2, + ResScalar alpha, SAccPacket &C0) + { + EIGEN_UNUSED_VARIABLE(res); + EIGEN_UNUSED_VARIABLE(straits); + EIGEN_UNUSED_VARIABLE(blA); + EIGEN_UNUSED_VARIABLE(blB); + EIGEN_UNUSED_VARIABLE(depth); + EIGEN_UNUSED_VARIABLE(endk); + EIGEN_UNUSED_VARIABLE(i); + EIGEN_UNUSED_VARIABLE(j2); + EIGEN_UNUSED_VARIABLE(alpha); + EIGEN_UNUSED_VARIABLE(C0); + } +}; + + +template +struct last_row_process_16_packets { + typedef gebp_traits Traits; + typedef gebp_traits SwappedTraits; + + typedef typename Traits::ResScalar ResScalar; + typedef typename SwappedTraits::LhsPacket SLhsPacket; + typedef typename SwappedTraits::RhsPacket SRhsPacket; + typedef typename SwappedTraits::ResPacket SResPacket; + typedef typename SwappedTraits::AccPacket SAccPacket; + + EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA, + const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2, + ResScalar alpha, SAccPacket &C0) + { + typedef typename unpacket_traits::half>::half SResPacketQuarter; + typedef typename unpacket_traits::half>::half SLhsPacketQuarter; + typedef typename unpacket_traits::half>::half SRhsPacketQuarter; + typedef typename unpacket_traits::half>::half SAccPacketQuarter; + + SResPacketQuarter R = res.template gatherPacket(i, j2); + SResPacketQuarter alphav = pset1(alpha); + + if (depth - endk > 0) + { + // We have to handle the last row(s) of the rhs, which + // correspond to a half-packet + SAccPacketQuarter c0 = predux_half_dowto4(predux_half_dowto4(C0)); + + for (Index kk = endk; kk < depth; kk++) + { + SLhsPacketQuarter a0; + SRhsPacketQuarter b0; + straits.loadLhsUnaligned(blB, a0); + straits.loadRhs(blA, b0); + straits.madd(a0,b0,c0,b0, fix<0>); + blB += SwappedTraits::LhsProgress/4; + blA += 1; + } + straits.acc(c0, alphav, R); + } + else + { + straits.acc(predux_half_dowto4(predux_half_dowto4(C0)), alphav, R); + } + res.scatterPacket(i, j2, R); + } +}; + +template +struct lhs_process_one_packet +{ + typedef typename GEBPTraits::RhsPacketx4 RhsPacketx4; + + EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3) + { + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4"); + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); + traits.loadLhs(&blA[(0+1*K)*LhsProgress], *A0); + traits.loadRhs(&blB[(0+4*K)*RhsProgress], *rhs_panel); + traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>); + traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>); + traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>); + traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>); + #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE) + __asm__ ("" : "+x,m" (*A0)); + #endif + EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4"); + } + + EIGEN_STRONG_INLINE void operator()( + const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, ResScalar alpha, + Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB, + int prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4) + { + GEBPTraits traits; + + // loops on each largest micro horizontal panel of lhs + // (LhsProgress x depth) + for(Index i=peelStart; i(alpha); + + R0 = r0.template loadPacket(0); + R1 = r1.template loadPacket(0); + traits.acc(C0, alphav, R0); + traits.acc(C1, alphav, R1); + r0.storePacket(0, R0); + r1.storePacket(0, R1); + + R0 = r2.template loadPacket(0); + R1 = r3.template loadPacket(0); + traits.acc(C2, alphav, R0); + traits.acc(C3, alphav, R1); + r2.storePacket(0, R0); + r3.storePacket(0, R1); + } + + // Deal with remaining columns of the rhs + for(Index j2=packet_cols4; j2); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \ + } while(false); + + EIGEN_GEBGP_ONESTEP(0); + EIGEN_GEBGP_ONESTEP(1); + EIGEN_GEBGP_ONESTEP(2); + EIGEN_GEBGP_ONESTEP(3); + EIGEN_GEBGP_ONESTEP(4); + EIGEN_GEBGP_ONESTEP(5); + EIGEN_GEBGP_ONESTEP(6); + EIGEN_GEBGP_ONESTEP(7); + + blB += pk*RhsProgress; + blA += pk*LhsProgress; + + EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX1"); + } + + // process remaining peeled loop + for(Index k=peeled_kc; k(alpha); + R0 = r0.template loadPacket(0); + traits.acc(C0, alphav, R0); + r0.storePacket(0, R0); + } + } + } +}; + +template +struct lhs_process_fraction_of_packet : lhs_process_one_packet +{ + +EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3) + { + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4"); + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); + traits.loadLhsUnaligned(&blA[(0+1*K)*(LhsProgress)], *A0); + traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], *B_0, *B1, *B2, *B3); + traits.madd(*A0, *B_0, *C0, *B_0); + traits.madd(*A0, *B1, *C1, *B1); + traits.madd(*A0, *B2, *C2, *B2); + traits.madd(*A0, *B3, *C3, *B3); + EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4"); + } +}; + template EIGEN_DONT_INLINE void gebp_kernel @@ -904,10 +1415,12 @@ void gebp_kernel=4 ? (cols/4) * 4 : 0; const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0; const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0; - const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0; + const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? peeled_mc2+((rows-peeled_mc2)/(1*LhsProgress))*(1*LhsProgress) : 0; + const Index peeled_mc_half = mr>=LhsProgressHalf ? peeled_mc1+((rows-peeled_mc1)/(LhsProgressHalf))*(LhsProgressHalf) : 0; + const Index peeled_mc_quarter = mr>=LhsProgressQuarter ? peeled_mc_half+((rows-peeled_mc_half)/(LhsProgressQuarter))*(LhsProgressQuarter) : 0; enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell) const Index peeled_kc = depth & ~(pk-1); - const Index prefetch_res_offset = 32/sizeof(ResScalar); + const int prefetch_res_offset = 32/sizeof(ResScalar); // const Index depth2 = depth & ~1; //---------- Process 3 * LhsProgress rows at once ---------- @@ -965,36 +1478,48 @@ void gebp_kernel); \ + traits.madd(A1, rhs_panel, C4, T0, fix<0>); \ + traits.madd(A2, rhs_panel, C8, T0, fix<0>); \ + traits.updateRhs(blB + (1+4*K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C1, T0, fix<1>); \ + traits.madd(A1, rhs_panel, C5, T0, fix<1>); \ + traits.madd(A2, rhs_panel, C9, T0, fix<1>); \ + traits.updateRhs(blB + (2+4*K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C2, T0, fix<2>); \ + traits.madd(A1, rhs_panel, C6, T0, fix<2>); \ + traits.madd(A2, rhs_panel, C10, T0, fix<2>); \ + traits.updateRhs(blB + (3+4*K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C3, T0, fix<3>); \ + traits.madd(A1, rhs_panel, C7, T0, fix<3>); \ + traits.madd(A2, rhs_panel, C11, T0, fix<3>); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \ + } while (false) internal::prefetch(blB); EIGEN_GEBP_ONESTEP(0); @@ -1014,7 +1539,8 @@ void gebp_kernel(alpha); - R0 = r0.loadPacket(0 * Traits::ResPacketSize); - R1 = r0.loadPacket(1 * Traits::ResPacketSize); - R2 = r0.loadPacket(2 * Traits::ResPacketSize); + R0 = r0.template loadPacket(0 * Traits::ResPacketSize); + R1 = r0.template loadPacket(1 * Traits::ResPacketSize); + R2 = r0.template loadPacket(2 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); traits.acc(C8, alphav, R2); @@ -1036,9 +1562,9 @@ void gebp_kernel(0 * Traits::ResPacketSize); + R1 = r1.template loadPacket(1 * Traits::ResPacketSize); + R2 = r1.template loadPacket(2 * Traits::ResPacketSize); traits.acc(C1, alphav, R0); traits.acc(C5, alphav, R1); traits.acc(C9, alphav, R2); @@ -1046,9 +1572,9 @@ void gebp_kernel(0 * Traits::ResPacketSize); + R1 = r2.template loadPacket(1 * Traits::ResPacketSize); + R2 = r2.template loadPacket(2 * Traits::ResPacketSize); traits.acc(C2, alphav, R0); traits.acc(C6, alphav, R1); traits.acc(C10, alphav, R2); @@ -1056,9 +1582,9 @@ void gebp_kernel(0 * Traits::ResPacketSize); + R1 = r3.template loadPacket(1 * Traits::ResPacketSize); + R2 = r3.template loadPacket(2 * Traits::ResPacketSize); traits.acc(C3, alphav, R0); traits.acc(C7, alphav, R1); traits.acc(C11, alphav, R2); @@ -1094,20 +1620,20 @@ void gebp_kernel); \ + traits.madd(A1, B_0, C4, B_0, fix<0>); \ + traits.madd(A2, B_0, C8, B_0, fix<0>); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \ + } while (false) + EIGEN_GEBGP_ONESTEP(0); EIGEN_GEBGP_ONESTEP(1); EIGEN_GEBGP_ONESTEP(2); @@ -1117,8 +1643,8 @@ void gebp_kernel(alpha); - R0 = r0.loadPacket(0 * Traits::ResPacketSize); - R1 = r0.loadPacket(1 * Traits::ResPacketSize); - R2 = r0.loadPacket(2 * Traits::ResPacketSize); + R0 = r0.template loadPacket(0 * Traits::ResPacketSize); + R1 = r0.template loadPacket(1 * Traits::ResPacketSize); + R2 = r0.template loadPacket(2 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); traits.acc(C8, alphav, R2); @@ -1196,7 +1722,8 @@ void gebp_kernel=6 without FMA (bug 1637) @@ -1205,24 +1732,24 @@ void gebp_kernel); \ + traits.madd(A1, rhs_panel, C4, T0, fix<0>); \ + traits.madd(A0, rhs_panel, C1, T0, fix<1>); \ + traits.madd(A1, rhs_panel, C5, T0, fix<1>); \ + traits.madd(A0, rhs_panel, C2, T0, fix<2>); \ + traits.madd(A1, rhs_panel, C6, T0, fix<2>); \ + traits.madd(A0, rhs_panel, C3, T0, fix<3>); \ + traits.madd(A1, rhs_panel, C7, T0, fix<3>); \ + EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \ + } while (false) + internal::prefetch(blB+(48+0)); EIGEN_GEBGP_ONESTEP(0); EIGEN_GEBGP_ONESTEP(1); @@ -1242,7 +1769,8 @@ void gebp_kernel(alpha); - R0 = r0.loadPacket(0 * Traits::ResPacketSize); - R1 = r0.loadPacket(1 * Traits::ResPacketSize); - R2 = r1.loadPacket(0 * Traits::ResPacketSize); - R3 = r1.loadPacket(1 * Traits::ResPacketSize); + R0 = r0.template loadPacket(0 * Traits::ResPacketSize); + R1 = r0.template loadPacket(1 * Traits::ResPacketSize); + R2 = r1.template loadPacket(0 * Traits::ResPacketSize); + R3 = r1.template loadPacket(1 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); traits.acc(C1, alphav, R2); @@ -1265,10 +1793,10 @@ void gebp_kernel(0 * Traits::ResPacketSize); + R1 = r2.template loadPacket(1 * Traits::ResPacketSize); + R2 = r3.template loadPacket(0 * Traits::ResPacketSize); + R3 = r3.template loadPacket(1 * Traits::ResPacketSize); traits.acc(C2, alphav, R0); traits.acc(C6, alphav, R1); traits.acc(C3, alphav, R2); @@ -1313,8 +1841,8 @@ void gebp_kernel); \ + traits.madd(A1, B_0, C4, B_0, fix<0>); \ EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \ } while(false) @@ -1327,8 +1855,8 @@ void gebp_kernel(alpha); - R0 = r0.loadPacket(0 * Traits::ResPacketSize); - R1 = r0.loadPacket(1 * Traits::ResPacketSize); + R0 = r0.template loadPacket(0 * Traits::ResPacketSize); + R1 = r0.template loadPacket(1 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); r0.storePacket(0 * Traits::ResPacketSize, R0); @@ -1358,186 +1886,43 @@ void gebp_kernel=1*Traits::LhsProgress) { - // loops on each largest micro horizontal panel of lhs (1*LhsProgress x depth) - for(Index i=peeled_mc2; i(alpha); - - R0 = r0.loadPacket(0 * Traits::ResPacketSize); - R1 = r1.loadPacket(0 * Traits::ResPacketSize); - traits.acc(C0, alphav, R0); - traits.acc(C1, alphav, R1); - r0.storePacket(0 * Traits::ResPacketSize, R0); - r1.storePacket(0 * Traits::ResPacketSize, R1); - - R0 = r2.loadPacket(0 * Traits::ResPacketSize); - R1 = r3.loadPacket(0 * Traits::ResPacketSize); - traits.acc(C2, alphav, R0); - traits.acc(C3, alphav, R1); - r2.storePacket(0 * Traits::ResPacketSize, R0); - r3.storePacket(0 * Traits::ResPacketSize, R1); - } - - // Deal with remaining columns of the rhs - for(Index j2=packet_cols4; j2(alpha); - R0 = r0.loadPacket(0 * Traits::ResPacketSize); - traits.acc(C0, alphav, R0); - r0.storePacket(0 * Traits::ResPacketSize, R0); - } - } + lhs_process_one_packet p; + p(res, blockA, blockB, alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4); + } + //---------- Process LhsProgressHalf rows at once ---------- + if((LhsProgressHalf < LhsProgress) && mr>=LhsProgressHalf) + { + lhs_process_fraction_of_packet p; + p(res, blockA, blockB, alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4); + } + //---------- Process LhsProgressQuarter rows at once ---------- + if((LhsProgressQuarter < LhsProgressHalf) && mr>=LhsProgressQuarter) + { + lhs_process_fraction_of_packet p; + p(res, blockA, blockB, alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4); } //---------- Process remaining rows, 1 at once ---------- - if(peeled_mc1::half>::size; + const int SResPacketQuarterSize = unpacket_traits::half>::half>::size; if ((SwappedTraits::LhsProgress % 4) == 0 && - (SwappedTraits::LhsProgress <= 8) && - (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr)) + (SwappedTraits::LhsProgress<=16) && + (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr) && + (SwappedTraits::LhsProgress!=16 || SResPacketQuarterSize==nr)) { SAccPacket C0, C1, C2, C3; straits.initAcc(C0); @@ -1560,15 +1945,15 @@ void gebp_kernel); + straits.madd(A1,B_1,C1,B_1, fix<0>); straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0); straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1); straits.loadRhsQuad(blA+2*spk, B_0); straits.loadRhsQuad(blA+3*spk, B_1); - straits.madd(A0,B_0,C2,B_0); - straits.madd(A1,B_1,C3,B_1); + straits.madd(A0,B_0,C2,B_0, fix<0>); + straits.madd(A1,B_1,C3,B_1, fix<0>); blB += 4*SwappedTraits::LhsProgress; blA += 4*spk; @@ -1581,7 +1966,7 @@ void gebp_kernel); blB += SwappedTraits::LhsProgress; blA += spk; @@ -1591,7 +1976,7 @@ void gebp_kernel=8,typename unpacket_traits::half,SResPacket>::type SResPacketHalf; typedef typename conditional=8,typename unpacket_traits::half,SLhsPacket>::type SLhsPacketHalf; - typedef typename conditional=8,typename unpacket_traits::half,SRhsPacket>::type SRhsPacketHalf; + typedef typename conditional=8,typename unpacket_traits::half,SRhsPacket>::type SRhsPacketHalf; typedef typename conditional=8,typename unpacket_traits::half,SAccPacket>::type SAccPacketHalf; SResPacketHalf R = res.template gatherPacket(i, j2); @@ -1604,16 +1989,25 @@ void gebp_kernel); straits.acc(c0, alphav, R); } else { - straits.acc(predux_downto4(C0), alphav, R); + straits.acc(predux_half_dowto4(C0), alphav, R); } res.scatterPacket(i, j2, R); } + else if (SwappedTraits::LhsProgress==16) + { + // Special case where we have to first reduce the + // accumulation register C0. We specialize the block in + // template form, so that LhsProgress < 16 paths don't + // fail to compile + last_row_process_16_packets p; + p(res, straits, blA, blB, depth, endk, i, j2,alpha, C0); + } else { SResPacket R = res.template gatherPacket(i, j2); @@ -1636,14 +2030,14 @@ void gebp_kernel -struct gemm_pack_lhs +template +struct gemm_pack_lhs { typedef typename DataMapper::LinearMapper LinearMapper; EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); }; -template -EIGEN_DONT_INLINE void gemm_pack_lhs +template +EIGEN_DONT_INLINE void gemm_pack_lhs ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { - typedef typename packet_traits::type Packet; - enum { PacketSize = packet_traits::size }; + typedef typename unpacket_traits::half HalfPacket; + typedef typename unpacket_traits::half>::half QuarterPacket; + enum { PacketSize = unpacket_traits::size, + HalfPacketSize = unpacket_traits::size, + QuarterPacketSize = unpacket_traits::size, + HasHalf = (int)HalfPacketSize < (int)PacketSize, + HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize}; EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); EIGEN_UNUSED_VARIABLE(stride); @@ -1717,9 +2114,12 @@ EIGEN_DONT_INLINE void gemm_pack_lhs=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; - const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; - const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1 - : Pack2>1 ? (rows/Pack2)*Pack2 : 0; + const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0; + const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0; + const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? (rows/(QuarterPacketSize))*(QuarterPacketSize) : 0; + const Index last_lhs_progress = rows > peeled_mc_quarter ? (rows - peeled_mc_quarter) & ~1 : 0; + const Index peeled_mc0 = Pack2>=PacketSize ? peeled_mc_quarter + : Pack2>1 && last_lhs_progress ? (rows/last_lhs_progress)*last_lhs_progress : 0; Index i=0; @@ -1733,9 +2133,9 @@ EIGEN_DONT_INLINE void gemm_pack_lhs(i+0*PacketSize, k); + B = lhs.template loadPacket(i+1*PacketSize, k); + C = lhs.template loadPacket(i+2*PacketSize, k); pstore(blockA+count, cj.pconj(A)); count+=PacketSize; pstore(blockA+count, cj.pconj(B)); count+=PacketSize; pstore(blockA+count, cj.pconj(C)); count+=PacketSize; @@ -1753,8 +2153,8 @@ EIGEN_DONT_INLINE void gemm_pack_lhs(i+0*PacketSize, k); + B = lhs.template loadPacket(i+1*PacketSize, k); pstore(blockA+count, cj.pconj(A)); count+=PacketSize; pstore(blockA+count, cj.pconj(B)); count+=PacketSize; } @@ -1771,27 +2171,67 @@ EIGEN_DONT_INLINE void gemm_pack_lhs(i+0*PacketSize, k); pstore(blockA+count, cj.pconj(A)); count+=PacketSize; } if(PanelMode) count += (1*PacketSize) * (stride-offset-depth); } } - // Pack scalars - if(Pack21) + // Pack half packets + if(HasHalf && Pack1>=HalfPacketSize) { - for(; i(i+0*(HalfPacketSize), k); + pstoreu(blockA+count, cj.pconj(A)); + count+=HalfPacketSize; + } + if(PanelMode) count += (HalfPacketSize) * (stride-offset-depth); } } + // Pack quarter packets + if(HasQuarter && Pack1>=QuarterPacketSize) + { + for(; i(i+0*(QuarterPacketSize), k); + pstoreu(blockA+count, cj.pconj(A)); + count+=QuarterPacketSize; + } + if(PanelMode) count += (QuarterPacketSize) * (stride-offset-depth); + } + } + // Pack2 may be *smaller* than PacketSize—that happens for + // products like real * complex, where we have to go half the + // progress on the lhs in order to duplicate those operands to + // address both real & imaginary parts on the rhs. This portion will + // pack those half ones until they match the number expected on the + // last peeling loop at this point (for the rhs). + if(Pack21) + { + for(; i -struct gemm_pack_lhs +template +struct gemm_pack_lhs { typedef typename DataMapper::LinearMapper LinearMapper; EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); }; -template -EIGEN_DONT_INLINE void gemm_pack_lhs +template +EIGEN_DONT_INLINE void gemm_pack_lhs ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { - typedef typename packet_traits::type Packet; - enum { PacketSize = packet_traits::size }; + typedef typename unpacket_traits::half HalfPacket; + typedef typename unpacket_traits::half>::half QuarterPacket; + enum { PacketSize = unpacket_traits::size, + HalfPacketSize = unpacket_traits::size, + QuarterPacketSize = unpacket_traits::size, + HasHalf = (int)HalfPacketSize < (int)PacketSize, + HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize}; EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); EIGEN_UNUSED_VARIABLE(stride); @@ -1821,37 +2266,51 @@ EIGEN_DONT_INLINE void gemm_pack_lhs=depth && offset<=stride)); conj_if::IsComplex && Conjugate> cj; Index count = 0; + bool gone_half = false, gone_quarter = false, gone_last = false; -// const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; -// const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; -// const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; - - int pack = Pack1; Index i = 0; + int pack = Pack1; + int psize = PacketSize; while(pack>0) { Index remaining_rows = rows-i; - Index peeled_mc = i+(remaining_rows/pack)*pack; + Index peeled_mc = gone_last ? Pack2>1 ? (rows/pack)*pack : 0 : i+(remaining_rows/pack)*pack; + Index starting_pos = i; for(; i=PacketSize) + if(pack>=psize && psize >= QuarterPacketSize) { - for(; k kernel; - for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k); - ptranspose(kernel); - for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); + if (psize == PacketSize) { + PacketBlock kernel; + for (int p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket(i+p+m, k); + ptranspose(kernel); + for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); + } else if (HasHalf && psize == HalfPacketSize) { + gone_half = true; + PacketBlock kernel_half; + for (int p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket(i+p+m, k); + ptranspose(kernel_half); + for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p])); + } else if (HasQuarter && psize == QuarterPacketSize) { + gone_quarter = true; + PacketBlock kernel_quarter; + for (int p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket(i+p+m, k); + ptranspose(kernel_quarter); + for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p])); + } } - count += PacketSize*pack; + count += psize*pack; } } + for(; k= psize/2 || left >= psize/4) && + ((psize/2 == HalfPacketSize && HasHalf && !gone_half) || + (psize/2 == QuarterPacketSize && HasQuarter && !gone_quarter))) { + psize /= 2; + pack = psize; + continue; + } + // Pack2 may be *smaller* than PacketSize—that happens for + // products like real * complex, where we have to go half the + // progress on the lhs in order to duplicate those operands to + // address both real & imaginary parts on the rhs. This portion will + // pack those half ones until they match the number expected on the + // last peeling loop at this point (for the rhs). + if (Pack2 < PacketSize && !gone_last) { + gone_last = true; + psize = pack = left & ~1; + } + } } for(; i kernel; @@ -1979,10 +2457,10 @@ EIGEN_DONT_INLINE void gemm_pack_rhs kernel; - kernel.packet[0] = dm0.loadPacket(k); - kernel.packet[1%PacketSize] = dm1.loadPacket(k); - kernel.packet[2%PacketSize] = dm2.loadPacket(k); - kernel.packet[3%PacketSize] = dm3.loadPacket(k); + kernel.packet[0 ] = dm0.template loadPacket(k); + kernel.packet[1%PacketSize] = dm1.template loadPacket(k); + kernel.packet[2%PacketSize] = dm2.template loadPacket(k); + kernel.packet[3%PacketSize] = dm3.template loadPacket(k); ptranspose(kernel); pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0])); pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize])); @@ -2023,94 +2501,104 @@ template { typedef typename packet_traits::type Packet; + typedef typename unpacket_traits::half HalfPacket; + typedef typename unpacket_traits::half>::half QuarterPacket; typedef typename DataMapper::LinearMapper LinearMapper; - enum { PacketSize = packet_traits::size }; - EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); -}; - -template -EIGEN_DONT_INLINE void gemm_pack_rhs - ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) -{ - EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR"); - EIGEN_UNUSED_VARIABLE(stride); - EIGEN_UNUSED_VARIABLE(offset); - eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); - conj_if::IsComplex && Conjugate> cj; - Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; - Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; - Index count = 0; - -// if(nr>=8) -// { -// for(Index j2=0; j2(&rhs[k*rhsStride + j2]); -// pstoreu(blockB+count, cj.pconj(A)); -// } else if (PacketSize==4) { -// Packet A = ploadu(&rhs[k*rhsStride + j2]); -// Packet B = ploadu(&rhs[k*rhsStride + j2 + PacketSize]); -// pstoreu(blockB+count, cj.pconj(A)); -// pstoreu(blockB+count+PacketSize, cj.pconj(B)); -// } else { -// const Scalar* b0 = &rhs[k*rhsStride + j2]; -// blockB[count+0] = cj(b0[0]); -// blockB[count+1] = cj(b0[1]); -// blockB[count+2] = cj(b0[2]); -// blockB[count+3] = cj(b0[3]); -// blockB[count+4] = cj(b0[4]); -// blockB[count+5] = cj(b0[5]); -// blockB[count+6] = cj(b0[6]); -// blockB[count+7] = cj(b0[7]); -// } -// count += 8; -// } -// // skip what we have after -// if(PanelMode) count += 8 * (stride-offset-depth); -// } -// } - if(nr>=4) + enum { PacketSize = packet_traits::size, + HalfPacketSize = unpacket_traits::size, + QuarterPacketSize = unpacket_traits::size}; + EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0) { - for(Index j2=packet_cols8; j2=depth && offset<=stride)); + const bool HasHalf = (int)HalfPacketSize < (int)PacketSize; + const bool HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize; + conj_if::IsComplex && Conjugate> cj; + Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; + Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; + Index count = 0; + + // if(nr>=8) + // { + // for(Index j2=0; j2(&rhs[k*rhsStride + j2]); + // pstoreu(blockB+count, cj.pconj(A)); + // } else if (PacketSize==4) { + // Packet A = ploadu(&rhs[k*rhsStride + j2]); + // Packet B = ploadu(&rhs[k*rhsStride + j2 + PacketSize]); + // pstoreu(blockB+count, cj.pconj(A)); + // pstoreu(blockB+count+PacketSize, cj.pconj(B)); + // } else { + // const Scalar* b0 = &rhs[k*rhsStride + j2]; + // blockB[count+0] = cj(b0[0]); + // blockB[count+1] = cj(b0[1]); + // blockB[count+2] = cj(b0[2]); + // blockB[count+3] = cj(b0[3]); + // blockB[count+4] = cj(b0[4]); + // blockB[count+5] = cj(b0[5]); + // blockB[count+6] = cj(b0[6]); + // blockB[count+7] = cj(b0[7]); + // } + // count += 8; + // } + // // skip what we have after + // if(PanelMode) count += 8 * (stride-offset-depth); + // } + // } + if(nr>=4) { - // skip what we have before - if(PanelMode) count += 4 * offset; + for(Index j2=packet_cols8; j2(k, j2); + pstoreu(blockB+count, cj.pconj(A)); + count += PacketSize; + } else if (HasHalf && HalfPacketSize==4) { + HalfPacket A = rhs.template loadPacket(k, j2); + pstoreu(blockB+count, cj.pconj(A)); + count += HalfPacketSize; + } else if (HasQuarter && QuarterPacketSize==4) { + QuarterPacket A = rhs.template loadPacket(k, j2); + pstoreu(blockB+count, cj.pconj(A)); + count += QuarterPacketSize; + } else { + const LinearMapper dm0 = rhs.getLinearMapper(k, j2); + blockB[count+0] = cj(dm0(0)); + blockB[count+1] = cj(dm0(1)); + blockB[count+2] = cj(dm0(2)); + blockB[count+3] = cj(dm0(3)); + count += 4; + } + } + // skip what we have after + if(PanelMode) count += 4 * (stride-offset-depth); + } + } + // copy the remaining columns one at a time (nr==1) + for(Index j2=packet_cols4; j2 pack_lhs; + gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; gebp_kernel gebp; @@ -110,7 +110,7 @@ static void run(Index rows, Index cols, Index depth, // i.e., we test that info[tid].users equals 0. // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it. while(info[tid].users!=0) {} - info[tid].users += threads; + info[tid].users = threads; pack_lhs(blockA+info[tid].lhs_start*actual_kc, lhs.getSubMapper(info[tid].lhs_start,k), actual_kc, info[tid].lhs_length); @@ -148,7 +148,9 @@ static void run(Index rows, Index cols, Index depth, // Release all the sub blocks A'_i of A' for the current thread, // i.e., we simply decrement the number of users by 1 for(Index i=0; i template static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { - if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0) + // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=404 for a discussion and helper program + // to determine the following heuristic. + // EIGEN_GEMM_TO_COEFFBASED_THRESHOLD is typically defined to 20 in GeneralProduct.h, + // unless it has been specialized by the user or for a given architecture. + // Note that the condition rhs.rows()>0 was required because lazy product is (was?) not happy with empty inputs. + // I'm not sure it is still required. + if((rhs.rows()+dst.rows()+dst.cols())0) lazyproduct::eval_dynamic(dst, lhs, rhs, internal::assign_op()); else { @@ -441,7 +449,7 @@ struct generic_product_impl template static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { - if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0) + if((rhs.rows()+dst.rows()+dst.cols())0) lazyproduct::eval_dynamic(dst, lhs, rhs, internal::add_assign_op()); else scaleAndAddTo(dst,lhs, rhs, Scalar(1)); @@ -450,7 +458,7 @@ struct generic_product_impl template static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { - if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0) + if((rhs.rows()+dst.rows()+dst.cols())0) lazyproduct::eval_dynamic(dst, lhs, rhs, internal::sub_assign_op()); else scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); @@ -463,11 +471,25 @@ struct generic_product_impl if(a_lhs.cols()==0 || a_lhs.rows()==0 || a_rhs.cols()==0) return; + if (dst.cols() == 1) + { + // Fallback to GEMV if either the lhs or rhs is a runtime vector + typename Dest::ColXpr dst_vec(dst.col(0)); + return internal::generic_product_impl + ::scaleAndAddTo(dst_vec, a_lhs, a_rhs.col(0), alpha); + } + else if (dst.rows() == 1) + { + // Fallback to GEMV if either the lhs or rhs is a runtime vector + typename Dest::RowXpr dst_vec(dst.row(0)); + return internal::generic_product_impl + ::scaleAndAddTo(dst_vec, a_lhs.row(0), a_rhs, alpha); + } + typename internal::add_const_on_value_type::type lhs = LhsBlasTraits::extract(a_lhs); typename internal::add_const_on_value_type::type rhs = RhsBlasTraits::extract(a_rhs); - Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs) - * RhsBlasTraits::extractScalarFactor(a_rhs); + Scalar actualAlpha = combine_scalar_factors(alpha, a_lhs, a_rhs); typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,LhsScalar,RhsScalar, Dest::MaxRowsAtCompileTime,Dest::MaxColsAtCompileTime,MaxDepthAtCompileTime> BlockingType; diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index d68d2f9657..6ba0d9bdb8 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -87,7 +87,7 @@ struct general_matrix_matrix_triangular_product pack_lhs; + gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; gebp_kernel gebp; tribb_kernel sybb; @@ -302,13 +302,13 @@ struct general_product_to_triangular_selector template template -TriangularView& TriangularViewImpl::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta) +EIGEN_DEVICE_FUNC TriangularView& TriangularViewImpl::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta) { EIGEN_STATIC_ASSERT((UpLo&UnitDiag)==0, WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED); eigen_assert(derived().nestedExpression().rows() == prod.rows() && derived().cols() == prod.cols()); - + general_product_to_triangular_selector::InnerSize==1>::run(derived().nestedExpression().const_cast_derived(), prod, alpha, beta); - + return derived(); } diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/products/GeneralMatrixVector.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/products/GeneralMatrixVector.h index a597c1f4ee..dfb6aebced 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/products/GeneralMatrixVector.h @@ -1,7 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2008-2009 Gael Guennebaud +// Copyright (C) 2008-2016 Gael Guennebaud // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -14,11 +14,57 @@ namespace Eigen { namespace internal { +enum GEMVPacketSizeType { + GEMVPacketFull = 0, + GEMVPacketHalf, + GEMVPacketQuarter +}; + +template +struct gemv_packet_cond { typedef T3 type; }; + +template +struct gemv_packet_cond { typedef T1 type; }; + +template +struct gemv_packet_cond { typedef T2 type; }; + +template +class gemv_traits +{ + typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; + +#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \ + typedef typename gemv_packet_cond::type, \ + typename packet_traits::half, \ + typename unpacket_traits::half>::half>::type \ + prefix ## name ## Packet + + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); +#undef PACKET_DECL_COND_PREFIX + +public: + enum { + Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && + unpacket_traits<_RhsPacket>::vectorizable && + int(unpacket_traits<_LhsPacket>::size)==int(unpacket_traits<_RhsPacket>::size), + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1 + }; + + typedef typename conditional::type LhsPacket; + typedef typename conditional::type RhsPacket; + typedef typename conditional::type ResPacket; +}; + + /* Optimized col-major matrix * vector product: - * This algorithm processes 4 columns at onces that allows to both reduce - * the number of load/stores of the result by a factor 4 and to reduce - * the instruction dependency. Moreover, we know that all bands have the - * same alignment pattern. + * This algorithm processes the matrix per vertical panels, + * which are then processed horizontaly per chunck of 8*PacketSize x 1 vertical segments. * * Mixing type logic: C += alpha * A * B * | A | B |alpha| comments @@ -27,56 +73,30 @@ namespace internal { * |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp * |cplx |real |real | optimal case, vectorization possible via real-cplx mul * - * Accesses to the matrix coefficients follow the following logic: - * - * - if all columns have the same alignment then - * - if the columns have the same alignment as the result vector, then easy! (-> AllAligned case) - * - otherwise perform unaligned loads only (-> NoneAligned case) - * - otherwise - * - if even columns have the same alignment then - * // odd columns are guaranteed to have the same alignment too - * - if even or odd columns have the same alignment as the result, then - * // for a register size of 2 scalars, this is guarantee to be the case (e.g., SSE with double) - * - perform half aligned and half unaligned loads (-> EvenAligned case) - * - otherwise perform unaligned loads only (-> NoneAligned case) - * - otherwise, if the register size is 4 scalars (e.g., SSE with float) then - * - one over 4 consecutive columns is guaranteed to be aligned with the result vector, - * perform simple aligned loads for this column and aligned loads plus re-alignment for the other. (-> FirstAligned case) - * // this re-alignment is done by the palign function implemented for SSE in Eigen/src/Core/arch/SSE/PacketMath.h - * - otherwise, - * // if we get here, this means the register size is greater than 4 (e.g., AVX with floats), - * // we currently fall back to the NoneAligned case - * * The same reasoning apply for the transposed case. - * - * The last case (PacketSize>4) could probably be improved by generalizing the FirstAligned case, but since we do not support AVX yet... - * One might also wonder why in the EvenAligned case we perform unaligned loads instead of using the aligned-loads plus re-alignment - * strategy as in the FirstAligned case. The reason is that we observed that unaligned loads on a 8 byte boundary are not too slow - * compared to unaligned loads on a 4 byte boundary. - * */ template struct general_matrix_vector_product { + typedef gemv_traits Traits; + typedef gemv_traits HalfTraits; + typedef gemv_traits QuarterTraits; + typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; -enum { - Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable - && int(packet_traits::size)==int(packet_traits::size), - LhsPacketSize = Vectorizable ? packet_traits::size : 1, - RhsPacketSize = Vectorizable ? packet_traits::size : 1, - ResPacketSize = Vectorizable ? packet_traits::size : 1 -}; + typedef typename Traits::LhsPacket LhsPacket; + typedef typename Traits::RhsPacket RhsPacket; + typedef typename Traits::ResPacket ResPacket; -typedef typename packet_traits::type _LhsPacket; -typedef typename packet_traits::type _RhsPacket; -typedef typename packet_traits::type _ResPacket; + typedef typename HalfTraits::LhsPacket LhsPacketHalf; + typedef typename HalfTraits::RhsPacket RhsPacketHalf; + typedef typename HalfTraits::ResPacket ResPacketHalf; -typedef typename conditional::type LhsPacket; -typedef typename conditional::type RhsPacket; -typedef typename conditional::type ResPacket; + typedef typename QuarterTraits::LhsPacket LhsPacketQuarter; + typedef typename QuarterTraits::RhsPacket RhsPacketQuarter; + typedef typename QuarterTraits::ResPacket ResPacketQuarter; -EIGEN_DONT_INLINE static void run( +EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( Index rows, Index cols, const LhsMapper& lhs, const RhsMapper& rhs, @@ -85,244 +105,187 @@ EIGEN_DONT_INLINE static void run( }; template -EIGEN_DONT_INLINE void general_matrix_vector_product::run( +EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product::run( Index rows, Index cols, - const LhsMapper& lhs, + const LhsMapper& alhs, const RhsMapper& rhs, ResScalar* res, Index resIncr, RhsScalar alpha) { EIGEN_UNUSED_VARIABLE(resIncr); eigen_internal_assert(resIncr==1); - #ifdef _EIGEN_ACCUMULATE_PACKETS - #error _EIGEN_ACCUMULATE_PACKETS has already been defined - #endif - #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) \ - pstore(&res[j], \ - padd(pload(&res[j]), \ - padd( \ - padd(pcj.pmul(lhs0.template load(j), ptmp0), \ - pcj.pmul(lhs1.template load(j), ptmp1)), \ - padd(pcj.pmul(lhs2.template load(j), ptmp2), \ - pcj.pmul(lhs3.template load(j), ptmp3)) ))) - typedef typename LhsMapper::VectorMapper LhsScalars; + // The following copy tells the compiler that lhs's attributes are not modified outside this function + // This helps GCC to generate propoer code. + LhsMapper lhs(alhs); conj_helper cj; conj_helper pcj; - if(ConjugateRhs) - alpha = numext::conj(alpha); - - enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned }; - const Index columnsAtOnce = 4; - const Index peels = 2; - const Index LhsPacketAlignedMask = LhsPacketSize-1; - const Index ResPacketAlignedMask = ResPacketSize-1; -// const Index PeelAlignedMask = ResPacketSize*peels-1; - const Index size = rows; + conj_helper pcj_half; + conj_helper pcj_quarter; const Index lhsStride = lhs.stride(); + // TODO: for padded aligned inputs, we could enable aligned reads + enum { LhsAlignment = Unaligned, + ResPacketSize = Traits::ResPacketSize, + ResPacketSizeHalf = HalfTraits::ResPacketSize, + ResPacketSizeQuarter = QuarterTraits::ResPacketSize, + LhsPacketSize = Traits::LhsPacketSize, + HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize, + HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf + }; - // How many coeffs of the result do we have to skip to be aligned. - // Here we assume data are at least aligned on the base scalar type. - Index alignedStart = internal::first_default_aligned(res,size); - Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0; - const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1; + const Index n8 = rows-8*ResPacketSize+1; + const Index n4 = rows-4*ResPacketSize+1; + const Index n3 = rows-3*ResPacketSize+1; + const Index n2 = rows-2*ResPacketSize+1; + const Index n1 = rows-1*ResPacketSize+1; + const Index n_half = rows-1*ResPacketSizeHalf+1; + const Index n_quarter = rows-1*ResPacketSizeQuarter+1; - const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; - Index alignmentPattern = alignmentStep==0 ? AllAligned - : alignmentStep==(LhsPacketSize/2) ? EvenAligned - : FirstAligned; + // TODO: improve the following heuristic: + const Index block_cols = cols<128 ? cols : (lhsStride*sizeof(LhsScalar)<32000?16:4); + ResPacket palpha = pset1(alpha); + ResPacketHalf palpha_half = pset1(alpha); + ResPacketQuarter palpha_quarter = pset1(alpha); - // we cannot assume the first element is aligned because of sub-matrices - const Index lhsAlignmentOffset = lhs.firstAligned(size); - - // find how many columns do we have to skip to be aligned with the result (if possible) - Index skipColumns = 0; - // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) - if( (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == size) || (UIntPtr(res)%sizeof(ResScalar)) ) + for(Index j2=0; j2 4) - { - // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4. - // Currently, it seems to be better to perform unaligned loads anyway - alignmentPattern = NoneAligned; - } - else if (LhsPacketSize>1) - { - // eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size(ResScalar(0)), + c1 = pset1(ResScalar(0)), + c2 = pset1(ResScalar(0)), + c3 = pset1(ResScalar(0)), + c4 = pset1(ResScalar(0)), + c5 = pset1(ResScalar(0)), + c6 = pset1(ResScalar(0)), + c7 = pset1(ResScalar(0)); - /* eigen_internal_assert( (alignmentPattern==NoneAligned) - || (skipColumns + columnsAtOnce >= cols) - || LhsPacketSize > size - || (size_t(firstLhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);*/ - } - else if(Vectorizable) - { - alignedStart = 0; - alignedSize = size; - alignmentPattern = AllAligned; - } - - const Index offset1 = (alignmentPattern==FirstAligned && alignmentStep==1)?3:1; - const Index offset3 = (alignmentPattern==FirstAligned && alignmentStep==1)?1:3; - - Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns; - for (Index i=skipColumns; i(alpha*rhs(i, 0)), - ptmp1 = pset1(alpha*rhs(i+offset1, 0)), - ptmp2 = pset1(alpha*rhs(i+2, 0)), - ptmp3 = pset1(alpha*rhs(i+offset3, 0)); - - // this helps a lot generating better binary code - const LhsScalars lhs0 = lhs.getVectorMapper(0, i+0), lhs1 = lhs.getVectorMapper(0, i+offset1), - lhs2 = lhs.getVectorMapper(0, i+2), lhs3 = lhs.getVectorMapper(0, i+offset3); - - if (Vectorizable) - { - /* explicit vectorization */ - // process initial unaligned coeffs - for (Index j=0; j(rhs(j,0)); + c0 = pcj.pmadd(lhs.template load(i+LhsPacketSize*0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load(i+LhsPacketSize*1,j),b0,c1); + c2 = pcj.pmadd(lhs.template load(i+LhsPacketSize*2,j),b0,c2); + c3 = pcj.pmadd(lhs.template load(i+LhsPacketSize*3,j),b0,c3); + c4 = pcj.pmadd(lhs.template load(i+LhsPacketSize*4,j),b0,c4); + c5 = pcj.pmadd(lhs.template load(i+LhsPacketSize*5,j),b0,c5); + c6 = pcj.pmadd(lhs.template load(i+LhsPacketSize*6,j),b0,c6); + c7 = pcj.pmadd(lhs.template load(i+LhsPacketSize*7,j),b0,c7); } - - if (alignedSize>alignedStart) - { - switch(alignmentPattern) - { - case AllAligned: - for (Index j = alignedStart; j1) - { - LhsPacket A00, A01, A02, A03, A10, A11, A12, A13; - ResPacket T0, T1; - - A01 = lhs1.template load(alignedStart-1); - A02 = lhs2.template load(alignedStart-2); - A03 = lhs3.template load(alignedStart-3); - - for (; j(j-1+LhsPacketSize); palign<1>(A01,A11); - A12 = lhs2.template load(j-2+LhsPacketSize); palign<2>(A02,A12); - A13 = lhs3.template load(j-3+LhsPacketSize); palign<3>(A03,A13); - - A00 = lhs0.template load(j); - A10 = lhs0.template load(j+LhsPacketSize); - T0 = pcj.pmadd(A00, ptmp0, pload(&res[j])); - T1 = pcj.pmadd(A10, ptmp0, pload(&res[j+ResPacketSize])); - - T0 = pcj.pmadd(A01, ptmp1, T0); - A01 = lhs1.template load(j-1+2*LhsPacketSize); palign<1>(A11,A01); - T0 = pcj.pmadd(A02, ptmp2, T0); - A02 = lhs2.template load(j-2+2*LhsPacketSize); palign<2>(A12,A02); - T0 = pcj.pmadd(A03, ptmp3, T0); - pstore(&res[j],T0); - A03 = lhs3.template load(j-3+2*LhsPacketSize); palign<3>(A13,A03); - T1 = pcj.pmadd(A11, ptmp1, T1); - T1 = pcj.pmadd(A12, ptmp2, T1); - T1 = pcj.pmadd(A13, ptmp3, T1); - pstore(&res[j+ResPacketSize],T1); - } - } - for (; j(res+i+ResPacketSize*0))); + pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu(res+i+ResPacketSize*1))); + pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu(res+i+ResPacketSize*2))); + pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu(res+i+ResPacketSize*3))); + pstoreu(res+i+ResPacketSize*4, pmadd(c4,palpha,ploadu(res+i+ResPacketSize*4))); + pstoreu(res+i+ResPacketSize*5, pmadd(c5,palpha,ploadu(res+i+ResPacketSize*5))); + pstoreu(res+i+ResPacketSize*6, pmadd(c6,palpha,ploadu(res+i+ResPacketSize*6))); + pstoreu(res+i+ResPacketSize*7, pmadd(c7,palpha,ploadu(res+i+ResPacketSize*7))); + } + if(i(ResScalar(0)), + c1 = pset1(ResScalar(0)), + c2 = pset1(ResScalar(0)), + c3 = pset1(ResScalar(0)); + + for(Index j=j2; j(rhs(j,0)); + c0 = pcj.pmadd(lhs.template load(i+LhsPacketSize*0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load(i+LhsPacketSize*1,j),b0,c1); + c2 = pcj.pmadd(lhs.template load(i+LhsPacketSize*2,j),b0,c2); + c3 = pcj.pmadd(lhs.template load(i+LhsPacketSize*3,j),b0,c3); + } + pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu(res+i+ResPacketSize*0))); + pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu(res+i+ResPacketSize*1))); + pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu(res+i+ResPacketSize*2))); + pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu(res+i+ResPacketSize*3))); + + i+=ResPacketSize*4; + } + if(i(ResScalar(0)), + c1 = pset1(ResScalar(0)), + c2 = pset1(ResScalar(0)); + + for(Index j=j2; j(rhs(j,0)); + c0 = pcj.pmadd(lhs.template load(i+LhsPacketSize*0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load(i+LhsPacketSize*1,j),b0,c1); + c2 = pcj.pmadd(lhs.template load(i+LhsPacketSize*2,j),b0,c2); + } + pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu(res+i+ResPacketSize*0))); + pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu(res+i+ResPacketSize*1))); + pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu(res+i+ResPacketSize*2))); + + i+=ResPacketSize*3; + } + if(i(ResScalar(0)), + c1 = pset1(ResScalar(0)); + + for(Index j=j2; j(rhs(j,0)); + c0 = pcj.pmadd(lhs.template load(i+LhsPacketSize*0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load(i+LhsPacketSize*1,j),b0,c1); + } + pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu(res+i+ResPacketSize*0))); + pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu(res+i+ResPacketSize*1))); + i+=ResPacketSize*2; + } + if(i(ResScalar(0)); + for(Index j=j2; j(rhs(j,0)); + c0 = pcj.pmadd(lhs.template load(i+0,j),b0,c0); + } + pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu(res+i+ResPacketSize*0))); + i+=ResPacketSize; + } + if(HasHalf && i(ResScalar(0)); + for(Index j=j2; j(rhs(j,0)); + c0 = pcj_half.pmadd(lhs.template load(i+0,j),b0,c0); + } + pstoreu(res+i+ResPacketSizeHalf*0, pmadd(c0,palpha_half,ploadu(res+i+ResPacketSizeHalf*0))); + i+=ResPacketSizeHalf; + } + if(HasQuarter && i(ResScalar(0)); + for(Index j=j2; j(rhs(j,0)); + c0 = pcj_quarter.pmadd(lhs.template load(i+0,j),b0,c0); + } + pstoreu(res+i+ResPacketSizeQuarter*0, pmadd(c0,palpha_quarter,ploadu(res+i+ResPacketSizeQuarter*0))); + i+=ResPacketSizeQuarter; + } + for(;i(alpha*rhs(k, 0)); - const LhsScalars lhs0 = lhs.getVectorMapper(0, k); - - if (Vectorizable) - { - /* explicit vectorization */ - // process first unaligned result's coeffs - for (Index j=0; j(alignedStart)) - for (Index i = alignedStart;i(i), ptmp0, pload(&res[i]))); - else - for (Index i = alignedStart;i(i), ptmp0, pload(&res[i]))); - } - - // process remaining scalars (or all if no explicit vectorization) - for (Index i=alignedSize; i struct general_matrix_vector_product { -typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; + typedef gemv_traits Traits; + typedef gemv_traits HalfTraits; + typedef gemv_traits QuarterTraits; -enum { - Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable - && int(packet_traits::size)==int(packet_traits::size), - LhsPacketSize = Vectorizable ? packet_traits::size : 1, - RhsPacketSize = Vectorizable ? packet_traits::size : 1, - ResPacketSize = Vectorizable ? packet_traits::size : 1 -}; + typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; -typedef typename packet_traits::type _LhsPacket; -typedef typename packet_traits::type _RhsPacket; -typedef typename packet_traits::type _ResPacket; + typedef typename Traits::LhsPacket LhsPacket; + typedef typename Traits::RhsPacket RhsPacket; + typedef typename Traits::ResPacket ResPacket; -typedef typename conditional::type LhsPacket; -typedef typename conditional::type RhsPacket; -typedef typename conditional::type ResPacket; + typedef typename HalfTraits::LhsPacket LhsPacketHalf; + typedef typename HalfTraits::RhsPacket RhsPacketHalf; + typedef typename HalfTraits::ResPacket ResPacketHalf; -EIGEN_DONT_INLINE static void run( + typedef typename QuarterTraits::LhsPacket LhsPacketQuarter; + typedef typename QuarterTraits::RhsPacket RhsPacketQuarter; + typedef typename QuarterTraits::ResPacket ResPacketQuarter; + +EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( Index rows, Index cols, const LhsMapper& lhs, const RhsMapper& rhs, @@ -361,255 +324,191 @@ EIGEN_DONT_INLINE static void run( }; template -EIGEN_DONT_INLINE void general_matrix_vector_product::run( +EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product::run( Index rows, Index cols, - const LhsMapper& lhs, + const LhsMapper& alhs, const RhsMapper& rhs, ResScalar* res, Index resIncr, ResScalar alpha) { + // The following copy tells the compiler that lhs's attributes are not modified outside this function + // This helps GCC to generate propoer code. + LhsMapper lhs(alhs); + eigen_internal_assert(rhs.stride()==1); - - #ifdef _EIGEN_ACCUMULATE_PACKETS - #error _EIGEN_ACCUMULATE_PACKETS has already been defined - #endif - - #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) {\ - RhsPacket b = rhs.getVectorMapper(j, 0).template load(0); \ - ptmp0 = pcj.pmadd(lhs0.template load(j), b, ptmp0); \ - ptmp1 = pcj.pmadd(lhs1.template load(j), b, ptmp1); \ - ptmp2 = pcj.pmadd(lhs2.template load(j), b, ptmp2); \ - ptmp3 = pcj.pmadd(lhs3.template load(j), b, ptmp3); } - conj_helper cj; conj_helper pcj; + conj_helper pcj_half; + conj_helper pcj_quarter; - typedef typename LhsMapper::VectorMapper LhsScalars; + // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large, + // processing 8 rows at once might be counter productive wrt cache. + const Index n8 = lhs.stride()*sizeof(LhsScalar)>32000 ? 0 : rows-7; + const Index n4 = rows-3; + const Index n2 = rows-1; - enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 }; - const Index rowsAtOnce = 4; - const Index peels = 2; - const Index RhsPacketAlignedMask = RhsPacketSize-1; - const Index LhsPacketAlignedMask = LhsPacketSize-1; - const Index depth = cols; - const Index lhsStride = lhs.stride(); + // TODO: for padded aligned inputs, we could enable aligned reads + enum { LhsAlignment = Unaligned, + ResPacketSize = Traits::ResPacketSize, + ResPacketSizeHalf = HalfTraits::ResPacketSize, + ResPacketSizeQuarter = QuarterTraits::ResPacketSize, + LhsPacketSize = Traits::LhsPacketSize, + LhsPacketSizeHalf = HalfTraits::LhsPacketSize, + LhsPacketSizeQuarter = QuarterTraits::LhsPacketSize, + HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize, + HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf + }; - // How many coeffs of the result do we have to skip to be aligned. - // Here we assume data are at least aligned on the base scalar type - // if that's not the case then vectorization is discarded, see below. - Index alignedStart = rhs.firstAligned(depth); - Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0; - const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1; - - const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; - Index alignmentPattern = alignmentStep==0 ? AllAligned - : alignmentStep==(LhsPacketSize/2) ? EvenAligned - : FirstAligned; - - // we cannot assume the first element is aligned because of sub-matrices - const Index lhsAlignmentOffset = lhs.firstAligned(depth); - const Index rhsAlignmentOffset = rhs.firstAligned(rows); - - // find how many rows do we have to skip to be aligned with rhs (if possible) - Index skipRows = 0; - // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) - if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || - (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == depth) || - (rhsAlignmentOffset < 0) || (rhsAlignmentOffset == rows) ) + Index i=0; + for(; i 4) - { - // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4. - alignmentPattern = NoneAligned; - } - else if (LhsPacketSize>1) - { - // eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth(ResScalar(0)), + c1 = pset1(ResScalar(0)), + c2 = pset1(ResScalar(0)), + c3 = pset1(ResScalar(0)), + c4 = pset1(ResScalar(0)), + c5 = pset1(ResScalar(0)), + c6 = pset1(ResScalar(0)), + c7 = pset1(ResScalar(0)); - while (skipRows(j,0); + + c0 = pcj.pmadd(lhs.template load(i+0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load(i+1,j),b0,c1); + c2 = pcj.pmadd(lhs.template load(i+2,j),b0,c2); + c3 = pcj.pmadd(lhs.template load(i+3,j),b0,c3); + c4 = pcj.pmadd(lhs.template load(i+4,j),b0,c4); + c5 = pcj.pmadd(lhs.template load(i+5,j),b0,c5); + c6 = pcj.pmadd(lhs.template load(i+6,j),b0,c6); + c7 = pcj.pmadd(lhs.template load(i+7,j),b0,c7); } - else + ResScalar cc0 = predux(c0); + ResScalar cc1 = predux(c1); + ResScalar cc2 = predux(c2); + ResScalar cc3 = predux(c3); + ResScalar cc4 = predux(c4); + ResScalar cc5 = predux(c5); + ResScalar cc6 = predux(c6); + ResScalar cc7 = predux(c7); + for(; j= rows) - || LhsPacketSize > depth - || (size_t(firstLhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);*/ + res[(i+0)*resIncr] += alpha*cc0; + res[(i+1)*resIncr] += alpha*cc1; + res[(i+2)*resIncr] += alpha*cc2; + res[(i+3)*resIncr] += alpha*cc3; + res[(i+4)*resIncr] += alpha*cc4; + res[(i+5)*resIncr] += alpha*cc5; + res[(i+6)*resIncr] += alpha*cc6; + res[(i+7)*resIncr] += alpha*cc7; } - else if(Vectorizable) + for(; i(ResScalar(0)), + c1 = pset1(ResScalar(0)), + c2 = pset1(ResScalar(0)), + c3 = pset1(ResScalar(0)); - const Index offset1 = (alignmentPattern==FirstAligned && alignmentStep==1)?3:1; - const Index offset3 = (alignmentPattern==FirstAligned && alignmentStep==1)?1:3; - - Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows; - for (Index i=skipRows; i(ResScalar(0)), ptmp1 = pset1(ResScalar(0)), - ptmp2 = pset1(ResScalar(0)), ptmp3 = pset1(ResScalar(0)); + RhsPacket b0 = rhs.template load(j,0); - // process initial unaligned coeffs - // FIXME this loop get vectorized by the compiler ! - for (Index j=0; j(i+0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load(i+1,j),b0,c1); + c2 = pcj.pmadd(lhs.template load(i+2,j),b0,c2); + c3 = pcj.pmadd(lhs.template load(i+3,j),b0,c3); + } + ResScalar cc0 = predux(c0); + ResScalar cc1 = predux(c1); + ResScalar cc2 = predux(c2); + ResScalar cc3 = predux(c3); + for(; jalignedStart) - { - switch(alignmentPattern) + cc0 += cj.pmul(lhs(i+0,j), b0); + cc1 += cj.pmul(lhs(i+1,j), b0); + cc2 += cj.pmul(lhs(i+2,j), b0); + cc3 += cj.pmul(lhs(i+3,j), b0); + } + res[(i+0)*resIncr] += alpha*cc0; + res[(i+1)*resIncr] += alpha*cc1; + res[(i+2)*resIncr] += alpha*cc2; + res[(i+3)*resIncr] += alpha*cc3; + } + for(; i(ResScalar(0)), + c1 = pset1(ResScalar(0)); + + Index j=0; + for(; j+LhsPacketSize<=cols; j+=LhsPacketSize) + { + RhsPacket b0 = rhs.template load(j,0); + + c0 = pcj.pmadd(lhs.template load(i+0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load(i+1,j),b0,c1); + } + ResScalar cc0 = predux(c0); + ResScalar cc1 = predux(c1); + for(; j(ResScalar(0)); + ResPacketHalf c0_h = pset1(ResScalar(0)); + ResPacketQuarter c0_q = pset1(ResScalar(0)); + Index j=0; + for(; j+LhsPacketSize<=cols; j+=LhsPacketSize) + { + RhsPacket b0 = rhs.template load(j,0); + c0 = pcj.pmadd(lhs.template load(i,j),b0,c0); + } + ResScalar cc0 = predux(c0); + if (HasHalf) { + for(; j+LhsPacketSizeHalf<=cols; j+=LhsPacketSizeHalf) { - case AllAligned: - for (Index j = alignedStart; j1) - { - /* Here we proccess 4 rows with with two peeled iterations to hide - * the overhead of unaligned loads. Moreover unaligned loads are handled - * using special shift/move operations between the two aligned packets - * overlaping the desired unaligned packet. This is *much* more efficient - * than basic unaligned loads. - */ - LhsPacket A01, A02, A03, A11, A12, A13; - A01 = lhs1.template load(alignedStart-1); - A02 = lhs2.template load(alignedStart-2); - A03 = lhs3.template load(alignedStart-3); - - for (; j(0); - A11 = lhs1.template load(j-1+LhsPacketSize); palign<1>(A01,A11); - A12 = lhs2.template load(j-2+LhsPacketSize); palign<2>(A02,A12); - A13 = lhs3.template load(j-3+LhsPacketSize); palign<3>(A03,A13); - - ptmp0 = pcj.pmadd(lhs0.template load(j), b, ptmp0); - ptmp1 = pcj.pmadd(A01, b, ptmp1); - A01 = lhs1.template load(j-1+2*LhsPacketSize); palign<1>(A11,A01); - ptmp2 = pcj.pmadd(A02, b, ptmp2); - A02 = lhs2.template load(j-2+2*LhsPacketSize); palign<2>(A12,A02); - ptmp3 = pcj.pmadd(A03, b, ptmp3); - A03 = lhs3.template load(j-3+2*LhsPacketSize); palign<3>(A13,A03); - - b = rhs.getVectorMapper(j+RhsPacketSize, 0).template load(0); - ptmp0 = pcj.pmadd(lhs0.template load(j+LhsPacketSize), b, ptmp0); - ptmp1 = pcj.pmadd(A11, b, ptmp1); - ptmp2 = pcj.pmadd(A12, b, ptmp2); - ptmp3 = pcj.pmadd(A13, b, ptmp3); - } - } - for (; j(j,0); + c0_h = pcj_half.pmadd(lhs.template load(i,j),b0,c0_h); } - tmp0 += predux(ptmp0); - tmp1 += predux(ptmp1); - tmp2 += predux(ptmp2); - tmp3 += predux(ptmp3); - } - } // end explicit vectorization - - // process remaining coeffs (or all if no explicit vectorization) - // FIXME this loop get vectorized by the compiler ! - for (Index j=alignedSize; j(j,0); + c0_q = pcj_quarter.pmadd(lhs.template load(i,j),b0,c0_q); + } + cc0 += predux(c0_q); + } + for(; j(tmp0); - const LhsScalars lhs0 = lhs.getVectorMapper(i, 0); - // process first unaligned result's coeffs - // FIXME this loop get vectorized by the compiler ! - for (Index j=0; jalignedStart) - { - // process aligned rhs coeffs - if (lhs0.template aligned(alignedStart)) - for (Index j = alignedStart;j(j), rhs.getVectorMapper(j, 0).template load(0), ptmp0); - else - for (Index j = alignedStart;j(j), rhs.getVectorMapper(j, 0).template load(0), ptmp0); - tmp0 += predux(ptmp0); - } - - // process remaining scalars - // FIXME this loop get vectorized by the compiler ! - for (Index j=alignedSize; j +#endif + namespace Eigen { namespace internal { @@ -18,7 +22,7 @@ namespace internal { inline void manage_multi_threading(Action action, int* v) { static int m_maxThreads = -1; - EIGEN_UNUSED_VARIABLE(m_maxThreads); + EIGEN_UNUSED_VARIABLE(m_maxThreads) if(action==SetAction) { @@ -76,8 +80,17 @@ template struct GemmParallelInfo { GemmParallelInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {} + // volatile is not enough on all architectures (see bug 1572) + // to guarantee that when thread A says to thread B that it is + // done with packing a block, then all writes have been really + // carried out... C++11 memory model+atomic guarantees this. +#if EIGEN_HAS_CXX11_ATOMIC + std::atomic sync; + std::atomic users; +#else Index volatile sync; int volatile users; +#endif Index lhs_start; Index lhs_length; @@ -88,11 +101,14 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, { // TODO when EIGEN_USE_BLAS is defined, // we should still enable OMP for other scalar types -#if !(defined (EIGEN_HAS_OPENMP)) || defined (EIGEN_USE_BLAS) + // Without C++11, we have to disable GEMM's parallelization on + // non x86 architectures because there volatile is not enough for our purpose. + // See bug 1572. +#if (! defined(EIGEN_HAS_OPENMP)) || defined(EIGEN_USE_BLAS) || ((!EIGEN_HAS_CXX11_ATOMIC) && !(EIGEN_ARCH_i386_OR_x86_64)) // FIXME the transpose variable is only needed to properly split // the matrix product when multithreading is enabled. This is a temporary // fix to support row-major destination matrices. This whole - // parallelizer mechanism has to be redisigned anyway. + // parallelizer mechanism has to be redesigned anyway. EIGEN_UNUSED_VARIABLE(depth); EIGEN_UNUSED_VARIABLE(transpose); func(0,rows, 0,cols); @@ -113,12 +129,12 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, double work = static_cast(rows) * static_cast(cols) * static_cast(depth); double kMinTaskSize = 50000; // FIXME improve this heuristic. - pb_max_threads = std::max(1, std::min(pb_max_threads, work / kMinTaskSize)); + pb_max_threads = std::max(1, std::min(pb_max_threads, static_cast( work / kMinTaskSize ) )); // compute the number of threads we are going to use Index threads = std::min(nbThreads(), pb_max_threads); - // if multi-threading is explicitely disabled, not useful, or if we already are in a parallel session, + // if multi-threading is explicitly disabled, not useful, or if we already are in a parallel session, // then abort multi-threading // FIXME omp_get_num_threads()>1 only works for openmp, what if the user does not use openmp? if((!Condition) || (threads==1) || (omp_get_num_threads()>1)) @@ -151,10 +167,8 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, info[i].lhs_start = r0; info[i].lhs_length = actualBlockRows; - if(transpose) - func(c0, actualBlockCols, 0, rows, info); - else - func(0, rows, c0, actualBlockCols, info); + if(transpose) func(c0, actualBlockCols, 0, rows, info); + else func(0, rows, c0, actualBlockCols, info); } #endif } diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index 04c933480b..33ecf10f61 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -45,14 +45,23 @@ struct symm_pack_lhs } void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows) { - enum { PacketSize = packet_traits::size }; + typedef typename unpacket_traits::type>::half HalfPacket; + typedef typename unpacket_traits::type>::half>::half QuarterPacket; + enum { PacketSize = packet_traits::size, + HalfPacketSize = unpacket_traits::size, + QuarterPacketSize = unpacket_traits::size, + HasHalf = (int)HalfPacketSize < (int)PacketSize, + HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize}; + const_blas_data_mapper lhs(_lhs,lhsStride); Index count = 0; //Index peeled_mc3 = (rows/Pack1)*Pack1; const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; - const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; + const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0; + const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0; + const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? peeled_mc_half+((rows-peeled_mc_half)/(QuarterPacketSize))*(QuarterPacketSize) : 0; if(Pack1>=3*PacketSize) for(Index i=0; i(blockA, lhs, cols, i, count); + if(HasHalf && Pack1>=HalfPacketSize) + for(Index i=peeled_mc1; i(blockA, lhs, cols, i, count); + + if(HasQuarter && Pack1>=QuarterPacketSize) + for(Index i=peeled_mc_half; i(blockA, lhs, cols, i, count); + // do the same with mr==1 - for(Index i=peeled_mc1; i gebp_kernel; symm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; - gemm_pack_lhs pack_lhs_transposed; + gemm_pack_lhs pack_lhs_transposed; for(Index k2=0; k2() + gemm_pack_lhs() (blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc); gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha); @@ -442,7 +459,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix gebp_kernel; - gemm_pack_lhs pack_lhs; + gemm_pack_lhs pack_lhs; symm_pack_rhs pack_rhs; for(Index k2=0; k2 -EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product::run( +EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC +void selfadjoint_matrix_vector_product::run( Index size, const Scalar* lhs, Index lhsStride, const Scalar* rhs, @@ -62,8 +64,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product enum { LhsUpLo = LhsMode&(Upper|Lower) }; template - static void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha) + static EIGEN_DEVICE_FUNC + void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha) { typedef typename Dest::Scalar ResScalar; typedef typename Rhs::Scalar RhsScalar; diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/products/SelfadjointProduct.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/products/SelfadjointProduct.h index ef12c98f6c..a21be80504 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/products/SelfadjointProduct.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/products/SelfadjointProduct.h @@ -111,7 +111,7 @@ struct selfadjoint_product_selector Scalar, OtherIsRowMajor ? ColMajor : RowMajor, (!OtherBlasTraits::NeedToConjugate) && NumTraits::IsComplex, IsRowMajor ? RowMajor : ColMajor, MatrixType::InnerStrideAtCompileTime, UpLo> ::run(size, depth, - &actualOther.coeffRef(0,0), actualOther.outerStride(), &actualOther.coeffRef(0,0), actualOther.outerStride(), + actualOther.data(), actualOther.outerStride(), actualOther.data(), actualOther.outerStride(), mat.data(), mat.innerStride(), mat.outerStride(), actualAlpha, blocking); } }; @@ -120,7 +120,7 @@ struct selfadjoint_product_selector template template -SelfAdjointView& SelfAdjointView +EIGEN_DEVICE_FUNC SelfAdjointView& SelfAdjointView ::rankUpdate(const MatrixBase& u, const Scalar& alpha) { selfadjoint_product_selector::run(_expression().const_cast_derived(), u.derived(), alpha); diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/products/SelfadjointRank2Update.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/products/SelfadjointRank2Update.h index 2ae3641111..f752a0bf09 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/products/SelfadjointRank2Update.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/products/SelfadjointRank2Update.h @@ -24,7 +24,8 @@ struct selfadjoint_rank2_update_selector; template struct selfadjoint_rank2_update_selector { - static void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha) + static EIGEN_DEVICE_FUNC + void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha) { const Index size = u.size(); for (Index i=0; i struct conj_expr_if template template -SelfAdjointView& SelfAdjointView +EIGEN_DEVICE_FUNC SelfAdjointView& SelfAdjointView ::rankUpdate(const MatrixBase& u, const MatrixBase& v, const Scalar& alpha) { typedef internal::blas_traits UBlasTraits; @@ -79,8 +80,8 @@ SelfAdjointView& SelfAdjointView if (IsRowMajor) actualAlpha = numext::conj(actualAlpha); - typedef typename internal::remove_all::type>::type UType; - typedef typename internal::remove_all::type>::type VType; + typedef typename internal::remove_all::type>::type UType; + typedef typename internal::remove_all::type>::type VType; internal::selfadjoint_rank2_update_selector ::run(_expression().const_cast_derived().data(),_expression().outerStride(),UType(actualU),VType(actualV),actualAlpha); diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/products/TriangularMatrixMatrix.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/products/TriangularMatrixMatrix.h index 2fb408d1d7..f0c60507ab 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -155,7 +155,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix gebp_kernel; - gemm_pack_lhs pack_lhs; + gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; for(Index k2=IsLower ? depth : 0; @@ -226,7 +226,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix() + gemm_pack_lhs() (blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc); gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, @@ -305,7 +305,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix gebp_kernel; - gemm_pack_lhs pack_lhs; + gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; gemm_pack_rhs pack_rhs_panel; diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/products/TriangularSolverMatrix.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/products/TriangularSolverMatrix.h index e3ed2cd19e..6d879ba00f 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/products/TriangularSolverMatrix.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/products/TriangularSolverMatrix.h @@ -76,7 +76,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix conj; gebp_kernel gebp_kernel; - gemm_pack_lhs pack_lhs; + gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; // the goal here is to subdivise the Rhs panels such that we keep some cache @@ -136,7 +136,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix gebp_kernel; gemm_pack_rhs pack_rhs; gemm_pack_rhs pack_rhs_panel; - gemm_pack_lhs pack_lhs_panel; + gemm_pack_lhs pack_lhs_panel; for(Index k2=IsLower ? size : 0; IsLower ? k2>0 : k20) rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map >(rhs+s,k))).sum(); - if(!(Mode & UnitDiag)) + if((!(Mode & UnitDiag)) && numext::not_equal_strict(rhs[i],RhsScalar(0))) rhs[i] /= cjLhs(i,i); } } @@ -114,20 +114,23 @@ struct triangular_solve_vector0) - Map >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r); + Index r = actualPanelWidth - k - 1; // remaining size + Index s = IsLower ? i+1 : i-r; + if (r>0) + Map >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r); + } } Index r = IsLower ? size - endBlock : startBlock; // remaining size if (r > 0) { // let's directly call the low level product function because: // 1 - it is faster to compile - // 2 - it is slighlty faster at runtime + // 2 - it is slightly faster at runtime general_matrix_vector_product::run( r, actualPanelWidth, LhsMapper(&lhs.coeffRef(endBlock,startBlock), lhsStride), diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/BlasUtil.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/BlasUtil.h index 3dff9bc9b3..e16a564980 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/BlasUtil.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/BlasUtil.h @@ -24,7 +24,7 @@ struct gebp_kernel; template struct gemm_pack_rhs; -template +template struct gemm_pack_lhs; template< @@ -39,90 +39,6 @@ template struct general_matrix_vector_product; - -template struct conj_if; - -template<> struct conj_if { - template - inline T operator()(const T& x) const { return numext::conj(x); } - template - inline T pconj(const T& x) const { return internal::pconj(x); } -}; - -template<> struct conj_if { - template - inline const T& operator()(const T& x) const { return x; } - template - inline const T& pconj(const T& x) const { return x; } -}; - -// Generic implementation for custom complex types. -template -struct conj_helper -{ - typedef typename ScalarBinaryOpTraits::ReturnType Scalar; - - EIGEN_STRONG_INLINE Scalar pmadd(const LhsScalar& x, const RhsScalar& y, const Scalar& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Scalar pmul(const LhsScalar& x, const RhsScalar& y) const - { return conj_if()(x) * conj_if()(y); } -}; - -template struct conj_helper -{ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const { return internal::pmadd(x,y,c); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const { return internal::pmul(x,y); } -}; - -template struct conj_helper, std::complex, false,true> -{ - typedef std::complex Scalar; - EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const - { return c + pmul(x,y); } - - EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const - { return Scalar(numext::real(x)*numext::real(y) + numext::imag(x)*numext::imag(y), numext::imag(x)*numext::real(y) - numext::real(x)*numext::imag(y)); } -}; - -template struct conj_helper, std::complex, true,false> -{ - typedef std::complex Scalar; - EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const - { return c + pmul(x,y); } - - EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const - { return Scalar(numext::real(x)*numext::real(y) + numext::imag(x)*numext::imag(y), numext::real(x)*numext::imag(y) - numext::imag(x)*numext::real(y)); } -}; - -template struct conj_helper, std::complex, true,true> -{ - typedef std::complex Scalar; - EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const - { return c + pmul(x,y); } - - EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const - { return Scalar(numext::real(x)*numext::real(y) - numext::imag(x)*numext::imag(y), - numext::real(x)*numext::imag(y) - numext::imag(x)*numext::real(y)); } -}; - -template struct conj_helper, RealScalar, Conj,false> -{ - typedef std::complex Scalar; - EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const RealScalar& y, const Scalar& c) const - { return padd(c, pmul(x,y)); } - EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const RealScalar& y) const - { return conj_if()(x)*y; } -}; - -template struct conj_helper, false,Conj> -{ - typedef std::complex Scalar; - EIGEN_STRONG_INLINE Scalar pmadd(const RealScalar& x, const Scalar& y, const Scalar& c) const - { return padd(c, pmul(x,y)); } - EIGEN_STRONG_INLINE Scalar pmul(const RealScalar& x, const Scalar& y) const - { return x*conj_if()(y); } -}; - template struct get_factor { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE To run(const From& x) { return To(x); } }; @@ -159,11 +75,9 @@ template class BlasLinearMapper; template -class BlasLinearMapper { - public: - typedef typename packet_traits::type Packet; - typedef typename packet_traits::half HalfPacket; - +class BlasLinearMapper +{ +public: EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data, Index incr=1) : m_data(data) { @@ -179,19 +93,17 @@ class BlasLinearMapper { return m_data[i]; } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { - return ploadt(m_data + i); + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i) const { + return ploadt(m_data + i); } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { - return ploadt(m_data + i); + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketType &p) const { + pstoret(m_data + i, p); } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const Packet &p) const { - pstoret(m_data + i, p); - } - - protected: +protected: Scalar *m_data; }; @@ -199,13 +111,59 @@ class BlasLinearMapper { template class blas_data_mapper; +// TMP to help PacketBlock store implementation. +// There's currently no known use case for PacketBlock load. +// The default implementation assumes ColMajor order. +// It always store each packet sequentially one `stride` apart. +template +struct PacketBlockManagement +{ + PacketBlockManagement pbm; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(Scalar *to, const Index stride, Index i, Index j, const PacketBlock &block) const { + pbm.store(to, stride, i, j, block); + pstoreu(to + i + (j + idx)*stride, block.packet[idx]); + } +}; + +// PacketBlockManagement specialization to take care of RowMajor order without ifs. +template +struct PacketBlockManagement +{ + PacketBlockManagement pbm; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(Scalar *to, const Index stride, Index i, Index j, const PacketBlock &block) const { + pbm.store(to, stride, i, j, block); + pstoreu(to + j + (i + idx)*stride, block.packet[idx]); + } +}; + +template +struct PacketBlockManagement +{ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(Scalar *to, const Index stride, Index i, Index j, const PacketBlock &block) const { + EIGEN_UNUSED_VARIABLE(to); + EIGEN_UNUSED_VARIABLE(stride); + EIGEN_UNUSED_VARIABLE(i); + EIGEN_UNUSED_VARIABLE(j); + EIGEN_UNUSED_VARIABLE(block); + } +}; + +template +struct PacketBlockManagement +{ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(Scalar *to, const Index stride, Index i, Index j, const PacketBlock &block) const { + EIGEN_UNUSED_VARIABLE(to); + EIGEN_UNUSED_VARIABLE(stride); + EIGEN_UNUSED_VARIABLE(i); + EIGEN_UNUSED_VARIABLE(j); + EIGEN_UNUSED_VARIABLE(block); + } +}; + template class blas_data_mapper { public: - typedef typename packet_traits::type Packet; - typedef typename packet_traits::half HalfPacket; - typedef BlasLinearMapper LinearMapper; typedef BlasVectorMapper VectorMapper; @@ -235,12 +193,14 @@ public: return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { - return ploadt(&operator()(i, j)); + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i, Index j) const { + return ploadt(&operator()(i, j)); } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { - return ploadt(&operator()(i, j)); + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i, Index j) const { + return ploadt(&operator()(i, j)); } template @@ -263,7 +223,12 @@ public: return internal::first_default_aligned(m_data, size); } - protected: + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketBlock(Index i, Index j, const PacketBlock &block) const { + PacketBlockManagement pbm; + pbm.store(m_data, m_stride, i, j, block); + } +protected: Scalar* EIGEN_RESTRICT m_data; const Index m_stride; }; @@ -275,9 +240,6 @@ template class BlasLinearMapper { public: - typedef typename packet_traits::type Packet; - typedef typename packet_traits::half HalfPacket; - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data,Index incr) : m_data(data), m_incr(incr) {} EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const { @@ -288,8 +250,9 @@ public: return m_data[i*m_incr.value()]; } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { - return pgather(m_data + i*m_incr.value(), m_incr.value()); + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i) const { + return pgather(m_data + i*m_incr.value(), m_incr.value()); } template @@ -306,9 +269,6 @@ template::type Packet; - typedef typename packet_traits::half HalfPacket; - typedef BlasLinearMapper LinearMapper; EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride, Index incr) : m_data(data), m_stride(stride), m_incr(incr) {} @@ -327,8 +287,9 @@ public: return m_data[StorageOrder==RowMajor ? j*m_incr.value() + i*m_stride : i*m_incr.value() + j*m_stride]; } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { - return pgather(&operator()(i, j),m_incr.value()); + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i, Index j) const { + return pgather(&operator()(i, j),m_incr.value()); } template @@ -346,6 +307,77 @@ public: return pgather(&operator()(i, j), m_stride); } + // storePacketBlock_helper defines a way to access values inside the PacketBlock, this is essentially required by the Complex types. + template + struct storePacketBlock_helper + { + storePacketBlock_helper spbh; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper* sup, Index i, Index j, const PacketBlock& block) const { + spbh.store(sup, i,j,block); + for(int l = 0; l < unpacket_traits::size; l++) + { + ScalarT *v = &sup->operator()(i+l, j+idx); + *v = block.packet[idx][l]; + } + } + }; + + template + struct storePacketBlock_helper, n, idx> + { + storePacketBlock_helper, n, idx-1> spbh; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper* sup, Index i, Index j, const PacketBlock& block) const { + spbh.store(sup,i,j,block); + for(int l = 0; l < unpacket_traits::size; l++) + { + std::complex *v = &sup->operator()(i+l, j+idx); + v->real(block.packet[idx].v[2*l+0]); + v->imag(block.packet[idx].v[2*l+1]); + } + } + }; + + template + struct storePacketBlock_helper, n, idx> + { + storePacketBlock_helper, n, idx-1> spbh; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper* sup, Index i, Index j, const PacketBlock& block) const { + spbh.store(sup,i,j,block); + for(int l = 0; l < unpacket_traits::size; l++) + { + std::complex *v = &sup->operator()(i+l, j+idx); + v->real(block.packet[idx].v[2*l+0]); + v->imag(block.packet[idx].v[2*l+1]); + } + } + }; + + template + struct storePacketBlock_helper + { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper*, Index, Index, const PacketBlock& ) const { + } + }; + + template + struct storePacketBlock_helper, n, -1> + { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper*, Index, Index, const PacketBlock& ) const { + } + }; + + template + struct storePacketBlock_helper, n, -1> + { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper*, Index, Index, const PacketBlock& ) const { + } + }; + // This function stores a PacketBlock on m_data, this approach is really quite slow compare to Incr=1 and should be avoided when possible. + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketBlock(Index i, Index j, const PacketBlock&block) const { + storePacketBlock_helper spb; + spb.store(this, i,j,block); + } protected: Scalar* EIGEN_RESTRICT m_data; const Index m_stride; @@ -379,14 +411,15 @@ template struct blas_traits HasUsableDirectAccess = ( (int(XprType::Flags)&DirectAccessBit) && ( bool(XprType::IsVectorAtCompileTime) || int(inner_stride_at_compile_time::ret) == 1) - ) ? 1 : 0 + ) ? 1 : 0, + HasScalarFactor = false }; typedef typename conditional::type DirectLinearAccessType; - static inline ExtractType extract(const XprType& x) { return x; } - static inline const Scalar extractScalarFactor(const XprType&) { return Scalar(1); } + static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return x; } + static inline EIGEN_DEVICE_FUNC const Scalar extractScalarFactor(const XprType&) { return Scalar(1); } }; // pop conjugate @@ -411,17 +444,23 @@ template struct blas_traits, const CwiseNullaryOp,Plain>, NestedXpr> > : blas_traits { + enum { + HasScalarFactor = true + }; typedef blas_traits Base; typedef CwiseBinaryOp, const CwiseNullaryOp,Plain>, NestedXpr> XprType; typedef typename Base::ExtractType ExtractType; - static inline ExtractType extract(const XprType& x) { return Base::extract(x.rhs()); } - static inline Scalar extractScalarFactor(const XprType& x) + static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return Base::extract(x.rhs()); } + static inline EIGEN_DEVICE_FUNC Scalar extractScalarFactor(const XprType& x) { return x.lhs().functor().m_other * Base::extractScalarFactor(x.rhs()); } }; template struct blas_traits, NestedXpr, const CwiseNullaryOp,Plain> > > : blas_traits { + enum { + HasScalarFactor = true + }; typedef blas_traits Base; typedef CwiseBinaryOp, NestedXpr, const CwiseNullaryOp,Plain> > XprType; typedef typename Base::ExtractType ExtractType; @@ -440,6 +479,9 @@ template struct blas_traits, NestedXpr> > : blas_traits { + enum { + HasScalarFactor = true + }; typedef blas_traits Base; typedef CwiseUnaryOp, NestedXpr> XprType; typedef typename Base::ExtractType ExtractType; @@ -476,7 +518,7 @@ struct blas_traits template::HasUsableDirectAccess> struct extract_data_selector { - static const typename T::Scalar* run(const T& m) + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static const typename T::Scalar* run(const T& m) { return blas_traits::extract(m).data(); } @@ -487,11 +529,53 @@ struct extract_data_selector { static typename T::Scalar* run(const T&) { return 0; } }; -template const typename T::Scalar* extract_data(const T& m) +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename T::Scalar* extract_data(const T& m) { return extract_data_selector::run(m); } +/** + * \c combine_scalar_factors extracts and multiplies factors from GEMM and GEMV products. + * There is a specialization for booleans + */ +template +struct combine_scalar_factors_impl +{ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static ResScalar run(const Lhs& lhs, const Rhs& rhs) + { + return blas_traits::extractScalarFactor(lhs) * blas_traits::extractScalarFactor(rhs); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static ResScalar run(const ResScalar& alpha, const Lhs& lhs, const Rhs& rhs) + { + return alpha * blas_traits::extractScalarFactor(lhs) * blas_traits::extractScalarFactor(rhs); + } +}; +template +struct combine_scalar_factors_impl +{ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(const Lhs& lhs, const Rhs& rhs) + { + return blas_traits::extractScalarFactor(lhs) && blas_traits::extractScalarFactor(rhs); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(const bool& alpha, const Lhs& lhs, const Rhs& rhs) + { + return alpha && blas_traits::extractScalarFactor(lhs) && blas_traits::extractScalarFactor(rhs); + } +}; + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ResScalar combine_scalar_factors(const ResScalar& alpha, const Lhs& lhs, const Rhs& rhs) +{ + return combine_scalar_factors_impl::run(alpha, lhs, rhs); +} +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ResScalar combine_scalar_factors(const Lhs& lhs, const Rhs& rhs) +{ + return combine_scalar_factors_impl::run(lhs, rhs); +} + + } // end namespace internal } // end namespace Eigen diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/ConfigureVectorization.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/ConfigureVectorization.h new file mode 100644 index 0000000000..af4e696238 --- /dev/null +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/ConfigureVectorization.h @@ -0,0 +1,512 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2018 Gael Guennebaud +// Copyright (C) 2020, Arm Limited and Contributors +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CONFIGURE_VECTORIZATION_H +#define EIGEN_CONFIGURE_VECTORIZATION_H + +//------------------------------------------------------------------------------------------ +// Static and dynamic alignment control +// +// The main purpose of this section is to define EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES +// as the maximal boundary in bytes on which dynamically and statically allocated data may be alignment respectively. +// The values of EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES can be specified by the user. If not, +// a default value is automatically computed based on architecture, compiler, and OS. +// +// This section also defines macros EIGEN_ALIGN_TO_BOUNDARY(N) and the shortcuts EIGEN_ALIGN{8,16,32,_MAX} +// to be used to declare statically aligned buffers. +//------------------------------------------------------------------------------------------ + + +/* EIGEN_ALIGN_TO_BOUNDARY(n) forces data to be n-byte aligned. This is used to satisfy SIMD requirements. + * However, we do that EVEN if vectorization (EIGEN_VECTORIZE) is disabled, + * so that vectorization doesn't affect binary compatibility. + * + * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link + * vectorized and non-vectorized code. + * + * FIXME: this code can be cleaned up once we switch to proper C++11 only. + */ +#if (defined EIGEN_CUDACC) + #define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n) + #define EIGEN_ALIGNOF(x) __alignof(x) +#elif EIGEN_HAS_ALIGNAS + #define EIGEN_ALIGN_TO_BOUNDARY(n) alignas(n) + #define EIGEN_ALIGNOF(x) alignof(x) +#elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM + #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n))) + #define EIGEN_ALIGNOF(x) __alignof(x) +#elif EIGEN_COMP_MSVC + #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n)) + #define EIGEN_ALIGNOF(x) __alignof(x) +#elif EIGEN_COMP_SUNCC + // FIXME not sure about this one: + #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n))) + #define EIGEN_ALIGNOF(x) __alignof(x) +#else + #error Please tell me what is the equivalent of alignas(n) and alignof(x) for your compiler +#endif + +// If the user explicitly disable vectorization, then we also disable alignment +#if defined(EIGEN_DONT_VECTORIZE) + #if defined(EIGEN_GPUCC) + // GPU code is always vectorized and requires memory alignment for + // statically allocated buffers. + #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16 + #else + #define EIGEN_IDEAL_MAX_ALIGN_BYTES 0 + #endif +#elif defined(__AVX512F__) + // 64 bytes static alignment is preferred only if really required + #define EIGEN_IDEAL_MAX_ALIGN_BYTES 64 +#elif defined(__AVX__) + // 32 bytes static alignment is preferred only if really required + #define EIGEN_IDEAL_MAX_ALIGN_BYTES 32 +#else + #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16 +#endif + + +// EIGEN_MIN_ALIGN_BYTES defines the minimal value for which the notion of explicit alignment makes sense +#define EIGEN_MIN_ALIGN_BYTES 16 + +// Defined the boundary (in bytes) on which the data needs to be aligned. Note +// that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be +// aligned at all regardless of the value of this #define. + +#if (defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN)) && defined(EIGEN_MAX_STATIC_ALIGN_BYTES) && EIGEN_MAX_STATIC_ALIGN_BYTES>0 +#error EIGEN_MAX_STATIC_ALIGN_BYTES and EIGEN_DONT_ALIGN[_STATICALLY] are both defined with EIGEN_MAX_STATIC_ALIGN_BYTES!=0. Use EIGEN_MAX_STATIC_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN_STATICALLY. +#endif + +// EIGEN_DONT_ALIGN_STATICALLY and EIGEN_DONT_ALIGN are deprecated +// They imply EIGEN_MAX_STATIC_ALIGN_BYTES=0 +#if defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN) + #ifdef EIGEN_MAX_STATIC_ALIGN_BYTES + #undef EIGEN_MAX_STATIC_ALIGN_BYTES + #endif + #define EIGEN_MAX_STATIC_ALIGN_BYTES 0 +#endif + +#ifndef EIGEN_MAX_STATIC_ALIGN_BYTES + + // Try to automatically guess what is the best default value for EIGEN_MAX_STATIC_ALIGN_BYTES + + // 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable + // 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always + // enable alignment, but it can be a cause of problems on some platforms, so we just disable it in + // certain common platform (compiler+architecture combinations) to avoid these problems. + // Only static alignment is really problematic (relies on nonstandard compiler extensions), + // try to keep heap alignment even when we have to disable static alignment. + #if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64 || EIGEN_ARCH_MIPS) + #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1 + #elif EIGEN_ARCH_ARM_OR_ARM64 && EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_MOST(4, 6) + // Old versions of GCC on ARM, at least 4.4, were once seen to have buggy static alignment support. + // Not sure which version fixed it, hopefully it doesn't affect 4.7, which is still somewhat in use. + // 4.8 and newer seem definitely unaffected. + #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1 + #else + #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0 + #endif + + // static alignment is completely disabled with GCC 3, Sun Studio, and QCC/QNX + #if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT \ + && !EIGEN_GCC3_OR_OLDER \ + && !EIGEN_COMP_SUNCC \ + && !EIGEN_OS_QNX + #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1 + #else + #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0 + #endif + + #if EIGEN_ARCH_WANTS_STACK_ALIGNMENT + #define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES + #else + #define EIGEN_MAX_STATIC_ALIGN_BYTES 0 + #endif + +#endif + +// If EIGEN_MAX_ALIGN_BYTES is defined, then it is considered as an upper bound for EIGEN_MAX_STATIC_ALIGN_BYTES +#if defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES0 is the true test whether we want to align arrays on the stack or not. +// It takes into account both the user choice to explicitly enable/disable alignment (by setting EIGEN_MAX_STATIC_ALIGN_BYTES) +// and the architecture config (EIGEN_ARCH_WANTS_STACK_ALIGNMENT). +// Henceforth, only EIGEN_MAX_STATIC_ALIGN_BYTES should be used. + + +// Shortcuts to EIGEN_ALIGN_TO_BOUNDARY +#define EIGEN_ALIGN8 EIGEN_ALIGN_TO_BOUNDARY(8) +#define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16) +#define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32) +#define EIGEN_ALIGN64 EIGEN_ALIGN_TO_BOUNDARY(64) +#if EIGEN_MAX_STATIC_ALIGN_BYTES>0 +#define EIGEN_ALIGN_MAX EIGEN_ALIGN_TO_BOUNDARY(EIGEN_MAX_STATIC_ALIGN_BYTES) +#else +#define EIGEN_ALIGN_MAX +#endif + + +// Dynamic alignment control + +#if defined(EIGEN_DONT_ALIGN) && defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES>0 +#error EIGEN_MAX_ALIGN_BYTES and EIGEN_DONT_ALIGN are both defined with EIGEN_MAX_ALIGN_BYTES!=0. Use EIGEN_MAX_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN. +#endif + +#ifdef EIGEN_DONT_ALIGN + #ifdef EIGEN_MAX_ALIGN_BYTES + #undef EIGEN_MAX_ALIGN_BYTES + #endif + #define EIGEN_MAX_ALIGN_BYTES 0 +#elif !defined(EIGEN_MAX_ALIGN_BYTES) + #define EIGEN_MAX_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES +#endif + +#if EIGEN_IDEAL_MAX_ALIGN_BYTES > EIGEN_MAX_ALIGN_BYTES +#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES +#else +#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES +#endif + + +#ifndef EIGEN_UNALIGNED_VECTORIZE +#define EIGEN_UNALIGNED_VECTORIZE 1 +#endif + +//---------------------------------------------------------------------- + +// if alignment is disabled, then disable vectorization. Note: EIGEN_MAX_ALIGN_BYTES is the proper check, it takes into +// account both the user's will (EIGEN_MAX_ALIGN_BYTES,EIGEN_DONT_ALIGN) and our own platform checks +#if EIGEN_MAX_ALIGN_BYTES==0 + #ifndef EIGEN_DONT_VECTORIZE + #define EIGEN_DONT_VECTORIZE + #endif +#endif + + +// The following (except #include and _M_IX86_FP ??) can likely be +// removed as gcc 4.1 and msvc 2008 are not supported anyways. +#if EIGEN_COMP_MSVC + #include // for _aligned_malloc -- need it regardless of whether vectorization is enabled + #if (EIGEN_COMP_MSVC >= 1500) // 2008 or later + // a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP. + #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || EIGEN_ARCH_x86_64 + #define EIGEN_SSE2_ON_MSVC_2008_OR_LATER + #endif + #endif +#else + #if (defined __SSE2__) && ( (!EIGEN_COMP_GNUC) || EIGEN_COMP_ICC || EIGEN_GNUC_AT_LEAST(4,2) ) + #define EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC + #endif +#endif + +#if !(defined(EIGEN_DONT_VECTORIZE) || defined(EIGEN_GPUCC)) + + #if defined (EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER) + + // Defines symbols for compile-time detection of which instructions are + // used. + // EIGEN_VECTORIZE_YY is defined if and only if the instruction set YY is used + #define EIGEN_VECTORIZE + #define EIGEN_VECTORIZE_SSE + #define EIGEN_VECTORIZE_SSE2 + + // Detect sse3/ssse3/sse4: + // gcc and icc defines __SSE3__, ... + // there is no way to know about this on msvc. You can define EIGEN_VECTORIZE_SSE* if you + // want to force the use of those instructions with msvc. + #ifdef __SSE3__ + #define EIGEN_VECTORIZE_SSE3 + #endif + #ifdef __SSSE3__ + #define EIGEN_VECTORIZE_SSSE3 + #endif + #ifdef __SSE4_1__ + #define EIGEN_VECTORIZE_SSE4_1 + #endif + #ifdef __SSE4_2__ + #define EIGEN_VECTORIZE_SSE4_2 + #endif + #ifdef __AVX__ + #ifndef EIGEN_USE_SYCL + #define EIGEN_VECTORIZE_AVX + #endif + #define EIGEN_VECTORIZE_SSE3 + #define EIGEN_VECTORIZE_SSSE3 + #define EIGEN_VECTORIZE_SSE4_1 + #define EIGEN_VECTORIZE_SSE4_2 + #endif + #ifdef __AVX2__ + #ifndef EIGEN_USE_SYCL + #define EIGEN_VECTORIZE_AVX2 + #define EIGEN_VECTORIZE_AVX + #endif + #define EIGEN_VECTORIZE_SSE3 + #define EIGEN_VECTORIZE_SSSE3 + #define EIGEN_VECTORIZE_SSE4_1 + #define EIGEN_VECTORIZE_SSE4_2 + #endif + #if defined(__FMA__) || (EIGEN_COMP_MSVC && defined(__AVX2__)) + // MSVC does not expose a switch dedicated for FMA + // For MSVC, AVX2 => FMA + #define EIGEN_VECTORIZE_FMA + #endif + #if defined(__AVX512F__) + #ifndef EIGEN_VECTORIZE_FMA + #if EIGEN_COMP_GNUC + #error Please add -mfma to your compiler flags: compiling with -mavx512f alone without SSE/AVX FMA is not supported (bug 1638). + #else + #error Please enable FMA in your compiler flags (e.g. -mfma): compiling with AVX512 alone without SSE/AVX FMA is not supported (bug 1638). + #endif + #endif + #ifndef EIGEN_USE_SYCL + #define EIGEN_VECTORIZE_AVX512 + #define EIGEN_VECTORIZE_AVX2 + #define EIGEN_VECTORIZE_AVX + #endif + #define EIGEN_VECTORIZE_FMA + #define EIGEN_VECTORIZE_SSE3 + #define EIGEN_VECTORIZE_SSSE3 + #define EIGEN_VECTORIZE_SSE4_1 + #define EIGEN_VECTORIZE_SSE4_2 + #ifndef EIGEN_USE_SYCL + #ifdef __AVX512DQ__ + #define EIGEN_VECTORIZE_AVX512DQ + #endif + #ifdef __AVX512ER__ + #define EIGEN_VECTORIZE_AVX512ER + #endif + #ifdef __AVX512BF16__ + #define EIGEN_VECTORIZE_AVX512BF16 + #endif + #endif + #endif + + // Disable AVX support on broken xcode versions + #if defined(__apple_build_version__) && (__apple_build_version__ == 11000033 ) && ( __MAC_OS_X_VERSION_MIN_REQUIRED == 101500 ) + // A nasty bug in the clang compiler shipped with xcode in a common compilation situation + // when XCode 11.0 and Mac deployment target macOS 10.15 is https://trac.macports.org/ticket/58776#no1 + #ifdef EIGEN_VECTORIZE_AVX + #undef EIGEN_VECTORIZE_AVX + #warning "Disabling AVX support: clang compiler shipped with XCode 11.[012] generates broken assembly with -macosx-version-min=10.15 and AVX enabled. " + #ifdef EIGEN_VECTORIZE_AVX2 + #undef EIGEN_VECTORIZE_AVX2 + #endif + #ifdef EIGEN_VECTORIZE_FMA + #undef EIGEN_VECTORIZE_FMA + #endif + #ifdef EIGEN_VECTORIZE_AVX512 + #undef EIGEN_VECTORIZE_AVX512 + #endif + #ifdef EIGEN_VECTORIZE_AVX512DQ + #undef EIGEN_VECTORIZE_AVX512DQ + #endif + #ifdef EIGEN_VECTORIZE_AVX512ER + #undef EIGEN_VECTORIZE_AVX512ER + #endif + #endif + // NOTE: Confirmed test failures in XCode 11.0, and XCode 11.2 with -macosx-version-min=10.15 and AVX + // NOTE using -macosx-version-min=10.15 with Xcode 11.0 results in runtime segmentation faults in many tests, 11.2 produce core dumps in 3 tests + // NOTE using -macosx-version-min=10.14 produces functioning and passing tests in all cases + // NOTE __clang_version__ "11.0.0 (clang-1100.0.33.8)" XCode 11.0 <- Produces many segfault and core dumping tests + // with -macosx-version-min=10.15 and AVX + // NOTE __clang_version__ "11.0.0 (clang-1100.0.33.12)" XCode 11.2 <- Produces 3 core dumping tests with + // -macosx-version-min=10.15 and AVX + #endif + + // include files + + // This extern "C" works around a MINGW-w64 compilation issue + // https://sourceforge.net/tracker/index.php?func=detail&aid=3018394&group_id=202880&atid=983354 + // In essence, intrin.h is included by windows.h and also declares intrinsics (just as emmintrin.h etc. below do). + // However, intrin.h uses an extern "C" declaration, and g++ thus complains of duplicate declarations + // with conflicting linkage. The linkage for intrinsics doesn't matter, but at that stage the compiler doesn't know; + // so, to avoid compile errors when windows.h is included after Eigen/Core, ensure intrinsics are extern "C" here too. + // notice that since these are C headers, the extern "C" is theoretically needed anyways. + extern "C" { + // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly. + // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus: + #if EIGEN_COMP_ICC >= 1110 + #include + #else + #include + #include + #include + #ifdef EIGEN_VECTORIZE_SSE3 + #include + #endif + #ifdef EIGEN_VECTORIZE_SSSE3 + #include + #endif + #ifdef EIGEN_VECTORIZE_SSE4_1 + #include + #endif + #ifdef EIGEN_VECTORIZE_SSE4_2 + #include + #endif + #if defined(EIGEN_VECTORIZE_AVX) || defined(EIGEN_VECTORIZE_AVX512) + #include + #endif + #endif + } // end extern "C" + + #elif defined __VSX__ + + #define EIGEN_VECTORIZE + #define EIGEN_VECTORIZE_VSX + #include + // We need to #undef all these ugly tokens defined in + // => use __vector instead of vector + #undef bool + #undef vector + #undef pixel + + #elif defined __ALTIVEC__ + + #define EIGEN_VECTORIZE + #define EIGEN_VECTORIZE_ALTIVEC + #include + // We need to #undef all these ugly tokens defined in + // => use __vector instead of vector + #undef bool + #undef vector + #undef pixel + + #elif ((defined __ARM_NEON) || (defined __ARM_NEON__)) && !(defined EIGEN_ARM64_USE_SVE) + + #define EIGEN_VECTORIZE + #define EIGEN_VECTORIZE_NEON + #include + + // We currently require SVE to be enabled explicitly via EIGEN_ARM64_USE_SVE and + // will not select the backend automatically + #elif (defined __ARM_FEATURE_SVE) && (defined EIGEN_ARM64_USE_SVE) + + #define EIGEN_VECTORIZE + #define EIGEN_VECTORIZE_SVE + #include + + // Since we depend on knowing SVE vector lengths at compile-time, we need + // to ensure a fixed lengths is set + #if defined __ARM_FEATURE_SVE_BITS + #define EIGEN_ARM64_SVE_VL __ARM_FEATURE_SVE_BITS + #else +#error "Eigen requires a fixed SVE lector length but EIGEN_ARM64_SVE_VL is not set." +#endif + +#elif (defined __s390x__ && defined __VEC__) + +#define EIGEN_VECTORIZE +#define EIGEN_VECTORIZE_ZVECTOR +#include + +#elif defined __mips_msa + +// Limit MSA optimizations to little-endian CPUs for now. +// TODO: Perhaps, eventually support MSA optimizations on big-endian CPUs? +#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +#if defined(__LP64__) +#define EIGEN_MIPS_64 +#else +#define EIGEN_MIPS_32 +#endif +#define EIGEN_VECTORIZE +#define EIGEN_VECTORIZE_MSA +#include +#endif + +#endif +#endif + +// Following the Arm ACLE arm_neon.h should also include arm_fp16.h but not all +// compilers seem to follow this. We therefore include it explicitly. +// See also: https://bugs.llvm.org/show_bug.cgi?id=47955 +#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) + #include +#endif + +#if defined(__F16C__) && (!defined(EIGEN_GPUCC) && (!defined(EIGEN_COMP_CLANG) || EIGEN_COMP_CLANG>=380)) + // We can use the optimized fp16 to float and float to fp16 conversion routines + #define EIGEN_HAS_FP16_C + + #if defined(EIGEN_COMP_CLANG) + // Workaround for clang: The FP16C intrinsics for clang are included by + // immintrin.h, as opposed to emmintrin.h as suggested by Intel: + // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#othertechs=FP16C&expand=1711 + #include + #endif +#endif + +#if defined EIGEN_CUDACC + #define EIGEN_VECTORIZE_GPU + #include + #if EIGEN_CUDA_SDK_VER >= 70500 + #define EIGEN_HAS_CUDA_FP16 + #endif +#endif + +#if defined(EIGEN_HAS_CUDA_FP16) + #include + #include +#endif + +#if defined(EIGEN_HIPCC) + #define EIGEN_VECTORIZE_GPU + #include + #define EIGEN_HAS_HIP_FP16 + #include +#endif + + +/** \brief Namespace containing all symbols from the %Eigen library. */ +namespace Eigen { + +inline static const char *SimdInstructionSetsInUse(void) { +#if defined(EIGEN_VECTORIZE_AVX512) + return "AVX512, FMA, AVX2, AVX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; +#elif defined(EIGEN_VECTORIZE_AVX) + return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; +#elif defined(EIGEN_VECTORIZE_SSE4_2) + return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; +#elif defined(EIGEN_VECTORIZE_SSE4_1) + return "SSE, SSE2, SSE3, SSSE3, SSE4.1"; +#elif defined(EIGEN_VECTORIZE_SSSE3) + return "SSE, SSE2, SSE3, SSSE3"; +#elif defined(EIGEN_VECTORIZE_SSE3) + return "SSE, SSE2, SSE3"; +#elif defined(EIGEN_VECTORIZE_SSE2) + return "SSE, SSE2"; +#elif defined(EIGEN_VECTORIZE_ALTIVEC) + return "AltiVec"; +#elif defined(EIGEN_VECTORIZE_VSX) + return "VSX"; +#elif defined(EIGEN_VECTORIZE_NEON) + return "ARM NEON"; +#elif defined(EIGEN_VECTORIZE_SVE) + return "ARM SVE"; +#elif defined(EIGEN_VECTORIZE_ZVECTOR) + return "S390X ZVECTOR"; +#elif defined(EIGEN_VECTORIZE_MSA) + return "MIPS MSA"; +#else + return "None"; +#endif +} + +} // end namespace Eigen + + +#endif // EIGEN_CONFIGURE_VECTORIZATION_H diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/Constants.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/Constants.h index 7587d68424..35dcaa7b38 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/Constants.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/Constants.h @@ -3,6 +3,7 @@ // // Copyright (C) 2008-2015 Gael Guennebaud // Copyright (C) 2007-2009 Benoit Jacob +// Copyright (C) 2020, Arm Limited and Contributors // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -25,6 +26,10 @@ const int Dynamic = -1; */ const int DynamicIndex = 0xffffff; +/** This value means that the increment to go from one value to another in a sequence is not constant for each step. + */ +const int UndefinedIncr = 0xfffffe; + /** This value means +Infinity; it is currently used only as the p parameter to MatrixBase::lpNorm(). * The value Infinity there means the L-infinity norm. */ @@ -152,7 +157,7 @@ const unsigned int DirectAccessBit = 0x40; /** \deprecated \ingroup flags * * means the first coefficient packet is guaranteed to be aligned. - * An expression cannot has the AlignedBit without the PacketAccessBit flag. + * An expression cannot have the AlignedBit without the PacketAccessBit flag. * In other words, this means we are allow to perform an aligned packet access to the first element regardless * of the expression kind: * \code @@ -250,12 +255,6 @@ enum AlignmentType { #endif }; -/** \ingroup enums - * Enum used by DenseBase::corner() in Eigen2 compatibility mode. */ -// FIXME after the corner() API change, this was not needed anymore, except by AlignedBox -// TODO: find out what to do with that. Adapt the AlignedBox API ? -enum CornerType { TopLeft, TopRight, BottomLeft, BottomRight }; - /** \ingroup enums * Enum containing possible values for the \p Direction parameter of * Reverse, PartialReduxExpr and VectorwiseOp. */ @@ -330,9 +329,20 @@ enum StorageOptions { * Enum for specifying whether to apply or solve on the left or right. */ enum SideType { /** Apply transformation on the left. */ - OnTheLeft = 1, + OnTheLeft = 1, /** Apply transformation on the right. */ - OnTheRight = 2 + OnTheRight = 2 +}; + +/** \ingroup enums + * Enum for specifying NaN-propagation behavior, e.g. for coeff-wise min/max. */ +enum NaNPropagationOptions { + /** Implementation defined behavior if NaNs are present. */ + PropagateFast = 0, + /** Always propagate NaNs. */ + PropagateNaN, + /** Always propagate not-NaNs. */ + PropagateNumbers }; /* the following used to be written as: @@ -464,6 +474,8 @@ namespace Architecture AltiVec = 0x2, VSX = 0x3, NEON = 0x4, + MSA = 0x5, + SVE = 0x6, #if defined EIGEN_VECTORIZE_SSE Target = SSE #elif defined EIGEN_VECTORIZE_ALTIVEC @@ -472,6 +484,10 @@ namespace Architecture Target = VSX #elif defined EIGEN_VECTORIZE_NEON Target = NEON +#elif defined EIGEN_VECTORIZE_SVE + Target = SVE +#elif defined EIGEN_VECTORIZE_MSA + Target = MSA #else Target = Generic #endif diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/DisableStupidWarnings.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/DisableStupidWarnings.h index 5fe86fa0d6..3bec07233f 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/DisableStupidWarnings.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/DisableStupidWarnings.h @@ -4,7 +4,6 @@ #ifdef _MSC_VER // 4100 - unreferenced formal parameter (occurred e.g. in aligned_allocator::destroy(pointer p)) // 4101 - unreferenced local variable - // 4127 - conditional expression is constant // 4181 - qualifier applied to reference type ignored // 4211 - nonstandard extension used : redefined extern to static // 4244 - 'argument' : conversion from 'type1' to 'type2', possible loss of data @@ -20,7 +19,7 @@ #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS #pragma warning( push ) #endif - #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800) + #pragma warning( disable : 4100 4101 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800) #elif defined __INTEL_COMPILER // 2196 - routine is both "inline" and "noinline" ("noinline" assumed) @@ -42,6 +41,17 @@ #pragma clang diagnostic push #endif #pragma clang diagnostic ignored "-Wconstant-logical-operand" + #if __clang_major__ >= 3 && __clang_minor__ >= 5 + #pragma clang diagnostic ignored "-Wabsolute-value" + #endif + #if __clang_major__ >= 10 + #pragma clang diagnostic ignored "-Wimplicit-int-float-conversion" + #endif + #if ( defined(__ALTIVEC__) || defined(__VSX__) ) && __cplusplus < 201103L + // warning: generic selections are a C11-specific feature + // ignoring warnings thrown at vec_ctf in Altivec/PacketMath.h + #pragma clang diagnostic ignored "-Wc11-extensions" + #endif #elif defined __GNUC__ @@ -68,6 +78,7 @@ #endif #if defined __NVCC__ + #pragma diag_suppress boolean_controlling_expr_is_constant // Disable the "statement is unreachable" message #pragma diag_suppress code_is_unreachable // Disable the "dynamic initialization in unreachable code" message @@ -85,6 +96,7 @@ #pragma diag_suppress 2671 #pragma diag_suppress 2735 #pragma diag_suppress 2737 + #pragma diag_suppress 2739 #endif #else diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/ForwardDeclarations.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/ForwardDeclarations.h index 134544f964..2f9cc44917 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/ForwardDeclarations.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/ForwardDeclarations.h @@ -79,6 +79,8 @@ template class ForceAlignedAccess; template class SwapWrapper; template class Block; +template class IndexedView; +template class Reshaped; template class VectorBlock; template class Transpose; @@ -108,7 +110,7 @@ template class TranspositionsWrapper; template::has_write_access ? WriteAccessors : ReadOnlyAccessors > class MapBase; -template class Stride; +template class Stride; template class InnerStride; template class OuterStride; template > class Map; @@ -129,6 +131,10 @@ template class SolverBase; template class InnerIterator; namespace internal { +template class generic_randaccess_stl_iterator; +template class pointer_based_stl_iterator; +template class subvector_stl_iterator; +template class subvector_stl_reverse_iterator; template struct kernel_retval_base; template struct kernel_retval; template struct image_retval_base; @@ -174,14 +180,15 @@ template struct scalar_sum_op; template struct scalar_difference_op; template struct scalar_conj_product_op; -template struct scalar_min_op; -template struct scalar_max_op; +template struct scalar_min_op; +template struct scalar_max_op; template struct scalar_opposite_op; template struct scalar_conjugate_op; template struct scalar_real_op; template struct scalar_imag_op; template struct scalar_abs_op; template struct scalar_abs2_op; +template struct scalar_absolute_difference_op; template struct scalar_sqrt_op; template struct scalar_rsqrt_op; template struct scalar_exp_op; @@ -198,7 +205,7 @@ template struct scalar_cast_op; template struct scalar_random_op; template struct scalar_constant_op; template struct scalar_identity_op; -template struct scalar_sign_op; +template struct scalar_sign_op; template struct scalar_pow_op; template struct scalar_hypot_op; template struct scalar_product_op; @@ -209,11 +216,27 @@ template struct scalar_lgamma_op; template struct scalar_digamma_op; template struct scalar_erf_op; template struct scalar_erfc_op; +template struct scalar_ndtri_op; template struct scalar_igamma_op; template struct scalar_igammac_op; template struct scalar_zeta_op; template struct scalar_betainc_op; +// Bessel functions in SpecialFunctions module +template struct scalar_bessel_i0_op; +template struct scalar_bessel_i0e_op; +template struct scalar_bessel_i1_op; +template struct scalar_bessel_i1e_op; +template struct scalar_bessel_j0_op; +template struct scalar_bessel_y0_op; +template struct scalar_bessel_j1_op; +template struct scalar_bessel_y1_op; +template struct scalar_bessel_k0_op; +template struct scalar_bessel_k0e_op; +template struct scalar_bessel_k1_op; +template struct scalar_bessel_k1e_op; + + } // end namespace internal struct IOFormat; @@ -251,6 +274,7 @@ template class HouseholderQR; template class ColPivHouseholderQR; template class FullPivHouseholderQR; template class CompleteOrthogonalDecomposition; +template class SVDBase; template class JacobiSVD; template class BDCSVD; template class LLT; diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/IndexedViewHelper.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/IndexedViewHelper.h new file mode 100644 index 0000000000..f85de305f2 --- /dev/null +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/IndexedViewHelper.h @@ -0,0 +1,186 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +#ifndef EIGEN_INDEXED_VIEW_HELPER_H +#define EIGEN_INDEXED_VIEW_HELPER_H + +namespace Eigen { + +namespace internal { +struct symbolic_last_tag {}; +} + +/** \var last + * \ingroup Core_Module + * + * Can be used as a parameter to Eigen::seq and Eigen::seqN functions to symbolically reference the last element/row/columns + * of the underlying vector or matrix once passed to DenseBase::operator()(const RowIndices&, const ColIndices&). + * + * This symbolic placeholder supports standard arithmetic operations. + * + * A typical usage example would be: + * \code + * using namespace Eigen; + * using Eigen::last; + * VectorXd v(n); + * v(seq(2,last-2)).setOnes(); + * \endcode + * + * \sa end + */ +static const symbolic::SymbolExpr last; // PLEASE use Eigen::last instead of Eigen::placeholders::last + +/** \var lastp1 + * \ingroup Core_Module + * + * Can be used as a parameter to Eigen::seq and Eigen::seqN functions to symbolically + * reference the last+1 element/row/columns of the underlying vector or matrix once + * passed to DenseBase::operator()(const RowIndices&, const ColIndices&). + * + * This symbolic placeholder supports standard arithmetic operations. + * It is essentially an alias to last+fix<1>. + * + * \sa last + */ +#ifdef EIGEN_PARSED_BY_DOXYGEN +static const auto lastp1 = last+fix<1>; +#else +// Using a FixedExpr<1> expression is important here to make sure the compiler +// can fully optimize the computation starting indices with zero overhead. +static const symbolic::AddExpr,symbolic::ValueExpr > > lastp1(last+fix<1>()); +#endif + +namespace internal { + + // Replace symbolic last/end "keywords" by their true runtime value +inline Index eval_expr_given_size(Index x, Index /* size */) { return x; } + +template +FixedInt eval_expr_given_size(FixedInt x, Index /*size*/) { return x; } + +template +Index eval_expr_given_size(const symbolic::BaseExpr &x, Index size) +{ + return x.derived().eval(last=size-1); +} + +// Extract increment/step at compile time +template struct get_compile_time_incr { + enum { value = UndefinedIncr }; +}; + +// Analogue of std::get<0>(x), but tailored for our needs. +template +EIGEN_CONSTEXPR Index first(const T& x) EIGEN_NOEXCEPT { return x.first(); } + +// IndexedViewCompatibleType/makeIndexedViewCompatible turn an arbitrary object of type T into something usable by MatrixSlice +// The generic implementation is a no-op +template +struct IndexedViewCompatibleType { + typedef T type; +}; + +template +const T& makeIndexedViewCompatible(const T& x, Index /*size*/, Q) { return x; } + +//-------------------------------------------------------------------------------- +// Handling of a single Index +//-------------------------------------------------------------------------------- + +struct SingleRange { + enum { + SizeAtCompileTime = 1 + }; + SingleRange(Index val) : m_value(val) {} + Index operator[](Index) const { return m_value; } + static EIGEN_CONSTEXPR Index size() EIGEN_NOEXCEPT { return 1; } + Index first() const EIGEN_NOEXCEPT { return m_value; } + Index m_value; +}; + +template<> struct get_compile_time_incr { + enum { value = 1 }; // 1 or 0 ?? +}; + +// Turn a single index into something that looks like an array (i.e., that exposes a .size(), and operator[](int) methods) +template +struct IndexedViewCompatibleType::value>::type> { + // Here we could simply use Array, but maybe it's less work for the compiler to use + // a simpler wrapper as SingleRange + //typedef Eigen::Array type; + typedef SingleRange type; +}; + +template +struct IndexedViewCompatibleType::value>::type> { + typedef SingleRange type; +}; + + +template +typename enable_if::value,SingleRange>::type +makeIndexedViewCompatible(const T& id, Index size, SpecializedType) { + return eval_expr_given_size(id,size); +} + +//-------------------------------------------------------------------------------- +// Handling of all +//-------------------------------------------------------------------------------- + +struct all_t { all_t() {} }; + +// Convert a symbolic 'all' into a usable range type +template +struct AllRange { + enum { SizeAtCompileTime = XprSize }; + AllRange(Index size = XprSize) : m_size(size) {} + EIGEN_CONSTEXPR Index operator[](Index i) const EIGEN_NOEXCEPT { return i; } + EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT { return m_size.value(); } + EIGEN_CONSTEXPR Index first() const EIGEN_NOEXCEPT { return 0; } + variable_if_dynamic m_size; +}; + +template +struct IndexedViewCompatibleType { + typedef AllRange type; +}; + +template +inline AllRange::value> makeIndexedViewCompatible(all_t , XprSizeType size, SpecializedType) { + return AllRange::value>(size); +} + +template struct get_compile_time_incr > { + enum { value = 1 }; +}; + +} // end namespace internal + + +/** \var all + * \ingroup Core_Module + * Can be used as a parameter to DenseBase::operator()(const RowIndices&, const ColIndices&) to index all rows or columns + */ +static const Eigen::internal::all_t all; // PLEASE use Eigen::all instead of Eigen::placeholders::all + + +namespace placeholders { + typedef symbolic::SymbolExpr last_t; + typedef symbolic::AddExpr,symbolic::ValueExpr > > end_t; + typedef Eigen::internal::all_t all_t; + + EIGEN_DEPRECATED static const all_t all = Eigen::all; // PLEASE use Eigen::all instead of Eigen::placeholders::all + EIGEN_DEPRECATED static const last_t last = Eigen::last; // PLEASE use Eigen::last instead of Eigen::placeholders::last + EIGEN_DEPRECATED static const end_t end = Eigen::lastp1; // PLEASE use Eigen::lastp1 instead of Eigen::placeholders::end +} + +} // end namespace Eigen + +#endif // EIGEN_INDEXED_VIEW_HELPER_H diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/IntegralConstant.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/IntegralConstant.h new file mode 100644 index 0000000000..945d426ea8 --- /dev/null +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/IntegralConstant.h @@ -0,0 +1,272 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +#ifndef EIGEN_INTEGRAL_CONSTANT_H +#define EIGEN_INTEGRAL_CONSTANT_H + +namespace Eigen { + +namespace internal { + +template class FixedInt; +template class VariableAndFixedInt; + +/** \internal + * \class FixedInt + * + * This class embeds a compile-time integer \c N. + * + * It is similar to c++11 std::integral_constant but with some additional features + * such as: + * - implicit conversion to int + * - arithmetic and some bitwise operators: -, +, *, /, %, &, | + * - c++98/14 compatibility with fix and fix() syntax to define integral constants. + * + * It is strongly discouraged to directly deal with this class FixedInt. Instances are expcected to + * be created by the user using Eigen::fix or Eigen::fix(). In C++98-11, the former syntax does + * not create a FixedInt instance but rather a point to function that needs to be \em cleaned-up + * using the generic helper: + * \code + * internal::cleanup_index_type::type + * internal::cleanup_index_type::type + * \endcode + * where T can a FixedInt, a pointer to function FixedInt (*)(), or numerous other integer-like representations. + * \c DynamicKey is either Dynamic (default) or DynamicIndex and used to identify true compile-time values. + * + * For convenience, you can extract the compile-time value \c N in a generic way using the following helper: + * \code + * internal::get_fixed_value::value + * \endcode + * that will give you \c N if T equals FixedInt or FixedInt (*)(), and \c DefaultVal if T does not embed any compile-time value (e.g., T==int). + * + * \sa fix, class VariableAndFixedInt + */ +template class FixedInt +{ +public: + static const int value = N; + EIGEN_CONSTEXPR operator int() const { return value; } + FixedInt() {} + FixedInt( VariableAndFixedInt other) { + #ifndef EIGEN_INTERNAL_DEBUGGING + EIGEN_UNUSED_VARIABLE(other); + #endif + eigen_internal_assert(int(other)==N); + } + + FixedInt<-N> operator-() const { return FixedInt<-N>(); } + template + FixedInt operator+( FixedInt) const { return FixedInt(); } + template + FixedInt operator-( FixedInt) const { return FixedInt(); } + template + FixedInt operator*( FixedInt) const { return FixedInt(); } + template + FixedInt operator/( FixedInt) const { return FixedInt(); } + template + FixedInt operator%( FixedInt) const { return FixedInt(); } + template + FixedInt operator|( FixedInt) const { return FixedInt(); } + template + FixedInt operator&( FixedInt) const { return FixedInt(); } + +#if EIGEN_HAS_CXX14_VARIABLE_TEMPLATES + // Needed in C++14 to allow fix(): + FixedInt operator() () const { return *this; } + + VariableAndFixedInt operator() (int val) const { return VariableAndFixedInt(val); } +#else + FixedInt ( FixedInt (*)() ) {} +#endif + +#if EIGEN_HAS_CXX11 + FixedInt(std::integral_constant) {} +#endif +}; + +/** \internal + * \class VariableAndFixedInt + * + * This class embeds both a compile-time integer \c N and a runtime integer. + * Both values are supposed to be equal unless the compile-time value \c N has a special + * value meaning that the runtime-value should be used. Depending on the context, this special + * value can be either Eigen::Dynamic (for positive quantities) or Eigen::DynamicIndex (for + * quantities that can be negative). + * + * It is the return-type of the function Eigen::fix(int), and most of the time this is the only + * way it is used. It is strongly discouraged to directly deal with instances of VariableAndFixedInt. + * Indeed, in order to write generic code, it is the responsibility of the callee to properly convert + * it to either a true compile-time quantity (i.e. a FixedInt), or to a runtime quantity (e.g., an Index) + * using the following generic helper: + * \code + * internal::cleanup_index_type::type + * internal::cleanup_index_type::type + * \endcode + * where T can be a template instantiation of VariableAndFixedInt or numerous other integer-like representations. + * \c DynamicKey is either Dynamic (default) or DynamicIndex and used to identify true compile-time values. + * + * For convenience, you can also extract the compile-time value \c N using the following helper: + * \code + * internal::get_fixed_value::value + * \endcode + * that will give you \c N if T equals VariableAndFixedInt, and \c DefaultVal if T does not embed any compile-time value (e.g., T==int). + * + * \sa fix(int), class FixedInt + */ +template class VariableAndFixedInt +{ +public: + static const int value = N; + operator int() const { return m_value; } + VariableAndFixedInt(int val) { m_value = val; } +protected: + int m_value; +}; + +template struct get_fixed_value { + static const int value = Default; +}; + +template struct get_fixed_value,Default> { + static const int value = N; +}; + +#if !EIGEN_HAS_CXX14 +template struct get_fixed_value (*)(),Default> { + static const int value = N; +}; +#endif + +template struct get_fixed_value,Default> { + static const int value = N ; +}; + +template +struct get_fixed_value,Default> { + static const int value = N; +}; + +template EIGEN_DEVICE_FUNC Index get_runtime_value(const T &x) { return x; } +#if !EIGEN_HAS_CXX14 +template EIGEN_DEVICE_FUNC Index get_runtime_value(FixedInt (*)()) { return N; } +#endif + +// Cleanup integer/FixedInt/VariableAndFixedInt/etc types: + +// By default, no cleanup: +template struct cleanup_index_type { typedef T type; }; + +// Convert any integral type (e.g., short, int, unsigned int, etc.) to Eigen::Index +template struct cleanup_index_type::value>::type> { typedef Index type; }; + +#if !EIGEN_HAS_CXX14 +// In c++98/c++11, fix is a pointer to function that we better cleanup to a true FixedInt: +template struct cleanup_index_type (*)(), DynamicKey> { typedef FixedInt type; }; +#endif + +// If VariableAndFixedInt does not match DynamicKey, then we turn it to a pure compile-time value: +template struct cleanup_index_type, DynamicKey> { typedef FixedInt type; }; +// If VariableAndFixedInt matches DynamicKey, then we turn it to a pure runtime-value (aka Index): +template struct cleanup_index_type, DynamicKey> { typedef Index type; }; + +#if EIGEN_HAS_CXX11 +template struct cleanup_index_type, DynamicKey> { typedef FixedInt type; }; +#endif + +} // end namespace internal + +#ifndef EIGEN_PARSED_BY_DOXYGEN + +#if EIGEN_HAS_CXX14_VARIABLE_TEMPLATES +template +static const internal::FixedInt fix{}; +#else +template +inline internal::FixedInt fix() { return internal::FixedInt(); } + +// The generic typename T is mandatory. Otherwise, a code like fix could refer to either the function above or this next overload. +// This way a code like fix can only refer to the previous function. +template +inline internal::VariableAndFixedInt fix(T val) { return internal::VariableAndFixedInt(internal::convert_index(val)); } +#endif + +#else // EIGEN_PARSED_BY_DOXYGEN + +/** \var fix() + * \ingroup Core_Module + * + * This \em identifier permits to construct an object embedding a compile-time integer \c N. + * + * \tparam N the compile-time integer value + * + * It is typically used in conjunction with the Eigen::seq and Eigen::seqN functions to pass compile-time values to them: + * \code + * seqN(10,fix<4>,fix<-3>) // <=> [10 7 4 1] + * \endcode + * + * See also the function fix(int) to pass both a compile-time and runtime value. + * + * In c++14, it is implemented as: + * \code + * template static const internal::FixedInt fix{}; + * \endcode + * where internal::FixedInt is an internal template class similar to + * \c std::integral_constant + * Here, \c fix is thus an object of type \c internal::FixedInt. + * + * In c++98/11, it is implemented as a function: + * \code + * template inline internal::FixedInt fix(); + * \endcode + * Here internal::FixedInt is thus a pointer to function. + * + * If for some reason you want a true object in c++98 then you can write: \code fix() \endcode which is also valid in c++14. + * + * \sa fix(int), seq, seqN + */ +template +static const auto fix(); + +/** \fn fix(int) + * \ingroup Core_Module + * + * This function returns an object embedding both a compile-time integer \c N, and a fallback runtime value \a val. + * + * \tparam N the compile-time integer value + * \param val the fallback runtime integer value + * + * This function is a more general version of the \ref fix identifier/function that can be used in template code + * where the compile-time value could turn out to actually mean "undefined at compile-time". For positive integers + * such as a size or a dimension, this case is identified by Eigen::Dynamic, whereas runtime signed integers + * (e.g., an increment/stride) are identified as Eigen::DynamicIndex. In such a case, the runtime value \a val + * will be used as a fallback. + * + * A typical use case would be: + * \code + * template void foo(const MatrixBase &mat) { + * const int N = Derived::RowsAtCompileTime==Dynamic ? Dynamic : Derived::RowsAtCompileTime/2; + * const int n = mat.rows()/2; + * ... mat( seqN(0,fix(n) ) ...; + * } + * \endcode + * In this example, the function Eigen::seqN knows that the second argument is expected to be a size. + * If the passed compile-time value N equals Eigen::Dynamic, then the proxy object returned by fix will be dissmissed, and converted to an Eigen::Index of value \c n. + * Otherwise, the runtime-value \c n will be dissmissed, and the returned ArithmeticSequence will be of the exact same type as seqN(0,fix) . + * + * \sa fix, seqN, class ArithmeticSequence + */ +template +static const auto fix(int val); + +#endif // EIGEN_PARSED_BY_DOXYGEN + +} // end namespace Eigen + +#endif // EIGEN_INTEGRAL_CONSTANT_H diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/Macros.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/Macros.h index 6b0399ebcc..986c3d44db 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/Macros.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/Macros.h @@ -11,19 +11,56 @@ #ifndef EIGEN_MACROS_H #define EIGEN_MACROS_H +//------------------------------------------------------------------------------------------ +// Eigen version and basic defaults +//------------------------------------------------------------------------------------------ + #define EIGEN_WORLD_VERSION 3 -#define EIGEN_MAJOR_VERSION 3 -#define EIGEN_MINOR_VERSION 9 +#define EIGEN_MAJOR_VERSION 4 +#define EIGEN_MINOR_VERSION 0 #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \ (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \ EIGEN_MINOR_VERSION>=z)))) +#ifdef EIGEN_DEFAULT_TO_ROW_MAJOR +#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::RowMajor +#else +#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::ColMajor +#endif + +#ifndef EIGEN_DEFAULT_DENSE_INDEX_TYPE +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE std::ptrdiff_t +#endif + +// Upperbound on the C++ version to use. +// Expected values are 03, 11, 14, 17, etc. +// By default, let's use an arbitrarily large C++ version. +#ifndef EIGEN_MAX_CPP_VER +#define EIGEN_MAX_CPP_VER 99 +#endif + +/** Allows to disable some optimizations which might affect the accuracy of the result. + * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them. + * They currently include: + * - single precision ArrayBase::sin() and ArrayBase::cos() for SSE and AVX vectorization. + */ +#ifndef EIGEN_FAST_MATH +#define EIGEN_FAST_MATH 1 +#endif + +#ifndef EIGEN_STACK_ALLOCATION_LIMIT +// 131072 == 128 KB +#define EIGEN_STACK_ALLOCATION_LIMIT 131072 +#endif + +//------------------------------------------------------------------------------------------ // Compiler identification, EIGEN_COMP_* +//------------------------------------------------------------------------------------------ /// \internal EIGEN_COMP_GNUC set to 1 for all compilers compatible with GCC #ifdef __GNUC__ - #define EIGEN_COMP_GNUC 1 + #define EIGEN_COMP_GNUC (__GNUC__*10+__GNUC_MINOR__) #else #define EIGEN_COMP_GNUC 0 #endif @@ -35,6 +72,12 @@ #define EIGEN_COMP_CLANG 0 #endif +/// \internal EIGEN_COMP_CASTXML set to 1 if being preprocessed by CastXML +#if defined(__castxml__) + #define EIGEN_COMP_CASTXML 1 +#else + #define EIGEN_COMP_CASTXML 0 +#endif /// \internal EIGEN_COMP_LLVM set to 1 if the compiler backend is llvm #if defined(__llvm__) @@ -71,14 +114,44 @@ #define EIGEN_COMP_MSVC 0 #endif +#if defined(__NVCC__) +#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) + #define EIGEN_COMP_NVCC ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100)) +#elif defined(__CUDACC_VER__) + #define EIGEN_COMP_NVCC __CUDACC_VER__ +#else + #error "NVCC did not define compiler version." +#endif +#else + #define EIGEN_COMP_NVCC 0 +#endif + // For the record, here is a table summarizing the possible values for EIGEN_COMP_MSVC: -// name ver MSC_VER -// 2008 9 1500 -// 2010 10 1600 -// 2012 11 1700 -// 2013 12 1800 -// 2015 14 1900 -// "15" 15 1900 +// name ver MSC_VER +// 2008 9 1500 +// 2010 10 1600 +// 2012 11 1700 +// 2013 12 1800 +// 2015 14 1900 +// "15" 15 1900 +// 2017-14.1 15.0 1910 +// 2017-14.11 15.3 1911 +// 2017-14.12 15.5 1912 +// 2017-14.13 15.6 1913 +// 2017-14.14 15.7 1914 + +/// \internal EIGEN_COMP_MSVC_LANG set to _MSVC_LANG if the compiler is Microsoft Visual C++, 0 otherwise. +#if defined(_MSVC_LANG) + #define EIGEN_COMP_MSVC_LANG _MSVC_LANG +#else + #define EIGEN_COMP_MSVC_LANG 0 +#endif + +// For the record, here is a table summarizing the possible values for EIGEN_COMP_MSVC_LANG: +// MSVC option Standard MSVC_LANG +// /std:c++14 (default as of VS 2019) C++14 201402L +// /std:c++17 C++17 201703L +// /std:c++latest >C++17 >201703L /// \internal EIGEN_COMP_MSVC_STRICT set to 1 if the compiler is really Microsoft Visual C++ and not ,e.g., ICC or clang-cl #if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC || EIGEN_COMP_LLVM || EIGEN_COMP_CLANG) @@ -87,16 +160,21 @@ #define EIGEN_COMP_MSVC_STRICT 0 #endif -/// \internal EIGEN_COMP_IBM set to 1 if the compiler is IBM XL C++ -#if defined(__IBMCPP__) || defined(__xlc__) - #define EIGEN_COMP_IBM 1 +/// \internal EIGEN_COMP_IBM set to xlc version if the compiler is IBM XL C++ +// XLC version +// 3.1 0x0301 +// 4.5 0x0405 +// 5.0 0x0500 +// 12.1 0x0C01 +#if defined(__IBMCPP__) || defined(__xlc__) || defined(__ibmxl__) + #define EIGEN_COMP_IBM __xlC__ #else #define EIGEN_COMP_IBM 0 #endif -/// \internal EIGEN_COMP_PGI set to 1 if the compiler is Portland Group Compiler +/// \internal EIGEN_COMP_PGI set to PGI version if the compiler is Portland Group Compiler #if defined(__PGI) - #define EIGEN_COMP_PGI 1 + #define EIGEN_COMP_PGI (__PGIC__*100+__PGIC_MINOR__) #else #define EIGEN_COMP_PGI 0 #endif @@ -108,7 +186,7 @@ #define EIGEN_COMP_ARM 0 #endif -/// \internal EIGEN_COMP_ARM set to 1 if the compiler is ARM Compiler +/// \internal EIGEN_COMP_EMSCRIPTEN set to 1 if the compiler is Emscripten Compiler #if defined(__EMSCRIPTEN__) #define EIGEN_COMP_EMSCRIPTEN 1 #else @@ -142,9 +220,13 @@ #endif -// Architecture identification, EIGEN_ARCH_* -#if defined(__x86_64__) || defined(_M_X64) || defined(__amd64) +//------------------------------------------------------------------------------------------ +// Architecture identification, EIGEN_ARCH_* +//------------------------------------------------------------------------------------------ + + +#if defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)) || defined(__amd64) #define EIGEN_ARCH_x86_64 1 #else #define EIGEN_ARCH_x86_64 0 @@ -170,18 +252,61 @@ #endif /// \internal EIGEN_ARCH_ARM64 set to 1 if the architecture is ARM64 -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) #define EIGEN_ARCH_ARM64 1 #else #define EIGEN_ARCH_ARM64 0 #endif +/// \internal EIGEN_ARCH_ARM_OR_ARM64 set to 1 if the architecture is ARM or ARM64 #if EIGEN_ARCH_ARM || EIGEN_ARCH_ARM64 #define EIGEN_ARCH_ARM_OR_ARM64 1 #else #define EIGEN_ARCH_ARM_OR_ARM64 0 #endif +/// \internal EIGEN_ARCH_ARMV8 set to 1 if the architecture is armv8 or greater. +#if EIGEN_ARCH_ARM_OR_ARM64 && defined(__ARM_ARCH) && __ARM_ARCH >= 8 +#define EIGEN_ARCH_ARMV8 1 +#else +#define EIGEN_ARCH_ARMV8 0 +#endif + + +/// \internal EIGEN_HAS_ARM64_FP16 set to 1 if the architecture provides an IEEE +/// compliant Arm fp16 type +#if EIGEN_ARCH_ARM64 + #ifndef EIGEN_HAS_ARM64_FP16 + #if defined(__ARM_FP16_FORMAT_IEEE) + #define EIGEN_HAS_ARM64_FP16 1 + #else + #define EIGEN_HAS_ARM64_FP16 0 + #endif + #endif +#endif + +/// \internal EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC set to 1 if the architecture +/// supports Neon vector intrinsics for fp16. +#if EIGEN_ARCH_ARM64 + #ifndef EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC + #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 1 + #else + #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 0 + #endif + #endif +#endif + +/// \internal EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC set to 1 if the architecture +/// supports Neon scalar intrinsics for fp16. +#if EIGEN_ARCH_ARM64 + #ifndef EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC + #if defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) + #define EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC 1 + #endif + #endif +#endif + /// \internal EIGEN_ARCH_MIPS set to 1 if the architecture is MIPS #if defined(__mips__) || defined(__mips) #define EIGEN_ARCH_MIPS 1 @@ -212,7 +337,9 @@ +//------------------------------------------------------------------------------------------ // Operating system identification, EIGEN_OS_* +//------------------------------------------------------------------------------------------ /// \internal EIGEN_OS_UNIX set to 1 if the OS is a unix variant #if defined(__unix__) || defined(__unix) @@ -299,9 +426,17 @@ #define EIGEN_OS_WIN_STRICT 0 #endif -/// \internal EIGEN_OS_SUN set to 1 if the OS is SUN +/// \internal EIGEN_OS_SUN set to __SUNPRO_C if the OS is SUN +// compiler solaris __SUNPRO_C +// version studio +// 5.7 10 0x570 +// 5.8 11 0x580 +// 5.9 12 0x590 +// 5.10 12.1 0x5100 +// 5.11 12.2 0x5110 +// 5.12 12.3 0x5120 #if (defined(sun) || defined(__sun)) && !(defined(__SVR4) || defined(__svr4__)) - #define EIGEN_OS_SUN 1 + #define EIGEN_OS_SUN __SUNPRO_C #else #define EIGEN_OS_SUN 0 #endif @@ -314,6 +449,131 @@ #endif +//------------------------------------------------------------------------------------------ +// Detect GPU compilers and architectures +//------------------------------------------------------------------------------------------ + +// NVCC is not supported as the target platform for HIPCC +// Note that this also makes EIGEN_CUDACC and EIGEN_HIPCC mutually exclusive +#if defined(__NVCC__) && defined(__HIPCC__) + #error "NVCC as the target platform for HIPCC is currently not supported." +#endif + +#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA) + // Means the compiler is either nvcc or clang with CUDA enabled + #define EIGEN_CUDACC __CUDACC__ +#endif + +#if defined(__CUDA_ARCH__) && !defined(EIGEN_NO_CUDA) + // Means we are generating code for the device + #define EIGEN_CUDA_ARCH __CUDA_ARCH__ +#endif + +#if defined(EIGEN_CUDACC) +#include + #define EIGEN_CUDA_SDK_VER (CUDA_VERSION * 10) +#else + #define EIGEN_CUDA_SDK_VER 0 +#endif + +#if defined(__HIPCC__) && !defined(EIGEN_NO_HIP) + // Means the compiler is HIPCC (analogous to EIGEN_CUDACC, but for HIP) + #define EIGEN_HIPCC __HIPCC__ + + // We need to include hip_runtime.h here because it pulls in + // ++ hip_common.h which contains the define for __HIP_DEVICE_COMPILE__ + // ++ host_defines.h which contains the defines for the __host__ and __device__ macros + #include + + #if defined(__HIP_DEVICE_COMPILE__) + // analogous to EIGEN_CUDA_ARCH, but for HIP + #define EIGEN_HIP_DEVICE_COMPILE __HIP_DEVICE_COMPILE__ + #endif + + // For HIP (ROCm 3.5 and higher), we need to explicitly set the launch_bounds attribute + // value to 1024. The compiler assigns a default value of 256 when the attribute is not + // specified. This results in failures on the HIP platform, for cases when a GPU kernel + // without an explicit launch_bounds attribute is called with a threads_per_block value + // greater than 256. + // + // This is a regression in functioanlity and is expected to be fixed within the next + // couple of ROCm releases (compiler will go back to using 1024 value as the default) + // + // In the meantime, we will use a "only enabled for HIP" macro to set the launch_bounds + // attribute. + + #define EIGEN_HIP_LAUNCH_BOUNDS_1024 __launch_bounds__(1024) + +#endif + +#if !defined(EIGEN_HIP_LAUNCH_BOUNDS_1024) +#define EIGEN_HIP_LAUNCH_BOUNDS_1024 +#endif // !defined(EIGEN_HIP_LAUNCH_BOUNDS_1024) + +// Unify CUDA/HIPCC + +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +// +// If either EIGEN_CUDACC or EIGEN_HIPCC is defined, then define EIGEN_GPUCC +// +#define EIGEN_GPUCC +// +// EIGEN_HIPCC implies the HIP compiler and is used to tweak Eigen code for use in HIP kernels +// EIGEN_CUDACC implies the CUDA compiler and is used to tweak Eigen code for use in CUDA kernels +// +// In most cases the same tweaks are required to the Eigen code to enable in both the HIP and CUDA kernels. +// For those cases, the corresponding code should be guarded with +// #if defined(EIGEN_GPUCC) +// instead of +// #if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +// +// For cases where the tweak is specific to HIP, the code should be guarded with +// #if defined(EIGEN_HIPCC) +// +// For cases where the tweak is specific to CUDA, the code should be guarded with +// #if defined(EIGEN_CUDACC) +// +#endif + +#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) +// +// If either EIGEN_CUDA_ARCH or EIGEN_HIP_DEVICE_COMPILE is defined, then define EIGEN_GPU_COMPILE_PHASE +// +#define EIGEN_GPU_COMPILE_PHASE +// +// GPU compilers (HIPCC, NVCC) typically do two passes over the source code, +// + one to compile the source for the "host" (ie CPU) +// + another to compile the source for the "device" (ie. GPU) +// +// Code that needs to enabled only during the either the "host" or "device" compilation phase +// needs to be guarded with a macro that indicates the current compilation phase +// +// EIGEN_HIP_DEVICE_COMPILE implies the device compilation phase in HIP +// EIGEN_CUDA_ARCH implies the device compilation phase in CUDA +// +// In most cases, the "host" / "device" specific code is the same for both HIP and CUDA +// For those cases, the code should be guarded with +// #if defined(EIGEN_GPU_COMPILE_PHASE) +// instead of +// #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) +// +// For cases where the tweak is specific to HIP, the code should be guarded with +// #if defined(EIGEN_HIP_DEVICE_COMPILE) +// +// For cases where the tweak is specific to CUDA, the code should be guarded with +// #if defined(EIGEN_CUDA_ARCH) +// +#endif + +#if defined(EIGEN_USE_SYCL) && defined(__SYCL_DEVICE_ONLY__) +// EIGEN_USE_SYCL is a user-defined macro while __SYCL_DEVICE_ONLY__ is a compiler-defined macro. +// In most cases we want to check if both macros are defined which can be done using the define below. +#define SYCL_DEVICE_ONLY +#endif + +//------------------------------------------------------------------------------------------ +// Detect Compiler/Architecture/OS specific features +//------------------------------------------------------------------------------------------ #if EIGEN_GNUC_AT_MOST(4,3) && !EIGEN_COMP_CLANG // see bug 89 @@ -322,20 +582,6 @@ #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 1 #endif -// This macro can be used to prevent from macro expansion, e.g.: -// std::max EIGEN_NOT_A_MACRO(a,b) -#define EIGEN_NOT_A_MACRO - -#ifdef EIGEN_DEFAULT_TO_ROW_MAJOR -#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::RowMajor -#else -#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::ColMajor -#endif - -#ifndef EIGEN_DEFAULT_DENSE_INDEX_TYPE -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE std::ptrdiff_t -#endif - // Cross compiler wrapper around LLVM's __has_builtin #ifdef __has_builtin # define EIGEN_HAS_BUILTIN(x) __has_builtin(x) @@ -349,26 +595,79 @@ # define __has_feature(x) 0 #endif -// Upperbound on the C++ version to use. -// Expected values are 03, 11, 14, 17, etc. -// By default, let's use an arbitrarily large C++ version. -#ifndef EIGEN_MAX_CPP_VER -#define EIGEN_MAX_CPP_VER 99 +// Some old compilers do not support template specializations like: +// template void foo(const T x[N]); +#if !( EIGEN_COMP_CLANG && ( (EIGEN_COMP_CLANG<309) \ + || (defined(__apple_build_version__) && (__apple_build_version__ < 9000000))) \ + || EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<49) +#define EIGEN_HAS_STATIC_ARRAY_TEMPLATE 1 +#else +#define EIGEN_HAS_STATIC_ARRAY_TEMPLATE 0 #endif -#if EIGEN_MAX_CPP_VER>=11 && (defined(__cplusplus) && (__cplusplus >= 201103L) || EIGEN_COMP_MSVC >= 1900) +// The macro EIGEN_CPLUSPLUS is a replacement for __cplusplus/_MSVC_LANG that +// works for both platforms, indicating the C++ standard version number. +// +// With MSVC, without defining /Zc:__cplusplus, the __cplusplus macro will +// report 199711L regardless of the language standard specified via /std. +// We need to rely on _MSVC_LANG instead, which is only available after +// VS2015.3. +#if EIGEN_COMP_MSVC_LANG > 0 +#define EIGEN_CPLUSPLUS EIGEN_COMP_MSVC_LANG +#elif EIGEN_COMP_MSVC >= 1900 +#define EIGEN_CPLUSPLUS 201103L +#elif defined(__cplusplus) +#define EIGEN_CPLUSPLUS __cplusplus +#else +#define EIGEN_CPLUSPLUS 0 +#endif + +// The macro EIGEN_COMP_CXXVER defines the c++ verson expected by the compiler. +// For instance, if compiling with gcc and -std=c++17, then EIGEN_COMP_CXXVER +// is defined to 17. +#if EIGEN_CPLUSPLUS > 201703L + #define EIGEN_COMP_CXXVER 20 +#elif EIGEN_CPLUSPLUS > 201402L + #define EIGEN_COMP_CXXVER 17 +#elif EIGEN_CPLUSPLUS > 201103L + #define EIGEN_COMP_CXXVER 14 +#elif EIGEN_CPLUSPLUS >= 201103L + #define EIGEN_COMP_CXXVER 11 +#else + #define EIGEN_COMP_CXXVER 03 +#endif + +#ifndef EIGEN_HAS_CXX14_VARIABLE_TEMPLATES + #if defined(__cpp_variable_templates) && __cpp_variable_templates >= 201304 && EIGEN_MAX_CPP_VER>=14 + #define EIGEN_HAS_CXX14_VARIABLE_TEMPLATES 1 + #else + #define EIGEN_HAS_CXX14_VARIABLE_TEMPLATES 0 + #endif +#endif + + +// The macros EIGEN_HAS_CXX?? defines a rough estimate of available c++ features +// but in practice we should not rely on them but rather on the availabilty of +// individual features as defined later. +// This is why there is no EIGEN_HAS_CXX17. +// FIXME: get rid of EIGEN_HAS_CXX14 and maybe even EIGEN_HAS_CXX11. +#if EIGEN_MAX_CPP_VER>=11 && EIGEN_COMP_CXXVER>=11 #define EIGEN_HAS_CXX11 1 #else #define EIGEN_HAS_CXX11 0 #endif +#if EIGEN_MAX_CPP_VER>=14 && EIGEN_COMP_CXXVER>=14 +#define EIGEN_HAS_CXX14 1 +#else +#define EIGEN_HAS_CXX14 0 +#endif // Do we support r-value references? #ifndef EIGEN_HAS_RVALUE_REFERENCES #if EIGEN_MAX_CPP_VER>=11 && \ (__has_feature(cxx_rvalue_references) || \ - (defined(__cplusplus) && __cplusplus >= 201103L) || \ - (EIGEN_COMP_MSVC >= 1600)) + (EIGEN_COMP_CXXVER >= 11) || (EIGEN_COMP_MSVC >= 1600)) #define EIGEN_HAS_RVALUE_REFERENCES 1 #else #define EIGEN_HAS_RVALUE_REFERENCES 0 @@ -376,12 +675,14 @@ #endif // Does the compiler support C99? +// Need to include to make sure _GLIBCXX_USE_C99 gets defined +#include #ifndef EIGEN_HAS_C99_MATH #if EIGEN_MAX_CPP_VER>=11 && \ ((defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901)) \ || (defined(__GNUC__) && defined(_GLIBCXX_USE_C99)) \ || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER)) \ - || (EIGEN_COMP_MSVC >= 1900) ) + || (EIGEN_COMP_MSVC >= 1900) || defined(SYCL_DEVICE_ONLY)) #define EIGEN_HAS_C99_MATH 1 #else #define EIGEN_HAS_C99_MATH 0 @@ -389,14 +690,50 @@ #endif // Does the compiler support result_of? +// result_of was deprecated in c++17 and removed in c++ 20 #ifndef EIGEN_HAS_STD_RESULT_OF -#if EIGEN_MAX_CPP_VER>=11 && ((__has_feature(cxx_lambdas) || (defined(__cplusplus) && __cplusplus >= 201103L))) +#if EIGEN_HAS_CXX11 && EIGEN_COMP_CXXVER < 17 #define EIGEN_HAS_STD_RESULT_OF 1 #else #define EIGEN_HAS_STD_RESULT_OF 0 #endif #endif +// Does the compiler support std::hash? +#ifndef EIGEN_HAS_STD_HASH +// The std::hash struct is defined in C++11 but is not labelled as a __device__ +// function and is not constexpr, so cannot be used on device. +#if EIGEN_HAS_CXX11 && !defined(EIGEN_GPU_COMPILE_PHASE) +#define EIGEN_HAS_STD_HASH 1 +#else +#define EIGEN_HAS_STD_HASH 0 +#endif +#endif // EIGEN_HAS_STD_HASH + +#ifndef EIGEN_HAS_STD_INVOKE_RESULT +#if EIGEN_MAX_CPP_VER >= 17 && EIGEN_COMP_CXXVER >= 17 +#define EIGEN_HAS_STD_INVOKE_RESULT 1 +#else +#define EIGEN_HAS_STD_INVOKE_RESULT 0 +#endif +#endif + +#ifndef EIGEN_HAS_ALIGNAS +#if EIGEN_MAX_CPP_VER>=11 && EIGEN_HAS_CXX11 && \ + ( __has_feature(cxx_alignas) \ + || EIGEN_HAS_CXX14 \ + || (EIGEN_COMP_MSVC >= 1800) \ + || (EIGEN_GNUC_AT_LEAST(4,8)) \ + || (EIGEN_COMP_CLANG>=305) \ + || (EIGEN_COMP_ICC>=1500) \ + || (EIGEN_COMP_PGI>=1500) \ + || (EIGEN_COMP_SUNCC>=0x5130)) +#define EIGEN_HAS_ALIGNAS 1 +#else +#define EIGEN_HAS_ALIGNAS 0 +#endif +#endif + // Does the compiler support type_traits? // - full support of type traits was added only to GCC 5.1.0. // - 20150626 corresponds to the last release of 4.x libstdc++ @@ -413,11 +750,13 @@ // Does the compiler support variadic templates? #ifndef EIGEN_HAS_VARIADIC_TEMPLATES -#if EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) \ - && (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (EIGEN_CUDACC_VER >= 80000) ) +#if EIGEN_MAX_CPP_VER>=11 && (EIGEN_COMP_CXXVER >= 11) \ + && (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (EIGEN_COMP_NVCC >= 80000) ) // ^^ Disable the use of variadic templates when compiling with versions of nvcc older than 8.0 on ARM devices: // this prevents nvcc from crashing when compiling Eigen on Tegra X1 #define EIGEN_HAS_VARIADIC_TEMPLATES 1 +#elif EIGEN_MAX_CPP_VER>=11 && (EIGEN_COMP_CXXVER >= 11) && defined(SYCL_DEVICE_ONLY) +#define EIGEN_HAS_VARIADIC_TEMPLATES 1 #else #define EIGEN_HAS_VARIADIC_TEMPLATES 0 #endif @@ -425,27 +764,33 @@ // Does the compiler fully support const expressions? (as in c++14) #ifndef EIGEN_HAS_CONSTEXPR + #if defined(EIGEN_CUDACC) + // Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above + #if EIGEN_MAX_CPP_VER>=14 && (EIGEN_COMP_CXXVER >= 11 && (EIGEN_COMP_CLANG || EIGEN_COMP_NVCC >= 70500)) + #define EIGEN_HAS_CONSTEXPR 1 + #endif + #elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (EIGEN_COMP_CXXVER >= 14) || \ + (EIGEN_GNUC_AT_LEAST(4,8) && (EIGEN_COMP_CXXVER >= 11)) || \ + (EIGEN_COMP_CLANG >= 306 && (EIGEN_COMP_CXXVER >= 11))) + #define EIGEN_HAS_CONSTEXPR 1 + #endif -#ifdef __CUDACC__ -// Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above -#if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && (EIGEN_COMP_CLANG || EIGEN_CUDACC_VER >= 70500)) - #define EIGEN_HAS_CONSTEXPR 1 -#endif -#elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \ - (EIGEN_GNUC_AT_LEAST(4,8) && (__cplusplus > 199711L))) -#define EIGEN_HAS_CONSTEXPR 1 -#endif + #ifndef EIGEN_HAS_CONSTEXPR + #define EIGEN_HAS_CONSTEXPR 0 + #endif -#ifndef EIGEN_HAS_CONSTEXPR -#define EIGEN_HAS_CONSTEXPR 0 -#endif +#endif // EIGEN_HAS_CONSTEXPR +#if EIGEN_HAS_CONSTEXPR +#define EIGEN_CONSTEXPR constexpr +#else +#define EIGEN_CONSTEXPR #endif // Does the compiler support C++11 math? // Let's be conservative and enable the default C++11 implementation only if we are sure it exists #ifndef EIGEN_HAS_CXX11_MATH - #if EIGEN_MAX_CPP_VER>=11 && ((__cplusplus > 201103L) || (__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC) \ + #if EIGEN_MAX_CPP_VER>=11 && ((EIGEN_COMP_CXXVER > 11) || (EIGEN_COMP_CXXVER == 11) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC) \ && (EIGEN_ARCH_i386_OR_x86_64) && (EIGEN_OS_GNULINUX || EIGEN_OS_WIN_STRICT || EIGEN_OS_MAC)) #define EIGEN_HAS_CXX11_MATH 1 #else @@ -456,9 +801,8 @@ // Does the compiler support proper C++11 containers? #ifndef EIGEN_HAS_CXX11_CONTAINERS #if EIGEN_MAX_CPP_VER>=11 && \ - ((__cplusplus > 201103L) \ - || ((__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_ICC>=1400)) \ - || EIGEN_COMP_MSVC >= 1900) + ((EIGEN_COMP_CXXVER > 11) \ + || ((EIGEN_COMP_CXXVER == 11) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC>=1400))) #define EIGEN_HAS_CXX11_CONTAINERS 1 #else #define EIGEN_HAS_CXX11_CONTAINERS 0 @@ -469,24 +813,88 @@ #ifndef EIGEN_HAS_CXX11_NOEXCEPT #if EIGEN_MAX_CPP_VER>=11 && \ (__has_feature(cxx_noexcept) \ - || (__cplusplus > 201103L) \ - || ((__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_ICC>=1400)) \ - || EIGEN_COMP_MSVC >= 1900) + || (EIGEN_COMP_CXXVER > 11) \ + || ((EIGEN_COMP_CXXVER == 11) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC>=1400))) #define EIGEN_HAS_CXX11_NOEXCEPT 1 #else #define EIGEN_HAS_CXX11_NOEXCEPT 0 #endif #endif -/** Allows to disable some optimizations which might affect the accuracy of the result. - * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them. - * They currently include: - * - single precision ArrayBase::sin() and ArrayBase::cos() for SSE and AVX vectorization. - */ -#ifndef EIGEN_FAST_MATH -#define EIGEN_FAST_MATH 1 +#ifndef EIGEN_HAS_CXX11_ATOMIC + #if EIGEN_MAX_CPP_VER>=11 && \ + (__has_feature(cxx_atomic) \ + || (EIGEN_COMP_CXXVER > 11) \ + || ((EIGEN_COMP_CXXVER == 11) && (EIGEN_COMP_MSVC==0 || EIGEN_COMP_MSVC >= 1700))) + #define EIGEN_HAS_CXX11_ATOMIC 1 + #else + #define EIGEN_HAS_CXX11_ATOMIC 0 + #endif #endif +#ifndef EIGEN_HAS_CXX11_OVERRIDE_FINAL + #if EIGEN_MAX_CPP_VER>=11 && \ + (EIGEN_COMP_CXXVER >= 11 || EIGEN_COMP_MSVC >= 1700) + #define EIGEN_HAS_CXX11_OVERRIDE_FINAL 1 + #else + #define EIGEN_HAS_CXX11_OVERRIDE_FINAL 0 + #endif +#endif + +// NOTE: the required Apple's clang version is very conservative +// and it could be that XCode 9 works just fine. +// NOTE: the MSVC version is based on https://en.cppreference.com/w/cpp/compiler_support +// and not tested. +#ifndef EIGEN_HAS_CXX17_OVERALIGN +#if EIGEN_MAX_CPP_VER>=17 && EIGEN_COMP_CXXVER>=17 && ( \ + (EIGEN_COMP_MSVC >= 1912) \ + || (EIGEN_GNUC_AT_LEAST(7,0)) \ + || ((!defined(__apple_build_version__)) && (EIGEN_COMP_CLANG>=500)) \ + || (( defined(__apple_build_version__)) && (__apple_build_version__>=10000000)) \ + ) +#define EIGEN_HAS_CXX17_OVERALIGN 1 +#else +#define EIGEN_HAS_CXX17_OVERALIGN 0 +#endif +#endif + +#if defined(EIGEN_CUDACC) && EIGEN_HAS_CONSTEXPR + // While available already with c++11, this is useful mostly starting with c++14 and relaxed constexpr rules + #if defined(__NVCC__) + // nvcc considers constexpr functions as __host__ __device__ with the option --expt-relaxed-constexpr + #ifdef __CUDACC_RELAXED_CONSTEXPR__ + #define EIGEN_CONSTEXPR_ARE_DEVICE_FUNC + #endif + #elif defined(__clang__) && defined(__CUDA__) && __has_feature(cxx_relaxed_constexpr) + // clang++ always considers constexpr functions as implicitly __host__ __device__ + #define EIGEN_CONSTEXPR_ARE_DEVICE_FUNC + #endif +#endif + +// Does the compiler support the __int128 and __uint128_t extensions for 128-bit +// integer arithmetic? +// +// Clang and GCC define __SIZEOF_INT128__ when these extensions are supported, +// but we avoid using them in certain cases: +// +// * Building using Clang for Windows, where the Clang runtime library has +// 128-bit support only on LP64 architectures, but Windows is LLP64. +#ifndef EIGEN_HAS_BUILTIN_INT128 +#if defined(__SIZEOF_INT128__) && !(EIGEN_OS_WIN && EIGEN_COMP_CLANG) +#define EIGEN_HAS_BUILTIN_INT128 1 +#else +#define EIGEN_HAS_BUILTIN_INT128 0 +#endif +#endif + +//------------------------------------------------------------------------------------------ +// Preprocessor programming helpers +//------------------------------------------------------------------------------------------ + +// This macro can be used to prevent from macro expansion, e.g.: +// std::max EIGEN_NOT_A_MACRO(a,b) +#define EIGEN_NOT_A_MACRO + #define EIGEN_DEBUG_VAR(x) std::cerr << #x << " = " << x << std::endl; // concatenate two tokens @@ -503,7 +911,7 @@ // but it still doesn't use GCC's always_inline. This is useful in (common) situations where MSVC needs forceinline // but GCC is still doing fine with just inline. #ifndef EIGEN_STRONG_INLINE -#if EIGEN_COMP_MSVC || EIGEN_COMP_ICC +#if (EIGEN_COMP_MSVC || EIGEN_COMP_ICC) && !defined(EIGEN_GPUCC) #define EIGEN_STRONG_INLINE __forceinline #else #define EIGEN_STRONG_INLINE inline @@ -518,7 +926,7 @@ // Eval.h:91: sorry, unimplemented: inlining failed in call to 'const Eigen::Eval Eigen::MatrixBase::eval() const' // : function body not available // See also bug 1367 -#if EIGEN_GNUC_AT_LEAST(4,2) +#if EIGEN_GNUC_AT_LEAST(4,2) && !defined(SYCL_DEVICE_ONLY) #define EIGEN_ALWAYS_INLINE __attribute__((always_inline)) inline #else #define EIGEN_ALWAYS_INLINE EIGEN_STRONG_INLINE @@ -538,12 +946,43 @@ #define EIGEN_PERMISSIVE_EXPR #endif +// GPU stuff + +// Disable some features when compiling with GPU compilers (NVCC/clang-cuda/SYCL/HIPCC) +#if defined(EIGEN_CUDACC) || defined(SYCL_DEVICE_ONLY) || defined(EIGEN_HIPCC) + // Do not try asserts on device code + #ifndef EIGEN_NO_DEBUG + #define EIGEN_NO_DEBUG + #endif + + #ifdef EIGEN_INTERNAL_DEBUGGING + #undef EIGEN_INTERNAL_DEBUGGING + #endif + + #ifdef EIGEN_EXCEPTIONS + #undef EIGEN_EXCEPTIONS + #endif +#endif + +#if defined(SYCL_DEVICE_ONLY) + #ifndef EIGEN_DONT_VECTORIZE + #define EIGEN_DONT_VECTORIZE + #endif + #define EIGEN_DEVICE_FUNC __attribute__((flatten)) __attribute__((always_inline)) +// All functions callable from CUDA/HIP code must be qualified with __device__ +#elif defined(EIGEN_GPUCC) + #define EIGEN_DEVICE_FUNC __host__ __device__ +#else + #define EIGEN_DEVICE_FUNC +#endif + + // this macro allows to get rid of linking errors about multiply defined functions. // - static is not very good because it prevents definitions from different object files to be merged. // So static causes the resulting linked executable to be bloated with multiple copies of the same function. // - inline is not perfect either as it unwantedly hints the compiler toward inlining the function. -#define EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -#define EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS inline +#define EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC +#define EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC inline #ifdef NDEBUG # ifndef EIGEN_NO_DEBUG @@ -553,7 +992,11 @@ // eigen_plain_assert is where we implement the workaround for the assert() bug in GCC <= 4.3, see bug 89 #ifdef EIGEN_NO_DEBUG - #define eigen_plain_assert(x) + #ifdef SYCL_DEVICE_ONLY // used to silence the warning on SYCL device + #define eigen_plain_assert(x) EIGEN_UNUSED_VARIABLE(x) + #else + #define eigen_plain_assert(x) + #endif #else #if EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO namespace Eigen { @@ -627,7 +1070,7 @@ // Suppresses 'unused variable' warnings. namespace Eigen { namespace internal { - template EIGEN_DEVICE_FUNC void ignore_unused_variable(const T&) {} + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ignore_unused_variable(const T&) {} } } #define EIGEN_UNUSED_VARIABLE(var) Eigen::internal::ignore_unused_variable(var); @@ -641,169 +1084,75 @@ namespace Eigen { #endif -//------------------------------------------------------------------------------------------ -// Static and dynamic alignment control +// Acts as a barrier preventing operations involving `X` from crossing. This +// occurs, for example, in the fast rounding trick where a magic constant is +// added then subtracted, which is otherwise compiled away with -ffast-math. // -// The main purpose of this section is to define EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES -// as the maximal boundary in bytes on which dynamically and statically allocated data may be alignment respectively. -// The values of EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES can be specified by the user. If not, -// a default value is automatically computed based on architecture, compiler, and OS. -// -// This section also defines macros EIGEN_ALIGN_TO_BOUNDARY(N) and the shortcuts EIGEN_ALIGN{8,16,32,_MAX} -// to be used to declare statically aligned buffers. -//------------------------------------------------------------------------------------------ - - -/* EIGEN_ALIGN_TO_BOUNDARY(n) forces data to be n-byte aligned. This is used to satisfy SIMD requirements. - * However, we do that EVEN if vectorization (EIGEN_VECTORIZE) is disabled, - * so that vectorization doesn't affect binary compatibility. - * - * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link - * vectorized and non-vectorized code. - */ -#if (defined __CUDACC__) - #define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n) -#elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM - #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n))) -#elif EIGEN_COMP_MSVC - #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n)) -#elif EIGEN_COMP_SUNCC - // FIXME not sure about this one: - #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n))) -#else - #error Please tell me what is the equivalent of __attribute__((aligned(n))) for your compiler -#endif - -// If the user explicitly disable vectorization, then we also disable alignment -#if defined(EIGEN_DONT_VECTORIZE) - #define EIGEN_IDEAL_MAX_ALIGN_BYTES 0 -#elif defined(EIGEN_VECTORIZE_AVX512) - // 64 bytes static alignmeent is preferred only if really required - #define EIGEN_IDEAL_MAX_ALIGN_BYTES 64 -#elif defined(__AVX__) - // 32 bytes static alignmeent is preferred only if really required - #define EIGEN_IDEAL_MAX_ALIGN_BYTES 32 -#else - #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16 -#endif - - -// EIGEN_MIN_ALIGN_BYTES defines the minimal value for which the notion of explicit alignment makes sense -#define EIGEN_MIN_ALIGN_BYTES 16 - -// Defined the boundary (in bytes) on which the data needs to be aligned. Note -// that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be -// aligned at all regardless of the value of this #define. - -#if (defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN)) && defined(EIGEN_MAX_STATIC_ALIGN_BYTES) && EIGEN_MAX_STATIC_ALIGN_BYTES>0 -#error EIGEN_MAX_STATIC_ALIGN_BYTES and EIGEN_DONT_ALIGN[_STATICALLY] are both defined with EIGEN_MAX_STATIC_ALIGN_BYTES!=0. Use EIGEN_MAX_STATIC_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN_STATICALLY. -#endif - -// EIGEN_DONT_ALIGN_STATICALLY and EIGEN_DONT_ALIGN are deprectated -// They imply EIGEN_MAX_STATIC_ALIGN_BYTES=0 -#if defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN) - #ifdef EIGEN_MAX_STATIC_ALIGN_BYTES - #undef EIGEN_MAX_STATIC_ALIGN_BYTES - #endif - #define EIGEN_MAX_STATIC_ALIGN_BYTES 0 -#endif - -#ifndef EIGEN_MAX_STATIC_ALIGN_BYTES - - // Try to automatically guess what is the best default value for EIGEN_MAX_STATIC_ALIGN_BYTES - - // 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable - // 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always - // enable alignment, but it can be a cause of problems on some platforms, so we just disable it in - // certain common platform (compiler+architecture combinations) to avoid these problems. - // Only static alignment is really problematic (relies on nonstandard compiler extensions), - // try to keep heap alignment even when we have to disable static alignment. - #if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64) - #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1 - #elif EIGEN_ARCH_ARM_OR_ARM64 && EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_MOST(4, 6) - // Old versions of GCC on ARM, at least 4.4, were once seen to have buggy static alignment support. - // Not sure which version fixed it, hopefully it doesn't affect 4.7, which is still somewhat in use. - // 4.8 and newer seem definitely unaffected. - #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1 +// See bug 1674 +#if !defined(EIGEN_OPTIMIZATION_BARRIER) + #if EIGEN_COMP_GNUC + // According to https://gcc.gnu.org/onlinedocs/gcc/Constraints.html: + // X: Any operand whatsoever. + // r: A register operand is allowed provided that it is in a general + // register. + // g: Any register, memory or immediate integer operand is allowed, except + // for registers that are not general registers. + // w: (AArch32/AArch64) Floating point register, Advanced SIMD vector + // register or SVE vector register. + // x: (SSE) Any SSE register. + // (AArch64) Like w, but restricted to registers 0 to 15 inclusive. + // v: (PowerPC) An Altivec vector register. + // wa:(PowerPC) A VSX register. + // + // "X" (uppercase) should work for all cases, though this seems to fail for + // some versions of GCC for arm/aarch64 with + // "error: inconsistent operand constraints in an 'asm'" + // Clang x86_64/arm/aarch64 seems to require "g" to support both scalars and + // vectors, otherwise + // "error: non-trivial scalar-to-vector conversion, possible invalid + // constraint for vector type" + // + // GCC for ppc64le generates an internal compiler error with x/X/g. + // GCC for AVX generates an internal compiler error with X. + // + // Tested on icc/gcc/clang for sse, avx, avx2, avx512dq + // gcc for arm, aarch64, + // gcc for ppc64le, + // both vectors and scalars. + // + // Note that this is restricted to plain types - this will not work + // directly for std::complex, Eigen::half, Eigen::bfloat16. For these, + // you will need to apply to the underlying POD type. + #if EIGEN_ARCH_PPC && EIGEN_COMP_GNUC_STRICT + // This seems to be broken on clang. Packet4f is loaded into a single + // register rather than a vector, zeroing out some entries. Integer + // types also generate a compile error. + // General, Altivec, VSX. + #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+r,v,wa" (X)); + #elif EIGEN_ARCH_ARM_OR_ARM64 + // General, NEON. + #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+g,w" (X)); + #elif EIGEN_ARCH_i386_OR_x86_64 + // General, SSE. + #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+g,x" (X)); + #else + // Not implemented for other architectures. + #define EIGEN_OPTIMIZATION_BARRIER(X) + #endif #else - #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0 + // Not implemented for other compilers. + #define EIGEN_OPTIMIZATION_BARRIER(X) #endif - - // static alignment is completely disabled with GCC 3, Sun Studio, and QCC/QNX - #if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT \ - && !EIGEN_GCC3_OR_OLDER \ - && !EIGEN_COMP_SUNCC \ - && !EIGEN_OS_QNX - #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1 - #else - #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0 - #endif - - #if EIGEN_ARCH_WANTS_STACK_ALIGNMENT - #define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES - #else - #define EIGEN_MAX_STATIC_ALIGN_BYTES 0 - #endif - #endif -// If EIGEN_MAX_ALIGN_BYTES is defined, then it is considered as an upper bound for EIGEN_MAX_ALIGN_BYTES -#if defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES0 is the true test whether we want to align arrays on the stack or not. -// It takes into account both the user choice to explicitly enable/disable alignment (by settting EIGEN_MAX_STATIC_ALIGN_BYTES) -// and the architecture config (EIGEN_ARCH_WANTS_STACK_ALIGNMENT). -// Henceforth, only EIGEN_MAX_STATIC_ALIGN_BYTES should be used. - - -// Shortcuts to EIGEN_ALIGN_TO_BOUNDARY -#define EIGEN_ALIGN8 EIGEN_ALIGN_TO_BOUNDARY(8) -#define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16) -#define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32) -#define EIGEN_ALIGN64 EIGEN_ALIGN_TO_BOUNDARY(64) -#if EIGEN_MAX_STATIC_ALIGN_BYTES>0 -#define EIGEN_ALIGN_MAX EIGEN_ALIGN_TO_BOUNDARY(EIGEN_MAX_STATIC_ALIGN_BYTES) +#if EIGEN_COMP_MSVC + // NOTE MSVC often gives C4127 warnings with compiletime if statements. See bug 1362. + // This workaround is ugly, but it does the job. +# define EIGEN_CONST_CONDITIONAL(cond) (void)0, cond #else -#define EIGEN_ALIGN_MAX +# define EIGEN_CONST_CONDITIONAL(cond) cond #endif - -// Dynamic alignment control - -#if defined(EIGEN_DONT_ALIGN) && defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES>0 -#error EIGEN_MAX_ALIGN_BYTES and EIGEN_DONT_ALIGN are both defined with EIGEN_MAX_ALIGN_BYTES!=0. Use EIGEN_MAX_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN. -#endif - -#ifdef EIGEN_DONT_ALIGN - #ifdef EIGEN_MAX_ALIGN_BYTES - #undef EIGEN_MAX_ALIGN_BYTES - #endif - #define EIGEN_MAX_ALIGN_BYTES 0 -#elif !defined(EIGEN_MAX_ALIGN_BYTES) - #define EIGEN_MAX_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES -#endif - -#if EIGEN_IDEAL_MAX_ALIGN_BYTES > EIGEN_MAX_ALIGN_BYTES -#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES -#else -#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES -#endif - - -#ifndef EIGEN_UNALIGNED_VECTORIZE -#define EIGEN_UNALIGNED_VECTORIZE 1 -#endif - -//---------------------------------------------------------------------- - - #ifdef EIGEN_DONT_USE_RESTRICT_KEYWORD #define EIGEN_RESTRICT #endif @@ -811,10 +1160,6 @@ namespace Eigen { #define EIGEN_RESTRICT __restrict #endif -#ifndef EIGEN_STACK_ALLOCATION_LIMIT -// 131072 == 128 KB -#define EIGEN_STACK_ALLOCATION_LIMIT 131072 -#endif #ifndef EIGEN_DEFAULT_IO_FORMAT #ifdef EIGEN_MAKING_DOCS @@ -829,8 +1174,23 @@ namespace Eigen { // just an empty macro ! #define EIGEN_EMPTY -#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || EIGEN_CUDACC_VER>0) - // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324) + +// When compiling CUDA/HIP device code with NVCC or HIPCC +// pull in math functions from the global namespace. +// In host mode, and when device code is compiled with clang, +// use the std versions. +#if (defined(EIGEN_CUDA_ARCH) && defined(__NVCC__)) || defined(EIGEN_HIP_DEVICE_COMPILE) + #define EIGEN_USING_STD(FUNC) using ::FUNC; +#else + #define EIGEN_USING_STD(FUNC) using std::FUNC; +#endif + +#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || (EIGEN_COMP_MSVC == 1900 && EIGEN_COMP_NVCC)) + // For older MSVC versions, as well as 1900 && CUDA 8, using the base operator is necessary, + // otherwise we get duplicate definition errors + // For later MSVC versions, we require explicit operator= definition, otherwise we get + // use of implicitly deleted operator errors. + // (cf Bugs 920, 1000, 1324, 2291) #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ using Base::operator =; #elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653) @@ -856,7 +1216,7 @@ namespace Eigen { * This is necessary, because the implicit definition is deprecated if the copy-assignment is overridden. */ #if EIGEN_HAS_CXX11 -#define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS) EIGEN_DEVICE_FUNC CLASS(const CLASS&) = default; +#define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS) CLASS(const CLASS&) = default; #else #define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS) #endif @@ -881,12 +1241,12 @@ namespace Eigen { */ #if EIGEN_HAS_CXX11 #define EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(Derived) \ - EIGEN_DEVICE_FUNC Derived() = default; \ - EIGEN_DEVICE_FUNC ~Derived() = default; + Derived() = default; \ + ~Derived() = default; #else #define EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(Derived) \ - EIGEN_DEVICE_FUNC Derived() {}; \ - /* EIGEN_DEVICE_FUNC ~Derived() {}; */ + Derived() {}; \ + /* ~Derived() {}; */ #endif @@ -908,7 +1268,8 @@ namespace Eigen { typedef typename Eigen::internal::ref_selector::type Nested; \ typedef typename Eigen::internal::traits::StorageKind StorageKind; \ typedef typename Eigen::internal::traits::StorageIndex StorageIndex; \ - enum { RowsAtCompileTime = Eigen::internal::traits::RowsAtCompileTime, \ + enum CompileTimeTraits \ + { RowsAtCompileTime = Eigen::internal::traits::RowsAtCompileTime, \ ColsAtCompileTime = Eigen::internal::traits::ColsAtCompileTime, \ Flags = Eigen::internal::traits::Flags, \ SizeAtCompileTime = Base::SizeAtCompileTime, \ @@ -953,6 +1314,14 @@ namespace Eigen { #define EIGEN_IMPLIES(a,b) (!(a) || (b)) +#if EIGEN_HAS_BUILTIN(__builtin_expect) || EIGEN_COMP_GNUC +#define EIGEN_PREDICT_FALSE(x) (__builtin_expect(x, false)) +#define EIGEN_PREDICT_TRUE(x) (__builtin_expect(false || (x), true)) +#else +#define EIGEN_PREDICT_FALSE(x) (x) +#define EIGEN_PREDICT_TRUE(x) (x) +#endif + // the expression type of a standard coefficient wise binary operation #define EIGEN_CWISE_BINARY_RETURN_TYPE(LHS,RHS,OPNAME) \ CwiseBinaryOp< \ @@ -984,14 +1353,14 @@ namespace Eigen { const typename internal::plain_constant_type::type, const EXPR> // Workaround for MSVC 2010 (see ML thread "patch with compile for for MSVC 2010") -#if EIGEN_COMP_MSVC_STRICT<=1600 +#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC_STRICT<=1600) #define EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(X) typename internal::enable_if::type #else #define EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(X) X #endif #define EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(METHOD,OPNAME) \ - template EIGEN_DEVICE_FUNC inline \ + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename internal::promote_scalar_arg::type,OPNAME))\ (METHOD)(const T& scalar) const { \ typedef typename internal::promote_scalar_arg::type PromotedT; \ @@ -1000,7 +1369,7 @@ namespace Eigen { } #define EIGEN_MAKE_SCALAR_BINARY_OP_ONTHELEFT(METHOD,OPNAME) \ - template EIGEN_DEVICE_FUNC inline friend \ + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend \ EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename internal::promote_scalar_arg::type,Derived,OPNAME)) \ (METHOD)(const T& scalar, const StorageBaseType& matrix) { \ typedef typename internal::promote_scalar_arg::type PromotedT; \ @@ -1013,15 +1382,23 @@ namespace Eigen { EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(METHOD,OPNAME) +#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL) && !defined(EIGEN_HIP_DEVICE_COMPILE) + #define EIGEN_EXCEPTIONS +#endif + + #ifdef EIGEN_EXCEPTIONS # define EIGEN_THROW_X(X) throw X # define EIGEN_THROW throw # define EIGEN_TRY try # define EIGEN_CATCH(X) catch (X) #else -# ifdef __CUDA_ARCH__ +# if defined(EIGEN_CUDA_ARCH) # define EIGEN_THROW_X(X) asm("trap;") # define EIGEN_THROW asm("trap;") +# elif defined(EIGEN_HIP_DEVICE_COMPILE) +# define EIGEN_THROW_X(X) asm("s_trap 0") +# define EIGEN_THROW asm("s_trap 0") # else # define EIGEN_THROW_X(X) std::abort() # define EIGEN_THROW std::abort() @@ -1041,13 +1418,47 @@ namespace Eigen { # define EIGEN_NOEXCEPT # define EIGEN_NOEXCEPT_IF(x) # define EIGEN_NO_THROW throw() -# if EIGEN_COMP_MSVC +# if EIGEN_COMP_MSVC || EIGEN_COMP_CXXVER>=17 // MSVC does not support exception specifications (warning C4290), - // and they are deprecated in c++11 anyway. + // and they are deprecated in c++11 anyway. This is even an error in c++17. # define EIGEN_EXCEPTION_SPEC(X) throw() # else # define EIGEN_EXCEPTION_SPEC(X) throw(X) # endif #endif +#if EIGEN_HAS_VARIADIC_TEMPLATES +// The all function is used to enable a variadic version of eigen_assert which can take a parameter pack as its input. +namespace Eigen { +namespace internal { + +inline bool all(){ return true; } + +template +bool all(T t, Ts ... ts){ return t && all(ts...); } + +} +} +#endif + +#if EIGEN_HAS_CXX11_OVERRIDE_FINAL +// provide override and final specifiers if they are available: +# define EIGEN_OVERRIDE override +# define EIGEN_FINAL final +#else +# define EIGEN_OVERRIDE +# define EIGEN_FINAL +#endif + +// Wrapping #pragma unroll in a macro since it is required for SYCL +#if defined(SYCL_DEVICE_ONLY) + #if defined(_MSC_VER) + #define EIGEN_UNROLL_LOOP __pragma(unroll) + #else + #define EIGEN_UNROLL_LOOP _Pragma("unroll") + #endif +#else + #define EIGEN_UNROLL_LOOP +#endif + #endif // EIGEN_MACROS_H diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/Memory.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/Memory.h index 291383c581..875318cdb1 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/Memory.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/Memory.h @@ -63,14 +63,28 @@ namespace Eigen { namespace internal { -EIGEN_DEVICE_FUNC +EIGEN_DEVICE_FUNC inline void throw_std_bad_alloc() { #ifdef EIGEN_EXCEPTIONS throw std::bad_alloc(); #else std::size_t huge = static_cast(-1); - ::operator new(huge); + #if defined(EIGEN_HIPCC) + // + // calls to "::operator new" are to be treated as opaque function calls (i.e no inlining), + // and as a consequence the code in the #else block triggers the hipcc warning : + // "no overloaded function has restriction specifiers that are compatible with the ambient context" + // + // "throw_std_bad_alloc" has the EIGEN_DEVICE_FUNC attribute, so it seems that hipcc expects + // the same on "operator new" + // Reverting code back to the old version in this #if block for the hipcc compiler + // + new int[huge]; + #else + void* unused = ::operator new(huge); + EIGEN_UNUSED_VARIABLE(unused); + #endif #endif } @@ -83,19 +97,26 @@ inline void throw_std_bad_alloc() /** \internal Like malloc, but the returned pointer is guaranteed to be 16-byte aligned. * Fast, but wastes 16 additional bytes of memory. Does not throw any exception. */ -inline void* handmade_aligned_malloc(std::size_t size) +EIGEN_DEVICE_FUNC inline void* handmade_aligned_malloc(std::size_t size, std::size_t alignment = EIGEN_DEFAULT_ALIGN_BYTES) { - void *original = std::malloc(size+EIGEN_DEFAULT_ALIGN_BYTES); + eigen_assert(alignment >= sizeof(void*) && (alignment & (alignment-1)) == 0 && "Alignment must be at least sizeof(void*) and a power of 2"); + + EIGEN_USING_STD(malloc) + void *original = malloc(size+alignment); + if (original == 0) return 0; - void *aligned = reinterpret_cast((reinterpret_cast(original) & ~(std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1))) + EIGEN_DEFAULT_ALIGN_BYTES); + void *aligned = reinterpret_cast((reinterpret_cast(original) & ~(std::size_t(alignment-1))) + alignment); *(reinterpret_cast(aligned) - 1) = original; return aligned; } /** \internal Frees memory allocated with handmade_aligned_malloc */ -inline void handmade_aligned_free(void *ptr) +EIGEN_DEVICE_FUNC inline void handmade_aligned_free(void *ptr) { - if (ptr) std::free(*(reinterpret_cast(ptr) - 1)); + if (ptr) { + EIGEN_USING_STD(free) + free(*(reinterpret_cast(ptr) - 1)); + } } /** \internal @@ -114,7 +135,7 @@ inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t = void *previous_aligned = static_cast(original)+previous_offset; if(aligned!=previous_aligned) std::memmove(aligned, previous_aligned, size); - + *(reinterpret_cast(aligned) - 1) = original; return aligned; } @@ -142,7 +163,7 @@ EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed() { eigen_assert(is_malloc_allowed() && "heap allocation is forbidden (EIGEN_RUNTIME_NO_MALLOC is defined and g_is_malloc_allowed is false)"); } -#else +#else EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed() {} #endif @@ -156,9 +177,12 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size) void *result; #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED - result = std::malloc(size); + + EIGEN_USING_STD(malloc) + result = malloc(size); + #if EIGEN_DEFAULT_ALIGN_BYTES==16 - eigen_assert((size<16 || (std::size_t(result)%16)==0) && "System's malloc returned an unaligned pointer. Compile with EIGEN_MALLOC_ALREADY_ALIGNED=0 to fallback to handmade alignd memory allocator."); + eigen_assert((size<16 || (std::size_t(result)%16)==0) && "System's malloc returned an unaligned pointer. Compile with EIGEN_MALLOC_ALREADY_ALIGNED=0 to fallback to handmade aligned memory allocator."); #endif #else result = handmade_aligned_malloc(size); @@ -174,7 +198,10 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size) EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr) { #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED - std::free(ptr); + + EIGEN_USING_STD(free) + free(ptr); + #else handmade_aligned_free(ptr); #endif @@ -187,7 +214,7 @@ EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr) */ inline void* aligned_realloc(void *ptr, std::size_t new_size, std::size_t old_size) { - EIGEN_UNUSED_VARIABLE(old_size); + EIGEN_UNUSED_VARIABLE(old_size) void *result; #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED @@ -218,7 +245,9 @@ template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc(std: { check_that_malloc_is_allowed(); - void *result = std::malloc(size); + EIGEN_USING_STD(malloc) + void *result = malloc(size); + if(!result && size) throw_std_bad_alloc(); return result; @@ -232,7 +261,8 @@ template EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void *ptr) { - std::free(ptr); + EIGEN_USING_STD(free) + free(ptr); } template inline void* conditional_aligned_realloc(void* ptr, std::size_t new_size, std::size_t old_size) @@ -331,7 +361,7 @@ template EIGEN_DEVICE_FUNC inline T* conditional_aligned template EIGEN_DEVICE_FUNC inline void aligned_delete(T *ptr, std::size_t size) { destruct_elements_of_array(ptr, size); - aligned_free(ptr); + Eigen::internal::aligned_free(ptr); } /** \internal Deletes objects constructed with conditional_aligned_new @@ -471,8 +501,8 @@ EIGEN_DEVICE_FUNC inline Index first_default_aligned(const Scalar* array, Index } /** \internal Returns the smallest integer multiple of \a base and greater or equal to \a size - */ -template + */ +template inline Index first_multiple(Index size, Index base) { return ((size+base-1)/base)*base; @@ -493,7 +523,8 @@ template struct smart_copy_helper { IntPtr size = IntPtr(end)-IntPtr(start); if(size==0) return; eigen_internal_assert(start!=0 && end!=0 && target!=0); - std::memcpy(target, start, size); + EIGEN_USING_STD(memcpy) + memcpy(target, start, size); } }; @@ -502,7 +533,7 @@ template struct smart_copy_helper { { std::copy(start, end, target); } }; -// intelligent memmove. falls back to std::memmove for POD types, uses std::copy otherwise. +// intelligent memmove. falls back to std::memmove for POD types, uses std::copy otherwise. template struct smart_memmove_helper; template void smart_memmove(const T* start, const T* end, T* target) @@ -522,19 +553,30 @@ template struct smart_memmove_helper { template struct smart_memmove_helper { static inline void run(const T* start, const T* end, T* target) - { + { if (UIntPtr(target) < UIntPtr(start)) { std::copy(start, end, target); } - else + else { std::ptrdiff_t count = (std::ptrdiff_t(end)-std::ptrdiff_t(start)) / sizeof(T); - std::copy_backward(start, end, target + count); + std::copy_backward(start, end, target + count); } } }; +#if EIGEN_HAS_RVALUE_REFERENCES +template EIGEN_DEVICE_FUNC T* smart_move(T* start, T* end, T* target) +{ + return std::move(start, end, target); +} +#else +template EIGEN_DEVICE_FUNC T* smart_move(T* start, T* end, T* target) +{ + return std::copy(start, end, target); +} +#endif /***************************************************************************** *** Implementation of runtime stack allocation (falling back to malloc) *** @@ -542,7 +584,7 @@ template struct smart_memmove_helper { // you can overwrite Eigen's default behavior regarding alloca by defining EIGEN_ALLOCA // to the appropriate stack allocation function -#ifndef EIGEN_ALLOCA +#if ! defined EIGEN_ALLOCA && ! defined EIGEN_GPU_COMPILE_PHASE #if EIGEN_OS_LINUX || EIGEN_OS_MAC || (defined alloca) #define EIGEN_ALLOCA alloca #elif EIGEN_COMP_MSVC @@ -550,6 +592,15 @@ template struct smart_memmove_helper { #endif #endif +// With clang -Oz -mthumb, alloca changes the stack pointer in a way that is +// not allowed in Thumb2. -DEIGEN_STACK_ALLOCATION_LIMIT=0 doesn't work because +// the compiler still emits bad code because stack allocation checks use "<=". +// TODO: Eliminate after https://bugs.llvm.org/show_bug.cgi?id=23772 +// is fixed. +#if defined(__clang__) && defined(__thumb__) + #undef EIGEN_ALLOCA +#endif + // This helper class construct the allocated memory, and takes care of destructing and freeing the handled data // at destruction time. In practice this helper class is mainly useful to avoid memory leak in case of exceptions. template class aligned_stack_memory_handler : noncopyable @@ -561,12 +612,14 @@ template class aligned_stack_memory_handler : noncopyable * In this case, the buffer elements will also be destructed when this handler will be destructed. * Finally, if \a dealloc is true, then the pointer \a ptr is freed. **/ + EIGEN_DEVICE_FUNC aligned_stack_memory_handler(T* ptr, std::size_t size, bool dealloc) : m_ptr(ptr), m_size(size), m_deallocate(dealloc) { if(NumTraits::RequireInitialization && m_ptr) Eigen::internal::construct_elements_of_array(m_ptr, size); } + EIGEN_DEVICE_FUNC ~aligned_stack_memory_handler() { if(NumTraits::RequireInitialization && m_ptr) @@ -580,6 +633,60 @@ template class aligned_stack_memory_handler : noncopyable bool m_deallocate; }; +#ifdef EIGEN_ALLOCA + +template::Evaluate && Xpr::MaxSizeAtCompileTime==Dynamic + > +struct local_nested_eval_wrapper +{ + static const bool NeedExternalBuffer = false; + typedef typename Xpr::Scalar Scalar; + typedef typename nested_eval::type ObjectType; + ObjectType object; + + EIGEN_DEVICE_FUNC + local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr) : object(xpr) + { + EIGEN_UNUSED_VARIABLE(ptr); + eigen_internal_assert(ptr==0); + } +}; + +template +struct local_nested_eval_wrapper +{ + static const bool NeedExternalBuffer = true; + typedef typename Xpr::Scalar Scalar; + typedef typename plain_object_eval::type PlainObject; + typedef Map ObjectType; + ObjectType object; + + EIGEN_DEVICE_FUNC + local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr) + : object(ptr==0 ? reinterpret_cast(Eigen::internal::aligned_malloc(sizeof(Scalar)*xpr.size())) : ptr, xpr.rows(), xpr.cols()), + m_deallocate(ptr==0) + { + if(NumTraits::RequireInitialization && object.data()) + Eigen::internal::construct_elements_of_array(object.data(), object.size()); + object = xpr; + } + + EIGEN_DEVICE_FUNC + ~local_nested_eval_wrapper() + { + if(NumTraits::RequireInitialization && object.data()) + Eigen::internal::destruct_elements_of_array(object.data(), object.size()); + if(m_deallocate) + Eigen::internal::aligned_free(object.data()); + } + +private: + bool m_deallocate; +}; + +#endif // EIGEN_ALLOCA + template class scoped_array : noncopyable { T* m_ptr; @@ -603,13 +710,15 @@ template void swap(scoped_array &a,scoped_array &b) { std::swap(a.ptr(),b.ptr()); } - + } // end namespace internal /** \internal - * Declares, allocates and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack - * if SIZE is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform - * (currently, this is Linux and Visual Studio only). Otherwise the memory is allocated on the heap. + * + * The macro ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) declares, allocates, + * and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack + * if the size in bytes is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform + * (currently, this is Linux, OSX and Visual Studio only). Otherwise the memory is allocated on the heap. * The allocated buffer is automatically deleted when exiting the scope of this declaration. * If BUFFER is non null, then the declared variable is simply an alias for BUFFER, and no allocation/deletion occurs. * Here is an example: @@ -620,9 +729,17 @@ template void swap(scoped_array &a,scoped_array &b) * } * \endcode * The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token. + * + * The macro ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) is analogue to + * \code + * typename internal::nested_eval::type NAME(XPR); + * \endcode + * with the advantage of using aligned stack allocation even if the maximal size of XPR at compile time is unknown. + * This is accomplished through alloca if this later is supported and if the required number of bytes + * is below EIGEN_STACK_ALLOCATION_LIMIT. */ #ifdef EIGEN_ALLOCA - + #if EIGEN_DEFAULT_ALIGN_BYTES>0 // We always manually re-align the result of EIGEN_ALLOCA. // If alloca is already aligned, the compiler should be smart enough to optimize away the re-alignment. @@ -639,13 +756,23 @@ template void swap(scoped_array &a,scoped_array &b) : Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE) ); \ Eigen::internal::aligned_stack_memory_handler EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,sizeof(TYPE)*SIZE>EIGEN_STACK_ALLOCATION_LIMIT) + + #define ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) \ + Eigen::internal::local_nested_eval_wrapper EIGEN_CAT(NAME,_wrapper)(XPR, reinterpret_cast( \ + ( (Eigen::internal::local_nested_eval_wrapper::NeedExternalBuffer) && ((sizeof(typename XPR_T::Scalar)*XPR.size())<=EIGEN_STACK_ALLOCATION_LIMIT) ) \ + ? EIGEN_ALIGNED_ALLOCA( sizeof(typename XPR_T::Scalar)*XPR.size() ) : 0 ) ) ; \ + typename Eigen::internal::local_nested_eval_wrapper::ObjectType NAME(EIGEN_CAT(NAME,_wrapper).object) + #else #define ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) \ Eigen::internal::check_size_for_overflow(SIZE); \ TYPE* NAME = (BUFFER)!=0 ? BUFFER : reinterpret_cast(Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE)); \ Eigen::internal::aligned_stack_memory_handler EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,true) - + + +#define ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) typename Eigen::internal::nested_eval::type NAME(XPR) + #endif @@ -653,32 +780,56 @@ template void swap(scoped_array &a,scoped_array &b) *** Implementation of EIGEN_MAKE_ALIGNED_OPERATOR_NEW [_IF] *** *****************************************************************************/ -#if EIGEN_MAX_ALIGN_BYTES!=0 +#if EIGEN_HAS_CXX17_OVERALIGN + +// C++17 -> no need to bother about alignment anymore :) + +#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) +#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign) +#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW +#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) + +#else + +// HIP does not support new/delete on device. +#if EIGEN_MAX_ALIGN_BYTES!=0 && !defined(EIGEN_HIP_DEVICE_COMPILE) #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \ + EIGEN_DEVICE_FUNC \ void* operator new(std::size_t size, const std::nothrow_t&) EIGEN_NO_THROW { \ EIGEN_TRY { return Eigen::internal::conditional_aligned_malloc(size); } \ EIGEN_CATCH (...) { return 0; } \ } #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign) \ + EIGEN_DEVICE_FUNC \ void *operator new(std::size_t size) { \ return Eigen::internal::conditional_aligned_malloc(size); \ } \ + EIGEN_DEVICE_FUNC \ void *operator new[](std::size_t size) { \ return Eigen::internal::conditional_aligned_malloc(size); \ } \ + EIGEN_DEVICE_FUNC \ void operator delete(void * ptr) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free(ptr); } \ + EIGEN_DEVICE_FUNC \ void operator delete[](void * ptr) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free(ptr); } \ + EIGEN_DEVICE_FUNC \ void operator delete(void * ptr, std::size_t /* sz */) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free(ptr); } \ + EIGEN_DEVICE_FUNC \ void operator delete[](void * ptr, std::size_t /* sz */) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free(ptr); } \ /* in-place new and delete. since (at least afaik) there is no actual */ \ /* memory allocated we can safely let the default implementation handle */ \ /* this particular case. */ \ + EIGEN_DEVICE_FUNC \ static void *operator new(std::size_t size, void *ptr) { return ::operator new(size,ptr); } \ + EIGEN_DEVICE_FUNC \ static void *operator new[](std::size_t size, void* ptr) { return ::operator new[](size,ptr); } \ + EIGEN_DEVICE_FUNC \ void operator delete(void * memory, void *ptr) EIGEN_NO_THROW { return ::operator delete(memory,ptr); } \ + EIGEN_DEVICE_FUNC \ void operator delete[](void * memory, void *ptr) EIGEN_NO_THROW { return ::operator delete[](memory,ptr); } \ /* nothrow-new (returns zero instead of std::bad_alloc) */ \ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \ + EIGEN_DEVICE_FUNC \ void operator delete(void *ptr, const std::nothrow_t&) EIGEN_NO_THROW { \ Eigen::internal::conditional_aligned_free(ptr); \ } \ @@ -688,8 +839,14 @@ template void swap(scoped_array &a,scoped_array &b) #endif #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(true) -#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) \ - EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%EIGEN_MAX_ALIGN_BYTES==0))) +#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) \ + EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool( \ + ((Size)!=Eigen::Dynamic) && \ + (((EIGEN_MAX_ALIGN_BYTES>=16) && ((sizeof(Scalar)*(Size))%(EIGEN_MAX_ALIGN_BYTES )==0)) || \ + ((EIGEN_MAX_ALIGN_BYTES>=32) && ((sizeof(Scalar)*(Size))%(EIGEN_MAX_ALIGN_BYTES/2)==0)) || \ + ((EIGEN_MAX_ALIGN_BYTES>=64) && ((sizeof(Scalar)*(Size))%(EIGEN_MAX_ALIGN_BYTES/4)==0)) ))) + +#endif /****************************************************************************/ @@ -703,13 +860,13 @@ template void swap(scoped_array &a,scoped_array &b) * - 32 bytes alignment if AVX is enabled. * - 64 bytes alignment if AVX512 is enabled. * -* This can be controled using the \c EIGEN_MAX_ALIGN_BYTES macro as documented +* This can be controlled using the \c EIGEN_MAX_ALIGN_BYTES macro as documented * \link TopicPreprocessorDirectivesPerformance there \endlink. * * Example: * \code * // Matrix4f requires 16 bytes alignment: -* std::map< int, Matrix4f, std::less, +* std::map< int, Matrix4f, std::less, * aligned_allocator > > my_map_mat4; * // Vector3f does not require 16 bytes alignment, no need to use Eigen's allocator: * std::map< int, Vector3f > my_map_vec3; @@ -744,18 +901,19 @@ public: ~aligned_allocator() {} + #if EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_LEAST(7,0) + // In gcc std::allocator::max_size() is bugged making gcc triggers a warning: + // eigen/Eigen/src/Core/util/Memory.h:189:12: warning: argument 1 value '18446744073709551612' exceeds maximum object size 9223372036854775807 + // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87544 + size_type max_size() const { + return (std::numeric_limits::max)()/sizeof(T); + } + #endif + pointer allocate(size_type num, const void* /*hint*/ = 0) { internal::check_size_for_overflow(num); - size_type size = num * sizeof(T); -#if EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_LEAST(7,0) - // workaround gcc bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87544 - // It triggered eigen/Eigen/src/Core/util/Memory.h:189:12: warning: argument 1 value '18446744073709551612' exceeds maximum object size 9223372036854775807 - if(size>=std::size_t((std::numeric_limits::max)())) - return 0; - else -#endif - return static_cast( internal::aligned_malloc(size) ); + return static_cast( internal::aligned_malloc(num * sizeof(T)) ); } void deallocate(pointer p, size_type /*num*/) @@ -914,20 +1072,32 @@ inline void queryCacheSizes_intel(int& l1, int& l2, int& l3, int max_std_funcs) { if(max_std_funcs>=4) queryCacheSizes_intel_direct(l1,l2,l3); - else + else if(max_std_funcs>=2) queryCacheSizes_intel_codes(l1,l2,l3); + else + l1 = l2 = l3 = 0; } inline void queryCacheSizes_amd(int& l1, int& l2, int& l3) { int abcd[4]; abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0; - EIGEN_CPUID(abcd,0x80000005,0); - l1 = (abcd[2] >> 24) * 1024; // C[31:24] = L1 size in KB - abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0; - EIGEN_CPUID(abcd,0x80000006,0); - l2 = (abcd[2] >> 16) * 1024; // C[31;16] = l2 cache size in KB - l3 = ((abcd[3] & 0xFFFC000) >> 18) * 512 * 1024; // D[31;18] = l3 cache size in 512KB + + // First query the max supported function. + EIGEN_CPUID(abcd,0x80000000,0); + if(static_cast(abcd[0]) >= static_cast(0x80000006)) + { + EIGEN_CPUID(abcd,0x80000005,0); + l1 = (abcd[2] >> 24) * 1024; // C[31:24] = L1 size in KB + abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0; + EIGEN_CPUID(abcd,0x80000006,0); + l2 = (abcd[2] >> 16) * 1024; // C[31;16] = l2 cache size in KB + l3 = ((abcd[3] & 0xFFFC000) >> 18) * 512 * 1024; // D[31;18] = l3 cache size in 512KB + } + else + { + l1 = l2 = l3 = 0; + } } #endif @@ -943,7 +1113,7 @@ inline void queryCacheSizes(int& l1, int& l2, int& l3) // identify the CPU vendor EIGEN_CPUID(abcd,0x0,0); - int max_std_funcs = abcd[1]; + int max_std_funcs = abcd[0]; if(cpuid_is_vendor(abcd,GenuineIntel)) queryCacheSizes_intel(l1,l2,l3,max_std_funcs); else if(cpuid_is_vendor(abcd,AuthenticAMD) || cpuid_is_vendor(abcd,AMDisbetter_)) diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/Meta.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/Meta.h index 9b61ff037a..7badfdcc92 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/Meta.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/Meta.h @@ -11,13 +11,54 @@ #ifndef EIGEN_META_H #define EIGEN_META_H -#if defined(__CUDA_ARCH__) -#include -#include +#if defined(EIGEN_GPU_COMPILE_PHASE) + + #include + + #if defined(EIGEN_CUDA_ARCH) + #include + #endif + + #if defined(EIGEN_HIP_DEVICE_COMPILE) + // #include "Eigen/src/Core/arch/HIP/hcc/math_constants.h" + #endif + #endif -#if EIGEN_COMP_ICC>=1600 && __cplusplus >= 201103L +// Recent versions of ICC require for pointer types below. +#define EIGEN_ICC_NEEDS_CSTDINT (EIGEN_COMP_ICC>=1600 && EIGEN_COMP_CXXVER >= 11) + +// Define portable (u)int{32,64} types +#if EIGEN_HAS_CXX11 || EIGEN_ICC_NEEDS_CSTDINT #include +namespace Eigen { +namespace numext { +typedef std::uint8_t uint8_t; +typedef std::int8_t int8_t; +typedef std::uint16_t uint16_t; +typedef std::int16_t int16_t; +typedef std::uint32_t uint32_t; +typedef std::int32_t int32_t; +typedef std::uint64_t uint64_t; +typedef std::int64_t int64_t; +} +} +#else +// Without c++11, all compilers able to compile Eigen also +// provide the C99 stdint.h header file. +#include +namespace Eigen { +namespace numext { +typedef ::uint8_t uint8_t; +typedef ::int8_t int8_t; +typedef ::uint16_t uint16_t; +typedef ::int16_t int16_t; +typedef ::uint32_t uint32_t; +typedef ::int32_t int32_t; +typedef ::uint64_t uint64_t; +typedef ::int64_t int64_t; +} +} #endif namespace Eigen { @@ -43,26 +84,33 @@ namespace internal { // Only recent versions of ICC complain about using ptrdiff_t to hold pointers, // and older versions do not provide *intptr_t types. -#if EIGEN_COMP_ICC>=1600 && __cplusplus >= 201103L +#if EIGEN_ICC_NEEDS_CSTDINT typedef std::intptr_t IntPtr; typedef std::uintptr_t UIntPtr; #else typedef std::ptrdiff_t IntPtr; typedef std::size_t UIntPtr; #endif +#undef EIGEN_ICC_NEEDS_CSTDINT struct true_type { enum { value = 1 }; }; struct false_type { enum { value = 0 }; }; +template +struct bool_constant; + +template<> +struct bool_constant : true_type {}; + +template<> +struct bool_constant : false_type {}; + template struct conditional { typedef Then type; }; template struct conditional { typedef Else type; }; -template struct is_same { enum { value = 0 }; }; -template struct is_same { enum { value = 1 }; }; - template struct remove_reference { typedef T type; }; template struct remove_reference { typedef T type; }; @@ -97,23 +145,31 @@ template<> struct is_arithmetic { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; +template struct is_same { enum { value = 0 }; }; +template struct is_same { enum { value = 1 }; }; + +template< class T > +struct is_void : is_same::type> {}; + #if EIGEN_HAS_CXX11 +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; using std::is_integral; #else -template struct is_integral { enum { value = false }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; +template struct is_integral { enum { value = false }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; #if EIGEN_COMP_MSVC -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral{ enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; #endif #endif @@ -137,6 +193,16 @@ template<> struct make_unsigned { typedef unsigned long type; template<> struct make_unsigned { typedef unsigned __int64 type; }; template<> struct make_unsigned { typedef unsigned __int64 type; }; #endif + +// Some platforms define int64_t as `long long` even for C++03, where +// `long long` is not guaranteed by the standard. In this case we are missing +// the definition for make_unsigned. If we just define it, we run into issues +// where `long long` doesn't exist in some compilers for C++03. We therefore add +// the specialization for these platforms only. +#if EIGEN_OS_MAC || EIGEN_COMP_MINGW +template<> struct make_unsigned { typedef unsigned long long type; }; +template<> struct make_unsigned { typedef unsigned long long type; }; +#endif #endif template struct add_const { typedef const T type; }; @@ -151,6 +217,11 @@ template struct add_const_on_value_type { typedef T const template struct add_const_on_value_type { typedef T const* const type; }; template struct add_const_on_value_type { typedef T const* const type; }; +#if EIGEN_HAS_CXX11 + +using std::is_convertible; + +#else template struct is_convertible_impl @@ -164,16 +235,19 @@ private: struct yes {int a[1];}; struct no {int a[2];}; - static yes test(const To&, int); + template + static yes test(T, int); + + template static no test(any_conversion, ...); public: - static From ms_from; + static typename internal::remove_reference::type* ms_from; #ifdef __INTEL_COMPILER #pragma warning push #pragma warning ( disable : 2259 ) #endif - enum { value = sizeof(test(ms_from, 0))==sizeof(yes) }; + enum { value = sizeof(test(*ms_from, 0))==sizeof(yes) }; #ifdef __INTEL_COMPILER #pragma warning pop #endif @@ -182,10 +256,17 @@ public: template struct is_convertible { - enum { value = is_convertible_impl::type, - typename remove_all::type>::value }; + enum { value = is_convertible_impl::value }; }; +template +struct is_convertible { enum { value = false }; }; + +template +struct is_convertible { enum { value = true }; }; + +#endif + /** \internal Allows to enable/disable an overload * according to a compile time condition. */ @@ -194,7 +275,7 @@ template struct enable_if; template struct enable_if { typedef T type; }; -#if defined(__CUDA_ARCH__) +#if defined(EIGEN_GPU_COMPILE_PHASE) && !EIGEN_HAS_CXX11 #if !defined(__FLT_EPSILON__) #define __FLT_EPSILON__ FLT_EPSILON #define __DBL_EPSILON__ DBL_EPSILON @@ -205,7 +286,7 @@ namespace device { template struct numeric_limits { EIGEN_DEVICE_FUNC - static T epsilon() { return 0; } + static EIGEN_CONSTEXPR T epsilon() { return 0; } static T (max)() { assert(false && "Highest not supported for this type"); } static T (min)() { assert(false && "Lowest not supported for this type"); } static T infinity() { assert(false && "Infinity not supported for this type"); } @@ -213,91 +294,130 @@ template struct numeric_limits }; template<> struct numeric_limits { - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static float epsilon() { return __FLT_EPSILON__; } EIGEN_DEVICE_FUNC - static float (max)() { return CUDART_MAX_NORMAL_F; } - EIGEN_DEVICE_FUNC + static float (max)() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_MAX_NORMAL_F; + #else + return HIPRT_MAX_NORMAL_F; + #endif + } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static float (min)() { return FLT_MIN; } EIGEN_DEVICE_FUNC - static float infinity() { return CUDART_INF_F; } + static float infinity() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_INF_F; + #else + return HIPRT_INF_F; + #endif + } EIGEN_DEVICE_FUNC - static float quiet_NaN() { return CUDART_NAN_F; } + static float quiet_NaN() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_NAN_F; + #else + return HIPRT_NAN_F; + #endif + } }; template<> struct numeric_limits { - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static double epsilon() { return __DBL_EPSILON__; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static double (max)() { return DBL_MAX; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static double (min)() { return DBL_MIN; } EIGEN_DEVICE_FUNC - static double infinity() { return CUDART_INF; } + static double infinity() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_INF; + #else + return HIPRT_INF; + #endif + } EIGEN_DEVICE_FUNC - static double quiet_NaN() { return CUDART_NAN; } + static double quiet_NaN() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_NAN; + #else + return HIPRT_NAN; + #endif + } }; template<> struct numeric_limits { - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int epsilon() { return 0; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int (max)() { return INT_MAX; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int (min)() { return INT_MIN; } }; template<> struct numeric_limits { - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static unsigned int epsilon() { return 0; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static unsigned int (max)() { return UINT_MAX; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static unsigned int (min)() { return 0; } }; template<> struct numeric_limits { - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static long epsilon() { return 0; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static long (max)() { return LONG_MAX; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static long (min)() { return LONG_MIN; } }; template<> struct numeric_limits { - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static unsigned long epsilon() { return 0; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static unsigned long (max)() { return ULONG_MAX; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static unsigned long (min)() { return 0; } }; template<> struct numeric_limits { - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static long long epsilon() { return 0; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static long long (max)() { return LLONG_MAX; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static long long (min)() { return LLONG_MIN; } }; template<> struct numeric_limits { - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static unsigned long long epsilon() { return 0; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static unsigned long long (max)() { return ULLONG_MAX; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static unsigned long long (min)() { return 0; } }; +template<> struct numeric_limits +{ + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + static bool epsilon() { return false; } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + static bool (max)() { return true; } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + static bool (min)() { return false; } +}; } -#endif +#endif // defined(EIGEN_GPU_COMPILE_PHASE) && !EIGEN_HAS_CXX11 /** \internal - * A base class do disable default copy ctor and copy assignement operator. + * A base class do disable default copy ctor and copy assignment operator. */ class noncopyable { @@ -309,13 +429,82 @@ protected: }; /** \internal - * Convenient struct to get the result type of a unary or binary functor. + * Provides access to the number of elements in the object of as a compile-time constant expression. + * It "returns" Eigen::Dynamic if the size cannot be resolved at compile-time (default). * - * It supports both the current STL mechanism (using the result_type member) as well as - * upcoming next STL generation (using a templated result member). - * If none of these members is provided, then the type of the first argument is returned. FIXME, that behavior is a pretty bad hack. + * Similar to std::tuple_size, but more general. + * + * It currently supports: + * - any types T defining T::SizeAtCompileTime + * - plain C arrays as T[N] + * - std::array (c++11) + * - some internal types such as SingleRange and AllRange + * + * The second template parameter eases SFINAE-based specializations. */ -#if EIGEN_HAS_STD_RESULT_OF +template struct array_size { + enum { value = Dynamic }; +}; + +template struct array_size::type> { + enum { value = T::SizeAtCompileTime }; +}; + +template struct array_size { + enum { value = N }; +}; +template struct array_size { + enum { value = N }; +}; + +#if EIGEN_HAS_CXX11 +template struct array_size > { + enum { value = N }; +}; +template struct array_size > { + enum { value = N }; +}; +#endif + +/** \internal + * Analogue of the std::size free function. + * It returns the size of the container or view \a x of type \c T + * + * It currently supports: + * - any types T defining a member T::size() const + * - plain C arrays as T[N] + * + */ +template +EIGEN_CONSTEXPR Index size(const T& x) { return x.size(); } + +template +EIGEN_CONSTEXPR Index size(const T (&) [N]) { return N; } + +/** \internal + * Convenient struct to get the result type of a nullary, unary, binary, or + * ternary functor. + * + * Pre C++11: + * Supports both a Func::result_type member and templated + * Func::result::type member. + * + * If none of these members is provided, then the type of the first + * argument is returned. + * + * Post C++11: + * This uses std::result_of. However, note the `type` member removes + * const and converts references/pointers to their corresponding value type. + */ +#if EIGEN_HAS_STD_INVOKE_RESULT +template struct result_of; + +template +struct result_of { + typedef typename std::invoke_result::type type1; + typedef typename remove_all::type type; +}; +#elif EIGEN_HAS_STD_RESULT_OF template struct result_of { typedef typename std::result_of::type type1; typedef typename remove_all::type type; @@ -327,6 +516,28 @@ struct has_none {int a[1];}; struct has_std_result_type {int a[2];}; struct has_tr1_result {int a[3];}; +template +struct nullary_result_of_select {}; + +template +struct nullary_result_of_select {typedef typename Func::result_type type;}; + +template +struct nullary_result_of_select {typedef typename Func::template result::type type;}; + +template +struct result_of { + template + static has_std_result_type testFunctor(T const *, typename T::result_type const * = 0); + template + static has_tr1_result testFunctor(T const *, typename T::template result::type const * = 0); + static has_none testFunctor(...); + + // note that the following indirection is needed for gcc-3.3 + enum {FunctorType = sizeof(testFunctor(static_cast(0)))}; + typedef typename nullary_result_of_select::type type; +}; + template struct unary_result_of_select {typedef typename internal::remove_all::type type;}; @@ -396,6 +607,45 @@ struct result_of { enum {FunctorType = sizeof(testFunctor(static_cast(0)))}; typedef typename ternary_result_of_select::type type; }; + +#endif + +#if EIGEN_HAS_STD_INVOKE_RESULT +template +struct invoke_result { + typedef typename std::invoke_result::type type1; + typedef typename remove_all::type type; +}; +#elif EIGEN_HAS_CXX11 +template +struct invoke_result { + typedef typename result_of::type type1; + typedef typename remove_all::type type; +}; +#else +template +struct invoke_result { + typedef typename result_of::type type1; + typedef typename remove_all::type type; +}; + +template +struct invoke_result { + typedef typename result_of::type type1; + typedef typename remove_all::type type; +}; + +template +struct invoke_result { + typedef typename result_of::type type1; + typedef typename remove_all::type type; +}; + +template +struct invoke_result { + typedef typename result_of::type type1; + typedef typename remove_all::type type; +}; #endif struct meta_yes { char a[1]; }; @@ -405,10 +655,10 @@ struct meta_no { char a[2]; }; template struct has_ReturnType { - template static meta_yes testFunctor(typename C::ReturnType const *); - template static meta_no testFunctor(...); + template static meta_yes testFunctor(C const *, typename C::ReturnType const * = 0); + template static meta_no testFunctor(...); - enum { value = sizeof(testFunctor(0)) == sizeof(meta_yes) }; + enum { value = sizeof(testFunctor(static_cast(0))) == sizeof(meta_yes) }; }; template const T* return_ptr(); @@ -465,20 +715,25 @@ class meta_sqrt { public: enum { ret = (SupX*SupX <= Y) ? /** \internal Computes the least common multiple of two positive integer A and B - * at compile-time. It implements a naive algorithm testing all multiples of A. - * It thus works better if A>=B. + * at compile-time. */ -template +template=B)> struct meta_least_common_multiple { enum { ret = meta_least_common_multiple::ret }; }; +template +struct meta_least_common_multiple +{ + enum { ret = meta_least_common_multiple::ret }; +}; template -struct meta_least_common_multiple +struct meta_least_common_multiple { enum { ret = A*K }; }; + /** \internal determines whether the product of two numeric types is allowed and what the return type is */ template struct scalar_product_traits { @@ -491,17 +746,27 @@ template struct scalar_product_traits // typedef typename scalar_product_traits::type, typename remove_all::type>::ReturnType type; // }; +/** \internal Obtains a POD type suitable to use as storage for an object of a size + * of at most Len bytes, aligned as specified by \c Align. + */ +template +struct aligned_storage { + struct type { + EIGEN_ALIGN_TO_BOUNDARY(Align) unsigned char data[Len]; + }; +}; + } // end namespace internal namespace numext { - -#if defined(__CUDA_ARCH__) + +#if defined(EIGEN_GPU_COMPILE_PHASE) template EIGEN_DEVICE_FUNC void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; } #else template EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); } #endif -#if defined(__CUDA_ARCH__) +#if defined(EIGEN_GPU_COMPILE_PHASE) && !EIGEN_HAS_CXX11 using internal::device::numeric_limits; #else using std::numeric_limits; @@ -510,6 +775,7 @@ using std::numeric_limits; // Integer division with rounding up. // T is assumed to be an integer type with a>=0, and b>0 template +EIGEN_DEVICE_FUNC T div_ceil(const T &a, const T &b) { return (a+b-1) / b; @@ -517,52 +783,30 @@ T div_ceil(const T &a, const T &b) // The aim of the following functions is to bypass -Wfloat-equal warnings // when we really want a strict equality comparison on floating points. -template EIGEN_STRONG_INLINE +template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool equal_strict(const X& x,const Y& y) { return x == y; } -template<> EIGEN_STRONG_INLINE +#if !defined(EIGEN_GPU_COMPILE_PHASE) || (!defined(EIGEN_CUDA_ARCH) && defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)) +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool equal_strict(const float& x,const float& y) { return std::equal_to()(x,y); } -template<> EIGEN_STRONG_INLINE +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool equal_strict(const double& x,const double& y) { return std::equal_to()(x,y); } +#endif -template EIGEN_STRONG_INLINE +template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool not_equal_strict(const X& x,const Y& y) { return x != y; } -template<> EIGEN_STRONG_INLINE +#if !defined(EIGEN_GPU_COMPILE_PHASE) || (!defined(EIGEN_CUDA_ARCH) && defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)) +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool not_equal_strict(const float& x,const float& y) { return std::not_equal_to()(x,y); } -template<> EIGEN_STRONG_INLINE +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool not_equal_strict(const double& x,const double& y) { return std::not_equal_to()(x,y); } +#endif } // end namespace numext } // end namespace Eigen -// Define portable (u)int{32,64} types -#if EIGEN_HAS_CXX11 -#include -namespace Eigen { -namespace numext { -typedef std::uint32_t uint32_t; -typedef std::int32_t int32_t; -typedef std::uint64_t uint64_t; -typedef std::int64_t int64_t; -} -} -#else -// Without c++11, all compilers able to compile Eigen also -// provides the C99 stdint.h header file. -#include -namespace Eigen { -namespace numext { -typedef ::uint32_t uint32_t; -typedef ::int32_t int32_t; -typedef ::uint64_t uint64_t; -typedef ::int64_t int64_t; -} -} -#endif - - #endif // EIGEN_META_H diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/ReshapedHelper.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/ReshapedHelper.h new file mode 100644 index 0000000000..412432132c --- /dev/null +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/ReshapedHelper.h @@ -0,0 +1,51 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +#ifndef EIGEN_RESHAPED_HELPER_H +#define EIGEN_RESHAPED_HELPER_H + +namespace Eigen { + +enum AutoSize_t { AutoSize }; +const int AutoOrder = 2; + +namespace internal { + +template +struct get_compiletime_reshape_size { + enum { value = get_fixed_value::value }; +}; + +template +Index get_runtime_reshape_size(SizeType size, Index /*other*/, Index /*total*/) { + return internal::get_runtime_value(size); +} + +template +struct get_compiletime_reshape_size { + enum { + other_size = get_fixed_value::value, + value = (TotalSize==Dynamic || other_size==Dynamic) ? Dynamic : TotalSize / other_size }; +}; + +inline Index get_runtime_reshape_size(AutoSize_t /*size*/, Index other, Index total) { + return total/other; +} + +template +struct get_compiletime_reshape_order { + enum { value = Order == AutoOrder ? Flags & RowMajorBit : Order }; +}; + +} + +} // end namespace Eigen + +#endif // EIGEN_RESHAPED_HELPER_H diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/StaticAssert.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/StaticAssert.h index 500e47792a..c45de59016 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/StaticAssert.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/StaticAssert.h @@ -27,7 +27,7 @@ #ifndef EIGEN_STATIC_ASSERT #ifndef EIGEN_NO_STATIC_ASSERT - #if EIGEN_MAX_CPP_VER>=11 && (__has_feature(cxx_static_assert) || (defined(__cplusplus) && __cplusplus >= 201103L) || (EIGEN_COMP_MSVC >= 1600)) + #if EIGEN_MAX_CPP_VER>=11 && (__has_feature(cxx_static_assert) || (EIGEN_COMP_CXXVER >= 11) || (EIGEN_COMP_MSVC >= 1600)) // if native static_assert is enabled, let's use it #define EIGEN_STATIC_ASSERT(X,MSG) static_assert(X,#MSG); @@ -103,7 +103,10 @@ STORAGE_KIND_MUST_MATCH=1, STORAGE_INDEX_MUST_MATCH=1, CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY=1, - SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY=1 + SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY=1, + INVALID_TEMPLATE_PARAMETER=1, + GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS=1, + THE_ARRAY_SIZE_SHOULD_EQUAL_WITH_PACKET_SIZE=1 }; }; @@ -182,7 +185,7 @@ ) #define EIGEN_STATIC_ASSERT_NON_INTEGER(TYPE) \ - EIGEN_STATIC_ASSERT(!NumTraits::IsInteger, THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES) + EIGEN_STATIC_ASSERT(!Eigen::NumTraits::IsInteger, THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES) // static assertion failing if it is guaranteed at compile-time that the two matrix expression types have different sizes @@ -192,8 +195,8 @@ YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES) #define EIGEN_STATIC_ASSERT_SIZE_1x1(TYPE) \ - EIGEN_STATIC_ASSERT((TYPE::RowsAtCompileTime == 1 || TYPE::RowsAtCompileTime == Dynamic) && \ - (TYPE::ColsAtCompileTime == 1 || TYPE::ColsAtCompileTime == Dynamic), \ + EIGEN_STATIC_ASSERT((TYPE::RowsAtCompileTime == 1 || TYPE::RowsAtCompileTime == Eigen::Dynamic) && \ + (TYPE::ColsAtCompileTime == 1 || TYPE::ColsAtCompileTime == Eigen::Dynamic), \ THIS_METHOD_IS_ONLY_FOR_1x1_EXPRESSIONS) #define EIGEN_STATIC_ASSERT_LVALUE(Derived) \ diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/SymbolicIndex.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/SymbolicIndex.h new file mode 100644 index 0000000000..354dd9add3 --- /dev/null +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/SymbolicIndex.h @@ -0,0 +1,293 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SYMBOLIC_INDEX_H +#define EIGEN_SYMBOLIC_INDEX_H + +namespace Eigen { + +/** \namespace Eigen::symbolic + * \ingroup Core_Module + * + * This namespace defines a set of classes and functions to build and evaluate symbolic expressions of scalar type Index. + * Here is a simple example: + * + * \code + * // First step, defines symbols: + * struct x_tag {}; static const symbolic::SymbolExpr x; + * struct y_tag {}; static const symbolic::SymbolExpr y; + * struct z_tag {}; static const symbolic::SymbolExpr z; + * + * // Defines an expression: + * auto expr = (x+3)/y+z; + * + * // And evaluate it: (c++14) + * std::cout << expr.eval(x=6,y=3,z=-13) << "\n"; + * + * // In c++98/11, only one symbol per expression is supported for now: + * auto expr98 = (3-x)/2; + * std::cout << expr98.eval(x=6) << "\n"; + * \endcode + * + * It is currently only used internally to define and manipulate the Eigen::last and Eigen::lastp1 symbols in Eigen::seq and Eigen::seqN. + * + */ +namespace symbolic { + +template class Symbol; +template class NegateExpr; +template class AddExpr; +template class ProductExpr; +template class QuotientExpr; + +// A simple wrapper around an integral value to provide the eval method. +// We could also use a free-function symbolic_eval... +template +class ValueExpr { +public: + ValueExpr(IndexType val) : m_value(val) {} + template + IndexType eval_impl(const T&) const { return m_value; } +protected: + IndexType m_value; +}; + +// Specialization for compile-time value, +// It is similar to ValueExpr(N) but this version helps the compiler to generate better code. +template +class ValueExpr > { +public: + ValueExpr() {} + template + EIGEN_CONSTEXPR Index eval_impl(const T&) const { return N; } +}; + + +/** \class BaseExpr + * \ingroup Core_Module + * Common base class of any symbolic expressions + */ +template +class BaseExpr +{ +public: + const Derived& derived() const { return *static_cast(this); } + + /** Evaluate the expression given the \a values of the symbols. + * + * \param values defines the values of the symbols, it can either be a SymbolValue or a std::tuple of SymbolValue + * as constructed by SymbolExpr::operator= operator. + * + */ + template + Index eval(const T& values) const { return derived().eval_impl(values); } + +#if EIGEN_HAS_CXX14 + template + Index eval(Types&&... values) const { return derived().eval_impl(std::make_tuple(values...)); } +#endif + + NegateExpr operator-() const { return NegateExpr(derived()); } + + AddExpr > operator+(Index b) const + { return AddExpr >(derived(), b); } + AddExpr > operator-(Index a) const + { return AddExpr >(derived(), -a); } + ProductExpr > operator*(Index a) const + { return ProductExpr >(derived(),a); } + QuotientExpr > operator/(Index a) const + { return QuotientExpr >(derived(),a); } + + friend AddExpr > operator+(Index a, const BaseExpr& b) + { return AddExpr >(b.derived(), a); } + friend AddExpr,ValueExpr<> > operator-(Index a, const BaseExpr& b) + { return AddExpr,ValueExpr<> >(-b.derived(), a); } + friend ProductExpr,Derived> operator*(Index a, const BaseExpr& b) + { return ProductExpr,Derived>(a,b.derived()); } + friend QuotientExpr,Derived> operator/(Index a, const BaseExpr& b) + { return QuotientExpr,Derived>(a,b.derived()); } + + template + AddExpr > > operator+(internal::FixedInt) const + { return AddExpr > >(derived(), ValueExpr >()); } + template + AddExpr > > operator-(internal::FixedInt) const + { return AddExpr > >(derived(), ValueExpr >()); } + template + ProductExpr > > operator*(internal::FixedInt) const + { return ProductExpr > >(derived(),ValueExpr >()); } + template + QuotientExpr > > operator/(internal::FixedInt) const + { return QuotientExpr > >(derived(),ValueExpr >()); } + + template + friend AddExpr > > operator+(internal::FixedInt, const BaseExpr& b) + { return AddExpr > >(b.derived(), ValueExpr >()); } + template + friend AddExpr,ValueExpr > > operator-(internal::FixedInt, const BaseExpr& b) + { return AddExpr,ValueExpr > >(-b.derived(), ValueExpr >()); } + template + friend ProductExpr >,Derived> operator*(internal::FixedInt, const BaseExpr& b) + { return ProductExpr >,Derived>(ValueExpr >(),b.derived()); } + template + friend QuotientExpr >,Derived> operator/(internal::FixedInt, const BaseExpr& b) + { return QuotientExpr > ,Derived>(ValueExpr >(),b.derived()); } + +#if (!EIGEN_HAS_CXX14) + template + AddExpr > > operator+(internal::FixedInt (*)()) const + { return AddExpr > >(derived(), ValueExpr >()); } + template + AddExpr > > operator-(internal::FixedInt (*)()) const + { return AddExpr > >(derived(), ValueExpr >()); } + template + ProductExpr > > operator*(internal::FixedInt (*)()) const + { return ProductExpr > >(derived(),ValueExpr >()); } + template + QuotientExpr > > operator/(internal::FixedInt (*)()) const + { return QuotientExpr > >(derived(),ValueExpr >()); } + + template + friend AddExpr > > operator+(internal::FixedInt (*)(), const BaseExpr& b) + { return AddExpr > >(b.derived(), ValueExpr >()); } + template + friend AddExpr,ValueExpr > > operator-(internal::FixedInt (*)(), const BaseExpr& b) + { return AddExpr,ValueExpr > >(-b.derived(), ValueExpr >()); } + template + friend ProductExpr >,Derived> operator*(internal::FixedInt (*)(), const BaseExpr& b) + { return ProductExpr >,Derived>(ValueExpr >(),b.derived()); } + template + friend QuotientExpr >,Derived> operator/(internal::FixedInt (*)(), const BaseExpr& b) + { return QuotientExpr > ,Derived>(ValueExpr >(),b.derived()); } +#endif + + + template + AddExpr operator+(const BaseExpr &b) const + { return AddExpr(derived(), b.derived()); } + + template + AddExpr > operator-(const BaseExpr &b) const + { return AddExpr >(derived(), -b.derived()); } + + template + ProductExpr operator*(const BaseExpr &b) const + { return ProductExpr(derived(), b.derived()); } + + template + QuotientExpr operator/(const BaseExpr &b) const + { return QuotientExpr(derived(), b.derived()); } +}; + +template +struct is_symbolic { + // BaseExpr has no conversion ctor, so we only have to check whether T can be statically cast to its base class BaseExpr. + enum { value = internal::is_convertible >::value }; +}; + +/** Represents the actual value of a symbol identified by its tag + * + * It is the return type of SymbolValue::operator=, and most of the time this is only way it is used. + */ +template +class SymbolValue +{ +public: + /** Default constructor from the value \a val */ + SymbolValue(Index val) : m_value(val) {} + + /** \returns the stored value of the symbol */ + Index value() const { return m_value; } +protected: + Index m_value; +}; + +/** Expression of a symbol uniquely identified by the template parameter type \c tag */ +template +class SymbolExpr : public BaseExpr > +{ +public: + /** Alias to the template parameter \c tag */ + typedef tag Tag; + + SymbolExpr() {} + + /** Associate the value \a val to the given symbol \c *this, uniquely identified by its \c Tag. + * + * The returned object should be passed to ExprBase::eval() to evaluate a given expression with this specified runtime-time value. + */ + SymbolValue operator=(Index val) const { + return SymbolValue(val); + } + + Index eval_impl(const SymbolValue &values) const { return values.value(); } + +#if EIGEN_HAS_CXX14 + // C++14 versions suitable for multiple symbols + template + Index eval_impl(const std::tuple& values) const { return std::get >(values).value(); } +#endif +}; + +template +class NegateExpr : public BaseExpr > +{ +public: + NegateExpr(const Arg0& arg0) : m_arg0(arg0) {} + + template + Index eval_impl(const T& values) const { return -m_arg0.eval_impl(values); } +protected: + Arg0 m_arg0; +}; + +template +class AddExpr : public BaseExpr > +{ +public: + AddExpr(const Arg0& arg0, const Arg1& arg1) : m_arg0(arg0), m_arg1(arg1) {} + + template + Index eval_impl(const T& values) const { return m_arg0.eval_impl(values) + m_arg1.eval_impl(values); } +protected: + Arg0 m_arg0; + Arg1 m_arg1; +}; + +template +class ProductExpr : public BaseExpr > +{ +public: + ProductExpr(const Arg0& arg0, const Arg1& arg1) : m_arg0(arg0), m_arg1(arg1) {} + + template + Index eval_impl(const T& values) const { return m_arg0.eval_impl(values) * m_arg1.eval_impl(values); } +protected: + Arg0 m_arg0; + Arg1 m_arg1; +}; + +template +class QuotientExpr : public BaseExpr > +{ +public: + QuotientExpr(const Arg0& arg0, const Arg1& arg1) : m_arg0(arg0), m_arg1(arg1) {} + + template + Index eval_impl(const T& values) const { return m_arg0.eval_impl(values) / m_arg1.eval_impl(values); } +protected: + Arg0 m_arg0; + Arg1 m_arg1; +}; + +} // end namespace symbolic + +} // end namespace Eigen + +#endif // EIGEN_SYMBOLIC_INDEX_H diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/XprHelper.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/XprHelper.h index 6bb4970828..71c32b8a11 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/XprHelper.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Core/util/XprHelper.h @@ -49,6 +49,12 @@ template struct is_valid_index_type }; }; +// true if both types are not valid index types +template +struct valid_indexed_view_overload { + enum { value = !(internal::is_valid_index_type::value && internal::is_valid_index_type::value) }; +}; + // promote_scalar_arg is an helper used in operation between an expression and a scalar, like: // expression * scalar // Its role is to determine how the type T of the scalar operand should be promoted given the scalar type ExprScalar of the given expression. @@ -123,19 +129,23 @@ struct promote_index_type template class variable_if_dynamic { public: - EIGEN_EMPTY_STRUCT_CTOR(variable_if_dynamic) + EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(variable_if_dynamic) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T v) { EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == T(Value)); } - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T value() { return T(Value); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T) {} + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + T value() { return T(Value); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + operator T() const { return T(Value); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void setValue(T v) const { EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == T(Value)); } }; template class variable_if_dynamic { T m_value; - EIGEN_DEVICE_FUNC variable_if_dynamic() { eigen_assert(false); } public: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T value) : m_value(value) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T value = 0) EIGEN_NO_THROW : m_value(value) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T value() const { return m_value; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator T() const { return m_value; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T value) { m_value = value; } }; @@ -146,8 +156,10 @@ template class variable_if_dynamicindex public: EIGEN_EMPTY_STRUCT_CTOR(variable_if_dynamicindex) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamicindex(T v) { EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == T(Value)); } - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T value() { return T(Value); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T) {} + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + T value() { return T(Value); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void setValue(T) {} }; template class variable_if_dynamicindex @@ -172,16 +184,7 @@ template struct functor_traits template struct packet_traits; -template struct unpacket_traits -{ - typedef T type; - typedef T half; - enum - { - size = 1, - alignment = 1 - }; -}; +template struct unpacket_traits; template::size)==0 || is_same::half>::value> @@ -400,7 +403,7 @@ template struct plain_matrix_type_row_major typedef Matrix::Scalar, Rows, Cols, - (MaxCols==1&&MaxRows!=1) ? RowMajor : ColMajor, + (MaxCols==1&&MaxRows!=1) ? ColMajor : RowMajor, MaxRows, MaxCols > type; @@ -417,7 +420,7 @@ struct ref_selector T const&, const T >::type type; - + typedef typename conditional< bool(traits::Flags & NestByRefBit), T &, @@ -455,7 +458,7 @@ template { enum { ScalarReadCost = NumTraits::Scalar>::ReadCost, - CoeffReadCost = evaluator::CoeffReadCost, // NOTE What if an evaluator evaluate itself into a tempory? + CoeffReadCost = evaluator::CoeffReadCost, // NOTE What if an evaluator evaluate itself into a temporary? // Then CoeffReadCost will be small (e.g., 1) but we still have to evaluate, especially if n>1. // This situation is already taken care by the EvalBeforeNestingBit flag, which is turned ON // for all evaluator creating a temporary. This flag is then propagated by the parent evaluators. @@ -596,14 +599,14 @@ template MatrixRowType; + int(ExpressionType::PlainObject::Options) | int(RowMajor), 1, ExpressionType::MaxColsAtCompileTime> MatrixRowType; typedef Array ArrayRowType; + int(ExpressionType::PlainObject::Options) | int(RowMajor), 1, ExpressionType::MaxColsAtCompileTime> ArrayRowType; typedef typename conditional< is_same< typename traits::XprKind, MatrixXpr >::value, MatrixRowType, - ArrayRowType + ArrayRowType >::type type; }; @@ -618,7 +621,7 @@ struct plain_col_type typedef typename conditional< is_same< typename traits::XprKind, MatrixXpr >::value, MatrixColType, - ArrayColType + ArrayColType >::type type; }; @@ -634,7 +637,7 @@ struct plain_diag_type typedef typename conditional< is_same< typename traits::XprKind, MatrixXpr >::value, MatrixDiagType, - ArrayDiagType + ArrayDiagType >::type type; }; @@ -671,24 +674,39 @@ template struct is_diagonal > template struct is_diagonal > { enum { ret = true }; }; + +template struct is_identity +{ enum { value = false }; }; + +template struct is_identity, T> > +{ enum { value = true }; }; + + template struct glue_shapes; template<> struct glue_shapes { typedef TriangularShape type; }; template -bool is_same_dense(const T1 &mat1, const T2 &mat2, typename enable_if::ret&&has_direct_access::ret, T1>::type * = 0) +struct possibly_same_dense { + enum { value = has_direct_access::ret && has_direct_access::ret && is_same::value }; +}; + +template +EIGEN_DEVICE_FUNC +bool is_same_dense(const T1 &mat1, const T2 &mat2, typename enable_if::value>::type * = 0) { return (mat1.data()==mat2.data()) && (mat1.innerStride()==mat2.innerStride()) && (mat1.outerStride()==mat2.outerStride()); } template -bool is_same_dense(const T1 &, const T2 &, typename enable_if::ret&&has_direct_access::ret), T1>::type * = 0) +EIGEN_DEVICE_FUNC +bool is_same_dense(const T1 &, const T2 &, typename enable_if::value>::type * = 0) { return false; } // Internal helper defining the cost of a scalar division for the type T. // The default heuristic can be specialized for each scalar type and architecture. -template +template struct scalar_div_cost { enum { value = 8*NumTraits::MulCost }; }; @@ -735,7 +753,7 @@ std::string demangle_flags(int f) if(f&DirectAccessBit) res += " | Direct"; if(f&NestByRefBit) res += " | NestByRef"; if(f&NoPreferredStorageOrderBit) res += " | NoPreferredStorageOrderBit"; - + return res; } #endif @@ -832,7 +850,7 @@ struct ScalarBinaryOpTraits #define EIGEN_CHECK_BINARY_COMPATIBILIY(BINOP,LHS,RHS) \ EIGEN_STATIC_ASSERT((Eigen::internal::has_ReturnType >::value), \ YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) - + } // end namespace Eigen #endif // EIGEN_XPRHELPER_H diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Eigenvalues/ComplexEigenSolver.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Eigenvalues/ComplexEigenSolver.h index dc5fae06a3..081e918f13 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Eigenvalues/ComplexEigenSolver.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Eigenvalues/ComplexEigenSolver.h @@ -214,7 +214,7 @@ template class ComplexEigenSolver /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, \c NoConvergence otherwise. + * \returns \c Success if computation was successful, \c NoConvergence otherwise. */ ComputationInfo info() const { diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Eigenvalues/ComplexSchur.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Eigenvalues/ComplexSchur.h index 4354e4018f..fc71468f8d 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Eigenvalues/ComplexSchur.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Eigenvalues/ComplexSchur.h @@ -212,7 +212,7 @@ template class ComplexSchur /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, \c NoConvergence otherwise. + * \returns \c Success if computation was successful, \c NoConvergence otherwise. */ ComputationInfo info() const { diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Eigenvalues/EigenSolver.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Eigenvalues/EigenSolver.h index f205b185de..572b29e4e9 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Eigenvalues/EigenSolver.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Eigenvalues/EigenSolver.h @@ -110,7 +110,7 @@ template class EigenSolver * * \sa compute() for an example. */ - EigenSolver() : m_eivec(), m_eivalues(), m_isInitialized(false), m_realSchur(), m_matT(), m_tmp() {} + EigenSolver() : m_eivec(), m_eivalues(), m_isInitialized(false), m_eigenvectorsOk(false), m_realSchur(), m_matT(), m_tmp() {} /** \brief Default constructor with memory preallocation * @@ -277,7 +277,7 @@ template class EigenSolver template EigenSolver& compute(const EigenBase& matrix, bool computeEigenvectors = true); - /** \returns NumericalIssue if the input contains INF or NaN values or overflow occured. Returns Success otherwise. */ + /** \returns NumericalIssue if the input contains INF or NaN values or overflow occurred. Returns Success otherwise. */ ComputationInfo info() const { eigen_assert(m_isInitialized && "EigenSolver is not initialized."); diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h index 5f6bb82898..d0f9091beb 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h @@ -121,7 +121,7 @@ class GeneralizedSelfAdjointEigenSolver : public SelfAdjointEigenSolver<_MatrixT * * \returns Reference to \c *this * - * Accoring to \p options, this function computes eigenvalues and (if requested) + * According to \p options, this function computes eigenvalues and (if requested) * the eigenvectors of one of the following three generalized eigenproblems: * - \c Ax_lBx: \f$ Ax = \lambda B x \f$ * - \c ABx_lx: \f$ ABx = \lambda x \f$ diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Eigenvalues/HessenbergDecomposition.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Eigenvalues/HessenbergDecomposition.h index f647f69b06..1f21139346 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Eigenvalues/HessenbergDecomposition.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Eigenvalues/HessenbergDecomposition.h @@ -267,7 +267,7 @@ template class HessenbergDecomposition private: - typedef Matrix VectorType; + typedef Matrix VectorType; typedef typename NumTraits::Real RealScalar; static void _compute(MatrixType& matA, CoeffVectorType& hCoeffs, VectorType& temp); @@ -315,7 +315,7 @@ void HessenbergDecomposition::_compute(MatrixType& matA, CoeffVector // A = A H' matA.rightCols(remainingSize) - .applyHouseholderOnTheRight(matA.col(i).tail(remainingSize-1).conjugate(), numext::conj(h), &temp.coeffRef(0)); + .applyHouseholderOnTheRight(matA.col(i).tail(remainingSize-1), numext::conj(h), &temp.coeffRef(0)); } } diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h index e4e4260711..66e5a3dbb0 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h @@ -84,7 +84,7 @@ MatrixBase::eigenvalues() const * \sa SelfAdjointEigenSolver::eigenvalues(), MatrixBase::eigenvalues() */ template -inline typename SelfAdjointView::EigenvaluesReturnType +EIGEN_DEVICE_FUNC inline typename SelfAdjointView::EigenvaluesReturnType SelfAdjointView::eigenvalues() const { PlainObject thisAsMatrix(*this); @@ -147,7 +147,7 @@ MatrixBase::operatorNorm() const * \sa eigenvalues(), MatrixBase::operatorNorm() */ template -inline typename SelfAdjointView::RealScalar +EIGEN_DEVICE_FUNC inline typename SelfAdjointView::RealScalar SelfAdjointView::operatorNorm() const { return eigenvalues().cwiseAbs().maxCoeff(); diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Eigenvalues/RealQZ.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Eigenvalues/RealQZ.h index b3a910dd9f..509130184b 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Eigenvalues/RealQZ.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Eigenvalues/RealQZ.h @@ -90,8 +90,9 @@ namespace Eigen { m_Z(size, size), m_workspace(size*2), m_maxIters(400), - m_isInitialized(false) - { } + m_isInitialized(false), + m_computeQZ(true) + {} /** \brief Constructor; computes real QZ decomposition of given matrices * @@ -108,9 +109,11 @@ namespace Eigen { m_Z(A.rows(),A.cols()), m_workspace(A.rows()*2), m_maxIters(400), - m_isInitialized(false) { - compute(A, B, computeQZ); - } + m_isInitialized(false), + m_computeQZ(true) + { + compute(A, B, computeQZ); + } /** \brief Returns matrix Q in the QZ decomposition. * @@ -161,7 +164,7 @@ namespace Eigen { /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, \c NoConvergence otherwise. + * \returns \c Success if computation was successful, \c NoConvergence otherwise. */ ComputationInfo info() const { diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Eigenvalues/RealSchur.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Eigenvalues/RealSchur.h index 9191519abe..7304ef3449 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Eigenvalues/RealSchur.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Eigenvalues/RealSchur.h @@ -190,7 +190,7 @@ template class RealSchur RealSchur& computeFromHessenberg(const HessMatrixType& matrixH, const OrthMatrixType& matrixQ, bool computeU); /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, \c NoConvergence otherwise. + * \returns \c Success if computation was successful, \c NoConvergence otherwise. */ ComputationInfo info() const { @@ -270,8 +270,13 @@ RealSchur& RealSchur::compute(const EigenBase // Step 1. Reduce to Hessenberg form m_hess.compute(matrix.derived()/scale); - // Step 2. Reduce to real Schur form - computeFromHessenberg(m_hess.matrixH(), m_hess.matrixQ(), computeU); + // Step 2. Reduce to real Schur form + // Note: we copy m_hess.matrixQ() into m_matU here and not in computeFromHessenberg + // to be able to pass our working-space buffer for the Householder to Dense evaluation. + m_workspaceVector.resize(matrix.cols()); + if(computeU) + m_hess.matrixQ().evalTo(m_matU, m_workspaceVector); + computeFromHessenberg(m_hess.matrixH(), m_matU, computeU); m_matT *= scale; @@ -284,13 +289,13 @@ RealSchur& RealSchur::computeFromHessenberg(const HessMa using std::abs; m_matT = matrixH; - if(computeU) + m_workspaceVector.resize(m_matT.cols()); + if(computeU && !internal::is_same_dense(m_matU,matrixQ)) m_matU = matrixQ; Index maxIters = m_maxIters; if (maxIters == -1) maxIters = m_maxIterationsPerRow * matrixH.rows(); - m_workspaceVector.resize(m_matT.cols()); Scalar* workspace = &m_workspaceVector.coeffRef(0); // The matrix m_matT is divided in three parts. diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h index d37656fa20..14692365ff 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h @@ -20,7 +20,9 @@ class GeneralizedSelfAdjointEigenSolver; namespace internal { template struct direct_selfadjoint_eigenvalues; + template +EIGEN_DEVICE_FUNC ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag, const Index maxIterations, bool computeEigenvectors, MatrixType& eivec); } @@ -42,10 +44,14 @@ ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag * \f$ v \f$ such that \f$ Av = \lambda v \f$. The eigenvalues of a * selfadjoint matrix are always real. If \f$ D \f$ is a diagonal matrix with * the eigenvalues on the diagonal, and \f$ V \f$ is a matrix with the - * eigenvectors as its columns, then \f$ A = V D V^{-1} \f$ (for selfadjoint - * matrices, the matrix \f$ V \f$ is always invertible). This is called the + * eigenvectors as its columns, then \f$ A = V D V^{-1} \f$. This is called the * eigendecomposition. * + * For a selfadjoint matrix, \f$ V \f$ is unitary, meaning its inverse is equal + * to its adjoint, \f$ V^{-1} = V^{\dagger} \f$. If \f$ A \f$ is real, then + * \f$ V \f$ is also real and therefore orthogonal, meaning its inverse is + * equal to its transpose, \f$ V^{-1} = V^T \f$. + * * The algorithm exploits the fact that the matrix is selfadjoint, making it * faster and more accurate than the general purpose eigenvalue algorithms * implemented in EigenSolver and ComplexEigenSolver. @@ -119,7 +125,10 @@ template class SelfAdjointEigenSolver : m_eivec(), m_eivalues(), m_subdiag(), - m_isInitialized(false) + m_hcoeffs(), + m_info(InvalidInput), + m_isInitialized(false), + m_eigenvectorsOk(false) { } /** \brief Constructor, pre-allocates memory for dynamic-size matrices. @@ -139,7 +148,9 @@ template class SelfAdjointEigenSolver : m_eivec(size, size), m_eivalues(size), m_subdiag(size > 1 ? size - 1 : 1), - m_isInitialized(false) + m_hcoeffs(size > 1 ? size - 1 : 1), + m_isInitialized(false), + m_eigenvectorsOk(false) {} /** \brief Constructor; computes eigendecomposition of given matrix. @@ -163,7 +174,9 @@ template class SelfAdjointEigenSolver : m_eivec(matrix.rows(), matrix.cols()), m_eivalues(matrix.cols()), m_subdiag(matrix.rows() > 1 ? matrix.rows() - 1 : 1), - m_isInitialized(false) + m_hcoeffs(matrix.cols() > 1 ? matrix.cols() - 1 : 1), + m_isInitialized(false), + m_eigenvectorsOk(false) { compute(matrix.derived(), options); } @@ -250,6 +263,11 @@ template class SelfAdjointEigenSolver * matrix \f$ A \f$, then the matrix returned by this function is the * matrix \f$ V \f$ in the eigendecomposition \f$ A = V D V^{-1} \f$. * + * For a selfadjoint matrix, \f$ V \f$ is unitary, meaning its inverse is equal + * to its adjoint, \f$ V^{-1} = V^{\dagger} \f$. If \f$ A \f$ is real, then + * \f$ V \f$ is also real and therefore orthogonal, meaning its inverse is + * equal to its transpose, \f$ V^{-1} = V^T \f$. + * * Example: \include SelfAdjointEigenSolver_eigenvectors.cpp * Output: \verbinclude SelfAdjointEigenSolver_eigenvectors.out * @@ -337,7 +355,7 @@ template class SelfAdjointEigenSolver /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, \c NoConvergence otherwise. + * \returns \c Success if computation was successful, \c NoConvergence otherwise. */ EIGEN_DEVICE_FUNC ComputationInfo info() const @@ -354,7 +372,8 @@ template class SelfAdjointEigenSolver static const int m_maxIterations = 30; protected: - static void check_template_parameters() + static EIGEN_DEVICE_FUNC + void check_template_parameters() { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); } @@ -362,6 +381,7 @@ template class SelfAdjointEigenSolver EigenvectorsType m_eivec; RealVectorType m_eivalues; typename TridiagonalizationType::SubDiagonalType m_subdiag; + typename TridiagonalizationType::CoeffVectorType m_hcoeffs; ComputationInfo m_info; bool m_isInitialized; bool m_eigenvectorsOk; @@ -403,7 +423,7 @@ SelfAdjointEigenSolver& SelfAdjointEigenSolver const InputType &matrix(a_matrix.derived()); - using std::abs; + EIGEN_USING_STD(abs); eigen_assert(matrix.cols() == matrix.rows()); eigen_assert((options&~(EigVecMask|GenEigMask))==0 && (options&EigVecMask)!=EigVecMask @@ -434,7 +454,8 @@ SelfAdjointEigenSolver& SelfAdjointEigenSolver if(scale==RealScalar(0)) scale = RealScalar(1); mat.template triangularView() /= scale; m_subdiag.resize(n-1); - internal::tridiagonalization_inplace(mat, diag, m_subdiag, computeEigenvectors); + m_hcoeffs.resize(n-1); + internal::tridiagonalization_inplace(mat, diag, m_subdiag, m_hcoeffs, computeEigenvectors); m_info = internal::computeFromTridiagonal_impl(diag, m_subdiag, m_maxIterations, computeEigenvectors, m_eivec); @@ -479,10 +500,9 @@ namespace internal { * \returns \c Success or \c NoConvergence */ template +EIGEN_DEVICE_FUNC ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag, const Index maxIterations, bool computeEigenvectors, MatrixType& eivec) { - using std::abs; - ComputationInfo info; typedef typename MatrixType::Scalar Scalar; @@ -493,15 +513,23 @@ ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag typedef typename DiagType::RealScalar RealScalar; const RealScalar considerAsZero = (std::numeric_limits::min)(); - const RealScalar precision = RealScalar(2)*NumTraits::epsilon(); - + const RealScalar precision_inv = RealScalar(1)/NumTraits::epsilon(); while (end>0) { - for (Index i = start; i0 && subdiag[end-1]==RealScalar(0)) { end--; @@ -535,7 +563,7 @@ ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag diag.segment(i,n-i).minCoeff(&k); if (k > 0) { - std::swap(diag[i], diag[k+i]); + numext::swap(diag[i], diag[k+i]); if(computeEigenvectors) eivec.col(i).swap(eivec.col(k+i)); } @@ -566,10 +594,10 @@ template struct direct_selfadjoint_eigenvalues struct direct_selfadjoint_eigenvalues res, Ref representative) { - EIGEN_USING_STD_MATH(sqrt) - EIGEN_USING_STD_MATH(abs) + EIGEN_USING_STD(abs); + EIGEN_USING_STD(sqrt); Index i0; // Find non-zero column i0 (by construction, there must exist a non zero coefficient on the diagonal): mat.diagonal().cwiseAbs().maxCoeff(&i0); @@ -720,7 +748,7 @@ struct direct_selfadjoint_eigenvalues EIGEN_DEVICE_FUNC static inline void computeRoots(const MatrixType& m, VectorType& roots) { - using std::sqrt; + EIGEN_USING_STD(sqrt); const Scalar t0 = Scalar(0.5) * sqrt( numext::abs2(m(0,0)-m(1,1)) + Scalar(4)*numext::abs2(m(1,0))); const Scalar t1 = Scalar(0.5) * (m(0,0) + m(1,1)); roots(0) = t1 - t0; @@ -730,8 +758,8 @@ struct direct_selfadjoint_eigenvalues EIGEN_DEVICE_FUNC static inline void run(SolverType& solver, const MatrixType& mat, int options) { - EIGEN_USING_STD_MATH(sqrt); - EIGEN_USING_STD_MATH(abs); + EIGEN_USING_STD(sqrt); + EIGEN_USING_STD(abs); eigen_assert(mat.cols() == 2 && mat.cols() == mat.rows()); eigen_assert((options&~(EigVecMask|GenEigMask))==0 @@ -804,32 +832,38 @@ SelfAdjointEigenSolver& SelfAdjointEigenSolver } namespace internal { + +// Francis implicit QR step. template EIGEN_DEVICE_FUNC static void tridiagonal_qr_step(RealScalar* diag, RealScalar* subdiag, Index start, Index end, Scalar* matrixQ, Index n) { - using std::abs; + // Wilkinson Shift. RealScalar td = (diag[end-1] - diag[end])*RealScalar(0.5); RealScalar e = subdiag[end-1]; // Note that thanks to scaling, e^2 or td^2 cannot overflow, however they can still // underflow thus leading to inf/NaN values when using the following commented code: -// RealScalar e2 = numext::abs2(subdiag[end-1]); -// RealScalar mu = diag[end] - e2 / (td + (td>0 ? 1 : -1) * sqrt(td*td + e2)); + // RealScalar e2 = numext::abs2(subdiag[end-1]); + // RealScalar mu = diag[end] - e2 / (td + (td>0 ? 1 : -1) * sqrt(td*td + e2)); // This explain the following, somewhat more complicated, version: RealScalar mu = diag[end]; - if(td==RealScalar(0)) - mu -= abs(e); - else - { - RealScalar e2 = numext::abs2(subdiag[end-1]); - RealScalar h = numext::hypot(td,e); - if(e2==RealScalar(0)) mu -= (e / (td + (td>RealScalar(0) ? RealScalar(1) : RealScalar(-1)))) * (e / h); - else mu -= e2 / (td + (td>RealScalar(0) ? h : -h)); + if(td==RealScalar(0)) { + mu -= numext::abs(e); + } else if (e != RealScalar(0)) { + const RealScalar e2 = numext::abs2(e); + const RealScalar h = numext::hypot(td,e); + if(e2 == RealScalar(0)) { + mu -= e / ((td + (td>RealScalar(0) ? h : -h)) / e); + } else { + mu -= e2 / (td + (td>RealScalar(0) ? h : -h)); + } } - + RealScalar x = diag[start] - mu; RealScalar z = subdiag[start]; - for (Index k = start; k < end; ++k) + // If z ever becomes zero, the Givens rotation will be the identity and + // z will stay zero for all future iterations. + for (Index k = start; k < end && z != RealScalar(0); ++k) { JacobiRotation rot; rot.makeGivens(x, z); @@ -842,12 +876,11 @@ static void tridiagonal_qr_step(RealScalar* diag, RealScalar* subdiag, Index sta diag[k+1] = rot.s() * sdk + rot.c() * dkp1; subdiag[k] = rot.c() * sdk - rot.s() * dkp1; - if (k > start) subdiag[k - 1] = rot.c() * subdiag[k-1] - rot.s() * z; + // "Chasing the bulge" to return to triangular form. x = subdiag[k]; - if (k < end - 1) { z = -rot.s() * subdiag[k+1]; diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Eigenvalues/Tridiagonalization.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Eigenvalues/Tridiagonalization.h index 1d102c17bc..674c92a39b 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Eigenvalues/Tridiagonalization.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Eigenvalues/Tridiagonalization.h @@ -11,10 +11,10 @@ #ifndef EIGEN_TRIDIAGONALIZATION_H #define EIGEN_TRIDIAGONALIZATION_H -namespace Eigen { +namespace Eigen { namespace internal { - + template struct TridiagonalizationMatrixTReturnType; template struct traits > @@ -25,6 +25,7 @@ struct traits > }; template +EIGEN_DEVICE_FUNC void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs); } @@ -344,6 +345,7 @@ namespace internal { * \sa Tridiagonalization::packedMatrix() */ template +EIGEN_DEVICE_FUNC void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs) { using numext::conj; @@ -352,7 +354,7 @@ void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs) Index n = matA.rows(); eigen_assert(n==matA.cols()); eigen_assert(n==hCoeffs.size()+1 || n==1); - + for (Index i = 0; i -void tridiagonalization_inplace(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ) +template +EIGEN_DEVICE_FUNC +void tridiagonalization_inplace(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, + CoeffVectorType& hcoeffs, bool extractQ) { eigen_assert(mat.cols()==mat.rows() && diag.size()==mat.rows() && subdiag.size()==mat.rows()-1); - tridiagonalization_inplace_selector::run(mat, diag, subdiag, extractQ); + tridiagonalization_inplace_selector::run(mat, diag, subdiag, hcoeffs, extractQ); } /** \internal @@ -439,10 +443,10 @@ struct tridiagonalization_inplace_selector typedef typename Tridiagonalization::CoeffVectorType CoeffVectorType; typedef typename Tridiagonalization::HouseholderSequenceType HouseholderSequenceType; template - static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ) + static EIGEN_DEVICE_FUNC + void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, CoeffVectorType& hCoeffs, bool extractQ) { - CoeffVectorType hCoeffs(mat.cols()-1); - tridiagonalization_inplace(mat,hCoeffs); + tridiagonalization_inplace(mat, hCoeffs); diag = mat.diagonal().real(); subdiag = mat.template diagonal<-1>().real(); if(extractQ) @@ -462,8 +466,8 @@ struct tridiagonalization_inplace_selector typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::RealScalar RealScalar; - template - static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ) + template + static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, CoeffVectorType&, bool extractQ) { using std::sqrt; const RealScalar tol = (std::numeric_limits::min)(); @@ -507,8 +511,9 @@ struct tridiagonalization_inplace_selector { typedef typename MatrixType::Scalar Scalar; - template - static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType&, bool extractQ) + template + static EIGEN_DEVICE_FUNC + void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType&, CoeffVectorType&, bool extractQ) { diag(0,0) = numext::real(mat(0,0)); if(extractQ) @@ -542,8 +547,8 @@ template struct TridiagonalizationMatrixTReturnType result.template diagonal<-1>() = m_matrix.template diagonal<-1>(); } - Index rows() const { return m_matrix.rows(); } - Index cols() const { return m_matrix.cols(); } + EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); } + EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); } protected: typename MatrixType::Nested m_matrix; diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Householder/BlockHouseholder.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Householder/BlockHouseholder.h index 01a7ed1884..39ce1c2a0e 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Householder/BlockHouseholder.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Householder/BlockHouseholder.h @@ -63,8 +63,15 @@ void make_block_householder_triangular_factor(TriangularFactorType& triFactor, c triFactor.row(i).tail(rt).noalias() = -hCoeffs(i) * vectors.col(i).tail(rs).adjoint() * vectors.bottomRightCorner(rs, rt).template triangularView(); - // FIXME add .noalias() once the triangular product can work inplace - triFactor.row(i).tail(rt) = triFactor.row(i).tail(rt) * triFactor.bottomRightCorner(rt,rt).template triangularView(); + // FIXME use the following line with .noalias() once the triangular product can work inplace + // triFactor.row(i).tail(rt) = triFactor.row(i).tail(rt) * triFactor.bottomRightCorner(rt,rt).template triangularView(); + for(Index j=nbVecs-1; j>i; --j) + { + typename TriangularFactorType::Scalar z = triFactor(i,j); + triFactor(i,j) = z * triFactor(j,j); + if(nbVecs-j-1>0) + triFactor.row(i).tail(nbVecs-j-1) += z * triFactor.row(j).tail(nbVecs-j-1); + } } triFactor(i,i) = hCoeffs(i); diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Householder/Householder.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Householder/Householder.h index 80de2c3052..5bc037f00d 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Householder/Householder.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Householder/Householder.h @@ -39,6 +39,7 @@ template struct decrement_size * MatrixBase::applyHouseholderOnTheRight() */ template +EIGEN_DEVICE_FUNC void MatrixBase::makeHouseholderInPlace(Scalar& tau, RealScalar& beta) { VectorBlock::ret> essentialPart(derived(), 1, size()-1); @@ -62,6 +63,7 @@ void MatrixBase::makeHouseholderInPlace(Scalar& tau, RealScalar& beta) */ template template +EIGEN_DEVICE_FUNC void MatrixBase::makeHouseholder( EssentialPart& essential, Scalar& tau, @@ -103,13 +105,14 @@ void MatrixBase::makeHouseholder( * \param essential the essential part of the vector \c v * \param tau the scaling factor of the Householder transformation * \param workspace a pointer to working space with at least - * this->cols() * essential.size() entries + * this->cols() entries * * \sa MatrixBase::makeHouseholder(), MatrixBase::makeHouseholderInPlace(), * MatrixBase::applyHouseholderOnTheRight() */ template template +EIGEN_DEVICE_FUNC void MatrixBase::applyHouseholderOnTheLeft( const EssentialPart& essential, const Scalar& tau, @@ -140,13 +143,14 @@ void MatrixBase::applyHouseholderOnTheLeft( * \param essential the essential part of the vector \c v * \param tau the scaling factor of the Householder transformation * \param workspace a pointer to working space with at least - * this->cols() * essential.size() entries + * this->rows() entries * * \sa MatrixBase::makeHouseholder(), MatrixBase::makeHouseholderInPlace(), * MatrixBase::applyHouseholderOnTheLeft() */ template template +EIGEN_DEVICE_FUNC void MatrixBase::applyHouseholderOnTheRight( const EssentialPart& essential, const Scalar& tau, @@ -160,10 +164,10 @@ void MatrixBase::applyHouseholderOnTheRight( { Map::type> tmp(workspace,rows()); Block right(derived(), 0, 1, rows(), cols()-1); - tmp.noalias() = right * essential.conjugate(); + tmp.noalias() = right * essential; tmp += this->col(0); this->col(0) -= tau * tmp; - right.noalias() -= tau * tmp * essential.transpose(); + right.noalias() -= tau * tmp * essential.adjoint(); } } diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Householder/HouseholderSequence.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Householder/HouseholderSequence.h index 3ce0a693d9..022f6c3dba 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Householder/HouseholderSequence.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Householder/HouseholderSequence.h @@ -11,7 +11,7 @@ #ifndef EIGEN_HOUSEHOLDER_SEQUENCE_H #define EIGEN_HOUSEHOLDER_SEQUENCE_H -namespace Eigen { +namespace Eigen { /** \ingroup Householder_Module * \householder_module @@ -34,8 +34,8 @@ namespace Eigen { * form \f$ H = \prod_{i=0}^{n-1} H_i \f$ where the i-th Householder reflection is \f$ H_i = I - h_i v_i * v_i^* \f$. The i-th Householder coefficient \f$ h_i \f$ is a scalar and the i-th Householder vector \f$ * v_i \f$ is a vector of the form - * \f[ - * v_i = [\underbrace{0, \ldots, 0}_{i-1\mbox{ zeros}}, 1, \underbrace{*, \ldots,*}_{n-i\mbox{ arbitrary entries}} ]. + * \f[ + * v_i = [\underbrace{0, \ldots, 0}_{i-1\mbox{ zeros}}, 1, \underbrace{*, \ldots,*}_{n-i\mbox{ arbitrary entries}} ]. * \f] * The last \f$ n-i \f$ entries of \f$ v_i \f$ are called the essential part of the Householder vector. * @@ -87,7 +87,7 @@ struct hseq_side_dependent_impl { typedef Block EssentialVectorType; typedef HouseholderSequence HouseholderSequenceType; - static inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k) + static EIGEN_DEVICE_FUNC inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k) { Index start = k+1+h.m_shift; return Block(h.m_vectors, start, k, h.rows()-start, 1); @@ -120,7 +120,7 @@ template class HouseholderS : public EigenBase > { typedef typename internal::hseq_side_dependent_impl::EssentialVectorType EssentialVectorType; - + public: enum { RowsAtCompileTime = internal::traits::RowsAtCompileTime, @@ -140,6 +140,28 @@ template class HouseholderS Side > ConjugateReturnType; + typedef HouseholderSequence< + VectorsType, + typename internal::conditional::IsComplex, + typename internal::remove_all::type, + CoeffsType>::type, + Side + > AdjointReturnType; + + typedef HouseholderSequence< + typename internal::conditional::IsComplex, + typename internal::remove_all::type, + VectorsType>::type, + CoeffsType, + Side + > TransposeReturnType; + + typedef HouseholderSequence< + typename internal::add_const::type, + typename internal::add_const::type, + Side + > ConstHouseholderSequence; + /** \brief Constructor. * \param[in] v %Matrix containing the essential parts of the Householder vectors * \param[in] h Vector containing the Householder coefficients @@ -157,33 +179,37 @@ template class HouseholderS * * \sa setLength(), setShift() */ + EIGEN_DEVICE_FUNC HouseholderSequence(const VectorsType& v, const CoeffsType& h) - : m_vectors(v), m_coeffs(h), m_trans(false), m_length(v.diagonalSize()), + : m_vectors(v), m_coeffs(h), m_reverse(false), m_length(v.diagonalSize()), m_shift(0) { } /** \brief Copy constructor. */ + EIGEN_DEVICE_FUNC HouseholderSequence(const HouseholderSequence& other) : m_vectors(other.m_vectors), m_coeffs(other.m_coeffs), - m_trans(other.m_trans), + m_reverse(other.m_reverse), m_length(other.m_length), m_shift(other.m_shift) { } /** \brief Number of rows of transformation viewed as a matrix. - * \returns Number of rows + * \returns Number of rows * \details This equals the dimension of the space that the transformation acts on. */ - Index rows() const { return Side==OnTheLeft ? m_vectors.rows() : m_vectors.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + Index rows() const EIGEN_NOEXCEPT { return Side==OnTheLeft ? m_vectors.rows() : m_vectors.cols(); } /** \brief Number of columns of transformation viewed as a matrix. * \returns Number of columns * \details This equals the dimension of the space that the transformation acts on. */ - Index cols() const { return rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + Index cols() const EIGEN_NOEXCEPT { return rows(); } /** \brief Essential part of a Householder vector. * \param[in] k Index of Householder reflection @@ -191,14 +217,15 @@ template class HouseholderS * * This function returns the essential part of the Householder vector \f$ v_i \f$. This is a vector of * length \f$ n-i \f$ containing the last \f$ n-i \f$ entries of the vector - * \f[ - * v_i = [\underbrace{0, \ldots, 0}_{i-1\mbox{ zeros}}, 1, \underbrace{*, \ldots,*}_{n-i\mbox{ arbitrary entries}} ]. + * \f[ + * v_i = [\underbrace{0, \ldots, 0}_{i-1\mbox{ zeros}}, 1, \underbrace{*, \ldots,*}_{n-i\mbox{ arbitrary entries}} ]. * \f] * The index \f$ i \f$ equals \p k + shift(), corresponding to the k-th column of the matrix \p v * passed to the constructor. * * \sa setShift(), shift() */ + EIGEN_DEVICE_FUNC const EssentialVectorType essentialVector(Index k) const { eigen_assert(k >= 0 && k < m_length); @@ -206,31 +233,51 @@ template class HouseholderS } /** \brief %Transpose of the Householder sequence. */ - HouseholderSequence transpose() const + TransposeReturnType transpose() const { - return HouseholderSequence(*this).setTrans(!m_trans); + return TransposeReturnType(m_vectors.conjugate(), m_coeffs) + .setReverseFlag(!m_reverse) + .setLength(m_length) + .setShift(m_shift); } /** \brief Complex conjugate of the Householder sequence. */ ConjugateReturnType conjugate() const { return ConjugateReturnType(m_vectors.conjugate(), m_coeffs.conjugate()) - .setTrans(m_trans) + .setReverseFlag(m_reverse) .setLength(m_length) .setShift(m_shift); } - /** \brief Adjoint (conjugate transpose) of the Householder sequence. */ - ConjugateReturnType adjoint() const + /** \returns an expression of the complex conjugate of \c *this if Cond==true, + * returns \c *this otherwise. + */ + template + EIGEN_DEVICE_FUNC + inline typename internal::conditional::type + conjugateIf() const { - return conjugate().setTrans(!m_trans); + typedef typename internal::conditional::type ReturnType; + return ReturnType(m_vectors.template conjugateIf(), m_coeffs.template conjugateIf()); + } + + /** \brief Adjoint (conjugate transpose) of the Householder sequence. */ + AdjointReturnType adjoint() const + { + return AdjointReturnType(m_vectors, m_coeffs.conjugate()) + .setReverseFlag(!m_reverse) + .setLength(m_length) + .setShift(m_shift); } /** \brief Inverse of the Householder sequence (equals the adjoint). */ - ConjugateReturnType inverse() const { return adjoint(); } + AdjointReturnType inverse() const { return adjoint(); } /** \internal */ - template inline void evalTo(DestType& dst) const + template + inline EIGEN_DEVICE_FUNC + void evalTo(DestType& dst) const { Matrix workspace(rows()); @@ -239,6 +286,7 @@ template class HouseholderS /** \internal */ template + EIGEN_DEVICE_FUNC void evalTo(Dest& dst, Workspace& workspace) const { workspace.resize(rows()); @@ -251,7 +299,7 @@ template class HouseholderS for(Index k = vecs-1; k >= 0; --k) { Index cornerSize = rows() - k - m_shift; - if(m_trans) + if(m_reverse) dst.bottomRightCorner(cornerSize, cornerSize) .applyHouseholderOnTheRight(essentialVector(k), m_coeffs.coeff(k), workspace.data()); else @@ -265,18 +313,26 @@ template class HouseholderS for(Index k = 0; kBlockSize) + { + dst.setIdentity(rows(), rows()); + if(m_reverse) + applyThisOnTheLeft(dst,workspace,true); + else + applyThisOnTheLeft(dst,workspace,true); + } else { dst.setIdentity(rows(), rows()); for(Index k = vecs-1; k >= 0; --k) { Index cornerSize = rows() - k - m_shift; - if(m_trans) + if(m_reverse) dst.bottomRightCorner(cornerSize, cornerSize) - .applyHouseholderOnTheRight(essentialVector(k), m_coeffs.coeff(k), &workspace.coeffRef(0)); + .applyHouseholderOnTheRight(essentialVector(k), m_coeffs.coeff(k), workspace.data()); else dst.bottomRightCorner(cornerSize, cornerSize) - .applyHouseholderOnTheLeft(essentialVector(k), m_coeffs.coeff(k), &workspace.coeffRef(0)); + .applyHouseholderOnTheLeft(essentialVector(k), m_coeffs.coeff(k), workspace.data()); } } } @@ -295,42 +351,52 @@ template class HouseholderS workspace.resize(dst.rows()); for(Index k = 0; k < m_length; ++k) { - Index actual_k = m_trans ? m_length-k-1 : k; + Index actual_k = m_reverse ? m_length-k-1 : k; dst.rightCols(rows()-m_shift-actual_k) .applyHouseholderOnTheRight(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data()); } } /** \internal */ - template inline void applyThisOnTheLeft(Dest& dst) const + template inline void applyThisOnTheLeft(Dest& dst, bool inputIsIdentity = false) const { Matrix workspace; - applyThisOnTheLeft(dst, workspace); + applyThisOnTheLeft(dst, workspace, inputIsIdentity); } /** \internal */ template - inline void applyThisOnTheLeft(Dest& dst, Workspace& workspace) const + inline void applyThisOnTheLeft(Dest& dst, Workspace& workspace, bool inputIsIdentity = false) const { - const Index BlockSize = 48; + if(inputIsIdentity && m_reverse) + inputIsIdentity = false; // if the entries are large enough, then apply the reflectors by block if(m_length>=BlockSize && dst.cols()>1) { - for(Index i = 0; i < m_length; i+=BlockSize) + // Make sure we have at least 2 useful blocks, otherwise it is point-less: + Index blockSize = m_length::type,Dynamic,Dynamic> SubVectorsType; SubVectorsType sub_vecs1(m_vectors.const_cast_derived(), Side==OnTheRight ? k : start, Side==OnTheRight ? start : k, Side==OnTheRight ? bs : m_vectors.rows()-start, Side==OnTheRight ? m_vectors.cols()-start : bs); typename internal::conditional, SubVectorsType&>::type sub_vecs(sub_vecs1); - Block sub_dst(dst,dst.rows()-rows()+m_shift+k,0, rows()-m_shift-k,dst.cols()); - apply_block_householder_on_the_left(sub_dst, sub_vecs, m_coeffs.segment(k, bs), !m_trans); + + Index dstStart = dst.rows()-rows()+m_shift+k; + Index dstRows = rows()-m_shift-k; + Block sub_dst(dst, + dstStart, + inputIsIdentity ? dstStart : 0, + dstRows, + inputIsIdentity ? dstRows : dst.cols()); + apply_block_householder_on_the_left(sub_dst, sub_vecs, m_coeffs.segment(k, bs), !m_reverse); } } else @@ -338,8 +404,9 @@ template class HouseholderS workspace.resize(dst.cols()); for(Index k = 0; k < m_length; ++k) { - Index actual_k = m_trans ? k : m_length-k-1; - dst.bottomRows(rows()-m_shift-actual_k) + Index actual_k = m_reverse ? k : m_length-k-1; + Index dstStart = rows()-m_shift-actual_k; + dst.bottomRightCorner(dstStart, inputIsIdentity ? dstStart : dst.cols()) .applyHouseholderOnTheLeft(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data()); } } @@ -357,7 +424,7 @@ template class HouseholderS { typename internal::matrix_type_times_scalar_type::Type res(other.template cast::ResultScalar>()); - applyThisOnTheLeft(res); + applyThisOnTheLeft(res, internal::is_identity::value && res.rows()==res.cols()); return res; } @@ -372,6 +439,7 @@ template class HouseholderS * * \sa length() */ + EIGEN_DEVICE_FUNC HouseholderSequence& setLength(Index length) { m_length = length; @@ -389,13 +457,17 @@ template class HouseholderS * * \sa shift() */ + EIGEN_DEVICE_FUNC HouseholderSequence& setShift(Index shift) { m_shift = shift; return *this; } + EIGEN_DEVICE_FUNC Index length() const { return m_length; } /**< \brief Returns the length of the Householder sequence. */ + + EIGEN_DEVICE_FUNC Index shift() const { return m_shift; } /**< \brief Returns the shift of the Householder sequence. */ /* Necessary for .adjoint() and .conjugate() */ @@ -403,27 +475,30 @@ template class HouseholderS protected: - /** \brief Sets the transpose flag. - * \param [in] trans New value of the transpose flag. + /** \internal + * \brief Sets the reverse flag. + * \param [in] reverse New value of the reverse flag. * - * By default, the transpose flag is not set. If the transpose flag is set, then this object represents - * \f$ H^T = H_{n-1}^T \ldots H_1^T H_0^T \f$ instead of \f$ H = H_0 H_1 \ldots H_{n-1} \f$. + * By default, the reverse flag is not set. If the reverse flag is set, then this object represents + * \f$ H^r = H_{n-1} \ldots H_1 H_0 \f$ instead of \f$ H = H_0 H_1 \ldots H_{n-1} \f$. + * \note For real valued HouseholderSequence this is equivalent to transposing \f$ H \f$. * - * \sa trans() + * \sa reverseFlag(), transpose(), adjoint() */ - HouseholderSequence& setTrans(bool trans) + HouseholderSequence& setReverseFlag(bool reverse) { - m_trans = trans; + m_reverse = reverse; return *this; } - bool trans() const { return m_trans; } /**< \brief Returns the transpose flag. */ + bool reverseFlag() const { return m_reverse; } /**< \internal \brief Returns the reverse flag. */ typename VectorsType::Nested m_vectors; typename CoeffsType::Nested m_coeffs; - bool m_trans; + bool m_reverse; Index m_length; Index m_shift; + enum { BlockSize = 48 }; }; /** \brief Computes the product of a matrix with a Householder sequence. @@ -444,7 +519,7 @@ typename internal::matrix_type_times_scalar_type @@ -454,7 +529,7 @@ HouseholderSequence householderSequence(const VectorsTyp } /** \ingroup Householder_Module \householder_module - * \brief Convenience function for constructing a Householder sequence. + * \brief Convenience function for constructing a Householder sequence. * \returns A HouseholderSequence constructed from the specified arguments. * \details This function differs from householderSequence() in that the template argument \p OnTheSide of * the constructed HouseholderSequence is set to OnTheRight, instead of the default OnTheLeft. diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/Jacobi/Jacobi.h b/wpimath/src/main/native/eigeninclude/Eigen/src/Jacobi/Jacobi.h index 1998c63227..76668a574a 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/Jacobi/Jacobi.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/Jacobi/Jacobi.h @@ -11,7 +11,7 @@ #ifndef EIGEN_JACOBI_H #define EIGEN_JACOBI_H -namespace Eigen { +namespace Eigen { /** \ingroup Jacobi_Module * \jacobi_module @@ -37,17 +37,20 @@ template class JacobiRotation typedef typename NumTraits::Real RealScalar; /** Default constructor without any initialization. */ + EIGEN_DEVICE_FUNC JacobiRotation() {} /** Construct a planar rotation from a cosine-sine pair (\a c, \c s). */ + EIGEN_DEVICE_FUNC JacobiRotation(const Scalar& c, const Scalar& s) : m_c(c), m_s(s) {} - Scalar& c() { return m_c; } - Scalar c() const { return m_c; } - Scalar& s() { return m_s; } - Scalar s() const { return m_s; } + EIGEN_DEVICE_FUNC Scalar& c() { return m_c; } + EIGEN_DEVICE_FUNC Scalar c() const { return m_c; } + EIGEN_DEVICE_FUNC Scalar& s() { return m_s; } + EIGEN_DEVICE_FUNC Scalar s() const { return m_s; } /** Concatenates two planar rotation */ + EIGEN_DEVICE_FUNC JacobiRotation operator*(const JacobiRotation& other) { using numext::conj; @@ -56,19 +59,26 @@ template class JacobiRotation } /** Returns the transposed transformation */ + EIGEN_DEVICE_FUNC JacobiRotation transpose() const { using numext::conj; return JacobiRotation(m_c, -conj(m_s)); } /** Returns the adjoint transformation */ + EIGEN_DEVICE_FUNC JacobiRotation adjoint() const { using numext::conj; return JacobiRotation(conj(m_c), -m_s); } template + EIGEN_DEVICE_FUNC bool makeJacobi(const MatrixBase&, Index p, Index q); + EIGEN_DEVICE_FUNC bool makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z); + EIGEN_DEVICE_FUNC void makeGivens(const Scalar& p, const Scalar& q, Scalar* r=0); protected: + EIGEN_DEVICE_FUNC void makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::true_type); + EIGEN_DEVICE_FUNC void makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::false_type); Scalar m_c, m_s; @@ -80,10 +90,12 @@ template class JacobiRotation * \sa MatrixBase::makeJacobi(const MatrixBase&, Index, Index), MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight() */ template +EIGEN_DEVICE_FUNC bool JacobiRotation::makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z) { using std::sqrt; using std::abs; + RealScalar deno = RealScalar(2)*abs(y); if(deno < (std::numeric_limits::min)()) { @@ -123,6 +135,7 @@ bool JacobiRotation::makeJacobi(const RealScalar& x, const Scalar& y, co */ template template +EIGEN_DEVICE_FUNC inline bool JacobiRotation::makeJacobi(const MatrixBase& m, Index p, Index q) { return makeJacobi(numext::real(m.coeff(p,p)), m.coeff(p,q), numext::real(m.coeff(q,q))); @@ -145,6 +158,7 @@ inline bool JacobiRotation::makeJacobi(const MatrixBase& m, Ind * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight() */ template +EIGEN_DEVICE_FUNC void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar* r) { makeGivens(p, q, r, typename internal::conditional::IsComplex, internal::true_type, internal::false_type>::type()); @@ -153,12 +167,13 @@ void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar // specialization for complexes template +EIGEN_DEVICE_FUNC void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::true_type) { using std::sqrt; using std::abs; using numext::conj; - + if(q==Scalar(0)) { m_c = numext::real(p)<0 ? Scalar(-1) : Scalar(1); @@ -212,6 +227,7 @@ void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar // specialization for reals template +EIGEN_DEVICE_FUNC void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::false_type) { using std::sqrt; @@ -257,12 +273,13 @@ void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar namespace internal { /** \jacobi_module - * Applies the clock wise 2D rotation \a j to the set of 2D vectors of cordinates \a x and \a y: + * Applies the clock wise 2D rotation \a j to the set of 2D vectors of coordinates \a x and \a y: * \f$ \left ( \begin{array}{cc} x \\ y \end{array} \right ) = J \left ( \begin{array}{cc} x \\ y \end{array} \right ) \f$ * * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight() */ template +EIGEN_DEVICE_FUNC void apply_rotation_in_the_plane(DenseBase& xpr_x, DenseBase& xpr_y, const JacobiRotation& j); } @@ -274,6 +291,7 @@ void apply_rotation_in_the_plane(DenseBase& xpr_x, DenseBase& */ template template +EIGEN_DEVICE_FUNC inline void MatrixBase::applyOnTheLeft(Index p, Index q, const JacobiRotation& j) { RowXpr x(this->row(p)); @@ -289,6 +307,7 @@ inline void MatrixBase::applyOnTheLeft(Index p, Index q, const JacobiRo */ template template +EIGEN_DEVICE_FUNC inline void MatrixBase::applyOnTheRight(Index p, Index q, const JacobiRotation& j) { ColXpr x(this->col(p)); @@ -302,7 +321,8 @@ template struct apply_rotation_in_the_plane_selector { - static inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s) + static EIGEN_DEVICE_FUNC + inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s) { for(Index i=0; i +EIGEN_DEVICE_FUNC void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase& xpr_x, DenseBase& xpr_y, const JacobiRotation& j) { typedef typename VectorX::Scalar Scalar; - const bool Vectorizable = (VectorX::Flags & VectorY::Flags & PacketAccessBit) + const bool Vectorizable = (int(VectorX::Flags) & int(VectorY::Flags) & PacketAccessBit) && (int(packet_traits::size) == int(packet_traits::size)); eigen_assert(xpr_x.size() == xpr_y.size()); @@ -442,7 +463,7 @@ void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase& xpr_x Scalar* EIGEN_RESTRICT x = &xpr_x.derived().coeffRef(0); Scalar* EIGEN_RESTRICT y = &xpr_y.derived().coeffRef(0); - + OtherScalar c = j.c(); OtherScalar s = j.s(); if (c==OtherScalar(1) && s==OtherScalar(0)) diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/LU/Determinant.h b/wpimath/src/main/native/eigeninclude/Eigen/src/LU/Determinant.h index d6a3c1e5a5..3a41e6fcba 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/LU/Determinant.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/LU/Determinant.h @@ -15,6 +15,7 @@ namespace Eigen { namespace internal { template +EIGEN_DEVICE_FUNC inline const typename Derived::Scalar bruteforce_det3_helper (const MatrixBase& matrix, int a, int b, int c) { @@ -22,14 +23,6 @@ inline const typename Derived::Scalar bruteforce_det3_helper * (matrix.coeff(1,b) * matrix.coeff(2,c) - matrix.coeff(1,c) * matrix.coeff(2,b)); } -template -const typename Derived::Scalar bruteforce_det4_helper -(const MatrixBase& matrix, int j, int k, int m, int n) -{ - return (matrix.coeff(j,0) * matrix.coeff(k,1) - matrix.coeff(k,0) * matrix.coeff(j,1)) - * (matrix.coeff(m,2) * matrix.coeff(n,3) - matrix.coeff(n,2) * matrix.coeff(m,3)); -} - template struct determinant_impl @@ -44,7 +37,8 @@ template struct determinant_impl { - static inline typename traits::Scalar run(const Derived& m) + static inline EIGEN_DEVICE_FUNC + typename traits::Scalar run(const Derived& m) { return m.coeff(0,0); } @@ -52,7 +46,8 @@ template struct determinant_impl template struct determinant_impl { - static inline typename traits::Scalar run(const Derived& m) + static inline EIGEN_DEVICE_FUNC + typename traits::Scalar run(const Derived& m) { return m.coeff(0,0) * m.coeff(1,1) - m.coeff(1,0) * m.coeff(0,1); } @@ -60,7 +55,8 @@ template struct determinant_impl template struct determinant_impl { - static inline typename traits::Scalar run(const Derived& m) + static inline EIGEN_DEVICE_FUNC + typename traits::Scalar run(const Derived& m) { return bruteforce_det3_helper(m,0,1,2) - bruteforce_det3_helper(m,1,0,2) @@ -70,15 +66,34 @@ template struct determinant_impl template struct determinant_impl { - static typename traits::Scalar run(const Derived& m) + typedef typename traits::Scalar Scalar; + static EIGEN_DEVICE_FUNC + Scalar run(const Derived& m) { - // trick by Martin Costabel to compute 4x4 det with only 30 muls - return bruteforce_det4_helper(m,0,1,2,3) - - bruteforce_det4_helper(m,0,2,1,3) - + bruteforce_det4_helper(m,0,3,1,2) - + bruteforce_det4_helper(m,1,2,0,3) - - bruteforce_det4_helper(m,1,3,0,2) - + bruteforce_det4_helper(m,2,3,0,1); + Scalar d2_01 = det2(m, 0, 1); + Scalar d2_02 = det2(m, 0, 2); + Scalar d2_03 = det2(m, 0, 3); + Scalar d2_12 = det2(m, 1, 2); + Scalar d2_13 = det2(m, 1, 3); + Scalar d2_23 = det2(m, 2, 3); + Scalar d3_0 = det3(m, 1,d2_23, 2,d2_13, 3,d2_12); + Scalar d3_1 = det3(m, 0,d2_23, 2,d2_03, 3,d2_02); + Scalar d3_2 = det3(m, 0,d2_13, 1,d2_03, 3,d2_01); + Scalar d3_3 = det3(m, 0,d2_12, 1,d2_02, 2,d2_01); + return internal::pmadd(-m(0,3),d3_0, m(1,3)*d3_1) + + internal::pmadd(-m(2,3),d3_2, m(3,3)*d3_3); + } +protected: + static EIGEN_DEVICE_FUNC + Scalar det2(const Derived& m, Index i0, Index i1) + { + return m(i0,0) * m(i1,1) - m(i1,0) * m(i0,1); + } + + static EIGEN_DEVICE_FUNC + Scalar det3(const Derived& m, Index i0, const Scalar& d0, Index i1, const Scalar& d1, Index i2, const Scalar& d2) + { + return internal::pmadd(m(i0,2), d0, internal::pmadd(-m(i1,2), d1, m(i2,2)*d2)); } }; @@ -89,6 +104,7 @@ template struct determinant_impl * \returns the determinant of this matrix */ template +EIGEN_DEVICE_FUNC inline typename internal::traits::Scalar MatrixBase::determinant() const { eigen_assert(rows() == cols()); diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/LU/FullPivLU.h b/wpimath/src/main/native/eigeninclude/Eigen/src/LU/FullPivLU.h index 03b6af7061..ba1749fa69 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/LU/FullPivLU.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/LU/FullPivLU.h @@ -18,6 +18,7 @@ template struct traits > { typedef MatrixXpr XprKind; typedef SolverStorage StorageKind; + typedef int StorageIndex; enum { Flags = 0 }; }; @@ -48,12 +49,12 @@ template struct traits > * The data of the LU decomposition can be directly accessed through the methods matrixLU(), * permutationP(), permutationQ(). * - * As an exemple, here is how the original matrix can be retrieved: + * As an example, here is how the original matrix can be retrieved: * \include class_FullPivLU.cpp * Output: \verbinclude class_FullPivLU.out * * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism. - * + * * \sa MatrixBase::fullPivLu(), MatrixBase::determinant(), MatrixBase::inverse() */ template class FullPivLU @@ -62,9 +63,9 @@ template class FullPivLU public: typedef _MatrixType MatrixType; typedef SolverBase Base; + friend class SolverBase; EIGEN_GENERIC_PUBLIC_INTERFACE(FullPivLU) - // FIXME StorageIndex defined in EIGEN_GENERIC_PUBLIC_INTERFACE should be int enum { MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime @@ -218,6 +219,7 @@ template class FullPivLU return internal::image_retval(*this, originalMatrix); } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** \return a solution x to the equation Ax=b, where A is the matrix of which * *this is the LU decomposition. * @@ -237,14 +239,10 @@ template class FullPivLU * * \sa TriangularView::solve(), kernel(), inverse() */ - // FIXME this is a copy-paste of the base-class member to add the isInitialized assertion. template inline const Solve - solve(const MatrixBase& b) const - { - eigen_assert(m_isInitialized && "LU is not initialized."); - return Solve(*this, b.derived()); - } + solve(const MatrixBase& b) const; + #endif /** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is the LU decomposition. @@ -320,7 +318,7 @@ template class FullPivLU return m_usePrescribedThreshold ? m_prescribedThreshold // this formula comes from experimenting (see "LU precision tuning" thread on the list) // and turns out to be identical to Higham's formula used already in LDLt. - : NumTraits::epsilon() * m_lu.diagonalSize(); + : NumTraits::epsilon() * RealScalar(m_lu.diagonalSize()); } /** \returns the rank of the matrix of which *this is the LU decomposition. @@ -406,16 +404,16 @@ template class FullPivLU MatrixType reconstructedMatrix() const; - EIGEN_DEVICE_FUNC inline Index rows() const { return m_lu.rows(); } - EIGEN_DEVICE_FUNC inline Index cols() const { return m_lu.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rows() const EIGEN_NOEXCEPT { return m_lu.rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return m_lu.cols(); } #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; template - EIGEN_DEVICE_FUNC void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif @@ -531,8 +529,8 @@ void FullPivLU::computeInPlace() m_nonzero_pivots = k; for(Index i = k; i < size; ++i) { - m_rowsTranspositions.coeffRef(i) = i; - m_colsTranspositions.coeffRef(i) = i; + m_rowsTranspositions.coeffRef(i) = internal::convert_index(i); + m_colsTranspositions.coeffRef(i) = internal::convert_index(i); } break; } @@ -543,8 +541,8 @@ void FullPivLU::computeInPlace() // Now that we've found the pivot, we need to apply the row/col swaps to // bring it to the location (k,k). - m_rowsTranspositions.coeffRef(k) = row_of_biggest_in_corner; - m_colsTranspositions.coeffRef(k) = col_of_biggest_in_corner; + m_rowsTranspositions.coeffRef(k) = internal::convert_index(row_of_biggest_in_corner); + m_colsTranspositions.coeffRef(k) = internal::convert_index(col_of_biggest_in_corner); if(k != row_of_biggest_in_corner) { m_lu.row(k).swap(m_lu.row(row_of_biggest_in_corner)); ++number_of_transpositions; @@ -757,7 +755,6 @@ void FullPivLU<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const const Index rows = this->rows(), cols = this->cols(), nonzero_pivots = this->rank(); - eigen_assert(rhs.rows() == rows); const Index smalldim = (std::min)(rows, cols); if(nonzero_pivots == 0) @@ -807,7 +804,6 @@ void FullPivLU<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType const Index rows = this->rows(), cols = this->cols(), nonzero_pivots = this->rank(); - eigen_assert(rhs.rows() == cols); const Index smalldim = (std::min)(rows, cols); if(nonzero_pivots == 0) @@ -821,29 +817,19 @@ void FullPivLU<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType // Step 1 c = permutationQ().inverse() * rhs; - if (Conjugate) { - // Step 2 - m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots) - .template triangularView() - .adjoint() - .solveInPlace(c.topRows(nonzero_pivots)); - // Step 3 - m_lu.topLeftCorner(smalldim, smalldim) - .template triangularView() - .adjoint() - .solveInPlace(c.topRows(smalldim)); - } else { - // Step 2 - m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots) - .template triangularView() - .transpose() - .solveInPlace(c.topRows(nonzero_pivots)); - // Step 3 - m_lu.topLeftCorner(smalldim, smalldim) - .template triangularView() - .transpose() - .solveInPlace(c.topRows(smalldim)); - } + // Step 2 + m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots) + .template triangularView() + .transpose() + .template conjugateIf() + .solveInPlace(c.topRows(nonzero_pivots)); + + // Step 3 + m_lu.topLeftCorner(smalldim, smalldim) + .template triangularView() + .transpose() + .template conjugateIf() + .solveInPlace(c.topRows(smalldim)); // Step 4 PermutationPType invp = permutationP().inverse().eval(); diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/LU/InverseImpl.h b/wpimath/src/main/native/eigeninclude/Eigen/src/LU/InverseImpl.h index f49f233600..a40cefa9e7 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/LU/InverseImpl.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/LU/InverseImpl.h @@ -77,10 +77,11 @@ inline void compute_inverse_size2_helper( const MatrixType& matrix, const typename ResultType::Scalar& invdet, ResultType& result) { + typename ResultType::Scalar temp = matrix.coeff(0,0); result.coeffRef(0,0) = matrix.coeff(1,1) * invdet; result.coeffRef(1,0) = -matrix.coeff(1,0) * invdet; result.coeffRef(0,1) = -matrix.coeff(0,1) * invdet; - result.coeffRef(1,1) = matrix.coeff(0,0) * invdet; + result.coeffRef(1,1) = temp * invdet; } template @@ -143,13 +144,18 @@ inline void compute_inverse_size3_helper( const Matrix& cofactors_col0, ResultType& result) { - result.row(0) = cofactors_col0 * invdet; - result.coeffRef(1,0) = cofactor_3x3(matrix) * invdet; - result.coeffRef(1,1) = cofactor_3x3(matrix) * invdet; + // Compute cofactors in a way that avoids aliasing issues. + typedef typename ResultType::Scalar Scalar; + const Scalar c01 = cofactor_3x3(matrix) * invdet; + const Scalar c11 = cofactor_3x3(matrix) * invdet; + const Scalar c02 = cofactor_3x3(matrix) * invdet; result.coeffRef(1,2) = cofactor_3x3(matrix) * invdet; - result.coeffRef(2,0) = cofactor_3x3(matrix) * invdet; result.coeffRef(2,1) = cofactor_3x3(matrix) * invdet; result.coeffRef(2,2) = cofactor_3x3(matrix) * invdet; + result.coeffRef(1,0) = c01; + result.coeffRef(1,1) = c11; + result.coeffRef(2,0) = c02; + result.row(0) = cofactors_col0 * invdet; } template @@ -181,14 +187,13 @@ struct compute_inverse_and_det_with_check bool& invertible ) { - using std::abs; typedef typename ResultType::Scalar Scalar; Matrix cofactors_col0; cofactors_col0.coeffRef(0) = cofactor_3x3(matrix); cofactors_col0.coeffRef(1) = cofactor_3x3(matrix); cofactors_col0.coeffRef(2) = cofactor_3x3(matrix); determinant = (cofactors_col0.cwiseProduct(matrix.col(0))).sum(); - invertible = abs(determinant) > absDeterminantThreshold; + invertible = Eigen::numext::abs(determinant) > absDeterminantThreshold; if(!invertible) return; const Scalar invdet = Scalar(1) / determinant; compute_inverse_size3_helper(matrix, invdet, cofactors_col0, inverse); @@ -273,7 +278,13 @@ struct compute_inverse_and_det_with_check using std::abs; determinant = matrix.determinant(); invertible = abs(determinant) > absDeterminantThreshold; - if(invertible) compute_inverse::run(matrix, inverse); + if(invertible && extract_data(matrix) != extract_data(inverse)) { + compute_inverse::run(matrix, inverse); + } + else if(invertible) { + MatrixType matrix_t = matrix; + compute_inverse::run(matrix_t, inverse); + } } }; @@ -290,6 +301,7 @@ template struct Assignment, internal::assign_op, Dense2Dense> { typedef Inverse SrcXprType; + EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op &) { Index dstRows = src.rows(); @@ -332,6 +344,7 @@ struct Assignment, internal::assign_op +EIGEN_DEVICE_FUNC inline const Inverse MatrixBase::inverse() const { EIGEN_STATIC_ASSERT(!NumTraits::IsInteger,THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES) @@ -345,6 +358,8 @@ inline const Inverse MatrixBase::inverse() const * * This is only for fixed-size square matrices of size up to 4x4. * + * Notice that it will trigger a copy of input matrix when trying to do the inverse in place. + * * \param inverse Reference to the matrix in which to store the inverse. * \param determinant Reference to the variable in which to store the determinant. * \param invertible Reference to the bool variable in which to store whether the matrix is invertible. @@ -385,6 +400,8 @@ inline void MatrixBase::computeInverseAndDetWithCheck( * * This is only for fixed-size square matrices of size up to 4x4. * + * Notice that it will trigger a copy of input matrix when trying to do the inverse in place. + * * \param inverse Reference to the matrix in which to store the inverse. * \param invertible Reference to the bool variable in which to store whether the matrix is invertible. * \param absDeterminantThreshold Optional parameter controlling the invertibility check. diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/LU/PartialPivLU.h b/wpimath/src/main/native/eigeninclude/Eigen/src/LU/PartialPivLU.h index 6b10f39fab..34aed72494 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/LU/PartialPivLU.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/LU/PartialPivLU.h @@ -19,6 +19,7 @@ template struct traits > { typedef MatrixXpr XprKind; typedef SolverStorage StorageKind; + typedef int StorageIndex; typedef traits<_MatrixType> BaseTraits; enum { Flags = BaseTraits::Flags & RowMajorBit, @@ -69,7 +70,7 @@ struct enable_if_ref,Derived> { * The data of the LU decomposition can be directly accessed through the methods matrixLU(), permutationP(). * * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism. - * + * * \sa MatrixBase::partialPivLu(), MatrixBase::determinant(), MatrixBase::inverse(), MatrixBase::computeInverse(), class FullPivLU */ template class PartialPivLU @@ -79,8 +80,9 @@ template class PartialPivLU typedef _MatrixType MatrixType; typedef SolverBase Base; + friend class SolverBase; + EIGEN_GENERIC_PUBLIC_INTERFACE(PartialPivLU) - // FIXME StorageIndex defined in EIGEN_GENERIC_PUBLIC_INTERFACE should be int enum { MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime @@ -152,6 +154,7 @@ template class PartialPivLU return m_p; } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** This method returns the solution x to the equation Ax=b, where A is the matrix of which * *this is the LU decomposition. * @@ -169,14 +172,10 @@ template class PartialPivLU * * \sa TriangularView::solve(), inverse(), computeInverse() */ - // FIXME this is a copy-paste of the base-class member to add the isInitialized assertion. template inline const Solve - solve(const MatrixBase& b) const - { - eigen_assert(m_isInitialized && "PartialPivLU is not initialized."); - return Solve(*this, b.derived()); - } + solve(const MatrixBase& b) const; + #endif /** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is the LU decomposition. @@ -217,8 +216,8 @@ template class PartialPivLU MatrixType reconstructedMatrix() const; - inline Index rows() const { return m_lu.rows(); } - inline Index cols() const { return m_lu.cols(); } + EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_lu.rows(); } + EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_lu.cols(); } #ifndef EIGEN_PARSED_BY_DOXYGEN template @@ -231,8 +230,6 @@ template class PartialPivLU * Step 3: replace c by the solution x to Ux = c. */ - eigen_assert(rhs.rows() == m_lu.rows()); - // Step 1 dst = permutationP() * rhs; @@ -246,26 +243,21 @@ template class PartialPivLU template EIGEN_DEVICE_FUNC void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const { - /* The decomposition PA = LU can be rewritten as A = P^{-1} L U. + /* The decomposition PA = LU can be rewritten as A^T = U^T L^T P. * So we proceed as follows: - * Step 1: compute c = Pb. - * Step 2: replace c by the solution x to Lx = c. - * Step 3: replace c by the solution x to Ux = c. + * Step 1: compute c as the solution to L^T c = b + * Step 2: replace c by the solution x to U^T x = c. + * Step 3: update c = P^-1 c. */ eigen_assert(rhs.rows() == m_lu.cols()); - if (Conjugate) { - // Step 1 - dst = m_lu.template triangularView().adjoint().solve(rhs); - // Step 2 - m_lu.template triangularView().adjoint().solveInPlace(dst); - } else { - // Step 1 - dst = m_lu.template triangularView().transpose().solve(rhs); - // Step 2 - m_lu.template triangularView().transpose().solveInPlace(dst); - } + // Step 1 + dst = m_lu.template triangularView().transpose() + .template conjugateIf().solve(rhs); + // Step 2 + m_lu.template triangularView().transpose() + .template conjugateIf().solveInPlace(dst); // Step 3 dst = permutationP().transpose() * dst; } @@ -339,17 +331,18 @@ PartialPivLU::PartialPivLU(EigenBase& matrix) namespace internal { /** \internal This is the blocked version of fullpivlu_unblocked() */ -template +template struct partial_lu_impl { - // FIXME add a stride to Map, so that the following mapping becomes easier, - // another option would be to create an expression being able to automatically - // warp any Map, Matrix, and Block expressions as a unique type, but since that's exactly - // a Map + stride, why not adding a stride to Map, and convenient ctors from a Matrix, - // and Block. - typedef Map > MapLU; - typedef Block MatrixType; - typedef Block BlockType; + static const int UnBlockedBound = 16; + static const bool UnBlockedAtCompileTime = SizeAtCompileTime!=Dynamic && SizeAtCompileTime<=UnBlockedBound; + static const int ActualSizeAtCompileTime = UnBlockedAtCompileTime ? SizeAtCompileTime : Dynamic; + // Remaining rows and columns at compile-time: + static const int RRows = SizeAtCompileTime==2 ? 1 : Dynamic; + static const int RCols = SizeAtCompileTime==2 ? 1 : Dynamic; + typedef Matrix MatrixType; + typedef Ref MatrixTypeRef; + typedef Ref > BlockType; typedef typename MatrixType::RealScalar RealScalar; /** \internal performs the LU decomposition in-place of the matrix \a lu @@ -362,19 +355,22 @@ struct partial_lu_impl * * \returns The index of the first pivot which is exactly zero if any, or a negative number otherwise. */ - static Index unblocked_lu(MatrixType& lu, PivIndex* row_transpositions, PivIndex& nb_transpositions) + static Index unblocked_lu(MatrixTypeRef& lu, PivIndex* row_transpositions, PivIndex& nb_transpositions) { typedef scalar_score_coeff_op Scoring; typedef typename Scoring::result_type Score; const Index rows = lu.rows(); const Index cols = lu.cols(); const Index size = (std::min)(rows,cols); + // For small compile-time matrices it is worth processing the last row separately: + // speedup: +100% for 2x2, +10% for others. + const Index endk = UnBlockedAtCompileTime ? size-1 : size; nb_transpositions = 0; Index first_zero_pivot = -1; - for(Index k = 0; k < size; ++k) + for(Index k = 0; k < endk; ++k) { - Index rrows = rows-k-1; - Index rcols = cols-k-1; + int rrows = internal::convert_index(rows-k-1); + int rcols = internal::convert_index(cols-k-1); Index row_of_biggest_in_col; Score biggest_in_corner @@ -391,9 +387,7 @@ struct partial_lu_impl ++nb_transpositions; } - // FIXME shall we introduce a safe quotient expression in cas 1/lu.coeff(k,k) - // overflow but not the actual quotient? - lu.col(k).tail(rrows) /= lu.coeff(k,k); + lu.col(k).tail(fix(rrows)) /= lu.coeff(k,k); } else if(first_zero_pivot==-1) { @@ -403,8 +397,18 @@ struct partial_lu_impl } if(k(rrows),fix(rcols)).noalias() -= lu.col(k).tail(fix(rrows)) * lu.row(k).tail(fix(rcols)); } + + // special handling of the last entry + if(UnBlockedAtCompileTime) + { + Index k = endk; + row_transpositions[k] = PivIndex(k); + if (Scoring()(lu(k, k)) == Score(0) && first_zero_pivot == -1) + first_zero_pivot = k; + } + return first_zero_pivot; } @@ -420,18 +424,17 @@ struct partial_lu_impl * \returns The index of the first pivot which is exactly zero if any, or a negative number otherwise. * * \note This very low level interface using pointers, etc. is to: - * 1 - reduce the number of instanciations to the strict minimum - * 2 - avoid infinite recursion of the instanciations with Block > > + * 1 - reduce the number of instantiations to the strict minimum + * 2 - avoid infinite recursion of the instantiations with Block > > */ static Index blocked_lu(Index rows, Index cols, Scalar* lu_data, Index luStride, PivIndex* row_transpositions, PivIndex& nb_transpositions, Index maxBlockSize=256) { - MapLU lu1(lu_data,StorageOrder==RowMajor?rows:luStride,StorageOrder==RowMajor?luStride:cols); - MatrixType lu(lu1,0,0,rows,cols); + MatrixTypeRef lu = MatrixType::Map(lu_data,rows, cols, OuterStride<>(luStride)); const Index size = (std::min)(rows,cols); // if the matrix is too small, no blocking: - if(size<=16) + if(UnBlockedAtCompileTime || size<=UnBlockedBound) { return unblocked_lu(lu, row_transpositions, nb_transpositions); } @@ -457,12 +460,12 @@ struct partial_lu_impl // A00 | A01 | A02 // lu = A_0 | A_1 | A_2 = A10 | A11 | A12 // A20 | A21 | A22 - BlockType A_0(lu,0,0,rows,k); - BlockType A_2(lu,0,k+bs,rows,tsize); - BlockType A11(lu,k,k,bs,bs); - BlockType A12(lu,k,k+bs,bs,tsize); - BlockType A21(lu,k+bs,k,trows,bs); - BlockType A22(lu,k+bs,k+bs,trows,tsize); + BlockType A_0 = lu.block(0,0,rows,k); + BlockType A_2 = lu.block(0,k+bs,rows,tsize); + BlockType A11 = lu.block(k,k,bs,bs); + BlockType A12 = lu.block(k,k+bs,bs,tsize); + BlockType A21 = lu.block(k+bs,k,trows,bs); + BlockType A22 = lu.block(k+bs,k+bs,trows,tsize); PivIndex nb_transpositions_in_panel; // recursively call the blocked LU algorithm on [A11^T A21^T]^T @@ -501,11 +504,18 @@ struct partial_lu_impl template void partial_lu_inplace(MatrixType& lu, TranspositionType& row_transpositions, typename TranspositionType::StorageIndex& nb_transpositions) { + // Special-case of zero matrix. + if (lu.rows() == 0 || lu.cols() == 0) { + nb_transpositions = 0; + return; + } eigen_assert(lu.cols() == row_transpositions.size()); - eigen_assert((&row_transpositions.coeffRef(1)-&row_transpositions.coeffRef(0)) == 1); + eigen_assert(row_transpositions.size() < 2 || (&row_transpositions.coeffRef(1)-&row_transpositions.coeffRef(0)) == 1); partial_lu_impl - + < typename MatrixType::Scalar, MatrixType::Flags&RowMajorBit?RowMajor:ColMajor, + typename TranspositionType::StorageIndex, + EIGEN_SIZE_MIN_PREFER_FIXED(MatrixType::RowsAtCompileTime,MatrixType::ColsAtCompileTime)> ::blocked_lu(lu.rows(), lu.cols(), &lu.coeffRef(0,0), lu.outerStride(), &row_transpositions.coeffRef(0), nb_transpositions); } diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/LU/arch/InverseSize4.h b/wpimath/src/main/native/eigeninclude/Eigen/src/LU/arch/InverseSize4.h new file mode 100644 index 0000000000..a232ffc0aa --- /dev/null +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/LU/arch/InverseSize4.h @@ -0,0 +1,351 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2001 Intel Corporation +// Copyright (C) 2010 Gael Guennebaud +// Copyright (C) 2009 Benoit Jacob +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// +// The algorithm below is a reimplementation of former \src\LU\Inverse_SSE.h using PacketMath. +// inv(M) = M#/|M|, where inv(M), M# and |M| denote the inverse of M, +// adjugate of M and determinant of M respectively. M# is computed block-wise +// using specific formulae. For proof, see: +// https://lxjk.github.io/2017/09/03/Fast-4x4-Matrix-Inverse-with-SSE-SIMD-Explained.html +// Variable names are adopted from \src\LU\Inverse_SSE.h. +// +// The SSE code for the 4x4 float and double matrix inverse in former (deprecated) \src\LU\Inverse_SSE.h +// comes from the following Intel's library: +// http://software.intel.com/en-us/articles/optimized-matrix-library-for-use-with-the-intel-pentiumr-4-processors-sse2-instructions/ +// +// Here is the respective copyright and license statement: +// +// Copyright (c) 2001 Intel Corporation. +// +// Permition is granted to use, copy, distribute and prepare derivative works +// of this library for any purpose and without fee, provided, that the above +// copyright notice and this statement appear in all copies. +// Intel makes no representations about the suitability of this software for +// any purpose, and specifically disclaims all warranties. +// See LEGAL.TXT for all the legal information. +// +// TODO: Unify implementations of different data types (i.e. float and double). +#ifndef EIGEN_INVERSE_SIZE_4_H +#define EIGEN_INVERSE_SIZE_4_H + +namespace Eigen +{ +namespace internal +{ +template +struct compute_inverse_size4 +{ + enum + { + MatrixAlignment = traits::Alignment, + ResultAlignment = traits::Alignment, + StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit) + }; + typedef typename conditional<(MatrixType::Flags & LinearAccessBit), MatrixType const &, typename MatrixType::PlainObject>::type ActualMatrixType; + + static void run(const MatrixType &mat, ResultType &result) + { + ActualMatrixType matrix(mat); + + const float* data = matrix.data(); + const Index stride = matrix.innerStride(); + Packet4f _L1 = ploadt(data); + Packet4f _L2 = ploadt(data + stride*4); + Packet4f _L3 = ploadt(data + stride*8); + Packet4f _L4 = ploadt(data + stride*12); + + // Four 2x2 sub-matrices of the input matrix + // input = [[A, B], + // [C, D]] + Packet4f A, B, C, D; + + if (!StorageOrdersMatch) + { + A = vec4f_unpacklo(_L1, _L2); + B = vec4f_unpacklo(_L3, _L4); + C = vec4f_unpackhi(_L1, _L2); + D = vec4f_unpackhi(_L3, _L4); + } + else + { + A = vec4f_movelh(_L1, _L2); + B = vec4f_movehl(_L2, _L1); + C = vec4f_movelh(_L3, _L4); + D = vec4f_movehl(_L4, _L3); + } + + Packet4f AB, DC; + + // AB = A# * B, where A# denotes the adjugate of A, and * denotes matrix product. + AB = pmul(vec4f_swizzle2(A, A, 3, 3, 0, 0), B); + AB = psub(AB, pmul(vec4f_swizzle2(A, A, 1, 1, 2, 2), vec4f_swizzle2(B, B, 2, 3, 0, 1))); + + // DC = D#*C + DC = pmul(vec4f_swizzle2(D, D, 3, 3, 0, 0), C); + DC = psub(DC, pmul(vec4f_swizzle2(D, D, 1, 1, 2, 2), vec4f_swizzle2(C, C, 2, 3, 0, 1))); + + // determinants of the sub-matrices + Packet4f dA, dB, dC, dD; + + dA = pmul(vec4f_swizzle2(A, A, 3, 3, 1, 1), A); + dA = psub(dA, vec4f_movehl(dA, dA)); + + dB = pmul(vec4f_swizzle2(B, B, 3, 3, 1, 1), B); + dB = psub(dB, vec4f_movehl(dB, dB)); + + dC = pmul(vec4f_swizzle2(C, C, 3, 3, 1, 1), C); + dC = psub(dC, vec4f_movehl(dC, dC)); + + dD = pmul(vec4f_swizzle2(D, D, 3, 3, 1, 1), D); + dD = psub(dD, vec4f_movehl(dD, dD)); + + Packet4f d, d1, d2; + + d = pmul(vec4f_swizzle2(DC, DC, 0, 2, 1, 3), AB); + d = padd(d, vec4f_movehl(d, d)); + d = padd(d, vec4f_swizzle2(d, d, 1, 0, 0, 0)); + d1 = pmul(dA, dD); + d2 = pmul(dB, dC); + + // determinant of the input matrix, det = |A||D| + |B||C| - trace(A#*B*D#*C) + Packet4f det = vec4f_duplane(psub(padd(d1, d2), d), 0); + + // reciprocal of the determinant of the input matrix, rd = 1/det + Packet4f rd = pdiv(pset1(1.0f), det); + + // Four sub-matrices of the inverse + Packet4f iA, iB, iC, iD; + + // iD = D*|A| - C*A#*B + iD = pmul(vec4f_swizzle2(C, C, 0, 0, 2, 2), vec4f_movelh(AB, AB)); + iD = padd(iD, pmul(vec4f_swizzle2(C, C, 1, 1, 3, 3), vec4f_movehl(AB, AB))); + iD = psub(pmul(D, vec4f_duplane(dA, 0)), iD); + + // iA = A*|D| - B*D#*C + iA = pmul(vec4f_swizzle2(B, B, 0, 0, 2, 2), vec4f_movelh(DC, DC)); + iA = padd(iA, pmul(vec4f_swizzle2(B, B, 1, 1, 3, 3), vec4f_movehl(DC, DC))); + iA = psub(pmul(A, vec4f_duplane(dD, 0)), iA); + + // iB = C*|B| - D * (A#B)# = C*|B| - D*B#*A + iB = pmul(D, vec4f_swizzle2(AB, AB, 3, 0, 3, 0)); + iB = psub(iB, pmul(vec4f_swizzle2(D, D, 1, 0, 3, 2), vec4f_swizzle2(AB, AB, 2, 1, 2, 1))); + iB = psub(pmul(C, vec4f_duplane(dB, 0)), iB); + + // iC = B*|C| - A * (D#C)# = B*|C| - A*C#*D + iC = pmul(A, vec4f_swizzle2(DC, DC, 3, 0, 3, 0)); + iC = psub(iC, pmul(vec4f_swizzle2(A, A, 1, 0, 3, 2), vec4f_swizzle2(DC, DC, 2, 1, 2, 1))); + iC = psub(pmul(B, vec4f_duplane(dC, 0)), iC); + + const float sign_mask[4] = {0.0f, numext::bit_cast(0x80000000u), numext::bit_cast(0x80000000u), 0.0f}; + const Packet4f p4f_sign_PNNP = ploadu(sign_mask); + rd = pxor(rd, p4f_sign_PNNP); + iA = pmul(iA, rd); + iB = pmul(iB, rd); + iC = pmul(iC, rd); + iD = pmul(iD, rd); + + Index res_stride = result.outerStride(); + float *res = result.data(); + + pstoret(res + 0, vec4f_swizzle2(iA, iB, 3, 1, 3, 1)); + pstoret(res + res_stride, vec4f_swizzle2(iA, iB, 2, 0, 2, 0)); + pstoret(res + 2 * res_stride, vec4f_swizzle2(iC, iD, 3, 1, 3, 1)); + pstoret(res + 3 * res_stride, vec4f_swizzle2(iC, iD, 2, 0, 2, 0)); + } +}; + +#if !(defined EIGEN_VECTORIZE_NEON && !(EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG)) +// same algorithm as above, except that each operand is split into +// halves for two registers to hold. +template +struct compute_inverse_size4 +{ + enum + { + MatrixAlignment = traits::Alignment, + ResultAlignment = traits::Alignment, + StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit) + }; + typedef typename conditional<(MatrixType::Flags & LinearAccessBit), + MatrixType const &, + typename MatrixType::PlainObject>::type + ActualMatrixType; + + static void run(const MatrixType &mat, ResultType &result) + { + ActualMatrixType matrix(mat); + + // Four 2x2 sub-matrices of the input matrix, each is further divided into upper and lower + // row e.g. A1, upper row of A, A2, lower row of A + // input = [[A, B], = [[[A1, [B1, + // [C, D]] A2], B2]], + // [[C1, [D1, + // C2], D2]]] + + Packet2d A1, A2, B1, B2, C1, C2, D1, D2; + + const double* data = matrix.data(); + const Index stride = matrix.innerStride(); + if (StorageOrdersMatch) + { + A1 = ploadt(data + stride*0); + B1 = ploadt(data + stride*2); + A2 = ploadt(data + stride*4); + B2 = ploadt(data + stride*6); + C1 = ploadt(data + stride*8); + D1 = ploadt(data + stride*10); + C2 = ploadt(data + stride*12); + D2 = ploadt(data + stride*14); + } + else + { + Packet2d temp; + A1 = ploadt(data + stride*0); + C1 = ploadt(data + stride*2); + A2 = ploadt(data + stride*4); + C2 = ploadt(data + stride*6); + temp = A1; + A1 = vec2d_unpacklo(A1, A2); + A2 = vec2d_unpackhi(temp, A2); + + temp = C1; + C1 = vec2d_unpacklo(C1, C2); + C2 = vec2d_unpackhi(temp, C2); + + B1 = ploadt(data + stride*8); + D1 = ploadt(data + stride*10); + B2 = ploadt(data + stride*12); + D2 = ploadt(data + stride*14); + + temp = B1; + B1 = vec2d_unpacklo(B1, B2); + B2 = vec2d_unpackhi(temp, B2); + + temp = D1; + D1 = vec2d_unpacklo(D1, D2); + D2 = vec2d_unpackhi(temp, D2); + } + + // determinants of the sub-matrices + Packet2d dA, dB, dC, dD; + + dA = vec2d_swizzle2(A2, A2, 1); + dA = pmul(A1, dA); + dA = psub(dA, vec2d_duplane(dA, 1)); + + dB = vec2d_swizzle2(B2, B2, 1); + dB = pmul(B1, dB); + dB = psub(dB, vec2d_duplane(dB, 1)); + + dC = vec2d_swizzle2(C2, C2, 1); + dC = pmul(C1, dC); + dC = psub(dC, vec2d_duplane(dC, 1)); + + dD = vec2d_swizzle2(D2, D2, 1); + dD = pmul(D1, dD); + dD = psub(dD, vec2d_duplane(dD, 1)); + + Packet2d DC1, DC2, AB1, AB2; + + // AB = A# * B, where A# denotes the adjugate of A, and * denotes matrix product. + AB1 = pmul(B1, vec2d_duplane(A2, 1)); + AB2 = pmul(B2, vec2d_duplane(A1, 0)); + AB1 = psub(AB1, pmul(B2, vec2d_duplane(A1, 1))); + AB2 = psub(AB2, pmul(B1, vec2d_duplane(A2, 0))); + + // DC = D#*C + DC1 = pmul(C1, vec2d_duplane(D2, 1)); + DC2 = pmul(C2, vec2d_duplane(D1, 0)); + DC1 = psub(DC1, pmul(C2, vec2d_duplane(D1, 1))); + DC2 = psub(DC2, pmul(C1, vec2d_duplane(D2, 0))); + + Packet2d d1, d2; + + // determinant of the input matrix, det = |A||D| + |B||C| - trace(A#*B*D#*C) + Packet2d det; + + // reciprocal of the determinant of the input matrix, rd = 1/det + Packet2d rd; + + d1 = pmul(AB1, vec2d_swizzle2(DC1, DC2, 0)); + d2 = pmul(AB2, vec2d_swizzle2(DC1, DC2, 3)); + rd = padd(d1, d2); + rd = padd(rd, vec2d_duplane(rd, 1)); + + d1 = pmul(dA, dD); + d2 = pmul(dB, dC); + + det = padd(d1, d2); + det = psub(det, rd); + det = vec2d_duplane(det, 0); + rd = pdiv(pset1(1.0), det); + + // rows of four sub-matrices of the inverse + Packet2d iA1, iA2, iB1, iB2, iC1, iC2, iD1, iD2; + + // iD = D*|A| - C*A#*B + iD1 = pmul(AB1, vec2d_duplane(C1, 0)); + iD2 = pmul(AB1, vec2d_duplane(C2, 0)); + iD1 = padd(iD1, pmul(AB2, vec2d_duplane(C1, 1))); + iD2 = padd(iD2, pmul(AB2, vec2d_duplane(C2, 1))); + dA = vec2d_duplane(dA, 0); + iD1 = psub(pmul(D1, dA), iD1); + iD2 = psub(pmul(D2, dA), iD2); + + // iA = A*|D| - B*D#*C + iA1 = pmul(DC1, vec2d_duplane(B1, 0)); + iA2 = pmul(DC1, vec2d_duplane(B2, 0)); + iA1 = padd(iA1, pmul(DC2, vec2d_duplane(B1, 1))); + iA2 = padd(iA2, pmul(DC2, vec2d_duplane(B2, 1))); + dD = vec2d_duplane(dD, 0); + iA1 = psub(pmul(A1, dD), iA1); + iA2 = psub(pmul(A2, dD), iA2); + + // iB = C*|B| - D * (A#B)# = C*|B| - D*B#*A + iB1 = pmul(D1, vec2d_swizzle2(AB2, AB1, 1)); + iB2 = pmul(D2, vec2d_swizzle2(AB2, AB1, 1)); + iB1 = psub(iB1, pmul(vec2d_swizzle2(D1, D1, 1), vec2d_swizzle2(AB2, AB1, 2))); + iB2 = psub(iB2, pmul(vec2d_swizzle2(D2, D2, 1), vec2d_swizzle2(AB2, AB1, 2))); + dB = vec2d_duplane(dB, 0); + iB1 = psub(pmul(C1, dB), iB1); + iB2 = psub(pmul(C2, dB), iB2); + + // iC = B*|C| - A * (D#C)# = B*|C| - A*C#*D + iC1 = pmul(A1, vec2d_swizzle2(DC2, DC1, 1)); + iC2 = pmul(A2, vec2d_swizzle2(DC2, DC1, 1)); + iC1 = psub(iC1, pmul(vec2d_swizzle2(A1, A1, 1), vec2d_swizzle2(DC2, DC1, 2))); + iC2 = psub(iC2, pmul(vec2d_swizzle2(A2, A2, 1), vec2d_swizzle2(DC2, DC1, 2))); + dC = vec2d_duplane(dC, 0); + iC1 = psub(pmul(B1, dC), iC1); + iC2 = psub(pmul(B2, dC), iC2); + + const double sign_mask1[2] = {0.0, numext::bit_cast(0x8000000000000000ull)}; + const double sign_mask2[2] = {numext::bit_cast(0x8000000000000000ull), 0.0}; + const Packet2d sign_PN = ploadu(sign_mask1); + const Packet2d sign_NP = ploadu(sign_mask2); + d1 = pxor(rd, sign_PN); + d2 = pxor(rd, sign_NP); + + Index res_stride = result.outerStride(); + double *res = result.data(); + pstoret(res + 0, pmul(vec2d_swizzle2(iA2, iA1, 3), d1)); + pstoret(res + res_stride, pmul(vec2d_swizzle2(iA2, iA1, 0), d2)); + pstoret(res + 2, pmul(vec2d_swizzle2(iB2, iB1, 3), d1)); + pstoret(res + res_stride + 2, pmul(vec2d_swizzle2(iB2, iB1, 0), d2)); + pstoret(res + 2 * res_stride, pmul(vec2d_swizzle2(iC2, iC1, 3), d1)); + pstoret(res + 3 * res_stride, pmul(vec2d_swizzle2(iC2, iC1, 0), d2)); + pstoret(res + 2 * res_stride + 2, pmul(vec2d_swizzle2(iD2, iD1, 3), d1)); + pstoret(res + 3 * res_stride + 2, pmul(vec2d_swizzle2(iD2, iD1, 0), d2)); + } +}; +#endif +} // namespace internal +} // namespace Eigen +#endif diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/LU/arch/Inverse_SSE.h b/wpimath/src/main/native/eigeninclude/Eigen/src/LU/arch/Inverse_SSE.h deleted file mode 100644 index 4dce2ef20e..0000000000 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/LU/arch/Inverse_SSE.h +++ /dev/null @@ -1,338 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2001 Intel Corporation -// Copyright (C) 2010 Gael Guennebaud -// Copyright (C) 2009 Benoit Jacob -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -// The SSE code for the 4x4 float and double matrix inverse in this file -// comes from the following Intel's library: -// http://software.intel.com/en-us/articles/optimized-matrix-library-for-use-with-the-intel-pentiumr-4-processors-sse2-instructions/ -// -// Here is the respective copyright and license statement: -// -// Copyright (c) 2001 Intel Corporation. -// -// Permition is granted to use, copy, distribute and prepare derivative works -// of this library for any purpose and without fee, provided, that the above -// copyright notice and this statement appear in all copies. -// Intel makes no representations about the suitability of this software for -// any purpose, and specifically disclaims all warranties. -// See LEGAL.TXT for all the legal information. - -#ifndef EIGEN_INVERSE_SSE_H -#define EIGEN_INVERSE_SSE_H - -namespace Eigen { - -namespace internal { - -template -struct compute_inverse_size4 -{ - enum { - MatrixAlignment = traits::Alignment, - ResultAlignment = traits::Alignment, - StorageOrdersMatch = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit) - }; - typedef typename conditional<(MatrixType::Flags&LinearAccessBit),MatrixType const &,typename MatrixType::PlainObject>::type ActualMatrixType; - - static void run(const MatrixType& mat, ResultType& result) - { - ActualMatrixType matrix(mat); - const Packet4f p4f_sign_PNNP = _mm_castsi128_ps(_mm_set_epi32(0x00000000, 0x80000000, 0x80000000, 0x00000000)); - - // Load the full matrix into registers - __m128 _L1 = matrix.template packet( 0); - __m128 _L2 = matrix.template packet( 4); - __m128 _L3 = matrix.template packet( 8); - __m128 _L4 = matrix.template packet(12); - - // The inverse is calculated using "Divide and Conquer" technique. The - // original matrix is divide into four 2x2 sub-matrices. Since each - // register holds four matrix element, the smaller matrices are - // represented as a registers. Hence we get a better locality of the - // calculations. - - __m128 A, B, C, D; // the four sub-matrices - if(!StorageOrdersMatch) - { - A = _mm_unpacklo_ps(_L1, _L2); - B = _mm_unpacklo_ps(_L3, _L4); - C = _mm_unpackhi_ps(_L1, _L2); - D = _mm_unpackhi_ps(_L3, _L4); - } - else - { - A = _mm_movelh_ps(_L1, _L2); - B = _mm_movehl_ps(_L2, _L1); - C = _mm_movelh_ps(_L3, _L4); - D = _mm_movehl_ps(_L4, _L3); - } - - __m128 iA, iB, iC, iD, // partial inverse of the sub-matrices - DC, AB; - __m128 dA, dB, dC, dD; // determinant of the sub-matrices - __m128 det, d, d1, d2; - __m128 rd; // reciprocal of the determinant - - // AB = A# * B - AB = _mm_mul_ps(_mm_shuffle_ps(A,A,0x0F), B); - AB = _mm_sub_ps(AB,_mm_mul_ps(_mm_shuffle_ps(A,A,0xA5), _mm_shuffle_ps(B,B,0x4E))); - // DC = D# * C - DC = _mm_mul_ps(_mm_shuffle_ps(D,D,0x0F), C); - DC = _mm_sub_ps(DC,_mm_mul_ps(_mm_shuffle_ps(D,D,0xA5), _mm_shuffle_ps(C,C,0x4E))); - - // dA = |A| - dA = _mm_mul_ps(_mm_shuffle_ps(A, A, 0x5F),A); - dA = _mm_sub_ss(dA, _mm_movehl_ps(dA,dA)); - // dB = |B| - dB = _mm_mul_ps(_mm_shuffle_ps(B, B, 0x5F),B); - dB = _mm_sub_ss(dB, _mm_movehl_ps(dB,dB)); - - // dC = |C| - dC = _mm_mul_ps(_mm_shuffle_ps(C, C, 0x5F),C); - dC = _mm_sub_ss(dC, _mm_movehl_ps(dC,dC)); - // dD = |D| - dD = _mm_mul_ps(_mm_shuffle_ps(D, D, 0x5F),D); - dD = _mm_sub_ss(dD, _mm_movehl_ps(dD,dD)); - - // d = trace(AB*DC) = trace(A#*B*D#*C) - d = _mm_mul_ps(_mm_shuffle_ps(DC,DC,0xD8),AB); - - // iD = C*A#*B - iD = _mm_mul_ps(_mm_shuffle_ps(C,C,0xA0), _mm_movelh_ps(AB,AB)); - iD = _mm_add_ps(iD,_mm_mul_ps(_mm_shuffle_ps(C,C,0xF5), _mm_movehl_ps(AB,AB))); - // iA = B*D#*C - iA = _mm_mul_ps(_mm_shuffle_ps(B,B,0xA0), _mm_movelh_ps(DC,DC)); - iA = _mm_add_ps(iA,_mm_mul_ps(_mm_shuffle_ps(B,B,0xF5), _mm_movehl_ps(DC,DC))); - - // d = trace(AB*DC) = trace(A#*B*D#*C) [continue] - d = _mm_add_ps(d, _mm_movehl_ps(d, d)); - d = _mm_add_ss(d, _mm_shuffle_ps(d, d, 1)); - d1 = _mm_mul_ss(dA,dD); - d2 = _mm_mul_ss(dB,dC); - - // iD = D*|A| - C*A#*B - iD = _mm_sub_ps(_mm_mul_ps(D,_mm_shuffle_ps(dA,dA,0)), iD); - - // iA = A*|D| - B*D#*C; - iA = _mm_sub_ps(_mm_mul_ps(A,_mm_shuffle_ps(dD,dD,0)), iA); - - // det = |A|*|D| + |B|*|C| - trace(A#*B*D#*C) - det = _mm_sub_ss(_mm_add_ss(d1,d2),d); - rd = _mm_div_ss(_mm_set_ss(1.0f), det); - -// #ifdef ZERO_SINGULAR -// rd = _mm_and_ps(_mm_cmpneq_ss(det,_mm_setzero_ps()), rd); -// #endif - - // iB = D * (A#B)# = D*B#*A - iB = _mm_mul_ps(D, _mm_shuffle_ps(AB,AB,0x33)); - iB = _mm_sub_ps(iB, _mm_mul_ps(_mm_shuffle_ps(D,D,0xB1), _mm_shuffle_ps(AB,AB,0x66))); - // iC = A * (D#C)# = A*C#*D - iC = _mm_mul_ps(A, _mm_shuffle_ps(DC,DC,0x33)); - iC = _mm_sub_ps(iC, _mm_mul_ps(_mm_shuffle_ps(A,A,0xB1), _mm_shuffle_ps(DC,DC,0x66))); - - rd = _mm_shuffle_ps(rd,rd,0); - rd = _mm_xor_ps(rd, p4f_sign_PNNP); - - // iB = C*|B| - D*B#*A - iB = _mm_sub_ps(_mm_mul_ps(C,_mm_shuffle_ps(dB,dB,0)), iB); - - // iC = B*|C| - A*C#*D; - iC = _mm_sub_ps(_mm_mul_ps(B,_mm_shuffle_ps(dC,dC,0)), iC); - - // iX = iX / det - iA = _mm_mul_ps(rd,iA); - iB = _mm_mul_ps(rd,iB); - iC = _mm_mul_ps(rd,iC); - iD = _mm_mul_ps(rd,iD); - - Index res_stride = result.outerStride(); - float* res = result.data(); - pstoret(res+0, _mm_shuffle_ps(iA,iB,0x77)); - pstoret(res+res_stride, _mm_shuffle_ps(iA,iB,0x22)); - pstoret(res+2*res_stride, _mm_shuffle_ps(iC,iD,0x77)); - pstoret(res+3*res_stride, _mm_shuffle_ps(iC,iD,0x22)); - } - -}; - -template -struct compute_inverse_size4 -{ - enum { - MatrixAlignment = traits::Alignment, - ResultAlignment = traits::Alignment, - StorageOrdersMatch = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit) - }; - typedef typename conditional<(MatrixType::Flags&LinearAccessBit),MatrixType const &,typename MatrixType::PlainObject>::type ActualMatrixType; - - static void run(const MatrixType& mat, ResultType& result) - { - ActualMatrixType matrix(mat); - const __m128d _Sign_NP = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0)); - const __m128d _Sign_PN = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); - - // The inverse is calculated using "Divide and Conquer" technique. The - // original matrix is divide into four 2x2 sub-matrices. Since each - // register of the matrix holds two elements, the smaller matrices are - // consisted of two registers. Hence we get a better locality of the - // calculations. - - // the four sub-matrices - __m128d A1, A2, B1, B2, C1, C2, D1, D2; - - if(StorageOrdersMatch) - { - A1 = matrix.template packet( 0); B1 = matrix.template packet( 2); - A2 = matrix.template packet( 4); B2 = matrix.template packet( 6); - C1 = matrix.template packet( 8); D1 = matrix.template packet(10); - C2 = matrix.template packet(12); D2 = matrix.template packet(14); - } - else - { - __m128d tmp; - A1 = matrix.template packet( 0); C1 = matrix.template packet( 2); - A2 = matrix.template packet( 4); C2 = matrix.template packet( 6); - tmp = A1; - A1 = _mm_unpacklo_pd(A1,A2); - A2 = _mm_unpackhi_pd(tmp,A2); - tmp = C1; - C1 = _mm_unpacklo_pd(C1,C2); - C2 = _mm_unpackhi_pd(tmp,C2); - - B1 = matrix.template packet( 8); D1 = matrix.template packet(10); - B2 = matrix.template packet(12); D2 = matrix.template packet(14); - tmp = B1; - B1 = _mm_unpacklo_pd(B1,B2); - B2 = _mm_unpackhi_pd(tmp,B2); - tmp = D1; - D1 = _mm_unpacklo_pd(D1,D2); - D2 = _mm_unpackhi_pd(tmp,D2); - } - - __m128d iA1, iA2, iB1, iB2, iC1, iC2, iD1, iD2, // partial invese of the sub-matrices - DC1, DC2, AB1, AB2; - __m128d dA, dB, dC, dD; // determinant of the sub-matrices - __m128d det, d1, d2, rd; - - // dA = |A| - dA = _mm_shuffle_pd(A2, A2, 1); - dA = _mm_mul_pd(A1, dA); - dA = _mm_sub_sd(dA, _mm_shuffle_pd(dA,dA,3)); - // dB = |B| - dB = _mm_shuffle_pd(B2, B2, 1); - dB = _mm_mul_pd(B1, dB); - dB = _mm_sub_sd(dB, _mm_shuffle_pd(dB,dB,3)); - - // AB = A# * B - AB1 = _mm_mul_pd(B1, _mm_shuffle_pd(A2,A2,3)); - AB2 = _mm_mul_pd(B2, _mm_shuffle_pd(A1,A1,0)); - AB1 = _mm_sub_pd(AB1, _mm_mul_pd(B2, _mm_shuffle_pd(A1,A1,3))); - AB2 = _mm_sub_pd(AB2, _mm_mul_pd(B1, _mm_shuffle_pd(A2,A2,0))); - - // dC = |C| - dC = _mm_shuffle_pd(C2, C2, 1); - dC = _mm_mul_pd(C1, dC); - dC = _mm_sub_sd(dC, _mm_shuffle_pd(dC,dC,3)); - // dD = |D| - dD = _mm_shuffle_pd(D2, D2, 1); - dD = _mm_mul_pd(D1, dD); - dD = _mm_sub_sd(dD, _mm_shuffle_pd(dD,dD,3)); - - // DC = D# * C - DC1 = _mm_mul_pd(C1, _mm_shuffle_pd(D2,D2,3)); - DC2 = _mm_mul_pd(C2, _mm_shuffle_pd(D1,D1,0)); - DC1 = _mm_sub_pd(DC1, _mm_mul_pd(C2, _mm_shuffle_pd(D1,D1,3))); - DC2 = _mm_sub_pd(DC2, _mm_mul_pd(C1, _mm_shuffle_pd(D2,D2,0))); - - // rd = trace(AB*DC) = trace(A#*B*D#*C) - d1 = _mm_mul_pd(AB1, _mm_shuffle_pd(DC1, DC2, 0)); - d2 = _mm_mul_pd(AB2, _mm_shuffle_pd(DC1, DC2, 3)); - rd = _mm_add_pd(d1, d2); - rd = _mm_add_sd(rd, _mm_shuffle_pd(rd, rd,3)); - - // iD = C*A#*B - iD1 = _mm_mul_pd(AB1, _mm_shuffle_pd(C1,C1,0)); - iD2 = _mm_mul_pd(AB1, _mm_shuffle_pd(C2,C2,0)); - iD1 = _mm_add_pd(iD1, _mm_mul_pd(AB2, _mm_shuffle_pd(C1,C1,3))); - iD2 = _mm_add_pd(iD2, _mm_mul_pd(AB2, _mm_shuffle_pd(C2,C2,3))); - - // iA = B*D#*C - iA1 = _mm_mul_pd(DC1, _mm_shuffle_pd(B1,B1,0)); - iA2 = _mm_mul_pd(DC1, _mm_shuffle_pd(B2,B2,0)); - iA1 = _mm_add_pd(iA1, _mm_mul_pd(DC2, _mm_shuffle_pd(B1,B1,3))); - iA2 = _mm_add_pd(iA2, _mm_mul_pd(DC2, _mm_shuffle_pd(B2,B2,3))); - - // iD = D*|A| - C*A#*B - dA = _mm_shuffle_pd(dA,dA,0); - iD1 = _mm_sub_pd(_mm_mul_pd(D1, dA), iD1); - iD2 = _mm_sub_pd(_mm_mul_pd(D2, dA), iD2); - - // iA = A*|D| - B*D#*C; - dD = _mm_shuffle_pd(dD,dD,0); - iA1 = _mm_sub_pd(_mm_mul_pd(A1, dD), iA1); - iA2 = _mm_sub_pd(_mm_mul_pd(A2, dD), iA2); - - d1 = _mm_mul_sd(dA, dD); - d2 = _mm_mul_sd(dB, dC); - - // iB = D * (A#B)# = D*B#*A - iB1 = _mm_mul_pd(D1, _mm_shuffle_pd(AB2,AB1,1)); - iB2 = _mm_mul_pd(D2, _mm_shuffle_pd(AB2,AB1,1)); - iB1 = _mm_sub_pd(iB1, _mm_mul_pd(_mm_shuffle_pd(D1,D1,1), _mm_shuffle_pd(AB2,AB1,2))); - iB2 = _mm_sub_pd(iB2, _mm_mul_pd(_mm_shuffle_pd(D2,D2,1), _mm_shuffle_pd(AB2,AB1,2))); - - // det = |A|*|D| + |B|*|C| - trace(A#*B*D#*C) - det = _mm_add_sd(d1, d2); - det = _mm_sub_sd(det, rd); - - // iC = A * (D#C)# = A*C#*D - iC1 = _mm_mul_pd(A1, _mm_shuffle_pd(DC2,DC1,1)); - iC2 = _mm_mul_pd(A2, _mm_shuffle_pd(DC2,DC1,1)); - iC1 = _mm_sub_pd(iC1, _mm_mul_pd(_mm_shuffle_pd(A1,A1,1), _mm_shuffle_pd(DC2,DC1,2))); - iC2 = _mm_sub_pd(iC2, _mm_mul_pd(_mm_shuffle_pd(A2,A2,1), _mm_shuffle_pd(DC2,DC1,2))); - - rd = _mm_div_sd(_mm_set_sd(1.0), det); -// #ifdef ZERO_SINGULAR -// rd = _mm_and_pd(_mm_cmpneq_sd(det,_mm_setzero_pd()), rd); -// #endif - rd = _mm_shuffle_pd(rd,rd,0); - - // iB = C*|B| - D*B#*A - dB = _mm_shuffle_pd(dB,dB,0); - iB1 = _mm_sub_pd(_mm_mul_pd(C1, dB), iB1); - iB2 = _mm_sub_pd(_mm_mul_pd(C2, dB), iB2); - - d1 = _mm_xor_pd(rd, _Sign_PN); - d2 = _mm_xor_pd(rd, _Sign_NP); - - // iC = B*|C| - A*C#*D; - dC = _mm_shuffle_pd(dC,dC,0); - iC1 = _mm_sub_pd(_mm_mul_pd(B1, dC), iC1); - iC2 = _mm_sub_pd(_mm_mul_pd(B2, dC), iC2); - - Index res_stride = result.outerStride(); - double* res = result.data(); - pstoret(res+0, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 3), d1)); - pstoret(res+res_stride, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 0), d2)); - pstoret(res+2, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 3), d1)); - pstoret(res+res_stride+2, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 0), d2)); - pstoret(res+2*res_stride, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 3), d1)); - pstoret(res+3*res_stride, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 0), d2)); - pstoret(res+2*res_stride+2,_mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 3), d1)); - pstoret(res+3*res_stride+2,_mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 0), d2)); - } -}; - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_INVERSE_SSE_H diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/QR/ColPivHouseholderQR.h b/wpimath/src/main/native/eigeninclude/Eigen/src/QR/ColPivHouseholderQR.h index a7b47d55dc..9b677e9bf4 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/QR/ColPivHouseholderQR.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/QR/ColPivHouseholderQR.h @@ -17,6 +17,9 @@ namespace internal { template struct traits > : traits<_MatrixType> { + typedef MatrixXpr XprKind; + typedef SolverStorage StorageKind; + typedef int StorageIndex; enum { Flags = 0 }; }; @@ -46,20 +49,19 @@ template struct traits > * \sa MatrixBase::colPivHouseholderQr() */ template class ColPivHouseholderQR + : public SolverBase > { public: typedef _MatrixType MatrixType; + typedef SolverBase Base; + friend class SolverBase; + + EIGEN_GENERIC_PUBLIC_INTERFACE(ColPivHouseholderQR) enum { - RowsAtCompileTime = MatrixType::RowsAtCompileTime, - ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime }; - typedef typename MatrixType::Scalar Scalar; - typedef typename MatrixType::RealScalar RealScalar; - // FIXME should be int - typedef typename MatrixType::StorageIndex StorageIndex; typedef typename internal::plain_diag_type::type HCoeffsType; typedef PermutationMatrix PermutationType; typedef typename internal::plain_row_type::type IntRowVectorType; @@ -156,6 +158,7 @@ template class ColPivHouseholderQR computeInPlace(); } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** This method finds a solution x to the equation Ax=b, where A is the matrix of which * *this is the QR decomposition, if any exists. * @@ -172,11 +175,8 @@ template class ColPivHouseholderQR */ template inline const Solve - solve(const MatrixBase& b) const - { - eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized."); - return Solve(*this, b.derived()); - } + solve(const MatrixBase& b) const; + #endif HouseholderSequenceType householderQ() const; HouseholderSequenceType matrixQ() const @@ -402,7 +402,7 @@ template class ColPivHouseholderQR */ RealScalar maxPivot() const { return m_maxpivot; } - /** \brief Reports whether the QR factorization was succesful. + /** \brief Reports whether the QR factorization was successful. * * \note This function always returns \c Success. It is provided for compatibility * with other factorization routines. @@ -416,8 +416,10 @@ template class ColPivHouseholderQR #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; + + template + void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: @@ -584,8 +586,6 @@ template template void ColPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const { - eigen_assert(rhs.rows() == rows()); - const Index nonzero_pivots = nonzeroPivots(); if(nonzero_pivots == 0) @@ -596,11 +596,7 @@ void ColPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType & typename RhsType::PlainObject c(rhs); - // Note that the matrix Q = H_0^* H_1^*... so its inverse is Q^* = (H_0 H_1 ...)^T - c.applyOnTheLeft(householderSequence(m_qr, m_hCoeffs) - .setLength(nonzero_pivots) - .transpose() - ); + c.applyOnTheLeft(householderQ().setLength(nonzero_pivots).adjoint() ); m_qr.topLeftCorner(nonzero_pivots, nonzero_pivots) .template triangularView() @@ -609,6 +605,31 @@ void ColPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType & for(Index i = 0; i < nonzero_pivots; ++i) dst.row(m_colsPermutation.indices().coeff(i)) = c.row(i); for(Index i = nonzero_pivots; i < cols(); ++i) dst.row(m_colsPermutation.indices().coeff(i)).setZero(); } + +template +template +void ColPivHouseholderQR<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +{ + const Index nonzero_pivots = nonzeroPivots(); + + if(nonzero_pivots == 0) + { + dst.setZero(); + return; + } + + typename RhsType::PlainObject c(m_colsPermutation.transpose()*rhs); + + m_qr.topLeftCorner(nonzero_pivots, nonzero_pivots) + .template triangularView() + .transpose().template conjugateIf() + .solveInPlace(c.topRows(nonzero_pivots)); + + dst.topRows(nonzero_pivots) = c.topRows(nonzero_pivots); + dst.bottomRows(rows()-nonzero_pivots).setZero(); + + dst.applyOnTheLeft(householderQ().setLength(nonzero_pivots).template conjugateIf() ); +} #endif namespace internal { diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/QR/CompleteOrthogonalDecomposition.h b/wpimath/src/main/native/eigeninclude/Eigen/src/QR/CompleteOrthogonalDecomposition.h index 34c637b70a..486d3373af 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/QR/CompleteOrthogonalDecomposition.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/QR/CompleteOrthogonalDecomposition.h @@ -16,6 +16,9 @@ namespace internal { template struct traits > : traits<_MatrixType> { + typedef MatrixXpr XprKind; + typedef SolverStorage StorageKind; + typedef int StorageIndex; enum { Flags = 0 }; }; @@ -44,19 +47,21 @@ struct traits > * * \sa MatrixBase::completeOrthogonalDecomposition() */ -template -class CompleteOrthogonalDecomposition { +template class CompleteOrthogonalDecomposition + : public SolverBase > +{ public: typedef _MatrixType MatrixType; + typedef SolverBase Base; + + template + friend struct internal::solve_assertion; + + EIGEN_GENERIC_PUBLIC_INTERFACE(CompleteOrthogonalDecomposition) enum { - RowsAtCompileTime = MatrixType::RowsAtCompileTime, - ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime }; - typedef typename MatrixType::Scalar Scalar; - typedef typename MatrixType::RealScalar RealScalar; - typedef typename MatrixType::StorageIndex StorageIndex; typedef typename internal::plain_diag_type::type HCoeffsType; typedef PermutationMatrix PermutationType; @@ -131,9 +136,9 @@ class CompleteOrthogonalDecomposition { m_temp(matrix.cols()) { computeInPlace(); - } - + } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** This method computes the minimum-norm solution X to a least squares * problem \f[\mathrm{minimize} \|A X - B\|, \f] where \b A is the matrix of * which \c *this is the complete orthogonal decomposition. @@ -145,11 +150,8 @@ class CompleteOrthogonalDecomposition { */ template inline const Solve solve( - const MatrixBase& b) const { - eigen_assert(m_cpqr.m_isInitialized && - "CompleteOrthogonalDecomposition is not initialized."); - return Solve(*this, b.derived()); - } + const MatrixBase& b) const; + #endif HouseholderSequenceType householderQ(void) const; HouseholderSequenceType matrixQ(void) const { return m_cpqr.householderQ(); } @@ -158,8 +160,8 @@ class CompleteOrthogonalDecomposition { */ MatrixType matrixZ() const { MatrixType Z = MatrixType::Identity(m_cpqr.cols(), m_cpqr.cols()); - applyZAdjointOnTheLeftInPlace(Z); - return Z.adjoint(); + applyZOnTheLeftInPlace(Z); + return Z; } /** \returns a reference to the matrix where the complete orthogonal @@ -275,6 +277,7 @@ class CompleteOrthogonalDecomposition { */ inline const Inverse pseudoInverse() const { + eigen_assert(m_cpqr.m_isInitialized && "CompleteOrthogonalDecomposition is not initialized."); return Inverse(*this); } @@ -353,7 +356,7 @@ class CompleteOrthogonalDecomposition { inline RealScalar maxPivot() const { return m_cpqr.maxPivot(); } /** \brief Reports whether the complete orthogonal decomposition was - * succesful. + * successful. * * \note This function always returns \c Success. It is provided for * compatibility @@ -367,7 +370,10 @@ class CompleteOrthogonalDecomposition { #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType& rhs, DstType& dst) const; + void _solve_impl(const RhsType& rhs, DstType& dst) const; + + template + void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: @@ -375,8 +381,22 @@ class CompleteOrthogonalDecomposition { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); } + template + void _check_solve_assertion(const Rhs& b) const { + EIGEN_ONLY_USED_FOR_DEBUG(b); + eigen_assert(m_cpqr.m_isInitialized && "CompleteOrthogonalDecomposition is not initialized."); + eigen_assert((Transpose_?derived().cols():derived().rows())==b.rows() && "CompleteOrthogonalDecomposition::solve(): invalid number of rows of the right hand side matrix b"); + } + void computeInPlace(); + /** Overwrites \b rhs with \f$ \mathbf{Z} * \mathbf{rhs} \f$ or + * \f$ \mathbf{\overline Z} * \mathbf{rhs} \f$ if \c Conjugate + * is set to \c true. + */ + template + void applyZOnTheLeftInPlace(Rhs& rhs) const; + /** Overwrites \b rhs with \f$ \mathbf{Z}^* * \mathbf{rhs} \f$. */ template @@ -452,7 +472,7 @@ void CompleteOrthogonalDecomposition::computeInPlace() // Apply Z(k) to the first k rows of X_k m_cpqr.m_qr.topRightCorner(k, cols - rank + 1) .applyHouseholderOnTheRight( - m_cpqr.m_qr.row(k).tail(cols - rank).transpose(), m_zCoeffs(k), + m_cpqr.m_qr.row(k).tail(cols - rank).adjoint(), m_zCoeffs(k), &m_temp(0)); } if (k != rank - 1) { @@ -464,6 +484,28 @@ void CompleteOrthogonalDecomposition::computeInPlace() } } +template +template +void CompleteOrthogonalDecomposition::applyZOnTheLeftInPlace( + Rhs& rhs) const { + const Index cols = this->cols(); + const Index nrhs = rhs.cols(); + const Index rank = this->rank(); + Matrix temp((std::max)(cols, nrhs)); + for (Index k = rank-1; k >= 0; --k) { + if (k != rank - 1) { + rhs.row(k).swap(rhs.row(rank - 1)); + } + rhs.middleRows(rank - 1, cols - rank + 1) + .applyHouseholderOnTheLeft( + matrixQTZ().row(k).tail(cols - rank).transpose().template conjugateIf(), zCoeffs().template conjugateIf()(k), + &temp(0)); + if (k != rank - 1) { + rhs.row(k).swap(rhs.row(rank - 1)); + } + } +} + template template void CompleteOrthogonalDecomposition::applyZAdjointOnTheLeftInPlace( @@ -471,7 +513,7 @@ void CompleteOrthogonalDecomposition::applyZAdjointOnTheLeftInPlace( const Index cols = this->cols(); const Index nrhs = rhs.cols(); const Index rank = this->rank(); - Matrix temp((std::max)(cols, nrhs)); + Matrix temp((std::max)(cols, nrhs)); for (Index k = 0; k < rank; ++k) { if (k != rank - 1) { rhs.row(k).swap(rhs.row(rank - 1)); @@ -491,8 +533,6 @@ template template void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl( const RhsType& rhs, DstType& dst) const { - eigen_assert(rhs.rows() == this->rows()); - const Index rank = this->rank(); if (rank == 0) { dst.setZero(); @@ -500,11 +540,8 @@ void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl( } // Compute c = Q^* * rhs - // Note that the matrix Q = H_0^* H_1^*... so its inverse is - // Q^* = (H_0 H_1 ...)^T typename RhsType::PlainObject c(rhs); - c.applyOnTheLeft( - householderSequence(matrixQTZ(), hCoeffs()).setLength(rank).transpose()); + c.applyOnTheLeft(matrixQ().setLength(rank).adjoint()); // Solve T z = c(1:rank, :) dst.topRows(rank) = matrixT() @@ -523,10 +560,45 @@ void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl( // Undo permutation to get x = P^{-1} * y. dst = colsPermutation() * dst; } + +template +template +void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +{ + const Index rank = this->rank(); + + if (rank == 0) { + dst.setZero(); + return; + } + + typename RhsType::PlainObject c(colsPermutation().transpose()*rhs); + + if (rank < cols()) { + applyZOnTheLeftInPlace(c); + } + + matrixT().topLeftCorner(rank, rank) + .template triangularView() + .transpose().template conjugateIf() + .solveInPlace(c.topRows(rank)); + + dst.topRows(rank) = c.topRows(rank); + dst.bottomRows(rows()-rank).setZero(); + + dst.applyOnTheLeft(householderQ().setLength(rank).template conjugateIf() ); +} #endif namespace internal { +template +struct traits > > + : traits::PlainObject> +{ + enum { Flags = 0 }; +}; + template struct Assignment >, internal::assign_op::Scalar>, Dense2Dense> { @@ -534,7 +606,8 @@ struct Assignment SrcXprType; static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op &) { - dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.rows())); + typedef Matrix IdentityMatrixType; + dst = src.nestedExpression().solve(IdentityMatrixType::Identity(src.cols(), src.cols())); } }; diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/QR/FullPivHouseholderQR.h b/wpimath/src/main/native/eigeninclude/Eigen/src/QR/FullPivHouseholderQR.h index e489bddc2d..d0664a1d8c 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/QR/FullPivHouseholderQR.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/QR/FullPivHouseholderQR.h @@ -18,6 +18,9 @@ namespace internal { template struct traits > : traits<_MatrixType> { + typedef MatrixXpr XprKind; + typedef SolverStorage StorageKind; + typedef int StorageIndex; enum { Flags = 0 }; }; @@ -55,20 +58,19 @@ struct traits > * \sa MatrixBase::fullPivHouseholderQr() */ template class FullPivHouseholderQR + : public SolverBase > { public: typedef _MatrixType MatrixType; + typedef SolverBase Base; + friend class SolverBase; + + EIGEN_GENERIC_PUBLIC_INTERFACE(FullPivHouseholderQR) enum { - RowsAtCompileTime = MatrixType::RowsAtCompileTime, - ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime }; - typedef typename MatrixType::Scalar Scalar; - typedef typename MatrixType::RealScalar RealScalar; - // FIXME should be int - typedef typename MatrixType::StorageIndex StorageIndex; typedef internal::FullPivHouseholderQRMatrixQReturnType MatrixQReturnType; typedef typename internal::plain_diag_type::type HCoeffsType; typedef Matrix class FullPivHouseholderQR computeInPlace(); } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** This method finds a solution x to the equation Ax=b, where A is the matrix of which * \c *this is the QR decomposition. * @@ -173,11 +176,8 @@ template class FullPivHouseholderQR */ template inline const Solve - solve(const MatrixBase& b) const - { - eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized."); - return Solve(*this, b.derived()); - } + solve(const MatrixBase& b) const; + #endif /** \returns Expression object representing the matrix Q */ @@ -392,22 +392,24 @@ template class FullPivHouseholderQR * diagonal coefficient of U. */ RealScalar maxPivot() const { return m_maxpivot; } - + #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; + + template + void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: - + static void check_template_parameters() { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); } - + void computeInPlace(); - + MatrixType m_qr; HCoeffsType m_hCoeffs; IntDiagSizeVectorType m_rows_transpositions; @@ -499,15 +501,15 @@ void FullPivHouseholderQR::computeInPlace() m_nonzero_pivots = k; for(Index i = k; i < size; i++) { - m_rows_transpositions.coeffRef(i) = i; - m_cols_transpositions.coeffRef(i) = i; + m_rows_transpositions.coeffRef(i) = internal::convert_index(i); + m_cols_transpositions.coeffRef(i) = internal::convert_index(i); m_hCoeffs.coeffRef(i) = Scalar(0); } break; } - m_rows_transpositions.coeffRef(k) = row_of_biggest_in_corner; - m_cols_transpositions.coeffRef(k) = col_of_biggest_in_corner; + m_rows_transpositions.coeffRef(k) = internal::convert_index(row_of_biggest_in_corner); + m_cols_transpositions.coeffRef(k) = internal::convert_index(col_of_biggest_in_corner); if(k != row_of_biggest_in_corner) { m_qr.row(k).tail(cols-k).swap(m_qr.row(row_of_biggest_in_corner).tail(cols-k)); ++number_of_transpositions; @@ -541,7 +543,6 @@ template template void FullPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const { - eigen_assert(rhs.rows() == rows()); const Index l_rank = rank(); // FIXME introduce nonzeroPivots() and use it here. and more generally, @@ -554,7 +555,7 @@ void FullPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType typename RhsType::PlainObject c(rhs); - Matrix temp(rhs.cols()); + Matrix temp(rhs.cols()); for (Index k = 0; k < l_rank; ++k) { Index remainingSize = rows()-k; @@ -571,6 +572,42 @@ void FullPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType for(Index i = 0; i < l_rank; ++i) dst.row(m_cols_permutation.indices().coeff(i)) = c.row(i); for(Index i = l_rank; i < cols(); ++i) dst.row(m_cols_permutation.indices().coeff(i)).setZero(); } + +template +template +void FullPivHouseholderQR<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +{ + const Index l_rank = rank(); + + if(l_rank == 0) + { + dst.setZero(); + return; + } + + typename RhsType::PlainObject c(m_cols_permutation.transpose()*rhs); + + m_qr.topLeftCorner(l_rank, l_rank) + .template triangularView() + .transpose().template conjugateIf() + .solveInPlace(c.topRows(l_rank)); + + dst.topRows(l_rank) = c.topRows(l_rank); + dst.bottomRows(rows()-l_rank).setZero(); + + Matrix temp(dst.cols()); + const Index size = (std::min)(rows(), cols()); + for (Index k = size-1; k >= 0; --k) + { + Index remainingSize = rows()-k; + + dst.bottomRightCorner(remainingSize, dst.cols()) + .applyHouseholderOnTheLeft(m_qr.col(k).tail(remainingSize-1).template conjugateIf(), + m_hCoeffs.template conjugateIf().coeff(k), &temp.coeffRef(0)); + + dst.row(k).swap(dst.row(m_rows_transpositions.coeff(k))); + } +} #endif namespace internal { diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/QR/HouseholderQR.h b/wpimath/src/main/native/eigeninclude/Eigen/src/QR/HouseholderQR.h index 3513d995cb..801739fbd8 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/QR/HouseholderQR.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/QR/HouseholderQR.h @@ -14,6 +14,18 @@ namespace Eigen { +namespace internal { +template struct traits > + : traits<_MatrixType> +{ + typedef MatrixXpr XprKind; + typedef SolverStorage StorageKind; + typedef int StorageIndex; + enum { Flags = 0 }; +}; + +} // end namespace internal + /** \ingroup QR_Module * * @@ -42,20 +54,19 @@ namespace Eigen { * \sa MatrixBase::householderQr() */ template class HouseholderQR + : public SolverBase > { public: typedef _MatrixType MatrixType; + typedef SolverBase Base; + friend class SolverBase; + + EIGEN_GENERIC_PUBLIC_INTERFACE(HouseholderQR) enum { - RowsAtCompileTime = MatrixType::RowsAtCompileTime, - ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime }; - typedef typename MatrixType::Scalar Scalar; - typedef typename MatrixType::RealScalar RealScalar; - // FIXME should be int - typedef typename MatrixType::StorageIndex StorageIndex; typedef Matrix MatrixQType; typedef typename internal::plain_diag_type::type HCoeffsType; typedef typename internal::plain_row_type::type RowVectorType; @@ -121,6 +132,7 @@ template class HouseholderQR computeInPlace(); } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** This method finds a solution x to the equation Ax=b, where A is the matrix of which * *this is the QR decomposition, if any exists. * @@ -137,11 +149,8 @@ template class HouseholderQR */ template inline const Solve - solve(const MatrixBase& b) const - { - eigen_assert(m_isInitialized && "HouseholderQR is not initialized."); - return Solve(*this, b.derived()); - } + solve(const MatrixBase& b) const; + #endif /** This method returns an expression of the unitary matrix Q as a sequence of Householder transformations. * @@ -204,28 +213,30 @@ template class HouseholderQR inline Index rows() const { return m_qr.rows(); } inline Index cols() const { return m_qr.cols(); } - + /** \returns a const reference to the vector of Householder coefficients used to represent the factor \c Q. * * For advanced uses only. */ const HCoeffsType& hCoeffs() const { return m_hCoeffs; } - + #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; + + template + void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: - + static void check_template_parameters() { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); } void computeInPlace(); - + MatrixType m_qr; HCoeffsType m_hCoeffs; RowVectorType m_temp; @@ -292,7 +303,7 @@ template struct householder_qr_inplace_blocked { - // This is specialized for MKL-supported Scalar types in HouseholderQR_MKL.h + // This is specialized for LAPACK-supported Scalar types in HouseholderQR_LAPACKE.h static void run(MatrixQR& mat, HCoeffs& hCoeffs, Index maxBlockSize=32, typename MatrixQR::Scalar* tempData = 0) { @@ -350,15 +361,10 @@ template void HouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const { const Index rank = (std::min)(rows(), cols()); - eigen_assert(rhs.rows() == rows()); typename RhsType::PlainObject c(rhs); - // Note that the matrix Q = H_0^* H_1^*... so its inverse is Q^* = (H_0 H_1 ...)^T - c.applyOnTheLeft(householderSequence( - m_qr.leftCols(rank), - m_hCoeffs.head(rank)).transpose() - ); + c.applyOnTheLeft(householderQ().setLength(rank).adjoint() ); m_qr.topLeftCorner(rank, rank) .template triangularView() @@ -367,6 +373,25 @@ void HouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) c dst.topRows(rank) = c.topRows(rank); dst.bottomRows(cols()-rank).setZero(); } + +template +template +void HouseholderQR<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +{ + const Index rank = (std::min)(rows(), cols()); + + typename RhsType::PlainObject c(rhs); + + m_qr.topLeftCorner(rank, rank) + .template triangularView() + .transpose().template conjugateIf() + .solveInPlace(c.topRows(rank)); + + dst.topRows(rank) = c.topRows(rank); + dst.bottomRows(rows()-rank).setZero(); + + dst.applyOnTheLeft(householderQ().setLength(rank).template conjugateIf() ); +} #endif /** Performs the QR factorization of the given matrix \a matrix. The result of diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/SVD/BDCSVD.h b/wpimath/src/main/native/eigeninclude/Eigen/src/SVD/BDCSVD.h index a5b73f8f21..17f8e44364 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/SVD/BDCSVD.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/SVD/BDCSVD.h @@ -22,6 +22,11 @@ // #define EIGEN_BDCSVD_DEBUG_VERBOSE // #define EIGEN_BDCSVD_SANITY_CHECKS +#ifdef EIGEN_BDCSVD_SANITY_CHECKS +#undef eigen_internal_assert +#define eigen_internal_assert(X) assert(X); +#endif + namespace Eigen { #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE @@ -34,6 +39,7 @@ namespace internal { template struct traits > + : traits<_MatrixType> { typedef _MatrixType MatrixType; }; @@ -57,7 +63,7 @@ struct traits > * recommended and can several order of magnitude faster. * * \warning this algorithm is unlikely to provide accurate result when compiled with unsafe math optimizations. - * For instance, this concerns Intel's compiler (ICC), which perfroms such optimization by default unless + * For instance, this concerns Intel's compiler (ICC), which performs such optimization by default unless * you compile with the \c -fp-model \c precise option. Likewise, the \c -ffast-math option of GCC or clang will * significantly degrade the accuracy. * @@ -105,7 +111,7 @@ public: * The default constructor is useful in cases in which the user intends to * perform decompositions via BDCSVD::compute(const MatrixType&). */ - BDCSVD() : m_algoswap(16), m_numIters(0) + BDCSVD() : m_algoswap(16), m_isTranspose(false), m_compU(false), m_compV(false), m_numIters(0) {} @@ -202,6 +208,7 @@ protected: using Base::m_computeThinV; using Base::m_matrixU; using Base::m_matrixV; + using Base::m_info; using Base::m_isInitialized; using Base::m_nonzeroSingularValues; @@ -212,7 +219,7 @@ public: // Method to allocate and initialize matrix and attributes template -void BDCSVD::allocate(Index rows, Index cols, unsigned int computationOptions) +void BDCSVD::allocate(Eigen::Index rows, Eigen::Index cols, unsigned int computationOptions) { m_isTranspose = (cols > rows); @@ -250,16 +257,25 @@ BDCSVD& BDCSVD::compute(const MatrixType& matrix, unsign { // FIXME this line involves temporaries JacobiSVD jsvd(matrix,computationOptions); - if(computeU()) m_matrixU = jsvd.matrixU(); - if(computeV()) m_matrixV = jsvd.matrixV(); - m_singularValues = jsvd.singularValues(); - m_nonzeroSingularValues = jsvd.nonzeroSingularValues(); m_isInitialized = true; + m_info = jsvd.info(); + if (m_info == Success || m_info == NoConvergence) { + if(computeU()) m_matrixU = jsvd.matrixU(); + if(computeV()) m_matrixV = jsvd.matrixV(); + m_singularValues = jsvd.singularValues(); + m_nonzeroSingularValues = jsvd.nonzeroSingularValues(); + } return *this; } //**** step 0 - Copy the input matrix and apply scaling to reduce over/under-flows - RealScalar scale = matrix.cwiseAbs().maxCoeff(); + RealScalar scale = matrix.cwiseAbs().template maxCoeff(); + if (!(numext::isfinite)(scale)) { + m_isInitialized = true; + m_info = InvalidInput; + return *this; + } + if(scale==Literal(0)) scale = Literal(1); MatrixX copy; if (m_isTranspose) copy = matrix.adjoint()/scale; @@ -276,7 +292,11 @@ BDCSVD& BDCSVD::compute(const MatrixType& matrix, unsign m_computed.topRows(m_diagSize) = bid.bidiagonal().toDenseMatrix().transpose(); m_computed.template bottomRows<1>().setZero(); divide(0, m_diagSize - 1, 0, 0, 0); - + if (m_info != Success && m_info != NoConvergence) { + m_isInitialized = true; + return *this; + } + //**** step 3 - Copy singular values and vectors for (int i=0; i::structured_update(Block A, co //@param shift : Each time one takes the left submatrix, one must add 1 to the shift. Why? Because! We actually want the last column of the U submatrix // to become the first column (*coeff) and to shift all the other columns to the right. There are more details on the reference paper. template -void BDCSVD::divide (Index firstCol, Index lastCol, Index firstRowW, Index firstColW, Index shift) +void BDCSVD::divide(Eigen::Index firstCol, Eigen::Index lastCol, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index shift) { // requires rows = cols + 1; using std::pow; @@ -408,6 +428,8 @@ void BDCSVD::divide (Index firstCol, Index lastCol, Index firstRowW, { // FIXME this line involves temporaries JacobiSVD b(m_computed.block(firstCol, firstCol, n + 1, n), ComputeFullU | (m_compV ? ComputeFullV : 0)); + m_info = b.info(); + if (m_info != Success && m_info != NoConvergence) return; if (m_compU) m_naiveU.block(firstCol, firstCol, n + 1, n + 1).real() = b.matrixU(); else @@ -427,7 +449,9 @@ void BDCSVD::divide (Index firstCol, Index lastCol, Index firstRowW, // and the divide of the right submatrice reads one column of the left submatrice. That's why we need to treat the // right submatrix before the left one. divide(k + 1 + firstCol, lastCol, k + 1 + firstRowW, k + 1 + firstColW, shift); + if (m_info != Success && m_info != NoConvergence) return; divide(firstCol, k - 1 + firstCol, firstRowW, firstColW + 1, shift + 1); + if (m_info != Success && m_info != NoConvergence) return; if (m_compU) { @@ -568,7 +592,7 @@ void BDCSVD::divide (Index firstCol, Index lastCol, Index firstRowW, // handling of round-off errors, be consistent in ordering // For instance, to solve the secular equation using FMM, see http://www.stat.uchicago.edu/~lekheng/courses/302/classics/greengard-rokhlin.pdf template -void BDCSVD::computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V) +void BDCSVD::computeSVDofM(Eigen::Index firstCol, Eigen::Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V) { const RealScalar considerZero = (std::numeric_limits::min)(); using std::abs; @@ -591,7 +615,7 @@ void BDCSVD::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec // but others are interleaved and we must ignore them at this stage. // To this end, let's compute a permutation skipping them: Index actual_n = n; - while(actual_n>1 && diag(actual_n-1)==Literal(0)) --actual_n; + while(actual_n>1 && diag(actual_n-1)==Literal(0)) {--actual_n; eigen_internal_assert(col0(actual_n)==Literal(0)); } Index m = 0; // size of the deflated problem for(Index k=0;kconsiderZero) @@ -618,13 +642,11 @@ void BDCSVD::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec std::cout << " shift: " << shifts.transpose() << "\n"; { - Index actual_n = n; - while(actual_n>1 && abs(col0(actual_n-1))= 0).all()); std::cout << " check2 (>0) : " << ((singVals.array()-diag) / singVals.array()).head(actual_n).transpose() << "\n\n"; - std::cout << " check3 (>0) : " << ((diag.segment(1,actual_n-1)-singVals.head(actual_n-1).array()) / singVals.head(actual_n-1).array()).transpose() << "\n\n\n"; - std::cout << " check4 (>0) : " << ((singVals.segment(1,actual_n-1)-singVals.head(actual_n-1))).transpose() << "\n\n\n"; + assert((((singVals.array()-diag) / singVals.array()).head(actual_n) >= 0).all()); } #endif @@ -652,13 +674,13 @@ void BDCSVD::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec #endif #ifdef EIGEN_BDCSVD_SANITY_CHECKS - assert(U.allFinite()); - assert(V.allFinite()); - assert((U.transpose() * U - MatrixXr(MatrixXr::Identity(U.cols(),U.cols()))).norm() < 1e-14 * n); - assert((V.transpose() * V - MatrixXr(MatrixXr::Identity(V.cols(),V.cols()))).norm() < 1e-14 * n); assert(m_naiveU.allFinite()); assert(m_naiveV.allFinite()); assert(m_computed.allFinite()); + assert(U.allFinite()); + assert(V.allFinite()); +// assert((U.transpose() * U - MatrixXr(MatrixXr::Identity(U.cols(),U.cols()))).norm() < 100*NumTraits::epsilon() * n); +// assert((V.transpose() * V - MatrixXr(MatrixXr::Identity(V.cols(),V.cols()))).norm() < 100*NumTraits::epsilon() * n); #endif // Because of deflation, the singular values might not be completely sorted. @@ -673,6 +695,15 @@ void BDCSVD::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec if(m_compV) V.col(i).swap(V.col(i+1)); } } + +#ifdef EIGEN_BDCSVD_SANITY_CHECKS + { + bool singular_values_sorted = (((singVals.segment(1,actual_n-1)-singVals.head(actual_n-1))).array() >= 0).all(); + if(!singular_values_sorted) + std::cout << "Singular values are not sorted: " << singVals.segment(1,actual_n).transpose() << "\n"; + assert(singular_values_sorted); + } +#endif // Reverse order so that singular values in increased order // Because of deflation, the zeros singular-values are already at the end @@ -749,19 +780,22 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d RealScalar mid = left + (right-left) / Literal(2); RealScalar fMid = secularEq(mid, col0, diag, perm, diag, Literal(0)); #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE - std::cout << right-left << "\n"; - std::cout << "fMid = " << fMid << " " << secularEq(mid-left, col0, diag, perm, diag-left, left) << " " << secularEq(mid-right, col0, diag, perm, diag-right, right) << "\n"; - std::cout << " = " << secularEq(0.1*(left+right), col0, diag, perm, diag, 0) - << " " << secularEq(0.2*(left+right), col0, diag, perm, diag, 0) - << " " << secularEq(0.3*(left+right), col0, diag, perm, diag, 0) - << " " << secularEq(0.4*(left+right), col0, diag, perm, diag, 0) - << " " << secularEq(0.49*(left+right), col0, diag, perm, diag, 0) - << " " << secularEq(0.5*(left+right), col0, diag, perm, diag, 0) - << " " << secularEq(0.51*(left+right), col0, diag, perm, diag, 0) - << " " << secularEq(0.6*(left+right), col0, diag, perm, diag, 0) - << " " << secularEq(0.7*(left+right), col0, diag, perm, diag, 0) - << " " << secularEq(0.8*(left+right), col0, diag, perm, diag, 0) - << " " << secularEq(0.9*(left+right), col0, diag, perm, diag, 0) << "\n"; + std::cout << "right-left = " << right-left << "\n"; +// std::cout << "fMid = " << fMid << " " << secularEq(mid-left, col0, diag, perm, ArrayXr(diag-left), left) +// << " " << secularEq(mid-right, col0, diag, perm, ArrayXr(diag-right), right) << "\n"; + std::cout << " = " << secularEq(left+RealScalar(0.000001)*(right-left), col0, diag, perm, diag, 0) + << " " << secularEq(left+RealScalar(0.1) *(right-left), col0, diag, perm, diag, 0) + << " " << secularEq(left+RealScalar(0.2) *(right-left), col0, diag, perm, diag, 0) + << " " << secularEq(left+RealScalar(0.3) *(right-left), col0, diag, perm, diag, 0) + << " " << secularEq(left+RealScalar(0.4) *(right-left), col0, diag, perm, diag, 0) + << " " << secularEq(left+RealScalar(0.49) *(right-left), col0, diag, perm, diag, 0) + << " " << secularEq(left+RealScalar(0.5) *(right-left), col0, diag, perm, diag, 0) + << " " << secularEq(left+RealScalar(0.51) *(right-left), col0, diag, perm, diag, 0) + << " " << secularEq(left+RealScalar(0.6) *(right-left), col0, diag, perm, diag, 0) + << " " << secularEq(left+RealScalar(0.7) *(right-left), col0, diag, perm, diag, 0) + << " " << secularEq(left+RealScalar(0.8) *(right-left), col0, diag, perm, diag, 0) + << " " << secularEq(left+RealScalar(0.9) *(right-left), col0, diag, perm, diag, 0) + << " " << secularEq(left+RealScalar(0.999999)*(right-left), col0, diag, perm, diag, 0) << "\n"; #endif RealScalar shift = (k == actual_n-1 || fMid > Literal(0)) ? left : right; @@ -819,13 +853,16 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d // And find mu such that f(mu)==0: RealScalar muZero = -a/b; RealScalar fZero = secularEq(muZero, col0, diag, perm, diagShifted, shift); + +#ifdef EIGEN_BDCSVD_SANITY_CHECKS + assert((numext::isfinite)(fZero)); +#endif muPrev = muCur; fPrev = fCur; muCur = muZero; fCur = fZero; - if (shift == left && (muCur < Literal(0) || muCur > right - left)) useBisection = true; if (shift == right && (muCur < -(right - left) || muCur > Literal(0))) useBisection = true; if (abs(fCur)>abs(fPrev)) useBisection = true; @@ -858,20 +895,33 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d else rightShifted = -(std::numeric_limits::min)(); } - + RealScalar fLeft = secularEq(leftShifted, col0, diag, perm, diagShifted, shift); eigen_internal_assert(fLeft " << leftShifted << " " << rightShifted << " shift=" << shift << "\n"; + std::cout << "f(leftShifted) using leftShifted=" << leftShifted << " ; diagShifted(1:10):" << diagShifted.head(10).transpose() << "\n ; " + << "left==shift=" << bool(left==shift) << " ; left-shift = " << (left-shift) << "\n"; + std::cout << "k=" << k << ", " << fLeft << " * " << fRight << " == " << fLeft * fRight << " ; " + << "[" << left << " .. " << right << "] -> [" << leftShifted << " " << rightShifted << "], shift=" << shift + << " , f(right)=" << secularEq(0, col0, diag, perm, diagShifted, shift) + << " == " << secularEq(right, col0, diag, perm, diag, 0) << " == " << fRight << "\n"; } #endif eigen_internal_assert(fLeft * fRight < Literal(0)); @@ -912,6 +962,15 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d shifts[k] = shift; mus[k] = muCur; +#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE + if(k+1=singVals[k-1]); + assert(singVals[k]>=diag(k)); +#endif + // perturb singular value slightly if it equals diagonal entry to avoid division by zero later // (deflation is supposed to avoid this from happening) // - this does no seem to be necessary anymore - @@ -935,7 +994,7 @@ void BDCSVD::perturbCol0 zhat.setZero(); return; } - Index last = perm(m-1); + Index lastIdx = perm(m-1); // The offset permits to skip deflated entries while computing zhat for (Index k = 0; k < n; ++k) { @@ -945,15 +1004,43 @@ void BDCSVD::perturbCol0 { // see equation (3.6) RealScalar dk = diag(k); - RealScalar prod = (singVals(last) + dk) * (mus(last) + (shifts(last) - dk)); + RealScalar prod = (singVals(lastIdx) + dk) * (mus(lastIdx) + (shifts(lastIdx) - dk)); +#ifdef EIGEN_BDCSVD_SANITY_CHECKS + if(prod<0) { + std::cout << "k = " << k << " ; z(k)=" << col0(k) << ", diag(k)=" << dk << "\n"; + std::cout << "prod = " << "(" << singVals(lastIdx) << " + " << dk << ") * (" << mus(lastIdx) << " + (" << shifts(lastIdx) << " - " << dk << "))" << "\n"; + std::cout << " = " << singVals(lastIdx) + dk << " * " << mus(lastIdx) + (shifts(lastIdx) - dk) << "\n"; + } + assert(prod>=0); +#endif for(Index l = 0; l=k && (l==0 || l-1>=m)) + { + std::cout << "Error in perturbCol0\n"; + std::cout << " " << k << "/" << n << " " << l << "/" << m << " " << i << "/" << n << " ; " << col0(k) << " " << diag(k) << " " << "\n"; + std::cout << " " <=0); +#endif #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE if(i!=k && numext::abs(((singVals(j)+dk)*(mus(j)+(shifts(j)-dk)))/((diag(i)+dk)*(diag(i)-dk)) - 1) > 0.9 ) std::cout << " " << ((singVals(j)+dk)*(mus(j)+(shifts(j)-dk)))/((diag(i)+dk)*(diag(i)-dk)) << " == (" << (singVals(j)+dk) << " * " << (mus(j)+(shifts(j)-dk)) @@ -962,9 +1049,12 @@ void BDCSVD::perturbCol0 } } #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE - std::cout << "zhat(" << k << ") = sqrt( " << prod << ") ; " << (singVals(last) + dk) << " * " << mus(last) + shifts(last) << " - " << dk << "\n"; + std::cout << "zhat(" << k << ") = sqrt( " << prod << ") ; " << (singVals(lastIdx) + dk) << " * " << mus(lastIdx) + shifts(lastIdx) << " - " << dk << "\n"; #endif RealScalar tmp = sqrt(prod); +#ifdef EIGEN_BDCSVD_SANITY_CHECKS + assert((numext::isfinite)(tmp)); +#endif zhat(k) = col0(k) > Literal(0) ? RealScalar(tmp) : RealScalar(-tmp); } } @@ -1018,7 +1108,7 @@ void BDCSVD::computeSingVecs // i >= 1, di almost null and zi non null. // We use a rotation to zero out zi applied to the left of M template -void BDCSVD::deflation43(Index firstCol, Index shift, Index i, Index size) +void BDCSVD::deflation43(Eigen::Index firstCol, Eigen::Index shift, Eigen::Index i, Eigen::Index size) { using std::abs; using std::sqrt; @@ -1047,7 +1137,7 @@ void BDCSVD::deflation43(Index firstCol, Index shift, Index i, Index // We apply two rotations to have zj = 0; // TODO deflation44 is still broken and not properly tested template -void BDCSVD::deflation44(Index firstColu , Index firstColm, Index firstRowW, Index firstColW, Index i, Index j, Index size) +void BDCSVD::deflation44(Eigen::Index firstColu , Eigen::Index firstColm, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index i, Eigen::Index j, Eigen::Index size) { using std::abs; using std::sqrt; @@ -1074,7 +1164,7 @@ void BDCSVD::deflation44(Index firstColu , Index firstColm, Index fi } c/=r; s/=r; - m_computed(firstColm + i, firstColm) = r; + m_computed(firstColm + i, firstColm) = r; m_computed(firstColm + j, firstColm + j) = m_computed(firstColm + i, firstColm + i); m_computed(firstColm + j, firstColm) = Literal(0); @@ -1087,7 +1177,7 @@ void BDCSVD::deflation44(Index firstColu , Index firstColm, Index fi // acts on block from (firstCol+shift, firstCol+shift) to (lastCol+shift, lastCol+shift) [inclusive] template -void BDCSVD::deflation(Index firstCol, Index lastCol, Index k, Index firstRowW, Index firstColW, Index shift) +void BDCSVD::deflation(Eigen::Index firstCol, Eigen::Index lastCol, Eigen::Index k, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index shift) { using std::sqrt; using std::abs; @@ -1148,6 +1238,7 @@ void BDCSVD::deflation(Index firstCol, Index lastCol, Index k, Index #endif #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE std::cout << "to be sorted: " << diag.transpose() << "\n\n"; + std::cout << " : " << col0.transpose() << "\n\n"; #endif { // Check for total deflation @@ -1238,7 +1329,7 @@ void BDCSVD::deflation(Index firstCol, Index lastCol, Index k, Index if( (diag(i) - diag(i-1)) < NumTraits::epsilon()*maxDiag ) { #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE - std::cout << "deflation 4.4 with i = " << i << " because " << (diag(i) - diag(i-1)) << " < " << NumTraits::epsilon()*diag(i) << "\n"; + std::cout << "deflation 4.4 with i = " << i << " because " << diag(i) << " - " << diag(i-1) << " == " << (diag(i) - diag(i-1)) << " < " << NumTraits::epsilon()*/*diag(i)*/maxDiag << "\n"; #endif eigen_internal_assert(abs(diag(i) - diag(i-1))::deflation(Index firstCol, Index lastCol, Index k, Index #endif }//end deflation -#ifndef __CUDACC__ /** \svd_module * * \return the singular value decomposition of \c *this computed by Divide & Conquer algorithm @@ -1270,7 +1360,6 @@ MatrixBase::bdcSvd(unsigned int computationOptions) const { return BDCSVD(*this, computationOptions); } -#endif } // end namespace Eigen diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/SVD/JacobiSVD.h b/wpimath/src/main/native/eigeninclude/Eigen/src/SVD/JacobiSVD.h index 43488b1e0c..9d95acdf67 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/SVD/JacobiSVD.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/SVD/JacobiSVD.h @@ -112,12 +112,12 @@ public: ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, - TrOptions = RowsAtCompileTime==1 ? (MatrixType::Options & ~(RowMajor)) - : ColsAtCompileTime==1 ? (MatrixType::Options | RowMajor) - : MatrixType::Options + Options = MatrixType::Options }; - typedef Matrix - TransposeTypeWithSameStorageOrder; + + typedef typename internal::make_proper_matrix_type< + Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime + >::type TransposeTypeWithSameStorageOrder; void allocate(const JacobiSVD& svd) { @@ -202,13 +202,12 @@ public: ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, - TrOptions = RowsAtCompileTime==1 ? (MatrixType::Options & ~(RowMajor)) - : ColsAtCompileTime==1 ? (MatrixType::Options | RowMajor) - : MatrixType::Options + Options = MatrixType::Options }; - typedef Matrix - TransposeTypeWithSameStorageOrder; + typedef typename internal::make_proper_matrix_type< + Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime + >::type TransposeTypeWithSameStorageOrder; void allocate(const JacobiSVD& svd) { @@ -303,8 +302,9 @@ public: Options = MatrixType::Options }; - typedef Matrix - TransposeTypeWithSameStorageOrder; + typedef typename internal::make_proper_matrix_type< + Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime + >::type TransposeTypeWithSameStorageOrder; void allocate(const JacobiSVD& svd) { @@ -425,6 +425,7 @@ struct svd_precondition_2x2_block_to_be_real template struct traits > + : traits<_MatrixType> { typedef _MatrixType MatrixType; }; @@ -584,6 +585,7 @@ template class JacobiSVD using Base::m_matrixU; using Base::m_matrixV; using Base::m_singularValues; + using Base::m_info; using Base::m_isInitialized; using Base::m_isAllocated; using Base::m_usePrescribedThreshold; @@ -610,7 +612,7 @@ template class JacobiSVD }; template -void JacobiSVD::allocate(Index rows, Index cols, unsigned int computationOptions) +void JacobiSVD::allocate(Eigen::Index rows, Eigen::Index cols, unsigned int computationOptions) { eigen_assert(rows >= 0 && cols >= 0); @@ -624,6 +626,7 @@ void JacobiSVD::allocate(Index rows, Index cols, u m_rows = rows; m_cols = cols; + m_info = Success; m_isInitialized = false; m_isAllocated = true; m_computationOptions = computationOptions; @@ -673,7 +676,12 @@ JacobiSVD::compute(const MatrixType& matrix, unsig const RealScalar considerAsZero = (std::numeric_limits::min)(); // Scaling factor to reduce over/under-flows - RealScalar scale = matrix.cwiseAbs().maxCoeff(); + RealScalar scale = matrix.cwiseAbs().template maxCoeff(); + if (!(numext::isfinite)(scale)) { + m_isInitialized = true; + m_info = InvalidInput; + return *this; + } if(scale==RealScalar(0)) scale = RealScalar(1); /*** step 1. The R-SVD step: we use a QR decomposition to reduce to the case of a square matrix */ diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/SVD/SVDBase.h b/wpimath/src/main/native/eigeninclude/Eigen/src/SVD/SVDBase.h index 53da284888..bc7ab88b4c 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/SVD/SVDBase.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/SVD/SVDBase.h @@ -17,6 +17,18 @@ #define EIGEN_SVDBASE_H namespace Eigen { + +namespace internal { +template struct traits > + : traits +{ + typedef MatrixXpr XprKind; + typedef SolverStorage StorageKind; + typedef int StorageIndex; + enum { Flags = 0 }; +}; +} + /** \ingroup SVD_Module * * @@ -39,20 +51,26 @@ namespace Eigen { * smaller value among \a n and \a p, there are only \a m singular vectors; the remaining columns of \a U and \a V do not correspond to actual * singular vectors. Asking for \em thin \a U or \a V means asking for only their \a m first columns to be formed. So \a U is then a n-by-m matrix, * and \a V is then a p-by-m matrix. Notice that thin \a U and \a V are all you need for (least squares) solving. + * + * The status of the computation can be retrived using the \a info() method. Unless \a info() returns \a Success, the results should be not + * considered well defined. * - * If the input matrix has inf or nan coefficients, the result of the computation is undefined, but the computation is guaranteed to + * If the input matrix has inf or nan coefficients, the result of the computation is undefined, and \a info() will return \a InvalidInput, but the computation is guaranteed to * terminate in finite (and reasonable) time. * \sa class BDCSVD, class JacobiSVD */ -template -class SVDBase +template class SVDBase + : public SolverBase > { +public: + + template + friend struct internal::solve_assertion; -public: typedef typename internal::traits::MatrixType MatrixType; typedef typename MatrixType::Scalar Scalar; typedef typename NumTraits::Real RealScalar; - typedef typename MatrixType::StorageIndex StorageIndex; + typedef typename Eigen::internal::traits::StorageIndex StorageIndex; typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3 enum { RowsAtCompileTime = MatrixType::RowsAtCompileTime, @@ -82,7 +100,7 @@ public: */ const MatrixUType& matrixU() const { - eigen_assert(m_isInitialized && "SVD is not initialized."); + _check_compute_assertions(); eigen_assert(computeU() && "This SVD decomposition didn't compute U. Did you ask for it?"); return m_matrixU; } @@ -98,7 +116,7 @@ public: */ const MatrixVType& matrixV() const { - eigen_assert(m_isInitialized && "SVD is not initialized."); + _check_compute_assertions(); eigen_assert(computeV() && "This SVD decomposition didn't compute V. Did you ask for it?"); return m_matrixV; } @@ -110,14 +128,14 @@ public: */ const SingularValuesType& singularValues() const { - eigen_assert(m_isInitialized && "SVD is not initialized."); + _check_compute_assertions(); return m_singularValues; } /** \returns the number of singular values that are not exactly 0 */ Index nonzeroSingularValues() const { - eigen_assert(m_isInitialized && "SVD is not initialized."); + _check_compute_assertions(); return m_nonzeroSingularValues; } @@ -130,7 +148,7 @@ public: inline Index rank() const { using std::abs; - eigen_assert(m_isInitialized && "JacobiSVD is not initialized."); + _check_compute_assertions(); if(m_singularValues.size()==0) return 0; RealScalar premultiplied_threshold = numext::maxi(m_singularValues.coeff(0) * threshold(), (std::numeric_limits::min)()); Index i = m_nonzeroSingularValues-1; @@ -194,6 +212,7 @@ public: inline Index rows() const { return m_rows; } inline Index cols() const { return m_cols; } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** \returns a (least squares) solution of \f$ A x = b \f$ using the current SVD decomposition of A. * * \param b the right-hand-side of the equation to solve. @@ -205,32 +224,55 @@ public: */ template inline const Solve - solve(const MatrixBase& b) const + solve(const MatrixBase& b) const; + #endif + + + /** \brief Reports whether previous computation was successful. + * + * \returns \c Success if computation was successful. + */ + EIGEN_DEVICE_FUNC + ComputationInfo info() const { eigen_assert(m_isInitialized && "SVD is not initialized."); - eigen_assert(computeU() && computeV() && "SVD::solve() requires both unitaries U and V to be computed (thin unitaries suffice)."); - return Solve(derived(), b.derived()); + return m_info; } - + #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; + + template + void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: - + static void check_template_parameters() { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); } - + + void _check_compute_assertions() const { + eigen_assert(m_isInitialized && "SVD is not initialized."); + } + + template + void _check_solve_assertion(const Rhs& b) const { + EIGEN_ONLY_USED_FOR_DEBUG(b); + _check_compute_assertions(); + eigen_assert(computeU() && computeV() && "SVDBase::solve(): Both unitaries U and V are required to be computed (thin unitaries suffice)."); + eigen_assert((Transpose_?cols():rows())==b.rows() && "SVDBase::solve(): invalid number of rows of the right hand side matrix b"); + } + // return true if already allocated bool allocate(Index rows, Index cols, unsigned int computationOptions) ; MatrixUType m_matrixU; MatrixVType m_matrixV; SingularValuesType m_singularValues; + ComputationInfo m_info; bool m_isInitialized, m_isAllocated, m_usePrescribedThreshold; bool m_computeFullU, m_computeThinU; bool m_computeFullV, m_computeThinV; @@ -243,9 +285,14 @@ protected: * Default constructor of SVDBase */ SVDBase() - : m_isInitialized(false), + : m_info(Success), + m_isInitialized(false), m_isAllocated(false), m_usePrescribedThreshold(false), + m_computeFullU(false), + m_computeThinU(false), + m_computeFullV(false), + m_computeThinV(false), m_computationOptions(0), m_rows(-1), m_cols(-1), m_diagSize(0) { @@ -260,17 +307,30 @@ template template void SVDBase::_solve_impl(const RhsType &rhs, DstType &dst) const { - eigen_assert(rhs.rows() == rows()); - // A = U S V^* // So A^{-1} = V S^{-1} U^* - Matrix tmp; + Matrix tmp; Index l_rank = rank(); tmp.noalias() = m_matrixU.leftCols(l_rank).adjoint() * rhs; tmp = m_singularValues.head(l_rank).asDiagonal().inverse() * tmp; dst = m_matrixV.leftCols(l_rank) * tmp; } + +template +template +void SVDBase::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +{ + // A = U S V^* + // So A^{-*} = U S^{-1} V^* + // And A^{-T} = U_conj S^{-1} V^T + Matrix tmp; + Index l_rank = rank(); + + tmp.noalias() = m_matrixV.leftCols(l_rank).transpose().template conjugateIf() * rhs; + tmp = m_singularValues.head(l_rank).asDiagonal().inverse() * tmp; + dst = m_matrixU.template conjugateIf().leftCols(l_rank) * tmp; +} #endif template @@ -288,6 +348,7 @@ bool SVDBase::allocate(Index rows, Index cols, unsigned int computat m_rows = rows; m_cols = cols; + m_info = Success; m_isInitialized = false; m_isAllocated = true; m_computationOptions = computationOptions; diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/SVD/UpperBidiagonalization.h b/wpimath/src/main/native/eigeninclude/Eigen/src/SVD/UpperBidiagonalization.h index 11ac847e1d..997defc474 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/SVD/UpperBidiagonalization.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/SVD/UpperBidiagonalization.h @@ -127,7 +127,7 @@ void upperbidiagonalization_inplace_unblocked(MatrixType& mat, .makeHouseholderInPlace(mat.coeffRef(k,k+1), upper_diagonal[k]); // apply householder transform to remaining part of mat on the left mat.bottomRightCorner(remainingRows-1, remainingCols) - .applyHouseholderOnTheRight(mat.row(k).tail(remainingCols-1).transpose(), mat.coeff(k,k+1), tempData); + .applyHouseholderOnTheRight(mat.row(k).tail(remainingCols-1).adjoint(), mat.coeff(k,k+1), tempData); } } @@ -202,7 +202,7 @@ void upperbidiagonalization_blocked_helper(MatrixType& A, { SubColumnType y_k( Y.col(k).tail(remainingCols) ); - // let's use the begining of column k of Y as a temporary vector + // let's use the beginning of column k of Y as a temporary vector SubColumnType tmp( Y.col(k).head(k) ); y_k.noalias() = A.block(k,k+1, remainingRows,remainingCols).adjoint() * v_k; // bottleneck tmp.noalias() = V_k1.adjoint() * v_k; @@ -231,7 +231,7 @@ void upperbidiagonalization_blocked_helper(MatrixType& A, { SubColumnType x_k ( X.col(k).tail(remainingRows-1) ); - // let's use the begining of column k of X as a temporary vectors + // let's use the beginning of column k of X as a temporary vectors // note that tmp0 and tmp1 overlaps SubColumnType tmp0 ( X.col(k).head(k) ), tmp1 ( X.col(k).head(k+1) ); diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/StlSupport/StdDeque.h b/wpimath/src/main/native/eigeninclude/Eigen/src/StlSupport/StdDeque.h index af158f425d..6d47e75722 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/StlSupport/StdDeque.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/StlSupport/StdDeque.h @@ -36,7 +36,7 @@ namespace std \ deque(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : deque_base(first, last, a) {} \ deque(const deque& c) : deque_base(c) {} \ explicit deque(size_type num, const value_type& val = value_type()) : deque_base(num, val) {} \ - deque(iterator start, iterator end) : deque_base(start, end) {} \ + deque(iterator start_, iterator end_) : deque_base(start_, end_) {} \ deque& operator=(const deque& x) { \ deque_base::operator=(x); \ return *this; \ @@ -62,7 +62,7 @@ namespace std { : deque_base(first, last, a) {} \ deque(const deque& c) : deque_base(c) {} \ explicit deque(size_type num, const value_type& val = value_type()) : deque_base(num, val) {} \ - deque(iterator start, iterator end) : deque_base(start, end) {} \ + deque(iterator start_, iterator end_) : deque_base(start_, end_) {} \ deque& operator=(const deque& x) { \ deque_base::operator=(x); \ return *this; \ @@ -98,19 +98,7 @@ namespace std { { return deque_base::insert(position,x); } void insert(const_iterator position, size_type new_size, const value_type& x) { deque_base::insert(position, new_size, x); } -#elif defined(_GLIBCXX_DEQUE) && EIGEN_GNUC_AT_LEAST(4,2) && !EIGEN_GNUC_AT_LEAST(10, 1) - // workaround GCC std::deque implementation - // GCC 10.1 doesn't let us access _Deque_impl _M_impl anymore and we have to - // fall-back to the default case - void resize(size_type new_size, const value_type& x) - { - if (new_size < deque_base::size()) - deque_base::_M_erase_at_end(this->_M_impl._M_start + new_size); - else - deque_base::insert(deque_base::end(), new_size - deque_base::size(), x); - } #else - // either non-GCC or GCC between 4.1 and 10.1 // default implementation which should always work. void resize(size_type new_size, const value_type& x) { diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/StlSupport/StdList.h b/wpimath/src/main/native/eigeninclude/Eigen/src/StlSupport/StdList.h index e1eba49859..8ba3fada0a 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/StlSupport/StdList.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/StlSupport/StdList.h @@ -35,7 +35,7 @@ namespace std \ list(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : list_base(first, last, a) {} \ list(const list& c) : list_base(c) {} \ explicit list(size_type num, const value_type& val = value_type()) : list_base(num, val) {} \ - list(iterator start, iterator end) : list_base(start, end) {} \ + list(iterator start_, iterator end_) : list_base(start_, end_) {} \ list& operator=(const list& x) { \ list_base::operator=(x); \ return *this; \ @@ -62,7 +62,7 @@ namespace std : list_base(first, last, a) {} \ list(const list& c) : list_base(c) {} \ explicit list(size_type num, const value_type& val = value_type()) : list_base(num, val) {} \ - list(iterator start, iterator end) : list_base(start, end) {} \ + list(iterator start_, iterator end_) : list_base(start_, end_) {} \ list& operator=(const list& x) { \ list_base::operator=(x); \ return *this; \ diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/StlSupport/StdVector.h b/wpimath/src/main/native/eigeninclude/Eigen/src/StlSupport/StdVector.h index ec22821d26..9fcf19bce8 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/StlSupport/StdVector.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/StlSupport/StdVector.h @@ -36,7 +36,7 @@ namespace std \ vector(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : vector_base(first, last, a) {} \ vector(const vector& c) : vector_base(c) {} \ explicit vector(size_type num, const value_type& val = value_type()) : vector_base(num, val) {} \ - vector(iterator start, iterator end) : vector_base(start, end) {} \ + vector(iterator start_, iterator end_) : vector_base(start_, end_) {} \ vector& operator=(const vector& x) { \ vector_base::operator=(x); \ return *this; \ @@ -62,7 +62,7 @@ namespace std { : vector_base(first, last, a) {} \ vector(const vector& c) : vector_base(c) {} \ explicit vector(size_type num, const value_type& val = value_type()) : vector_base(num, val) {} \ - vector(iterator start, iterator end) : vector_base(start, end) {} \ + vector(iterator start_, iterator end_) : vector_base(start_, end_) {} \ vector& operator=(const vector& x) { \ vector_base::operator=(x); \ return *this; \ diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/plugins/ArrayCwiseBinaryOps.h b/wpimath/src/main/native/eigeninclude/Eigen/src/plugins/ArrayCwiseBinaryOps.h index 05a7449bc9..0e5d5445b1 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/plugins/ArrayCwiseBinaryOps.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/plugins/ArrayCwiseBinaryOps.h @@ -75,6 +75,32 @@ max return (max)(Derived::PlainObject::Constant(rows(), cols(), other)); } +/** \returns an expression of the coefficient-wise absdiff of \c *this and \a other + * + * Example: \include Cwise_absolute_difference.cpp + * Output: \verbinclude Cwise_absolute_difference.out + * + * \sa absolute_difference() + */ +EIGEN_MAKE_CWISE_BINARY_OP(absolute_difference,absolute_difference) + +/** \returns an expression of the coefficient-wise absolute_difference of \c *this and scalar \a other + * + * \sa absolute_difference() + */ +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, + const CwiseNullaryOp, PlainObject> > +#ifdef EIGEN_PARSED_BY_DOXYGEN +absolute_difference +#else +(absolute_difference) +#endif +(const Scalar &other) const +{ + return (absolute_difference)(Derived::PlainObject::Constant(rows(), cols(), other)); +} + /** \returns an expression of the coefficient-wise power of \c *this to the given array of \a exponents. * * This function computes the coefficient-wise power. @@ -314,9 +340,9 @@ polygamma(const EIGEN_CURRENT_STORAGE_BASE_CLASS &n) const * * It returns the Riemann zeta function of two arguments \c *this and \a q: * - * \param *this is the exposent, it must be > 1 * \param q is the shift, it must be > 0 * + * \note *this is the exponent, it must be > 1. * \note This function supports only float and double scalar types. To support other scalar types, the user has * to provide implementations of zeta(T,T) for any scalar type T to be supported. * diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/wpimath/src/main/native/eigeninclude/Eigen/src/plugins/ArrayCwiseUnaryOps.h index ebaa3f192b..13c55f4b11 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/plugins/ArrayCwiseUnaryOps.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/plugins/ArrayCwiseUnaryOps.h @@ -10,9 +10,11 @@ typedef CwiseUnaryOp, const Derived> Inverse typedef CwiseUnaryOp, const Derived> BooleanNotReturnType; typedef CwiseUnaryOp, const Derived> ExpReturnType; +typedef CwiseUnaryOp, const Derived> Expm1ReturnType; typedef CwiseUnaryOp, const Derived> LogReturnType; typedef CwiseUnaryOp, const Derived> Log1pReturnType; typedef CwiseUnaryOp, const Derived> Log10ReturnType; +typedef CwiseUnaryOp, const Derived> Log2ReturnType; typedef CwiseUnaryOp, const Derived> CosReturnType; typedef CwiseUnaryOp, const Derived> SinReturnType; typedef CwiseUnaryOp, const Derived> TanReturnType; @@ -20,11 +22,18 @@ typedef CwiseUnaryOp, const Derived> AcosReturn typedef CwiseUnaryOp, const Derived> AsinReturnType; typedef CwiseUnaryOp, const Derived> AtanReturnType; typedef CwiseUnaryOp, const Derived> TanhReturnType; +typedef CwiseUnaryOp, const Derived> LogisticReturnType; typedef CwiseUnaryOp, const Derived> SinhReturnType; +#if EIGEN_HAS_CXX11_MATH +typedef CwiseUnaryOp, const Derived> AtanhReturnType; +typedef CwiseUnaryOp, const Derived> AsinhReturnType; +typedef CwiseUnaryOp, const Derived> AcoshReturnType; +#endif typedef CwiseUnaryOp, const Derived> CoshReturnType; typedef CwiseUnaryOp, const Derived> SquareReturnType; typedef CwiseUnaryOp, const Derived> CubeReturnType; typedef CwiseUnaryOp, const Derived> RoundReturnType; +typedef CwiseUnaryOp, const Derived> RintReturnType; typedef CwiseUnaryOp, const Derived> FloorReturnType; typedef CwiseUnaryOp, const Derived> CeilReturnType; typedef CwiseUnaryOp, const Derived> IsNaNReturnType; @@ -90,6 +99,20 @@ exp() const return ExpReturnType(derived()); } +/** \returns an expression of the coefficient-wise exponential of *this minus 1. + * + * In exact arithmetic, \c x.expm1() is equivalent to \c x.exp() - 1, + * however, with finite precision, this function is much more accurate when \c x is close to zero. + * + * \sa Math functions, exp() + */ +EIGEN_DEVICE_FUNC +inline const Expm1ReturnType +expm1() const +{ + return Expm1ReturnType(derived()); +} + /** \returns an expression of the coefficient-wise logarithm of *this. * * This function computes the coefficient-wise logarithm. The function MatrixBase::log() in the @@ -98,7 +121,7 @@ exp() const * Example: \include Cwise_log.cpp * Output: \verbinclude Cwise_log.out * - * \sa Math functions, exp() + * \sa Math functions, log() */ EIGEN_DEVICE_FUNC inline const LogReturnType @@ -137,6 +160,18 @@ log10() const return Log10ReturnType(derived()); } +/** \returns an expression of the coefficient-wise base-2 logarithm of *this. + * + * This function computes the coefficient-wise base-2 logarithm. + * + */ +EIGEN_DEVICE_FUNC +inline const Log2ReturnType +log2() const +{ + return Log2ReturnType(derived()); +} + /** \returns an expression of the coefficient-wise square root of *this. * * This function computes the coefficient-wise square root. The function MatrixBase::sqrt() in the @@ -311,7 +346,7 @@ sinh() const * Example: \include Cwise_cosh.cpp * Output: \verbinclude Cwise_cosh.out * - * \sa Math functions, tan(), sinh(), cosh() + * \sa Math functions, tanh(), sinh(), cosh() */ EIGEN_DEVICE_FUNC inline const CoshReturnType @@ -320,6 +355,50 @@ cosh() const return CoshReturnType(derived()); } +#if EIGEN_HAS_CXX11_MATH +/** \returns an expression of the coefficient-wise inverse hyperbolic tan of *this. + * + * \sa Math functions, atanh(), asinh(), acosh() + */ +EIGEN_DEVICE_FUNC +inline const AtanhReturnType +atanh() const +{ + return AtanhReturnType(derived()); +} + +/** \returns an expression of the coefficient-wise inverse hyperbolic sin of *this. + * + * \sa Math functions, atanh(), asinh(), acosh() + */ +EIGEN_DEVICE_FUNC +inline const AsinhReturnType +asinh() const +{ + return AsinhReturnType(derived()); +} + +/** \returns an expression of the coefficient-wise inverse hyperbolic cos of *this. + * + * \sa Math functions, atanh(), asinh(), acosh() + */ +EIGEN_DEVICE_FUNC +inline const AcoshReturnType +acosh() const +{ + return AcoshReturnType(derived()); +} +#endif + +/** \returns an expression of the coefficient-wise logistic of *this. + */ +EIGEN_DEVICE_FUNC +inline const LogisticReturnType +logistic() const +{ + return LogisticReturnType(derived()); +} + /** \returns an expression of the coefficient-wise inverse of *this. * * Example: \include Cwise_inverse.cpp @@ -362,6 +441,20 @@ cube() const return CubeReturnType(derived()); } +/** \returns an expression of the coefficient-wise rint of *this. + * + * Example: \include Cwise_rint.cpp + * Output: \verbinclude Cwise_rint.out + * + * \sa Math functions, ceil(), floor() + */ +EIGEN_DEVICE_FUNC +inline const RintReturnType +rint() const +{ + return RintReturnType(derived()); +} + /** \returns an expression of the coefficient-wise round of *this. * * Example: \include Cwise_round.cpp @@ -404,6 +497,45 @@ ceil() const return CeilReturnType(derived()); } +template struct ShiftRightXpr { + typedef CwiseUnaryOp, const Derived> Type; +}; + +/** \returns an expression of \c *this with the \a Scalar type arithmetically + * shifted right by \a N bit positions. + * + * The template parameter \a N specifies the number of bit positions to shift. + * + * \sa shiftLeft() + */ +template +EIGEN_DEVICE_FUNC +typename ShiftRightXpr::Type +shiftRight() const +{ + return typename ShiftRightXpr::Type(derived()); +} + + +template struct ShiftLeftXpr { + typedef CwiseUnaryOp, const Derived> Type; +}; + +/** \returns an expression of \c *this with the \a Scalar type logically + * shifted left by \a N bit positions. + * + * The template parameter \a N specifies the number of bit positions to shift. + * + * \sa shiftRight() + */ +template +EIGEN_DEVICE_FUNC +typename ShiftLeftXpr::Type +shiftLeft() const +{ + return typename ShiftLeftXpr::Type(derived()); +} + /** \returns an expression of the coefficient-wise isnan of *this. * * Example: \include Cwise_isNaN.cpp @@ -471,14 +603,12 @@ typedef CwiseUnaryOp, const Derived> LgammaRe typedef CwiseUnaryOp, const Derived> DigammaReturnType; typedef CwiseUnaryOp, const Derived> ErfReturnType; typedef CwiseUnaryOp, const Derived> ErfcReturnType; +typedef CwiseUnaryOp, const Derived> NdtriReturnType; /** \cpp11 \returns an expression of the coefficient-wise ln(|gamma(*this)|). * * \specialfunctions_module * - * Example: \include Cwise_lgamma.cpp - * Output: \verbinclude Cwise_lgamma.out - * * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types, * or float/double in non c++11 mode, the user has to provide implementations of lgamma(T) for any scalar * type T to be supported. @@ -514,9 +644,6 @@ digamma() const * * \specialfunctions_module * - * Example: \include Cwise_erf.cpp - * Output: \verbinclude Cwise_erf.out - * * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types, * or float/double in non c++11 mode, the user has to provide implementations of erf(T) for any scalar * type T to be supported. @@ -535,9 +662,6 @@ erf() const * * \specialfunctions_module * - * Example: \include Cwise_erfc.cpp - * Output: \verbinclude Cwise_erfc.out - * * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types, * or float/double in non c++11 mode, the user has to provide implementations of erfc(T) for any scalar * type T to be supported. @@ -550,3 +674,23 @@ erfc() const { return ErfcReturnType(derived()); } + +/** \returns an expression of the coefficient-wise inverse of the CDF of the Normal distribution function + * function of *this. + * + * \specialfunctions_module + * + * In other words, considering `x = ndtri(y)`, it returns the argument, x, for which the area under the + * Gaussian probability density function (integrated from minus infinity to x) is equal to y. + * + * \note This function supports only float and double scalar types. To support other scalar types, + * the user has to provide implementations of ndtri(T) for any scalar type T to be supported. + * + * \sa Math functions + */ +EIGEN_DEVICE_FUNC +inline const NdtriReturnType +ndtri() const +{ + return NdtriReturnType(derived()); +} diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/plugins/BlockMethods.h b/wpimath/src/main/native/eigeninclude/Eigen/src/plugins/BlockMethods.h index ac35a0086c..63a52a6ffa 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/plugins/BlockMethods.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/plugins/BlockMethods.h @@ -40,68 +40,126 @@ typedef const VectorBlock ConstSegmentReturnType; template struct FixedSegmentReturnType { typedef VectorBlock Type; }; template struct ConstFixedSegmentReturnType { typedef const VectorBlock Type; }; +/// \internal inner-vector +typedef Block InnerVectorReturnType; +typedef Block ConstInnerVectorReturnType; + +/// \internal set of inner-vectors +typedef Block InnerVectorsReturnType; +typedef Block ConstInnerVectorsReturnType; + #endif // not EIGEN_PARSED_BY_DOXYGEN -/// \returns a dynamic-size expression of a block in *this. +/// \returns an expression of a block in \c *this with either dynamic or fixed sizes. /// -/// \param startRow the first row in the block -/// \param startCol the first column in the block -/// \param blockRows the number of rows in the block -/// \param blockCols the number of columns in the block +/// \param startRow the first row in the block +/// \param startCol the first column in the block +/// \param blockRows number of rows in the block, specified at either run-time or compile-time +/// \param blockCols number of columns in the block, specified at either run-time or compile-time +/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index. +/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index. /// -/// Example: \include MatrixBase_block_int_int_int_int.cpp +/// Example using runtime (aka dynamic) sizes: \include MatrixBase_block_int_int_int_int.cpp /// Output: \verbinclude MatrixBase_block_int_int_int_int.out /// -/// \note Even though the returned expression has dynamic size, in the case +/// \newin{3.4}: +/// +/// The number of rows \a blockRows and columns \a blockCols can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. In the later case, \c n plays the role of a runtime fallback value in case \c N equals Eigen::Dynamic. +/// Here is an example with a fixed number of rows \c NRows and dynamic number of columns \c cols: +/// \code +/// mat.block(i,j,fix,cols) +/// \endcode +/// +/// This function thus fully covers the features offered by the following overloads block(Index, Index), +/// and block(Index, Index, Index, Index) that are thus obsolete. Indeed, this generic version avoids +/// redundancy, it preserves the argument order, and prevents the need to rely on the template keyword in templated code. +/// +/// but with less redundancy and more consistency as it does not modify the argument order +/// and seamlessly enable hybrid fixed/dynamic sizes. +/// +/// \note Even in the case that the returned expression has dynamic size, in the case /// when it is applied to a fixed-size matrix, it inherits a fixed maximal size, /// which means that evaluating it does not cause a dynamic memory allocation. /// EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// -/// \sa class Block, block(Index,Index) +/// \sa class Block, fix, fix(int) /// -EIGEN_DEVICE_FUNC -inline BlockXpr block(Index startRow, Index startCol, Index blockRows, Index blockCols) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type +#else +typename FixedBlockXpr<...,...>::Type +#endif +block(Index startRow, Index startCol, NRowsType blockRows, NColsType blockCols) { - return BlockXpr(derived(), startRow, startCol, blockRows, blockCols); + return typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type( + derived(), startRow, startCol, internal::get_runtime_value(blockRows), internal::get_runtime_value(blockCols)); } -/// This is the const version of block(Index,Index,Index,Index). */ -EIGEN_DEVICE_FUNC -inline const ConstBlockXpr block(Index startRow, Index startCol, Index blockRows, Index blockCols) const +/// This is the const version of block(Index,Index,NRowsType,NColsType) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type +#else +const typename ConstFixedBlockXpr<...,...>::Type +#endif +block(Index startRow, Index startCol, NRowsType blockRows, NColsType blockCols) const { - return ConstBlockXpr(derived(), startRow, startCol, blockRows, blockCols); + return typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type( + derived(), startRow, startCol, internal::get_runtime_value(blockRows), internal::get_runtime_value(blockCols)); } - -/// \returns a dynamic-size expression of a top-right corner of *this. +/// \returns a expression of a top-right corner of \c *this with either dynamic or fixed sizes. /// /// \param cRows the number of rows in the corner /// \param cCols the number of columns in the corner +/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index. +/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index. /// -/// Example: \include MatrixBase_topRightCorner_int_int.cpp +/// Example with dynamic sizes: \include MatrixBase_topRightCorner_int_int.cpp /// Output: \verbinclude MatrixBase_topRightCorner_int_int.out /// +/// The number of rows \a blockRows and columns \a blockCols can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. +/// EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC -inline BlockXpr topRightCorner(Index cRows, Index cCols) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type +#else +typename FixedBlockXpr<...,...>::Type +#endif +topRightCorner(NRowsType cRows, NColsType cCols) { - return BlockXpr(derived(), 0, cols() - cCols, cRows, cCols); + return typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type + (derived(), 0, cols() - internal::get_runtime_value(cCols), internal::get_runtime_value(cRows), internal::get_runtime_value(cCols)); } -/// This is the const version of topRightCorner(Index, Index). -EIGEN_DEVICE_FUNC -inline const ConstBlockXpr topRightCorner(Index cRows, Index cCols) const +/// This is the const version of topRightCorner(NRowsType, NColsType). +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type +#else +const typename ConstFixedBlockXpr<...,...>::Type +#endif +topRightCorner(NRowsType cRows, NColsType cCols) const { - return ConstBlockXpr(derived(), 0, cols() - cCols, cRows, cCols); + return typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type + (derived(), 0, cols() - internal::get_runtime_value(cCols), internal::get_runtime_value(cRows), internal::get_runtime_value(cCols)); } -/// \returns an expression of a fixed-size top-right corner of *this. +/// \returns an expression of a fixed-size top-right corner of \c *this. /// /// \tparam CRows the number of rows in the corner /// \tparam CCols the number of columns in the corner @@ -114,21 +172,21 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// \sa class Block, block(Index,Index) /// template -EIGEN_DEVICE_FUNC -inline typename FixedBlockXpr::Type topRightCorner() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedBlockXpr::Type topRightCorner() { return typename FixedBlockXpr::Type(derived(), 0, cols() - CCols); } /// This is the const version of topRightCorner(). template -EIGEN_DEVICE_FUNC -inline const typename ConstFixedBlockXpr::Type topRightCorner() const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const typename ConstFixedBlockXpr::Type topRightCorner() const { return typename ConstFixedBlockXpr::Type(derived(), 0, cols() - CCols); } -/// \returns an expression of a top-right corner of *this. +/// \returns an expression of a top-right corner of \c *this. /// /// \tparam CRows number of rows in corner as specified at compile-time /// \tparam CCols number of columns in corner as specified at compile-time @@ -148,46 +206,67 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// \sa class Block /// template -inline typename FixedBlockXpr::Type topRightCorner(Index cRows, Index cCols) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedBlockXpr::Type topRightCorner(Index cRows, Index cCols) { return typename FixedBlockXpr::Type(derived(), 0, cols() - cCols, cRows, cCols); } /// This is the const version of topRightCorner(Index, Index). template -inline const typename ConstFixedBlockXpr::Type topRightCorner(Index cRows, Index cCols) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const typename ConstFixedBlockXpr::Type topRightCorner(Index cRows, Index cCols) const { return typename ConstFixedBlockXpr::Type(derived(), 0, cols() - cCols, cRows, cCols); } -/// \returns a dynamic-size expression of a top-left corner of *this. +/// \returns an expression of a top-left corner of \c *this with either dynamic or fixed sizes. /// /// \param cRows the number of rows in the corner /// \param cCols the number of columns in the corner +/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index. +/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index. /// /// Example: \include MatrixBase_topLeftCorner_int_int.cpp /// Output: \verbinclude MatrixBase_topLeftCorner_int_int.out /// +/// The number of rows \a blockRows and columns \a blockCols can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. +/// EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC -inline BlockXpr topLeftCorner(Index cRows, Index cCols) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type +#else +typename FixedBlockXpr<...,...>::Type +#endif +topLeftCorner(NRowsType cRows, NColsType cCols) { - return BlockXpr(derived(), 0, 0, cRows, cCols); + return typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type + (derived(), 0, 0, internal::get_runtime_value(cRows), internal::get_runtime_value(cCols)); } /// This is the const version of topLeftCorner(Index, Index). -EIGEN_DEVICE_FUNC -inline const ConstBlockXpr topLeftCorner(Index cRows, Index cCols) const +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type +#else +const typename ConstFixedBlockXpr<...,...>::Type +#endif +topLeftCorner(NRowsType cRows, NColsType cCols) const { - return ConstBlockXpr(derived(), 0, 0, cRows, cCols); + return typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type + (derived(), 0, 0, internal::get_runtime_value(cRows), internal::get_runtime_value(cCols)); } -/// \returns an expression of a fixed-size top-left corner of *this. +/// \returns an expression of a fixed-size top-left corner of \c *this. /// /// The template parameters CRows and CCols are the number of rows and columns in the corner. /// @@ -196,24 +275,24 @@ inline const ConstBlockXpr topLeftCorner(Index cRows, Index cCols) const /// EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename FixedBlockXpr::Type topLeftCorner() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedBlockXpr::Type topLeftCorner() { return typename FixedBlockXpr::Type(derived(), 0, 0); } /// This is the const version of topLeftCorner(). template -EIGEN_DEVICE_FUNC -inline const typename ConstFixedBlockXpr::Type topLeftCorner() const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const typename ConstFixedBlockXpr::Type topLeftCorner() const { return typename ConstFixedBlockXpr::Type(derived(), 0, 0); } -/// \returns an expression of a top-left corner of *this. +/// \returns an expression of a top-left corner of \c *this. /// /// \tparam CRows number of rows in corner as specified at compile-time /// \tparam CCols number of columns in corner as specified at compile-time @@ -233,46 +312,69 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// \sa class Block /// template -inline typename FixedBlockXpr::Type topLeftCorner(Index cRows, Index cCols) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedBlockXpr::Type topLeftCorner(Index cRows, Index cCols) { return typename FixedBlockXpr::Type(derived(), 0, 0, cRows, cCols); } /// This is the const version of topLeftCorner(Index, Index). template -inline const typename ConstFixedBlockXpr::Type topLeftCorner(Index cRows, Index cCols) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const typename ConstFixedBlockXpr::Type topLeftCorner(Index cRows, Index cCols) const { return typename ConstFixedBlockXpr::Type(derived(), 0, 0, cRows, cCols); } -/// \returns a dynamic-size expression of a bottom-right corner of *this. +/// \returns an expression of a bottom-right corner of \c *this with either dynamic or fixed sizes. /// /// \param cRows the number of rows in the corner /// \param cCols the number of columns in the corner +/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index. +/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index. /// /// Example: \include MatrixBase_bottomRightCorner_int_int.cpp /// Output: \verbinclude MatrixBase_bottomRightCorner_int_int.out /// +/// The number of rows \a blockRows and columns \a blockCols can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. +/// EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC -inline BlockXpr bottomRightCorner(Index cRows, Index cCols) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type +#else +typename FixedBlockXpr<...,...>::Type +#endif +bottomRightCorner(NRowsType cRows, NColsType cCols) { - return BlockXpr(derived(), rows() - cRows, cols() - cCols, cRows, cCols); + return typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type + (derived(), rows() - internal::get_runtime_value(cRows), cols() - internal::get_runtime_value(cCols), + internal::get_runtime_value(cRows), internal::get_runtime_value(cCols)); } -/// This is the const version of bottomRightCorner(Index, Index). -EIGEN_DEVICE_FUNC -inline const ConstBlockXpr bottomRightCorner(Index cRows, Index cCols) const +/// This is the const version of bottomRightCorner(NRowsType, NColsType). +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type +#else +const typename ConstFixedBlockXpr<...,...>::Type +#endif +bottomRightCorner(NRowsType cRows, NColsType cCols) const { - return ConstBlockXpr(derived(), rows() - cRows, cols() - cCols, cRows, cCols); + return typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type + (derived(), rows() - internal::get_runtime_value(cRows), cols() - internal::get_runtime_value(cCols), + internal::get_runtime_value(cRows), internal::get_runtime_value(cCols)); } -/// \returns an expression of a fixed-size bottom-right corner of *this. +/// \returns an expression of a fixed-size bottom-right corner of \c *this. /// /// The template parameters CRows and CCols are the number of rows and columns in the corner. /// @@ -281,24 +383,24 @@ inline const ConstBlockXpr bottomRightCorner(Index cRows, Index cCols) const /// EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename FixedBlockXpr::Type bottomRightCorner() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedBlockXpr::Type bottomRightCorner() { return typename FixedBlockXpr::Type(derived(), rows() - CRows, cols() - CCols); } /// This is the const version of bottomRightCorner(). template -EIGEN_DEVICE_FUNC -inline const typename ConstFixedBlockXpr::Type bottomRightCorner() const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const typename ConstFixedBlockXpr::Type bottomRightCorner() const { return typename ConstFixedBlockXpr::Type(derived(), rows() - CRows, cols() - CCols); } -/// \returns an expression of a bottom-right corner of *this. +/// \returns an expression of a bottom-right corner of \c *this. /// /// \tparam CRows number of rows in corner as specified at compile-time /// \tparam CCols number of columns in corner as specified at compile-time @@ -318,46 +420,69 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// \sa class Block /// template -inline typename FixedBlockXpr::Type bottomRightCorner(Index cRows, Index cCols) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedBlockXpr::Type bottomRightCorner(Index cRows, Index cCols) { return typename FixedBlockXpr::Type(derived(), rows() - cRows, cols() - cCols, cRows, cCols); } /// This is the const version of bottomRightCorner(Index, Index). template -inline const typename ConstFixedBlockXpr::Type bottomRightCorner(Index cRows, Index cCols) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const typename ConstFixedBlockXpr::Type bottomRightCorner(Index cRows, Index cCols) const { return typename ConstFixedBlockXpr::Type(derived(), rows() - cRows, cols() - cCols, cRows, cCols); } -/// \returns a dynamic-size expression of a bottom-left corner of *this. +/// \returns an expression of a bottom-left corner of \c *this with either dynamic or fixed sizes. /// /// \param cRows the number of rows in the corner /// \param cCols the number of columns in the corner +/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index. +/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index. /// /// Example: \include MatrixBase_bottomLeftCorner_int_int.cpp /// Output: \verbinclude MatrixBase_bottomLeftCorner_int_int.out /// +/// The number of rows \a blockRows and columns \a blockCols can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. +/// EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC -inline BlockXpr bottomLeftCorner(Index cRows, Index cCols) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type +#else +typename FixedBlockXpr<...,...>::Type +#endif +bottomLeftCorner(NRowsType cRows, NColsType cCols) { - return BlockXpr(derived(), rows() - cRows, 0, cRows, cCols); + return typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type + (derived(), rows() - internal::get_runtime_value(cRows), 0, + internal::get_runtime_value(cRows), internal::get_runtime_value(cCols)); } -/// This is the const version of bottomLeftCorner(Index, Index). -EIGEN_DEVICE_FUNC -inline const ConstBlockXpr bottomLeftCorner(Index cRows, Index cCols) const +/// This is the const version of bottomLeftCorner(NRowsType, NColsType). +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type +#else +typename ConstFixedBlockXpr<...,...>::Type +#endif +bottomLeftCorner(NRowsType cRows, NColsType cCols) const { - return ConstBlockXpr(derived(), rows() - cRows, 0, cRows, cCols); + return typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type + (derived(), rows() - internal::get_runtime_value(cRows), 0, + internal::get_runtime_value(cRows), internal::get_runtime_value(cCols)); } -/// \returns an expression of a fixed-size bottom-left corner of *this. +/// \returns an expression of a fixed-size bottom-left corner of \c *this. /// /// The template parameters CRows and CCols are the number of rows and columns in the corner. /// @@ -366,24 +491,24 @@ inline const ConstBlockXpr bottomLeftCorner(Index cRows, Index cCols) const /// EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename FixedBlockXpr::Type bottomLeftCorner() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedBlockXpr::Type bottomLeftCorner() { return typename FixedBlockXpr::Type(derived(), rows() - CRows, 0); } /// This is the const version of bottomLeftCorner(). template -EIGEN_DEVICE_FUNC -inline const typename ConstFixedBlockXpr::Type bottomLeftCorner() const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const typename ConstFixedBlockXpr::Type bottomLeftCorner() const { return typename ConstFixedBlockXpr::Type(derived(), rows() - CRows, 0); } -/// \returns an expression of a bottom-left corner of *this. +/// \returns an expression of a bottom-left corner of \c *this. /// /// \tparam CRows number of rows in corner as specified at compile-time /// \tparam CCols number of columns in corner as specified at compile-time @@ -403,45 +528,66 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// \sa class Block /// template -inline typename FixedBlockXpr::Type bottomLeftCorner(Index cRows, Index cCols) +EIGEN_STRONG_INLINE +typename FixedBlockXpr::Type bottomLeftCorner(Index cRows, Index cCols) { return typename FixedBlockXpr::Type(derived(), rows() - cRows, 0, cRows, cCols); } /// This is the const version of bottomLeftCorner(Index, Index). template -inline const typename ConstFixedBlockXpr::Type bottomLeftCorner(Index cRows, Index cCols) const +EIGEN_STRONG_INLINE +const typename ConstFixedBlockXpr::Type bottomLeftCorner(Index cRows, Index cCols) const { return typename ConstFixedBlockXpr::Type(derived(), rows() - cRows, 0, cRows, cCols); } -/// \returns a block consisting of the top rows of *this. +/// \returns a block consisting of the top rows of \c *this. /// /// \param n the number of rows in the block +/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index. /// /// Example: \include MatrixBase_topRows_int.cpp /// Output: \verbinclude MatrixBase_topRows_int.out /// +/// The number of rows \a n can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. +/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. +/// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC -inline RowsBlockXpr topRows(Index n) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename NRowsBlockXpr::value>::Type +#else +typename NRowsBlockXpr<...>::Type +#endif +topRows(NRowsType n) { - return RowsBlockXpr(derived(), 0, 0, n, cols()); + return typename NRowsBlockXpr::value>::Type + (derived(), 0, 0, internal::get_runtime_value(n), cols()); } -/// This is the const version of topRows(Index). -EIGEN_DEVICE_FUNC -inline ConstRowsBlockXpr topRows(Index n) const +/// This is the const version of topRows(NRowsType). +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const typename ConstNRowsBlockXpr::value>::Type +#else +const typename ConstNRowsBlockXpr<...>::Type +#endif +topRows(NRowsType n) const { - return ConstRowsBlockXpr(derived(), 0, 0, n, cols()); + return typename ConstNRowsBlockXpr::value>::Type + (derived(), 0, 0, internal::get_runtime_value(n), cols()); } -/// \returns a block consisting of the top rows of *this. +/// \returns a block consisting of the top rows of \c *this. /// /// \tparam N the number of rows in the block as specified at compile-time /// \param n the number of rows in the block as specified at run-time @@ -454,50 +600,69 @@ inline ConstRowsBlockXpr topRows(Index n) const /// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename NRowsBlockXpr::Type topRows(Index n = N) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename NRowsBlockXpr::Type topRows(Index n = N) { return typename NRowsBlockXpr::Type(derived(), 0, 0, n, cols()); } /// This is the const version of topRows(). template -EIGEN_DEVICE_FUNC -inline typename ConstNRowsBlockXpr::Type topRows(Index n = N) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename ConstNRowsBlockXpr::Type topRows(Index n = N) const { return typename ConstNRowsBlockXpr::Type(derived(), 0, 0, n, cols()); } -/// \returns a block consisting of the bottom rows of *this. +/// \returns a block consisting of the bottom rows of \c *this. /// /// \param n the number of rows in the block +/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index. /// /// Example: \include MatrixBase_bottomRows_int.cpp /// Output: \verbinclude MatrixBase_bottomRows_int.out /// +/// The number of rows \a n can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. +/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. +/// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC -inline RowsBlockXpr bottomRows(Index n) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename NRowsBlockXpr::value>::Type +#else +typename NRowsBlockXpr<...>::Type +#endif +bottomRows(NRowsType n) { - return RowsBlockXpr(derived(), rows() - n, 0, n, cols()); + return typename NRowsBlockXpr::value>::Type + (derived(), rows() - internal::get_runtime_value(n), 0, internal::get_runtime_value(n), cols()); } -/// This is the const version of bottomRows(Index). -EIGEN_DEVICE_FUNC -inline ConstRowsBlockXpr bottomRows(Index n) const +/// This is the const version of bottomRows(NRowsType). +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const typename ConstNRowsBlockXpr::value>::Type +#else +const typename ConstNRowsBlockXpr<...>::Type +#endif +bottomRows(NRowsType n) const { - return ConstRowsBlockXpr(derived(), rows() - n, 0, n, cols()); + return typename ConstNRowsBlockXpr::value>::Type + (derived(), rows() - internal::get_runtime_value(n), 0, internal::get_runtime_value(n), cols()); } -/// \returns a block consisting of the bottom rows of *this. +/// \returns a block consisting of the bottom rows of \c *this. /// /// \tparam N the number of rows in the block as specified at compile-time /// \param n the number of rows in the block as specified at run-time @@ -510,51 +675,70 @@ inline ConstRowsBlockXpr bottomRows(Index n) const /// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename NRowsBlockXpr::Type bottomRows(Index n = N) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename NRowsBlockXpr::Type bottomRows(Index n = N) { return typename NRowsBlockXpr::Type(derived(), rows() - n, 0, n, cols()); } /// This is the const version of bottomRows(). template -EIGEN_DEVICE_FUNC -inline typename ConstNRowsBlockXpr::Type bottomRows(Index n = N) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename ConstNRowsBlockXpr::Type bottomRows(Index n = N) const { return typename ConstNRowsBlockXpr::Type(derived(), rows() - n, 0, n, cols()); } -/// \returns a block consisting of a range of rows of *this. +/// \returns a block consisting of a range of rows of \c *this. /// /// \param startRow the index of the first row in the block /// \param n the number of rows in the block +/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index. /// /// Example: \include DenseBase_middleRows_int.cpp /// Output: \verbinclude DenseBase_middleRows_int.out /// +/// The number of rows \a n can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. +/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. +/// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC -inline RowsBlockXpr middleRows(Index startRow, Index n) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename NRowsBlockXpr::value>::Type +#else +typename NRowsBlockXpr<...>::Type +#endif +middleRows(Index startRow, NRowsType n) { - return RowsBlockXpr(derived(), startRow, 0, n, cols()); + return typename NRowsBlockXpr::value>::Type + (derived(), startRow, 0, internal::get_runtime_value(n), cols()); } -/// This is the const version of middleRows(Index,Index). -EIGEN_DEVICE_FUNC -inline ConstRowsBlockXpr middleRows(Index startRow, Index n) const +/// This is the const version of middleRows(Index,NRowsType). +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const typename ConstNRowsBlockXpr::value>::Type +#else +const typename ConstNRowsBlockXpr<...>::Type +#endif +middleRows(Index startRow, NRowsType n) const { - return ConstRowsBlockXpr(derived(), startRow, 0, n, cols()); + return typename ConstNRowsBlockXpr::value>::Type + (derived(), startRow, 0, internal::get_runtime_value(n), cols()); } -/// \returns a block consisting of a range of rows of *this. +/// \returns a block consisting of a range of rows of \c *this. /// /// \tparam N the number of rows in the block as specified at compile-time /// \param startRow the index of the first row in the block @@ -568,50 +752,69 @@ inline ConstRowsBlockXpr middleRows(Index startRow, Index n) const /// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename NRowsBlockXpr::Type middleRows(Index startRow, Index n = N) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename NRowsBlockXpr::Type middleRows(Index startRow, Index n = N) { return typename NRowsBlockXpr::Type(derived(), startRow, 0, n, cols()); } /// This is the const version of middleRows(). template -EIGEN_DEVICE_FUNC -inline typename ConstNRowsBlockXpr::Type middleRows(Index startRow, Index n = N) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename ConstNRowsBlockXpr::Type middleRows(Index startRow, Index n = N) const { return typename ConstNRowsBlockXpr::Type(derived(), startRow, 0, n, cols()); } -/// \returns a block consisting of the left columns of *this. +/// \returns a block consisting of the left columns of \c *this. /// /// \param n the number of columns in the block +/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index. /// /// Example: \include MatrixBase_leftCols_int.cpp /// Output: \verbinclude MatrixBase_leftCols_int.out /// +/// The number of columns \a n can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. +/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. +/// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC -inline ColsBlockXpr leftCols(Index n) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename NColsBlockXpr::value>::Type +#else +typename NColsBlockXpr<...>::Type +#endif +leftCols(NColsType n) { - return ColsBlockXpr(derived(), 0, 0, rows(), n); + return typename NColsBlockXpr::value>::Type + (derived(), 0, 0, rows(), internal::get_runtime_value(n)); } -/// This is the const version of leftCols(Index). -EIGEN_DEVICE_FUNC -inline ConstColsBlockXpr leftCols(Index n) const +/// This is the const version of leftCols(NColsType). +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const typename ConstNColsBlockXpr::value>::Type +#else +const typename ConstNColsBlockXpr<...>::Type +#endif +leftCols(NColsType n) const { - return ConstColsBlockXpr(derived(), 0, 0, rows(), n); + return typename ConstNColsBlockXpr::value>::Type + (derived(), 0, 0, rows(), internal::get_runtime_value(n)); } -/// \returns a block consisting of the left columns of *this. +/// \returns a block consisting of the left columns of \c *this. /// /// \tparam N the number of columns in the block as specified at compile-time /// \param n the number of columns in the block as specified at run-time @@ -624,50 +827,69 @@ inline ConstColsBlockXpr leftCols(Index n) const /// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename NColsBlockXpr::Type leftCols(Index n = N) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename NColsBlockXpr::Type leftCols(Index n = N) { return typename NColsBlockXpr::Type(derived(), 0, 0, rows(), n); } /// This is the const version of leftCols(). template -EIGEN_DEVICE_FUNC -inline typename ConstNColsBlockXpr::Type leftCols(Index n = N) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename ConstNColsBlockXpr::Type leftCols(Index n = N) const { return typename ConstNColsBlockXpr::Type(derived(), 0, 0, rows(), n); } -/// \returns a block consisting of the right columns of *this. +/// \returns a block consisting of the right columns of \c *this. /// /// \param n the number of columns in the block +/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index. /// /// Example: \include MatrixBase_rightCols_int.cpp /// Output: \verbinclude MatrixBase_rightCols_int.out /// +/// The number of columns \a n can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. +/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. +/// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC -inline ColsBlockXpr rightCols(Index n) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename NColsBlockXpr::value>::Type +#else +typename NColsBlockXpr<...>::Type +#endif +rightCols(NColsType n) { - return ColsBlockXpr(derived(), 0, cols() - n, rows(), n); + return typename NColsBlockXpr::value>::Type + (derived(), 0, cols() - internal::get_runtime_value(n), rows(), internal::get_runtime_value(n)); } -/// This is the const version of rightCols(Index). -EIGEN_DEVICE_FUNC -inline ConstColsBlockXpr rightCols(Index n) const +/// This is the const version of rightCols(NColsType). +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const typename ConstNColsBlockXpr::value>::Type +#else +const typename ConstNColsBlockXpr<...>::Type +#endif +rightCols(NColsType n) const { - return ConstColsBlockXpr(derived(), 0, cols() - n, rows(), n); + return typename ConstNColsBlockXpr::value>::Type + (derived(), 0, cols() - internal::get_runtime_value(n), rows(), internal::get_runtime_value(n)); } -/// \returns a block consisting of the right columns of *this. +/// \returns a block consisting of the right columns of \c *this. /// /// \tparam N the number of columns in the block as specified at compile-time /// \param n the number of columns in the block as specified at run-time @@ -680,51 +902,70 @@ inline ConstColsBlockXpr rightCols(Index n) const /// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename NColsBlockXpr::Type rightCols(Index n = N) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename NColsBlockXpr::Type rightCols(Index n = N) { return typename NColsBlockXpr::Type(derived(), 0, cols() - n, rows(), n); } /// This is the const version of rightCols(). template -EIGEN_DEVICE_FUNC -inline typename ConstNColsBlockXpr::Type rightCols(Index n = N) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename ConstNColsBlockXpr::Type rightCols(Index n = N) const { return typename ConstNColsBlockXpr::Type(derived(), 0, cols() - n, rows(), n); } -/// \returns a block consisting of a range of columns of *this. +/// \returns a block consisting of a range of columns of \c *this. /// /// \param startCol the index of the first column in the block /// \param numCols the number of columns in the block +/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index. /// /// Example: \include DenseBase_middleCols_int.cpp /// Output: \verbinclude DenseBase_middleCols_int.out /// +/// The number of columns \a n can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. +/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. +/// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC -inline ColsBlockXpr middleCols(Index startCol, Index numCols) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename NColsBlockXpr::value>::Type +#else +typename NColsBlockXpr<...>::Type +#endif +middleCols(Index startCol, NColsType numCols) { - return ColsBlockXpr(derived(), 0, startCol, rows(), numCols); + return typename NColsBlockXpr::value>::Type + (derived(), 0, startCol, rows(), internal::get_runtime_value(numCols)); } -/// This is the const version of middleCols(Index,Index). -EIGEN_DEVICE_FUNC -inline ConstColsBlockXpr middleCols(Index startCol, Index numCols) const +/// This is the const version of middleCols(Index,NColsType). +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const typename ConstNColsBlockXpr::value>::Type +#else +const typename ConstNColsBlockXpr<...>::Type +#endif +middleCols(Index startCol, NColsType numCols) const { - return ConstColsBlockXpr(derived(), 0, startCol, rows(), numCols); + return typename ConstNColsBlockXpr::value>::Type + (derived(), 0, startCol, rows(), internal::get_runtime_value(numCols)); } -/// \returns a block consisting of a range of columns of *this. +/// \returns a block consisting of a range of columns of \c *this. /// /// \tparam N the number of columns in the block as specified at compile-time /// \param startCol the index of the first column in the block @@ -738,26 +979,26 @@ inline ConstColsBlockXpr middleCols(Index startCol, Index numCols) const /// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename NColsBlockXpr::Type middleCols(Index startCol, Index n = N) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename NColsBlockXpr::Type middleCols(Index startCol, Index n = N) { return typename NColsBlockXpr::Type(derived(), 0, startCol, rows(), n); } /// This is the const version of middleCols(). template -EIGEN_DEVICE_FUNC -inline typename ConstNColsBlockXpr::Type middleCols(Index startCol, Index n = N) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename ConstNColsBlockXpr::Type middleCols(Index startCol, Index n = N) const { return typename ConstNColsBlockXpr::Type(derived(), 0, startCol, rows(), n); } -/// \returns a fixed-size expression of a block in *this. +/// \returns a fixed-size expression of a block of \c *this. /// /// The template parameters \a NRows and \a NCols are the number of /// rows and columns in the block. @@ -768,29 +1009,35 @@ inline typename ConstNColsBlockXpr::Type middleCols(Index startCol, Index n = /// Example: \include MatrixBase_block_int_int.cpp /// Output: \verbinclude MatrixBase_block_int_int.out /// +/// \note The usage of of this overload is discouraged from %Eigen 3.4, better used the generic +/// block(Index,Index,NRowsType,NColsType), here is the one-to-one equivalence: +/// \code +/// mat.template block(i,j) <--> mat.block(i,j,fix,fix) +/// \endcode +/// /// \note since block is a templated member, the keyword template has to be used /// if the matrix type is also a template parameter: \code m.template block<3,3>(1,1); \endcode /// EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename FixedBlockXpr::Type block(Index startRow, Index startCol) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedBlockXpr::Type block(Index startRow, Index startCol) { return typename FixedBlockXpr::Type(derived(), startRow, startCol); } /// This is the const version of block<>(Index, Index). */ template -EIGEN_DEVICE_FUNC -inline const typename ConstFixedBlockXpr::Type block(Index startRow, Index startCol) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const typename ConstFixedBlockXpr::Type block(Index startRow, Index startCol) const { return typename ConstFixedBlockXpr::Type(derived(), startRow, startCol); } -/// \returns an expression of a block in *this. +/// \returns an expression of a block of \c *this. /// /// \tparam NRows number of rows in block as specified at compile-time /// \tparam NCols number of columns in block as specified at compile-time @@ -805,14 +1052,25 @@ inline const typename ConstFixedBlockXpr::Type block(Index startRow /// \a NRows is \a Dynamic, and the same for the number of columns. /// /// Example: \include MatrixBase_template_int_int_block_int_int_int_int.cpp -/// Output: \verbinclude MatrixBase_template_int_int_block_int_int_int_int.cpp +/// Output: \verbinclude MatrixBase_template_int_int_block_int_int_int_int.out +/// +/// \note The usage of of this overload is discouraged from %Eigen 3.4, better used the generic +/// block(Index,Index,NRowsType,NColsType), here is the one-to-one complete equivalence: +/// \code +/// mat.template block(i,j,rows,cols) <--> mat.block(i,j,fix(rows),fix(cols)) +/// \endcode +/// If we known that, e.g., NRows==Dynamic and NCols!=Dynamic, then the equivalence becomes: +/// \code +/// mat.template block(i,j,rows,NCols) <--> mat.block(i,j,rows,fix) +/// \endcode /// EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -inline typename FixedBlockXpr::Type block(Index startRow, Index startCol, +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedBlockXpr::Type block(Index startRow, Index startCol, Index blockRows, Index blockCols) { return typename FixedBlockXpr::Type(derived(), startRow, startCol, blockRows, blockCols); @@ -820,13 +1078,14 @@ inline typename FixedBlockXpr::Type block(Index startRow, Index sta /// This is the const version of block<>(Index, Index, Index, Index). template -inline const typename ConstFixedBlockXpr::Type block(Index startRow, Index startCol, +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const typename ConstFixedBlockXpr::Type block(Index startRow, Index startCol, Index blockRows, Index blockCols) const { return typename ConstFixedBlockXpr::Type(derived(), startRow, startCol, blockRows, blockCols); } -/// \returns an expression of the \a i-th column of *this. Note that the numbering starts at 0. +/// \returns an expression of the \a i-th column of \c *this. Note that the numbering starts at 0. /// /// Example: \include MatrixBase_col.cpp /// Output: \verbinclude MatrixBase_col.out @@ -834,20 +1093,20 @@ inline const typename ConstFixedBlockXpr::Type block(Index startRow EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /** * \sa row(), class Block */ -EIGEN_DEVICE_FUNC -inline ColXpr col(Index i) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +ColXpr col(Index i) { return ColXpr(derived(), i); } /// This is the const version of col(). -EIGEN_DEVICE_FUNC -inline ConstColXpr col(Index i) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +ConstColXpr col(Index i) const { return ConstColXpr(derived(), i); } -/// \returns an expression of the \a i-th row of *this. Note that the numbering starts at 0. +/// \returns an expression of the \a i-th row of \c *this. Note that the numbering starts at 0. /// /// Example: \include MatrixBase_row.cpp /// Output: \verbinclude MatrixBase_row.out @@ -855,109 +1114,166 @@ inline ConstColXpr col(Index i) const EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /** * \sa col(), class Block */ -EIGEN_DEVICE_FUNC -inline RowXpr row(Index i) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +RowXpr row(Index i) { return RowXpr(derived(), i); } /// This is the const version of row(). */ -EIGEN_DEVICE_FUNC -inline ConstRowXpr row(Index i) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +ConstRowXpr row(Index i) const { return ConstRowXpr(derived(), i); } -/// \returns a dynamic-size expression of a segment (i.e. a vector block) in *this. +/// \returns an expression of a segment (i.e. a vector block) in \c *this with either dynamic or fixed sizes. /// /// \only_for_vectors /// /// \param start the first coefficient in the segment /// \param n the number of coefficients in the segment +/// \tparam NType the type of the value handling the number of coefficients in the segment, typically Index. /// /// Example: \include MatrixBase_segment_int_int.cpp /// Output: \verbinclude MatrixBase_segment_int_int.out /// -/// \note Even though the returned expression has dynamic size, in the case +/// The number of coefficients \a n can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. +/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. +/// +/// \note Even in the case that the returned expression has dynamic size, in the case /// when it is applied to a fixed-size vector, it inherits a fixed maximal size, /// which means that evaluating it does not cause a dynamic memory allocation. /// -/// \sa class Block, segment(Index) +/// \sa block(Index,Index,NRowsType,NColsType), fix, fix(int), class Block /// -EIGEN_DEVICE_FUNC -inline SegmentReturnType segment(Index start, Index n) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename FixedSegmentReturnType::value>::Type +#else +typename FixedSegmentReturnType<...>::Type +#endif +segment(Index start, NType n) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return SegmentReturnType(derived(), start, n); + return typename FixedSegmentReturnType::value>::Type + (derived(), start, internal::get_runtime_value(n)); } -/// This is the const version of segment(Index,Index). -EIGEN_DEVICE_FUNC -inline ConstSegmentReturnType segment(Index start, Index n) const +/// This is the const version of segment(Index,NType). +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const typename ConstFixedSegmentReturnType::value>::Type +#else +const typename ConstFixedSegmentReturnType<...>::Type +#endif +segment(Index start, NType n) const { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return ConstSegmentReturnType(derived(), start, n); + return typename ConstFixedSegmentReturnType::value>::Type + (derived(), start, internal::get_runtime_value(n)); } -/// \returns a dynamic-size expression of the first coefficients of *this. +/// \returns an expression of the first coefficients of \c *this with either dynamic or fixed sizes. /// /// \only_for_vectors /// /// \param n the number of coefficients in the segment +/// \tparam NType the type of the value handling the number of coefficients in the segment, typically Index. /// /// Example: \include MatrixBase_start_int.cpp /// Output: \verbinclude MatrixBase_start_int.out /// -/// \note Even though the returned expression has dynamic size, in the case +/// The number of coefficients \a n can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. +/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. +/// +/// \note Even in the case that the returned expression has dynamic size, in the case /// when it is applied to a fixed-size vector, it inherits a fixed maximal size, /// which means that evaluating it does not cause a dynamic memory allocation. /// /// \sa class Block, block(Index,Index) /// -EIGEN_DEVICE_FUNC -inline SegmentReturnType head(Index n) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename FixedSegmentReturnType::value>::Type +#else +typename FixedSegmentReturnType<...>::Type +#endif +head(NType n) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return SegmentReturnType(derived(), 0, n); + return typename FixedSegmentReturnType::value>::Type + (derived(), 0, internal::get_runtime_value(n)); } -/// This is the const version of head(Index). -EIGEN_DEVICE_FUNC -inline ConstSegmentReturnType head(Index n) const +/// This is the const version of head(NType). +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const typename ConstFixedSegmentReturnType::value>::Type +#else +const typename ConstFixedSegmentReturnType<...>::Type +#endif +head(NType n) const { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return ConstSegmentReturnType(derived(), 0, n); + return typename ConstFixedSegmentReturnType::value>::Type + (derived(), 0, internal::get_runtime_value(n)); } -/// \returns a dynamic-size expression of the last coefficients of *this. +/// \returns an expression of a last coefficients of \c *this with either dynamic or fixed sizes. /// /// \only_for_vectors /// /// \param n the number of coefficients in the segment +/// \tparam NType the type of the value handling the number of coefficients in the segment, typically Index. /// /// Example: \include MatrixBase_end_int.cpp /// Output: \verbinclude MatrixBase_end_int.out /// -/// \note Even though the returned expression has dynamic size, in the case +/// The number of coefficients \a n can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. +/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. +/// +/// \note Even in the case that the returned expression has dynamic size, in the case /// when it is applied to a fixed-size vector, it inherits a fixed maximal size, /// which means that evaluating it does not cause a dynamic memory allocation. /// /// \sa class Block, block(Index,Index) /// -EIGEN_DEVICE_FUNC -inline SegmentReturnType tail(Index n) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename FixedSegmentReturnType::value>::Type +#else +typename FixedSegmentReturnType<...>::Type +#endif +tail(NType n) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return SegmentReturnType(derived(), this->size() - n, n); + return typename FixedSegmentReturnType::value>::Type + (derived(), this->size() - internal::get_runtime_value(n), internal::get_runtime_value(n)); } /// This is the const version of tail(Index). -EIGEN_DEVICE_FUNC -inline ConstSegmentReturnType tail(Index n) const +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const typename ConstFixedSegmentReturnType::value>::Type +#else +const typename ConstFixedSegmentReturnType<...>::Type +#endif +tail(NType n) const { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return ConstSegmentReturnType(derived(), this->size() - n, n); + return typename ConstFixedSegmentReturnType::value>::Type + (derived(), this->size() - internal::get_runtime_value(n), internal::get_runtime_value(n)); } /// \returns a fixed-size expression of a segment (i.e. a vector block) in \c *this @@ -974,11 +1290,11 @@ inline ConstSegmentReturnType tail(Index n) const /// Example: \include MatrixBase_template_int_segment.cpp /// Output: \verbinclude MatrixBase_template_int_segment.out /// -/// \sa class Block +/// \sa segment(Index,NType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename FixedSegmentReturnType::Type segment(Index start, Index n = N) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedSegmentReturnType::Type segment(Index start, Index n = N) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return typename FixedSegmentReturnType::Type(derived(), start, n); @@ -986,14 +1302,14 @@ inline typename FixedSegmentReturnType::Type segment(Index start, Index n = N /// This is the const version of segment(Index). template -EIGEN_DEVICE_FUNC -inline typename ConstFixedSegmentReturnType::Type segment(Index start, Index n = N) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename ConstFixedSegmentReturnType::Type segment(Index start, Index n = N) const { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return typename ConstFixedSegmentReturnType::Type(derived(), start, n); } -/// \returns a fixed-size expression of the first coefficients of *this. +/// \returns a fixed-size expression of the first coefficients of \c *this. /// /// \only_for_vectors /// @@ -1006,11 +1322,11 @@ inline typename ConstFixedSegmentReturnType::Type segment(Index start, Index /// Example: \include MatrixBase_template_int_start.cpp /// Output: \verbinclude MatrixBase_template_int_start.out /// -/// \sa class Block +/// \sa head(NType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename FixedSegmentReturnType::Type head(Index n = N) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedSegmentReturnType::Type head(Index n = N) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return typename FixedSegmentReturnType::Type(derived(), 0, n); @@ -1018,14 +1334,14 @@ inline typename FixedSegmentReturnType::Type head(Index n = N) /// This is the const version of head(). template -EIGEN_DEVICE_FUNC -inline typename ConstFixedSegmentReturnType::Type head(Index n = N) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename ConstFixedSegmentReturnType::Type head(Index n = N) const { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return typename ConstFixedSegmentReturnType::Type(derived(), 0, n); } -/// \returns a fixed-size expression of the last coefficients of *this. +/// \returns a fixed-size expression of the last coefficients of \c *this. /// /// \only_for_vectors /// @@ -1038,11 +1354,11 @@ inline typename ConstFixedSegmentReturnType::Type head(Index n = N) const /// Example: \include MatrixBase_template_int_end.cpp /// Output: \verbinclude MatrixBase_template_int_end.out /// -/// \sa class Block +/// \sa tail(NType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename FixedSegmentReturnType::Type tail(Index n = N) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedSegmentReturnType::Type tail(Index n = N) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return typename FixedSegmentReturnType::Type(derived(), size() - n); @@ -1050,9 +1366,77 @@ inline typename FixedSegmentReturnType::Type tail(Index n = N) /// This is the const version of tail. template -EIGEN_DEVICE_FUNC -inline typename ConstFixedSegmentReturnType::Type tail(Index n = N) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename ConstFixedSegmentReturnType::Type tail(Index n = N) const { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return typename ConstFixedSegmentReturnType::Type(derived(), size() - n); } + +/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this +/// is col-major (resp. row-major). +/// +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +InnerVectorReturnType innerVector(Index outer) +{ return InnerVectorReturnType(derived(), outer); } + +/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this +/// is col-major (resp. row-major). Read-only. +/// +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const ConstInnerVectorReturnType innerVector(Index outer) const +{ return ConstInnerVectorReturnType(derived(), outer); } + +/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this +/// is col-major (resp. row-major). +/// +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +InnerVectorsReturnType +innerVectors(Index outerStart, Index outerSize) +{ + return Block(derived(), + IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart, + IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize); + +} + +/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this +/// is col-major (resp. row-major). Read-only. +/// +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const ConstInnerVectorsReturnType +innerVectors(Index outerStart, Index outerSize) const +{ + return Block(derived(), + IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart, + IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize); + +} + +/** \returns the i-th subvector (column or vector) according to the \c Direction + * \sa subVectors() + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename internal::conditional::type +subVector(Index i) +{ + return typename internal::conditional::type(derived(),i); +} + +/** This is the const version of subVector(Index) */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename internal::conditional::type +subVector(Index i) const +{ + return typename internal::conditional::type(derived(),i); +} + +/** \returns the number of subvectors (rows or columns) in the direction \c Direction + * \sa subVector(Index) + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR +Index subVectors() const +{ return (Direction==Vertical)?cols():rows(); } diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/plugins/CommonCwiseUnaryOps.h b/wpimath/src/main/native/eigeninclude/Eigen/src/plugins/CommonCwiseUnaryOps.h index 89f4faaac6..5418dc4154 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/plugins/CommonCwiseUnaryOps.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/plugins/CommonCwiseUnaryOps.h @@ -76,6 +76,20 @@ conjugate() const return ConjugateReturnType(derived()); } +/// \returns an expression of the complex conjugate of \c *this if Cond==true, returns derived() otherwise. +/// +EIGEN_DOC_UNARY_ADDONS(conjugate,complex conjugate) +/// +/// \sa conjugate() +template +EIGEN_DEVICE_FUNC +inline typename internal::conditional::type +conjugateIf() const +{ + typedef typename internal::conditional::type ReturnType; + return ReturnType(derived()); +} + /// \returns a read-only expression of the real part of \c *this. /// EIGEN_DOC_UNARY_ADDONS(real,real part function) diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/plugins/IndexedViewMethods.h b/wpimath/src/main/native/eigeninclude/Eigen/src/plugins/IndexedViewMethods.h new file mode 100644 index 0000000000..5bfb19ac6c --- /dev/null +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/plugins/IndexedViewMethods.h @@ -0,0 +1,262 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#if !defined(EIGEN_PARSED_BY_DOXYGEN) + +// This file is automatically included twice to generate const and non-const versions + +#ifndef EIGEN_INDEXED_VIEW_METHOD_2ND_PASS +#define EIGEN_INDEXED_VIEW_METHOD_CONST const +#define EIGEN_INDEXED_VIEW_METHOD_TYPE ConstIndexedViewType +#else +#define EIGEN_INDEXED_VIEW_METHOD_CONST +#define EIGEN_INDEXED_VIEW_METHOD_TYPE IndexedViewType +#endif + +#ifndef EIGEN_INDEXED_VIEW_METHOD_2ND_PASS +protected: + +// define some aliases to ease readability + +template +struct IvcRowType : public internal::IndexedViewCompatibleType {}; + +template +struct IvcColType : public internal::IndexedViewCompatibleType {}; + +template +struct IvcType : public internal::IndexedViewCompatibleType {}; + +typedef typename internal::IndexedViewCompatibleType::type IvcIndex; + +template +typename IvcRowType::type +ivcRow(const Indices& indices) const { + return internal::makeIndexedViewCompatible(indices, internal::variable_if_dynamic(derived().rows()),Specialized); +} + +template +typename IvcColType::type +ivcCol(const Indices& indices) const { + return internal::makeIndexedViewCompatible(indices, internal::variable_if_dynamic(derived().cols()),Specialized); +} + +template +typename IvcColType::type +ivcSize(const Indices& indices) const { + return internal::makeIndexedViewCompatible(indices, internal::variable_if_dynamic(derived().size()),Specialized); +} + +public: + +#endif + +template +struct EIGEN_INDEXED_VIEW_METHOD_TYPE { + typedef IndexedView::type, + typename IvcColType::type> type; +}; + +// This is the generic version + +template +typename internal::enable_if::value + && internal::traits::type>::ReturnAsIndexedView, + typename EIGEN_INDEXED_VIEW_METHOD_TYPE::type >::type +operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST +{ + return typename EIGEN_INDEXED_VIEW_METHOD_TYPE::type + (derived(), ivcRow(rowIndices), ivcCol(colIndices)); +} + +// The following overload returns a Block<> object + +template +typename internal::enable_if::value + && internal::traits::type>::ReturnAsBlock, + typename internal::traits::type>::BlockType>::type +operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST +{ + typedef typename internal::traits::type>::BlockType BlockType; + typename IvcRowType::type actualRowIndices = ivcRow(rowIndices); + typename IvcColType::type actualColIndices = ivcCol(colIndices); + return BlockType(derived(), + internal::first(actualRowIndices), + internal::first(actualColIndices), + internal::size(actualRowIndices), + internal::size(actualColIndices)); +} + +// The following overload returns a Scalar + +template +typename internal::enable_if::value + && internal::traits::type>::ReturnAsScalar, + CoeffReturnType >::type +operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST +{ + return Base::operator()(internal::eval_expr_given_size(rowIndices,rows()),internal::eval_expr_given_size(colIndices,cols())); +} + +#if EIGEN_HAS_STATIC_ARRAY_TEMPLATE + +// The following three overloads are needed to handle raw Index[N] arrays. + +template +IndexedView::type> +operator()(const RowIndicesT (&rowIndices)[RowIndicesN], const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST +{ + return IndexedView::type> + (derived(), rowIndices, ivcCol(colIndices)); +} + +template +IndexedView::type, const ColIndicesT (&)[ColIndicesN]> +operator()(const RowIndices& rowIndices, const ColIndicesT (&colIndices)[ColIndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST +{ + return IndexedView::type,const ColIndicesT (&)[ColIndicesN]> + (derived(), ivcRow(rowIndices), colIndices); +} + +template +IndexedView +operator()(const RowIndicesT (&rowIndices)[RowIndicesN], const ColIndicesT (&colIndices)[ColIndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST +{ + return IndexedView + (derived(), rowIndices, colIndices); +} + +#endif // EIGEN_HAS_STATIC_ARRAY_TEMPLATE + +// Overloads for 1D vectors/arrays + +template +typename internal::enable_if< + IsRowMajor && (!(internal::get_compile_time_incr::type>::value==1 || internal::is_valid_index_type::value)), + IndexedView::type> >::type +operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + return IndexedView::type> + (derived(), IvcIndex(0), ivcCol(indices)); +} + +template +typename internal::enable_if< + (!IsRowMajor) && (!(internal::get_compile_time_incr::type>::value==1 || internal::is_valid_index_type::value)), + IndexedView::type,IvcIndex> >::type +operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + return IndexedView::type,IvcIndex> + (derived(), ivcRow(indices), IvcIndex(0)); +} + +template +typename internal::enable_if< + (internal::get_compile_time_incr::type>::value==1) && (!internal::is_valid_index_type::value) && (!symbolic::is_symbolic::value), + VectorBlock::value> >::type +operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + typename IvcType::type actualIndices = ivcSize(indices); + return VectorBlock::value> + (derived(), internal::first(actualIndices), internal::size(actualIndices)); +} + +template +typename internal::enable_if::value, CoeffReturnType >::type +operator()(const IndexType& id) EIGEN_INDEXED_VIEW_METHOD_CONST +{ + return Base::operator()(internal::eval_expr_given_size(id,size())); +} + +#if EIGEN_HAS_STATIC_ARRAY_TEMPLATE + +template +typename internal::enable_if >::type +operator()(const IndicesT (&indices)[IndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + return IndexedView + (derived(), IvcIndex(0), indices); +} + +template +typename internal::enable_if >::type +operator()(const IndicesT (&indices)[IndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + return IndexedView + (derived(), indices, IvcIndex(0)); +} + +#endif // EIGEN_HAS_STATIC_ARRAY_TEMPLATE + +#undef EIGEN_INDEXED_VIEW_METHOD_CONST +#undef EIGEN_INDEXED_VIEW_METHOD_TYPE + +#ifndef EIGEN_INDEXED_VIEW_METHOD_2ND_PASS +#define EIGEN_INDEXED_VIEW_METHOD_2ND_PASS +#include "IndexedViewMethods.h" +#undef EIGEN_INDEXED_VIEW_METHOD_2ND_PASS +#endif + +#else // EIGEN_PARSED_BY_DOXYGEN + +/** + * \returns a generic submatrix view defined by the rows and columns indexed \a rowIndices and \a colIndices respectively. + * + * Each parameter must either be: + * - An integer indexing a single row or column + * - Eigen::all indexing the full set of respective rows or columns in increasing order + * - An ArithmeticSequence as returned by the Eigen::seq and Eigen::seqN functions + * - Any %Eigen's vector/array of integers or expressions + * - Plain C arrays: \c int[N] + * - And more generally any type exposing the following two member functions: + * \code + * operator[]() const; + * size() const; + * \endcode + * where \c stands for any integer type compatible with Eigen::Index (i.e. \c std::ptrdiff_t). + * + * The last statement implies compatibility with \c std::vector, \c std::valarray, \c std::array, many of the Range-v3's ranges, etc. + * + * If the submatrix can be represented using a starting position \c (i,j) and positive sizes \c (rows,columns), then this + * method will returns a Block object after extraction of the relevant information from the passed arguments. This is the case + * when all arguments are either: + * - An integer + * - Eigen::all + * - An ArithmeticSequence with compile-time increment strictly equal to 1, as returned by Eigen::seq(a,b), and Eigen::seqN(a,N). + * + * Otherwise a more general IndexedView object will be returned, after conversion of the inputs + * to more suitable types \c RowIndices' and \c ColIndices'. + * + * For 1D vectors and arrays, you better use the operator()(const Indices&) overload, which behave the same way but taking a single parameter. + * + * See also this question and its answer for an example of how to duplicate coefficients. + * + * \sa operator()(const Indices&), class Block, class IndexedView, DenseBase::block(Index,Index,Index,Index) + */ +template +IndexedView_or_Block +operator()(const RowIndices& rowIndices, const ColIndices& colIndices); + +/** This is an overload of operator()(const RowIndices&, const ColIndices&) for 1D vectors or arrays + * + * \only_for_vectors + */ +template +IndexedView_or_VectorBlock +operator()(const Indices& indices); + +#endif // EIGEN_PARSED_BY_DOXYGEN diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/plugins/MatrixCwiseBinaryOps.h b/wpimath/src/main/native/eigeninclude/Eigen/src/plugins/MatrixCwiseBinaryOps.h index f1084abefb..a0feef8716 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/plugins/MatrixCwiseBinaryOps.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/plugins/MatrixCwiseBinaryOps.h @@ -39,10 +39,10 @@ cwiseProduct(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const */ template EIGEN_DEVICE_FUNC -inline const CwiseBinaryOp, const Derived, const OtherDerived> +inline const CwiseBinaryOp, const Derived, const OtherDerived> cwiseEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const { - return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } /** \returns an expression of the coefficient-wise != operator of *this and \a other @@ -59,10 +59,10 @@ cwiseEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const */ template EIGEN_DEVICE_FUNC -inline const CwiseBinaryOp, const Derived, const OtherDerived> +inline const CwiseBinaryOp, const Derived, const OtherDerived> cwiseNotEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const { - return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } /** \returns an expression of the coefficient-wise min of *this and \a other diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/plugins/MatrixCwiseUnaryOps.h b/wpimath/src/main/native/eigeninclude/Eigen/src/plugins/MatrixCwiseUnaryOps.h index b1be3d566c..0514d8f78b 100644 --- a/wpimath/src/main/native/eigeninclude/Eigen/src/plugins/MatrixCwiseUnaryOps.h +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/plugins/MatrixCwiseUnaryOps.h @@ -14,6 +14,7 @@ typedef CwiseUnaryOp, const Derived> CwiseAbsReturnType; typedef CwiseUnaryOp, const Derived> CwiseAbs2ReturnType; +typedef CwiseUnaryOp, const Derived> CwiseArgReturnType; typedef CwiseUnaryOp, const Derived> CwiseSqrtReturnType; typedef CwiseUnaryOp, const Derived> CwiseSignReturnType; typedef CwiseUnaryOp, const Derived> CwiseInverseReturnType; @@ -82,4 +83,13 @@ EIGEN_DEVICE_FUNC inline const CwiseInverseReturnType cwiseInverse() const { return CwiseInverseReturnType(derived()); } +/// \returns an expression of the coefficient-wise phase angle of \c *this +/// +/// Example: \include MatrixBase_cwiseArg.cpp +/// Output: \verbinclude MatrixBase_cwiseArg.out +/// +EIGEN_DOC_UNARY_ADDONS(cwiseArg,arg) +EIGEN_DEVICE_FUNC +inline const CwiseArgReturnType +cwiseArg() const { return CwiseArgReturnType(derived()); } diff --git a/wpimath/src/main/native/eigeninclude/Eigen/src/plugins/ReshapedMethods.h b/wpimath/src/main/native/eigeninclude/Eigen/src/plugins/ReshapedMethods.h new file mode 100644 index 0000000000..482a6b045e --- /dev/null +++ b/wpimath/src/main/native/eigeninclude/Eigen/src/plugins/ReshapedMethods.h @@ -0,0 +1,149 @@ + +#ifdef EIGEN_PARSED_BY_DOXYGEN + +/// \returns an expression of \c *this with reshaped sizes. +/// +/// \param nRows the number of rows in the reshaped expression, specified at either run-time or compile-time, or AutoSize +/// \param nCols the number of columns in the reshaped expression, specified at either run-time or compile-time, or AutoSize +/// \tparam Order specifies whether the coefficients should be processed in column-major-order (ColMajor), in row-major-order (RowMajor), +/// or follows the \em natural order of the nested expression (AutoOrder). The default is ColMajor. +/// \tparam NRowsType the type of the value handling the number of rows, typically Index. +/// \tparam NColsType the type of the value handling the number of columns, typically Index. +/// +/// Dynamic size example: \include MatrixBase_reshaped_int_int.cpp +/// Output: \verbinclude MatrixBase_reshaped_int_int.out +/// +/// The number of rows \a nRows and columns \a nCols can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. In the later case, \c n plays the role of a runtime fallback value in case \c N equals Eigen::Dynamic. +/// Here is an example with a fixed number of rows and columns: +/// \include MatrixBase_reshaped_fixed.cpp +/// Output: \verbinclude MatrixBase_reshaped_fixed.out +/// +/// Finally, one of the sizes parameter can be automatically deduced from the other one by passing AutoSize as in the following example: +/// \include MatrixBase_reshaped_auto.cpp +/// Output: \verbinclude MatrixBase_reshaped_auto.out +/// AutoSize does preserve compile-time sizes when possible, i.e., when the sizes of the input are known at compile time \b and +/// that the other size is passed at compile-time using Eigen::fix as above. +/// +/// \sa class Reshaped, fix, fix(int) +/// +template +EIGEN_DEVICE_FUNC +inline Reshaped +reshaped(NRowsType nRows, NColsType nCols); + +/// This is the const version of reshaped(NRowsType,NColsType). +template +EIGEN_DEVICE_FUNC +inline const Reshaped +reshaped(NRowsType nRows, NColsType nCols) const; + +/// \returns an expression of \c *this with columns (or rows) stacked to a linear column vector +/// +/// \tparam Order specifies whether the coefficients should be processed in column-major-order (ColMajor), in row-major-order (RowMajor), +/// or follows the \em natural order of the nested expression (AutoOrder). The default is ColMajor. +/// +/// This overloads is essentially a shortcut for `A.reshaped(AutoSize,fix<1>)`. +/// +/// - If `Order==ColMajor` (the default), then it returns a column-vector from the stacked columns of \c *this. +/// - If `Order==RowMajor`, then it returns a column-vector from the stacked rows of \c *this. +/// - If `Order==AutoOrder`, then it returns a column-vector with elements stacked following the storage order of \c *this. +/// This mode is the recommended one when the particular ordering of the element is not relevant. +/// +/// Example: +/// \include MatrixBase_reshaped_to_vector.cpp +/// Output: \verbinclude MatrixBase_reshaped_to_vector.out +/// +/// If you want more control, you can still fall back to reshaped(NRowsType,NColsType). +/// +/// \sa reshaped(NRowsType,NColsType), class Reshaped +/// +template +EIGEN_DEVICE_FUNC +inline Reshaped +reshaped(); + +/// This is the const version of reshaped(). +template +EIGEN_DEVICE_FUNC +inline const Reshaped +reshaped() const; + +#else + +// This file is automatically included twice to generate const and non-const versions + +#ifndef EIGEN_RESHAPED_METHOD_2ND_PASS +#define EIGEN_RESHAPED_METHOD_CONST const +#else +#define EIGEN_RESHAPED_METHOD_CONST +#endif + +#ifndef EIGEN_RESHAPED_METHOD_2ND_PASS + +// This part is included once + +#endif + +template +EIGEN_DEVICE_FUNC +inline Reshaped::value, + internal::get_compiletime_reshape_size::value> +reshaped(NRowsType nRows, NColsType nCols) EIGEN_RESHAPED_METHOD_CONST +{ + return Reshaped::value, + internal::get_compiletime_reshape_size::value> + (derived(), + internal::get_runtime_reshape_size(nRows,internal::get_runtime_value(nCols),size()), + internal::get_runtime_reshape_size(nCols,internal::get_runtime_value(nRows),size())); +} + +template +EIGEN_DEVICE_FUNC +inline Reshaped::value, + internal::get_compiletime_reshape_size::value, + internal::get_compiletime_reshape_order::value> +reshaped(NRowsType nRows, NColsType nCols) EIGEN_RESHAPED_METHOD_CONST +{ + return Reshaped::value, + internal::get_compiletime_reshape_size::value, + internal::get_compiletime_reshape_order::value> + (derived(), + internal::get_runtime_reshape_size(nRows,internal::get_runtime_value(nCols),size()), + internal::get_runtime_reshape_size(nCols,internal::get_runtime_value(nRows),size())); +} + +// Views as linear vectors + +EIGEN_DEVICE_FUNC +inline Reshaped +reshaped() EIGEN_RESHAPED_METHOD_CONST +{ + return Reshaped(derived(),size(),1); +} + +template +EIGEN_DEVICE_FUNC +inline Reshaped::value> +reshaped() EIGEN_RESHAPED_METHOD_CONST +{ + EIGEN_STATIC_ASSERT(Order==RowMajor || Order==ColMajor || Order==AutoOrder, INVALID_TEMPLATE_PARAMETER); + return Reshaped::value> + (derived(), size(), 1); +} + +#undef EIGEN_RESHAPED_METHOD_CONST + +#ifndef EIGEN_RESHAPED_METHOD_2ND_PASS +#define EIGEN_RESHAPED_METHOD_2ND_PASS +#include "ReshapedMethods.h" +#undef EIGEN_RESHAPED_METHOD_2ND_PASS +#endif + +#endif // EIGEN_PARSED_BY_DOXYGEN diff --git a/wpimath/src/main/native/eigeninclude/unsupported/Eigen/AutoDiff b/wpimath/src/main/native/eigeninclude/unsupported/Eigen/AutoDiff index abf5b7d678..7a4ff460ce 100644 --- a/wpimath/src/main/native/eigeninclude/unsupported/Eigen/AutoDiff +++ b/wpimath/src/main/native/eigeninclude/unsupported/Eigen/AutoDiff @@ -28,11 +28,17 @@ namespace Eigen { //@{ } +#include "../../Eigen/src/Core/util/DisableStupidWarnings.h" + #include "src/AutoDiff/AutoDiffScalar.h" // #include "src/AutoDiff/AutoDiffVector.h" #include "src/AutoDiff/AutoDiffJacobian.h" +#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h" + + + namespace Eigen { //@} } diff --git a/wpimath/src/main/native/eigeninclude/unsupported/Eigen/MatrixFunctions b/wpimath/src/main/native/eigeninclude/unsupported/Eigen/MatrixFunctions index 60dc0a69b3..20c23d1c5f 100644 --- a/wpimath/src/main/native/eigeninclude/unsupported/Eigen/MatrixFunctions +++ b/wpimath/src/main/native/eigeninclude/unsupported/Eigen/MatrixFunctions @@ -14,9 +14,9 @@ #include #include -#include -#include -#include +#include "../../Eigen/Core" +#include "../../Eigen/LU" +#include "../../Eigen/Eigenvalues" /** * \defgroup MatrixFunctions_Module Matrix functions module @@ -53,12 +53,16 @@ * */ +#include "../../Eigen/src/Core/util/DisableStupidWarnings.h" + #include "src/MatrixFunctions/MatrixExponential.h" #include "src/MatrixFunctions/MatrixFunction.h" #include "src/MatrixFunctions/MatrixSquareRoot.h" #include "src/MatrixFunctions/MatrixLogarithm.h" #include "src/MatrixFunctions/MatrixPower.h" +#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h" + /** \page matrixbaseextra_page diff --git a/wpimath/src/main/native/eigeninclude/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h b/wpimath/src/main/native/eigeninclude/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h index 58f3f3319d..0f166e35f0 100644 --- a/wpimath/src/main/native/eigeninclude/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h +++ b/wpimath/src/main/native/eigeninclude/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h @@ -26,11 +26,11 @@ void make_coherent(const A& a, const B&b) make_coherent_impl::run(a.const_cast_derived(), b.const_cast_derived()); } -template struct auto_diff_special_op; +template struct auto_diff_special_op; } // end namespace internal -template class AutoDiffScalar; +template class AutoDiffScalar; template inline AutoDiffScalar MakeAutoDiffScalar(const typename NewDerType::Scalar& value, const NewDerType &der) { @@ -38,16 +38,16 @@ inline AutoDiffScalar MakeAutoDiffScalar(const typename NewDerType:: } /** \class AutoDiffScalar - * \brief A scalar type replacement with automatic differentation capability + * \brief A scalar type replacement with automatic differentiation capability * - * \param _DerType the vector type used to store/represent the derivatives. The base scalar type + * \param DerivativeType the vector type used to store/represent the derivatives. The base scalar type * as well as the number of derivatives to compute are determined from this type. * Typical choices include, e.g., \c Vector4f for 4 derivatives, or \c VectorXf * if the number of derivatives is not known at compile time, and/or, the number * of derivatives is large. - * Note that _DerType can also be a reference (e.g., \c VectorXf&) to wrap a + * Note that DerivativeType can also be a reference (e.g., \c VectorXf&) to wrap a * existing vector into an AutoDiffScalar. - * Finally, _DerType can also be any Eigen compatible expression. + * Finally, DerivativeType can also be any Eigen compatible expression. * * This class represents a scalar value while tracking its respective derivatives using Eigen's expression * template mechanism. @@ -63,17 +63,17 @@ inline AutoDiffScalar MakeAutoDiffScalar(const typename NewDerType:: * */ -template +template class AutoDiffScalar : public internal::auto_diff_special_op - <_DerType, !internal::is_same::type>::Scalar, - typename NumTraits::type>::Scalar>::Real>::value> + ::type>::Scalar, + typename NumTraits::type>::Scalar>::Real>::value> { public: typedef internal::auto_diff_special_op - <_DerType, !internal::is_same::type>::Scalar, - typename NumTraits::type>::Scalar>::Real>::value> Base; - typedef typename internal::remove_all<_DerType>::type DerType; + ::type>::Scalar, + typename NumTraits::type>::Scalar>::Real>::value> Base; + typedef typename internal::remove_all::type DerType; typedef typename internal::traits::Scalar Scalar; typedef typename NumTraits::Real Real; @@ -382,16 +382,16 @@ class AutoDiffScalar namespace internal { -template -struct auto_diff_special_op<_DerType, true> -// : auto_diff_scalar_op<_DerType, typename NumTraits::Real, +template +struct auto_diff_special_op +// : auto_diff_scalar_op::Real, // is_same::Real>::value> { - typedef typename remove_all<_DerType>::type DerType; + typedef typename remove_all::type DerType; typedef typename traits::Scalar Scalar; typedef typename NumTraits::Real Real; -// typedef auto_diff_scalar_op<_DerType, typename NumTraits::Real, +// typedef auto_diff_scalar_op::Real, // is_same::Real>::value> Base; // using Base::operator+; @@ -401,8 +401,8 @@ struct auto_diff_special_op<_DerType, true> // using Base::operator*; // using Base::operator*=; - const AutoDiffScalar<_DerType>& derived() const { return *static_cast*>(this); } - AutoDiffScalar<_DerType>& derived() { return *static_cast*>(this); } + const AutoDiffScalar& derived() const { return *static_cast*>(this); } + AutoDiffScalar& derived() { return *static_cast*>(this); } inline const AutoDiffScalar operator+(const Real& other) const @@ -410,12 +410,12 @@ struct auto_diff_special_op<_DerType, true> return AutoDiffScalar(derived().value() + other, derived().derivatives()); } - friend inline const AutoDiffScalar operator+(const Real& a, const AutoDiffScalar<_DerType>& b) + friend inline const AutoDiffScalar operator+(const Real& a, const AutoDiffScalar& b) { return AutoDiffScalar(a + b.value(), b.derivatives()); } - inline AutoDiffScalar<_DerType>& operator+=(const Real& other) + inline AutoDiffScalar& operator+=(const Real& other) { derived().value() += other; return derived(); @@ -431,22 +431,22 @@ struct auto_diff_special_op<_DerType, true> } friend inline const AutoDiffScalar >, DerType>::Type > - operator*(const Real& other, const AutoDiffScalar<_DerType>& a) + operator*(const Real& other, const AutoDiffScalar& a) { return AutoDiffScalar >, DerType>::Type >( a.value() * other, a.derivatives() * other); } - inline AutoDiffScalar<_DerType>& operator*=(const Scalar& other) + inline AutoDiffScalar& operator*=(const Scalar& other) { *this = *this * other; return derived(); } }; -template -struct auto_diff_special_op<_DerType, false> +template +struct auto_diff_special_op { void operator*() const; void operator-() const; @@ -565,6 +565,11 @@ struct ScalarBinaryOpTraits, Bi CODE; \ } +template +struct CleanedUpDerType { + typedef AutoDiffScalar::type::PlainObject> type; +}; + template inline const AutoDiffScalar& conj(const AutoDiffScalar& x) { return x; } template @@ -572,31 +577,31 @@ inline const AutoDiffScalar& real(const AutoDiffScalar& x) { template inline typename DerType::Scalar imag(const AutoDiffScalar&) { return 0.; } template -inline AutoDiffScalar::type::PlainObject> (min)(const AutoDiffScalar& x, const T& y) { - typedef AutoDiffScalar::type::PlainObject> ADS; +inline typename CleanedUpDerType::type (min)(const AutoDiffScalar& x, const T& y) { + typedef typename CleanedUpDerType::type ADS; return (x <= y ? ADS(x) : ADS(y)); } template -inline AutoDiffScalar::type::PlainObject> (max)(const AutoDiffScalar& x, const T& y) { - typedef AutoDiffScalar::type::PlainObject> ADS; +inline typename CleanedUpDerType::type (max)(const AutoDiffScalar& x, const T& y) { + typedef typename CleanedUpDerType::type ADS; return (x >= y ? ADS(x) : ADS(y)); } template -inline AutoDiffScalar::type::PlainObject> (min)(const T& x, const AutoDiffScalar& y) { - typedef AutoDiffScalar::type::PlainObject> ADS; +inline typename CleanedUpDerType::type (min)(const T& x, const AutoDiffScalar& y) { + typedef typename CleanedUpDerType::type ADS; return (x < y ? ADS(x) : ADS(y)); } template -inline AutoDiffScalar::type::PlainObject> (max)(const T& x, const AutoDiffScalar& y) { - typedef AutoDiffScalar::type::PlainObject> ADS; +inline typename CleanedUpDerType::type (max)(const T& x, const AutoDiffScalar& y) { + typedef typename CleanedUpDerType::type ADS; return (x > y ? ADS(x) : ADS(y)); } template -inline AutoDiffScalar::type::PlainObject> (min)(const AutoDiffScalar& x, const AutoDiffScalar& y) { +inline typename CleanedUpDerType::type (min)(const AutoDiffScalar& x, const AutoDiffScalar& y) { return (x.value() < y.value() ? x : y); } template -inline AutoDiffScalar::type::PlainObject> (max)(const AutoDiffScalar& x, const AutoDiffScalar& y) { +inline typename CleanedUpDerType::type (max)(const AutoDiffScalar& x, const AutoDiffScalar& y) { return (x.value() >= y.value() ? x : y); } @@ -711,10 +716,15 @@ template struct NumTraits > } namespace std { + template class numeric_limits > : public numeric_limits {}; +template +class numeric_limits > + : public numeric_limits {}; + } // namespace std #endif // EIGEN_AUTODIFF_SCALAR_H diff --git a/wpimath/src/main/native/eigeninclude/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h b/wpimath/src/main/native/eigeninclude/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h index 0b0ee6546a..02284b0ddf 100644 --- a/wpimath/src/main/native/eigeninclude/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h +++ b/wpimath/src/main/native/eigeninclude/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h @@ -314,7 +314,7 @@ struct matrix_exp_computeUV matrix_exp_pade17(A, U, V); } -#elif LDBL_MANT_DIG <= 112 // quadruple precison +#elif LDBL_MANT_DIG <= 113 // quadruple precision if (l1norm < 1.639394610288918690547467954466970e-005L) { matrix_exp_pade3(arg, U, V); @@ -347,7 +347,7 @@ struct matrix_exp_computeUV template struct is_exp_known_type : false_type {}; template<> struct is_exp_known_type : true_type {}; template<> struct is_exp_known_type : true_type {}; -#if LDBL_MANT_DIG <= 112 +#if LDBL_MANT_DIG <= 113 template<> struct is_exp_known_type : true_type {}; #endif @@ -396,7 +396,6 @@ void matrix_exp_compute(const ArgType& arg, ResultType &result, false_type) // d template struct MatrixExponentialReturnValue : public ReturnByValue > { - typedef typename Derived::Index Index; public: /** \brief Constructor. * diff --git a/wpimath/src/main/native/eigeninclude/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h b/wpimath/src/main/native/eigeninclude/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h index 3df82394cb..cc12ab62ba 100644 --- a/wpimath/src/main/native/eigeninclude/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h +++ b/wpimath/src/main/native/eigeninclude/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h @@ -53,7 +53,7 @@ template typename NumTraits::Real matrix_function_compute_mu(const MatrixType& A) { typedef typename plain_col_type::type VectorType; - typename MatrixType::Index rows = A.rows(); + Index rows = A.rows(); const MatrixType N = MatrixType::Identity(rows, rows) - A; VectorType e = VectorType::Ones(rows); N.template triangularView().solveInPlace(e); @@ -65,7 +65,6 @@ MatrixType MatrixFunctionAtomic::compute(const MatrixType& A) { // TODO: Use that A is upper triangular typedef typename NumTraits::Real RealScalar; - typedef typename MatrixType::Index Index; Index rows = A.rows(); Scalar avgEival = A.trace() / Scalar(RealScalar(rows)); MatrixType Ashifted = A - avgEival * MatrixType::Identity(rows, rows); @@ -73,10 +72,10 @@ MatrixType MatrixFunctionAtomic::compute(const MatrixType& A) MatrixType F = m_f(avgEival, 0) * MatrixType::Identity(rows, rows); MatrixType P = Ashifted; MatrixType Fincr; - for (Index s = 1; s < 1.1 * rows + 10; s++) { // upper limit is fairly arbitrary + for (Index s = 1; double(s) < 1.1 * double(rows) + 10.0; s++) { // upper limit is fairly arbitrary Fincr = m_f(avgEival, static_cast(s)) * P; F += Fincr; - P = Scalar(RealScalar(1.0/(s + 1))) * P * Ashifted; + P = Scalar(RealScalar(1)/RealScalar(s + 1)) * P * Ashifted; // test whether Taylor series converged const RealScalar F_norm = F.cwiseAbs().rowwise().sum().maxCoeff(); @@ -131,7 +130,6 @@ typename ListOfClusters::iterator matrix_function_find_cluster(Index key, ListOf template void matrix_function_partition_eigenvalues(const EivalsType& eivals, std::list& clusters) { - typedef typename EivalsType::Index Index; typedef typename EivalsType::RealScalar RealScalar; for (Index i=0; i void matrix_function_compute_map(const EivalsType& eivals, const ListOfClusters& clusters, VectorType& eivalToCluster) { - typedef typename EivalsType::Index Index; eivalToCluster.resize(eivals.rows()); Index clusterIndex = 0; for (typename ListOfClusters::const_iterator cluster = clusters.begin(); cluster != clusters.end(); ++cluster) { @@ -205,7 +202,6 @@ void matrix_function_compute_map(const EivalsType& eivals, const ListOfClusters& template void matrix_function_compute_permutation(const DynVectorType& blockStart, const DynVectorType& eivalToCluster, VectorType& permutation) { - typedef typename VectorType::Index Index; DynVectorType indexNextEntry = blockStart; permutation.resize(eivalToCluster.rows()); for (Index i = 0; i < eivalToCluster.rows(); i++) { @@ -219,7 +215,6 @@ void matrix_function_compute_permutation(const DynVectorType& blockStart, const template void matrix_function_permute_schur(VectorType& permutation, MatrixType& U, MatrixType& T) { - typedef typename VectorType::Index Index; for (Index i = 0; i < permutation.rows() - 1; i++) { Index j; for (j = i; j < permutation.rows(); j++) { @@ -247,7 +242,7 @@ template void matrix_function_compute_block_atomic(const MatrixType& T, AtomicType& atomic, const VectorType& blockStart, const VectorType& clusterSize, MatrixType& fT) { fT.setZero(T.rows(), T.cols()); - for (typename VectorType::Index i = 0; i < clusterSize.rows(); ++i) { + for (Index i = 0; i < clusterSize.rows(); ++i) { fT.block(blockStart(i), blockStart(i), clusterSize(i), clusterSize(i)) = atomic.compute(T.block(blockStart(i), blockStart(i), clusterSize(i), clusterSize(i))); } @@ -285,7 +280,6 @@ MatrixType matrix_function_solve_triangular_sylvester(const MatrixType& A, const eigen_assert(C.rows() == A.rows()); eigen_assert(C.cols() == B.rows()); - typedef typename MatrixType::Index Index; typedef typename MatrixType::Scalar Scalar; Index m = A.rows(); @@ -330,11 +324,8 @@ void matrix_function_compute_above_diagonal(const MatrixType& T, const VectorTyp { typedef internal::traits Traits; typedef typename MatrixType::Scalar Scalar; - typedef typename MatrixType::Index Index; - static const int RowsAtCompileTime = Traits::RowsAtCompileTime; - static const int ColsAtCompileTime = Traits::ColsAtCompileTime; static const int Options = MatrixType::Options; - typedef Matrix DynMatrixType; + typedef Matrix DynMatrixType; for (Index k = 1; k < clusterSize.rows(); k++) { for (Index i = 0; i < clusterSize.rows() - k; i++) { @@ -428,7 +419,8 @@ struct matrix_function_compute typedef internal::traits Traits; // compute Schur decomposition of A - const ComplexSchur schurOfA(A); + const ComplexSchur schurOfA(A); + eigen_assert(schurOfA.info()==Success); MatrixType T = schurOfA.matrixT(); MatrixType U = schurOfA.matrixU(); @@ -480,7 +472,6 @@ template class MatrixFunctionReturnValue { public: typedef typename Derived::Scalar Scalar; - typedef typename Derived::Index Index; typedef typename internal::stem_function::type StemFunction; protected: @@ -505,10 +496,8 @@ template class MatrixFunctionReturnValue typedef typename internal::nested_eval::type NestedEvalType; typedef typename internal::remove_all::type NestedEvalTypeClean; typedef internal::traits Traits; - static const int RowsAtCompileTime = Traits::RowsAtCompileTime; - static const int ColsAtCompileTime = Traits::ColsAtCompileTime; typedef std::complex::Real> ComplexScalar; - typedef Matrix DynMatrixType; + typedef Matrix DynMatrixType; typedef internal::MatrixFunctionAtomic AtomicType; AtomicType atomic(m_f); diff --git a/wpimath/src/main/native/eigeninclude/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h b/wpimath/src/main/native/eigeninclude/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h index cf5fffad30..e917013e0e 100644 --- a/wpimath/src/main/native/eigeninclude/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h +++ b/wpimath/src/main/native/eigeninclude/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h @@ -62,8 +62,8 @@ void matrix_log_compute_2x2(const MatrixType& A, MatrixType& result) else { // computation in previous branch is inaccurate if A(1,1) \approx A(0,0) - int unwindingNumber = static_cast(ceil((imag(logA11 - logA00) - RealScalar(EIGEN_PI)) / RealScalar(2*EIGEN_PI))); - result(0,1) = A(0,1) * (numext::log1p(y/A(0,0)) + Scalar(0,2*EIGEN_PI*unwindingNumber)) / y; + RealScalar unwindingNumber = ceil((imag(logA11 - logA00) - RealScalar(EIGEN_PI)) / RealScalar(2*EIGEN_PI)); + result(0,1) = A(0,1) * (numext::log1p(y/A(0,0)) + Scalar(0,RealScalar(2*EIGEN_PI)*unwindingNumber)) / y; } } @@ -135,7 +135,8 @@ void matrix_log_compute_pade(MatrixType& result, const MatrixType& T, int degree const int minPadeDegree = 3; const int maxPadeDegree = 11; assert(degree >= minPadeDegree && degree <= maxPadeDegree); - + // FIXME this creates float-conversion-warnings if these are enabled. + // Either manually convert each value, or disable the warning locally const RealScalar nodes[][maxPadeDegree] = { { 0.1127016653792583114820734600217600L, 0.5000000000000000000000000000000000L, // degree 3 0.8872983346207416885179265399782400L }, @@ -232,12 +233,13 @@ void matrix_log_compute_big(const MatrixType& A, MatrixType& result) int degree; MatrixType T = A, sqrtT; - int maxPadeDegree = matrix_log_max_pade_degree::value; - const RealScalar maxNormForPade = maxPadeDegree<= 5? 5.3149729967117310e-1L: // single precision + const int maxPadeDegree = matrix_log_max_pade_degree::value; + const RealScalar maxNormForPade = RealScalar( + maxPadeDegree<= 5? 5.3149729967117310e-1L: // single precision maxPadeDegree<= 7? 2.6429608311114350e-1L: // double precision maxPadeDegree<= 8? 2.32777776523703892094e-1L: // extended precision maxPadeDegree<=10? 1.05026503471351080481093652651105e-1L: // double-double - 1.1880960220216759245467951592883642e-1L; // quadruple precision + 1.1880960220216759245467951592883642e-1L); // quadruple precision while (true) { RealScalar normTminusI = (T - MatrixType::Identity(T.rows(), T.rows())).cwiseAbs().colwise().sum().maxCoeff(); @@ -254,7 +256,7 @@ void matrix_log_compute_big(const MatrixType& A, MatrixType& result) } matrix_log_compute_pade(result, T, degree); - result *= pow(RealScalar(2), numberOfSquareRoots); + result *= pow(RealScalar(2), RealScalar(numberOfSquareRoots)); // TODO replace by bitshift if possible } /** \ingroup MatrixFunctions_Module @@ -332,10 +334,8 @@ public: typedef typename internal::nested_eval::type DerivedEvalType; typedef typename internal::remove_all::type DerivedEvalTypeClean; typedef internal::traits Traits; - static const int RowsAtCompileTime = Traits::RowsAtCompileTime; - static const int ColsAtCompileTime = Traits::ColsAtCompileTime; typedef std::complex::Real> ComplexScalar; - typedef Matrix DynMatrixType; + typedef Matrix DynMatrixType; typedef internal::MatrixLogarithmAtomic AtomicType; AtomicType atomic; diff --git a/wpimath/src/main/native/eigeninclude/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h b/wpimath/src/main/native/eigeninclude/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h index a3273da4e5..d7672d7c97 100644 --- a/wpimath/src/main/native/eigeninclude/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h +++ b/wpimath/src/main/native/eigeninclude/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h @@ -40,7 +40,6 @@ class MatrixPowerParenthesesReturnValue : public ReturnByValue< MatrixPowerParen { public: typedef typename MatrixType::RealScalar RealScalar; - typedef typename MatrixType::Index Index; /** * \brief Constructor. @@ -81,7 +80,7 @@ class MatrixPowerParenthesesReturnValue : public ReturnByValue< MatrixPowerParen * * \note Currently this class is only used by MatrixPower. One may * insist that this be nested into MatrixPower. This class is here to - * faciliate future development of triangular matrix functions. + * facilitate future development of triangular matrix functions. */ template class MatrixPowerAtomic : internal::noncopyable @@ -94,7 +93,6 @@ class MatrixPowerAtomic : internal::noncopyable typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::RealScalar RealScalar; typedef std::complex ComplexScalar; - typedef typename MatrixType::Index Index; typedef Block ResultType; const MatrixType& m_A; @@ -162,11 +160,11 @@ template void MatrixPowerAtomic::computePade(int degree, const MatrixType& IminusT, ResultType& res) const { int i = 2*degree; - res = (m_p-degree) / (2*i-2) * IminusT; + res = (m_p-RealScalar(degree)) / RealScalar(2*i-2) * IminusT; for (--i; i; --i) { res = (MatrixType::Identity(IminusT.rows(), IminusT.cols()) + res).template triangularView() - .solve((i==1 ? -m_p : i&1 ? (-m_p-i/2)/(2*i) : (m_p-i/2)/(2*i-2)) * IminusT).eval(); + .solve((i==1 ? -m_p : i&1 ? (-m_p-RealScalar(i/2))/RealScalar(2*i) : (m_p-RealScalar(i/2))/RealScalar(2*i-2)) * IminusT).eval(); } res += MatrixType::Identity(IminusT.rows(), IminusT.cols()); } @@ -196,11 +194,12 @@ void MatrixPowerAtomic::computeBig(ResultType& res) const { using std::ldexp; const int digits = std::numeric_limits::digits; - const RealScalar maxNormForPade = digits <= 24? 4.3386528e-1L // single precision + const RealScalar maxNormForPade = RealScalar( + digits <= 24? 4.3386528e-1L // single precision : digits <= 53? 2.789358995219730e-1L // double precision : digits <= 64? 2.4471944416607995472e-1L // extended precision : digits <= 106? 1.1016843812851143391275867258512e-1L // double-double - : 9.134603732914548552537150753385375e-2L; // quadruple precision + : 9.134603732914548552537150753385375e-2L); // quadruple precision MatrixType IminusT, sqrtT, T = m_A.template triangularView(); RealScalar normIminusT; int degree, degree2, numberOfSquareRoots = 0; @@ -298,8 +297,8 @@ MatrixPowerAtomic::computeSuperDiag(const ComplexScalar& curr, const ComplexScalar logCurr = log(curr); ComplexScalar logPrev = log(prev); - int unwindingNumber = ceil((numext::imag(logCurr - logPrev) - RealScalar(EIGEN_PI)) / RealScalar(2*EIGEN_PI)); - ComplexScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2) + ComplexScalar(0, EIGEN_PI*unwindingNumber); + RealScalar unwindingNumber = ceil((numext::imag(logCurr - logPrev) - RealScalar(EIGEN_PI)) / RealScalar(2*EIGEN_PI)); + ComplexScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2) + ComplexScalar(0, RealScalar(EIGEN_PI)*unwindingNumber); return RealScalar(2) * exp(RealScalar(0.5) * p * (logCurr + logPrev)) * sinh(p * w) / (curr - prev); } @@ -340,7 +339,6 @@ class MatrixPower : internal::noncopyable private: typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::RealScalar RealScalar; - typedef typename MatrixType::Index Index; public: /** @@ -600,7 +598,6 @@ class MatrixPowerReturnValue : public ReturnByValue< MatrixPowerReturnValue ComplexScalar; - typedef typename Derived::Index Index; /** * \brief Constructor. diff --git a/wpimath/src/main/native/eigeninclude/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h b/wpimath/src/main/native/eigeninclude/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h index 9de0c3574e..e363e779de 100644 --- a/wpimath/src/main/native/eigeninclude/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h +++ b/wpimath/src/main/native/eigeninclude/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h @@ -17,7 +17,7 @@ namespace internal { // pre: T.block(i,i,2,2) has complex conjugate eigenvalues // post: sqrtT.block(i,i,2,2) is square root of T.block(i,i,2,2) template -void matrix_sqrt_quasi_triangular_2x2_diagonal_block(const MatrixType& T, typename MatrixType::Index i, ResultType& sqrtT) +void matrix_sqrt_quasi_triangular_2x2_diagonal_block(const MatrixType& T, Index i, ResultType& sqrtT) { // TODO: This case (2-by-2 blocks with complex conjugate eigenvalues) is probably hidden somewhere // in EigenSolver. If we expose it, we could call it directly from here. @@ -32,7 +32,7 @@ void matrix_sqrt_quasi_triangular_2x2_diagonal_block(const MatrixType& T, typena // all blocks of sqrtT to left of and below (i,j) are correct // post: sqrtT(i,j) has the correct value template -void matrix_sqrt_quasi_triangular_1x1_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT) +void matrix_sqrt_quasi_triangular_1x1_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT) { typedef typename traits::Scalar Scalar; Scalar tmp = (sqrtT.row(i).segment(i+1,j-i-1) * sqrtT.col(j).segment(i+1,j-i-1)).value(); @@ -41,7 +41,7 @@ void matrix_sqrt_quasi_triangular_1x1_off_diagonal_block(const MatrixType& T, ty // similar to compute1x1offDiagonalBlock() template -void matrix_sqrt_quasi_triangular_1x2_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT) +void matrix_sqrt_quasi_triangular_1x2_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT) { typedef typename traits::Scalar Scalar; Matrix rhs = T.template block<1,2>(i,j); @@ -54,7 +54,7 @@ void matrix_sqrt_quasi_triangular_1x2_off_diagonal_block(const MatrixType& T, ty // similar to compute1x1offDiagonalBlock() template -void matrix_sqrt_quasi_triangular_2x1_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT) +void matrix_sqrt_quasi_triangular_2x1_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT) { typedef typename traits::Scalar Scalar; Matrix rhs = T.template block<2,1>(i,j); @@ -101,7 +101,7 @@ void matrix_sqrt_quasi_triangular_solve_auxiliary_equation(MatrixType& X, const // similar to compute1x1offDiagonalBlock() template -void matrix_sqrt_quasi_triangular_2x2_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT) +void matrix_sqrt_quasi_triangular_2x2_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT) { typedef typename traits::Scalar Scalar; Matrix A = sqrtT.template block<2,2>(i,i); @@ -204,7 +204,7 @@ template void matrix_sqrt_triangular(const MatrixType &arg, ResultType &result) { using std::sqrt; - typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::Scalar Scalar; eigen_assert(arg.rows() == arg.cols());