hal/src/main/native/athena/AddressableLEDSimd.h

// Copyright (c) FIRST and other WPILib contributors.
// Open Source Software; you can modify and/or share it under the terms of
// the WPILib BSD license file in the root directory of this project.

#pragma once
#include <utility>

#include "hal/AddressableLEDTypes.h"
#include "simd/simd.h"

// Timing info
// https://developer.arm.com/documentation/ddi0409/i/instruction-timing/instruction-specific-scheduling/advanced-simd-load-store-instructions?lang=en

namespace hal::detail {

using namespace Simd::Neon;

template <typename T>
using ConvertFunc = void (*)(T);

/*
 * Conversion funtions perform in-place conversion by swapping elements.
 * The names of the functions indicate the wire output (default GRB),
 * but the FPGA takes sequences of BGR_.
 */

template <typename T>
void ToRGB(T val) {
  std::swap(val[1], val[2]);  // swap G and R
}

template <typename T>
void ToRBG(T val) {
  std::swap(val[1], val[2]);  // swap G and R
  std::swap(val[0], val[2]);  // swap B and G
}

template <typename T>
void ToBGR(T val) {
  std::swap(val[0], val[1]);  // swap B and G
  std::swap(val[0], val[2]);  // swap G and R
}

template <typename T>
void ToBRG(T val) {
  std::swap(val[0], val[1]);  // swap B and G
}

template <typename T>
void ToGBR(T val) {
  std::swap(val[0], val[2]);  // swap B and R
}

/**
 * Copies 16 pixels from src to dst using NEON instructions, converting using
 * the provided conversion function. Optimizes based on alignment of input and
 * output arrays specified by srcAlign and dstAlign
 * @tparam srcAlign whether src is aligned to the size of a NEON register (16
 * bytes)
 * @tparam dstAlign whether dst is aligned to the size of a NEON register (16
 * bytes)
 * @tparam the conversion function
 * @param[in] src The source array
 * @param[out] dst the destination array
 * @pre src and dst must contain at least 64 bytes (16 pixels)
 * @pre if srcAlign is true, src must be 16 byte aligned
 * @pre if dstAlign is true, src muts be 16 byte aligned
 */
template <bool srcAlign, bool dstAlign, ConvertFunc<uint8x16_t*> Convert>
void ConvertNEON_16(const uint8_t* src, uint8_t* dst) {
  uint8x16x4_t pixels = Load4<srcAlign>(src);
  Convert(pixels.val);
  Store4<dstAlign>(dst, pixels);
}

/**
 * Copies 8 pixels from src to dst using NEON instructions, converting using
 * the provided conversion function. Optimizes based on alignment of input and
 * output arrays specified by srcAlign and dstAlign
 * @tparam srcAlign whether src is aligned to the size of a NEON register (16
 * bytes)
 * @tparam dstAlign whether dst is aligned to the size of a NEON register (16
 * bytes)
 * @tparam the conversion function
 * @param[in] src The source array
 * @param[out] dst the destination array
 * @pre src and dst must contain at least 32 bytes (8 pixels)
 * @pre if srcAlign is true, src must be 16 byte aligned
 * @pre if dstAlign is true, src muts be 16 byte aligned
 */
template <bool srcAlign, bool dstAlign, ConvertFunc<uint8x8_t*> Convert>
void ConvertNEON_8(const uint8_t* src, uint8_t* dst) {
  uint8x8x4_t pixels = LoadHalf4<srcAlign>(src);
  Convert(pixels.val);
  Store4<dstAlign>(dst, pixels);
}

/**
 * Copies 16 pixels from src to dst, converting from GRB (wire order) to order.
 * Optimizes based on alignment of input and output arrays specified by srcAlign
 * and dstAlign
 * @tparam order the color order to convert to
 * @tparam srcAlign whether src is aligned to the size of a NEON register (16
 * bytes)
 * @tparam dstAlign whether dst is aligned to the size of a NEON register (16
 * bytes)
 * @param[in] src The source array
 * @param[out] dst the destination array
 * @pre src and dst must contain at least 64 bytes (16 pixels)
 * @pre if srcAlign is true, src must be 16 byte aligned
 * @pre if dstAlign is true, src muts be 16 byte aligned
 */
template <HAL_AddressableLEDColorOrder order, bool srcAlign, bool dstAlign>
void Convert16Pixels(const uint8_t* src, uint8_t* dst) {
  switch (order) {
    case HAL_ALED_RGB:
      ConvertNEON_16<srcAlign, dstAlign, ToRGB>(src, dst);
      break;
    case HAL_ALED_RBG:
      ConvertNEON_16<srcAlign, dstAlign, ToRBG>(src, dst);
      break;
    case HAL_ALED_BGR:
      ConvertNEON_16<srcAlign, dstAlign, ToBGR>(src, dst);
      break;
    case HAL_ALED_BRG:
      ConvertNEON_16<srcAlign, dstAlign, ToBRG>(src, dst);
      break;
    case HAL_ALED_GBR:
      ConvertNEON_16<srcAlign, dstAlign, ToGBR>(src, dst);
      break;
  }
}

/**
 * Copies 8 pixels from src to dst, converting from GRB (wire order) to order.
 * Optimizes based on alignment of input and output arrays specified by srcAlign
 * and dstAlign
 * @tparam order the color order to convert to
 * @tparam srcAlign whether src is aligned to the size of a NEON register (16
 * bytes)
 * @tparam dstAlign whether dst is aligned to the size of a NEON register (16
 * bytes)
 * @param[in] src The source array
 * @param[out] dst the destination array
 * @pre src and dst must contain at least 32 bytes (8 pixels)
 * @pre if srcAlign is true, src must be 16 byte aligned
 * @pre if dstAlign is true, src muts be 16 byte aligned
 */
template <HAL_AddressableLEDColorOrder order, bool srcAlign, bool dstAlign>
void Convert8Pixels(const uint8_t* src, uint8_t* dst) {
  switch (order) {
    case HAL_ALED_RGB:
      ConvertNEON_8<srcAlign, dstAlign, ToRGB>(src, dst);
      break;
    case HAL_ALED_RBG:
      ConvertNEON_8<srcAlign, dstAlign, ToRBG>(src, dst);
      break;
    case HAL_ALED_BGR:
      ConvertNEON_8<srcAlign, dstAlign, ToBGR>(src, dst);
      break;
    case HAL_ALED_BRG:
      ConvertNEON_8<srcAlign, dstAlign, ToBRG>(src, dst);
      break;
    case HAL_ALED_GBR:
      ConvertNEON_8<srcAlign, dstAlign, ToGBR>(src, dst);
      break;
  }
}

/**
 * Copies 1 pixel from src to dst, converting from RGB to the specified order.
 * @param[in] order the color order to convert to
 * @param[in] in the source array
 * @param[out] the destination array
 * @pre in and out must contain at least 1 pixel (4 bytes).
 */
void Convert1Pixel(HAL_AddressableLEDColorOrder order, const uint8_t* src,
                   uint8_t* dst) {
  uint8_t tmp[4];
  std::memcpy(tmp, src, 4);  // Load 4 bytes
  // convert based on order
  switch (order) {
    case HAL_ALED_RGB:
      ToRGB(tmp);
      break;
    case HAL_ALED_RBG:
      ToRBG(tmp);
      break;
    case HAL_ALED_BGR:
      ToBGR(tmp);
      break;
    case HAL_ALED_BRG:
      ToBRG(tmp);
      break;
    case HAL_ALED_GBR:
      ToGBR(tmp);
      break;
    case HAL_ALED_GRB:
      break;  // this shouldn't ever get hit but compiler
              // wants this to be exhaustive
  }
  std::memcpy(dst, tmp, 4);  // Store 4 bytes
}
/**
 * Copies len pixels from src to dst, converting from GRB (wire order) to order.
 * Optimizes based on alignment of input and output arrays specified by srcAlign
 * and dstAlign
 * @tparam order the color order to convert to
 * @tparam srcAlign whether src is aligned to the size of a NEON register (16
 * bytes)
 * @tparam dstAlign whether dst is aligned to the size of a NEON register (16
 * bytes)
 * @param[in] src The source array
 * @param[out] dst the destination array
 * @param[in] len the size (in pixels, len = (size in bytes) / 4)
 * @pre src and dst must have at least len*4 capacity in bytes
 * @pre if srcAlign is true, src must be 16 byte aligned
 * @pre if dstAlign is true, src muts be 16 byte aligned
 */
template <HAL_AddressableLEDColorOrder order, bool srcAlign, bool dstAlign>
void ConvertPixels(const uint8_t* src, uint8_t* dst, size_t len) {
  if (len >= 16) {
    constexpr size_t A4 =
        A * 4;  // Stride of 1 16-pixel conversion operation. (4 NEON registers)
    size_t size = len * 4;
    size_t aligned = Simd::AlignLo(
        size, A4);  // number of bytes we can copy with whole 16-pixel strides
    for (size_t i = 0; i < aligned; i += A4) {
      Convert16Pixels<order, srcAlign, dstAlign>(src + i, dst + i);
    }
    if (aligned < size) {
      Convert16Pixels<order, false, false>(
          src + size - A4,
          dst + size - A4);  // copy last 16 pixels, possibly recopying.
    }
  } else if (len >= 8) {
    // If len between 8 and 16, we can do 1 or 2 8-pixel copies
    Convert8Pixels<order, srcAlign, dstAlign>(src, dst);
    if (len > 8) {
      size_t recopyOffset = (len * 4) - (HA * 4);
      Convert8Pixels<order, false, false>(
          src + recopyOffset,
          dst + recopyOffset);  // copy last 8 pixels, possibly recopying
    }
  } else {
    // Just copy pixel-by-pixel for <8
    for (size_t i = 0; i < len; i += 4) {
      Convert1Pixel(order, src + i, dst + i);
    }
  }
}

/**
 * Copies pixelCount pixels from src to dst, converting from RGB to the
 * specified order
 * @tparam order the color order to convert to
 * @param src the source array
 * @param dst the destination array
 * @param pixelCount the number of pixels to convert and copy
 */
template <HAL_AddressableLEDColorOrder order>
void ConvertPixels(const uint8_t* src, uint8_t* dst, size_t pixelCount) {
  if (Aligned(src) && Aligned(dst)) {
    ConvertPixels<order, true, true>(src, dst, pixelCount);
  } else if (Aligned(src)) {
    ConvertPixels<order, true, false>(src, dst, pixelCount);
  } else if (Aligned(dst)) {
    ConvertPixels<order, false, true>(src, dst, pixelCount);
  } else {
    ConvertPixels<order, false, false>(src, dst, pixelCount);
  }
}
}  // namespace hal::detail
[hal, wpilib] AddressableLED: add support for other color orders (#7102) Many LED strips use different color order (GRB in particular is common). This makes the change at the HAL level. This solves 2 problems; first, no code needs to change in the high level drivers, which was challenging for C++, and second, simulation will behave properly as no conversion is needed. The HAL will accept an array of data objects in the same order no matter what the selected output order is, and will convert before sending it to the FPGA for output. To accomplish this, NEON bulk load/interleave instructions are utilized. The low level implementation (load, store, and alignment functions) come from the Simd Library. The high level implementations are inspired by the image conversion functions in the simd library, but have diverged significantly. Much of the implementation uses templates and inlined functions rather than runtime parameters; This is a trade off between the size of the generated code and the amount of function calls done at runtime. Currently, the entire conversion operation is inlined. 2025-02-07 15:36:41 -05:00			`// Copyright (c) FIRST and other WPILib contributors.`
			`// Open Source Software; you can modify and/or share it under the terms of`
			`// the WPILib BSD license file in the root directory of this project.`

			`#pragma once`
			`#include <utility>`

			`#include "hal/AddressableLEDTypes.h"`
			`#include "simd/simd.h"`

			`// Timing info`
			`// https://developer.arm.com/documentation/ddi0409/i/instruction-timing/instruction-specific-scheduling/advanced-simd-load-store-instructions?lang=en`

			`namespace hal::detail {`

			`using namespace Simd::Neon;`

			`template <typename T>`
			`using ConvertFunc = void (*)(T);`

			`/*`
			`* Conversion funtions perform in-place conversion by swapping elements.`
			`* The names of the functions indicate the wire output (default GRB),`
			`* but the FPGA takes sequences of BGR_.`
			`*/`

			`template <typename T>`
			`void ToRGB(T val) {`
			`std::swap(val[1], val[2]); // swap G and R`
			`}`

			`template <typename T>`
			`void ToRBG(T val) {`
			`std::swap(val[1], val[2]); // swap G and R`
			`std::swap(val[0], val[2]); // swap B and G`
			`}`

			`template <typename T>`
			`void ToBGR(T val) {`
			`std::swap(val[0], val[1]); // swap B and G`
			`std::swap(val[0], val[2]); // swap G and R`
			`}`

			`template <typename T>`
			`void ToBRG(T val) {`
			`std::swap(val[0], val[1]); // swap B and G`
			`}`

			`template <typename T>`
			`void ToGBR(T val) {`
			`std::swap(val[0], val[2]); // swap B and R`
			`}`

			`/**`
			`* Copies 16 pixels from src to dst using NEON instructions, converting using`
			`* the provided conversion function. Optimizes based on alignment of input and`
			`* output arrays specified by srcAlign and dstAlign`
			`* @tparam srcAlign whether src is aligned to the size of a NEON register (16`
			`* bytes)`
			`* @tparam dstAlign whether dst is aligned to the size of a NEON register (16`
			`* bytes)`
			`* @tparam the conversion function`
			`* @param[in] src The source array`
			`* @param[out] dst the destination array`
			`* @pre src and dst must contain at least 64 bytes (16 pixels)`
			`* @pre if srcAlign is true, src must be 16 byte aligned`
			`* @pre if dstAlign is true, src muts be 16 byte aligned`
			`*/`
			`template <bool srcAlign, bool dstAlign, ConvertFunc<uint8x16_t*> Convert>`
			`void ConvertNEON_16(const uint8_t* src, uint8_t* dst) {`
			`uint8x16x4_t pixels = Load4<srcAlign>(src);`
			`Convert(pixels.val);`
			`Store4<dstAlign>(dst, pixels);`
			`}`

			`/**`
			`* Copies 8 pixels from src to dst using NEON instructions, converting using`
			`* the provided conversion function. Optimizes based on alignment of input and`
			`* output arrays specified by srcAlign and dstAlign`
			`* @tparam srcAlign whether src is aligned to the size of a NEON register (16`
			`* bytes)`
			`* @tparam dstAlign whether dst is aligned to the size of a NEON register (16`
			`* bytes)`
			`* @tparam the conversion function`
			`* @param[in] src The source array`
			`* @param[out] dst the destination array`
			`* @pre src and dst must contain at least 32 bytes (8 pixels)`
			`* @pre if srcAlign is true, src must be 16 byte aligned`
			`* @pre if dstAlign is true, src muts be 16 byte aligned`
			`*/`
			`template <bool srcAlign, bool dstAlign, ConvertFunc<uint8x8_t*> Convert>`
			`void ConvertNEON_8(const uint8_t* src, uint8_t* dst) {`
			`uint8x8x4_t pixels = LoadHalf4<srcAlign>(src);`
			`Convert(pixels.val);`
			`Store4<dstAlign>(dst, pixels);`
			`}`

			`/**`
			`* Copies 16 pixels from src to dst, converting from GRB (wire order) to order.`
			`* Optimizes based on alignment of input and output arrays specified by srcAlign`
			`* and dstAlign`
			`* @tparam order the color order to convert to`
			`* @tparam srcAlign whether src is aligned to the size of a NEON register (16`
			`* bytes)`
			`* @tparam dstAlign whether dst is aligned to the size of a NEON register (16`
			`* bytes)`
			`* @param[in] src The source array`
			`* @param[out] dst the destination array`
			`* @pre src and dst must contain at least 64 bytes (16 pixels)`
			`* @pre if srcAlign is true, src must be 16 byte aligned`
			`* @pre if dstAlign is true, src muts be 16 byte aligned`
			`*/`
			`template <HAL_AddressableLEDColorOrder order, bool srcAlign, bool dstAlign>`
			`void Convert16Pixels(const uint8_t* src, uint8_t* dst) {`
			`switch (order) {`
			`case HAL_ALED_RGB:`
			`ConvertNEON_16<srcAlign, dstAlign, ToRGB>(src, dst);`
			`break;`
			`case HAL_ALED_RBG:`
			`ConvertNEON_16<srcAlign, dstAlign, ToRBG>(src, dst);`
			`break;`
			`case HAL_ALED_BGR:`
			`ConvertNEON_16<srcAlign, dstAlign, ToBGR>(src, dst);`
			`break;`
			`case HAL_ALED_BRG:`
			`ConvertNEON_16<srcAlign, dstAlign, ToBRG>(src, dst);`
			`break;`
			`case HAL_ALED_GBR:`
			`ConvertNEON_16<srcAlign, dstAlign, ToGBR>(src, dst);`
			`break;`
			`}`
			`}`

			`/**`
			`* Copies 8 pixels from src to dst, converting from GRB (wire order) to order.`
			`* Optimizes based on alignment of input and output arrays specified by srcAlign`
			`* and dstAlign`
			`* @tparam order the color order to convert to`
			`* @tparam srcAlign whether src is aligned to the size of a NEON register (16`
			`* bytes)`
			`* @tparam dstAlign whether dst is aligned to the size of a NEON register (16`
			`* bytes)`
			`* @param[in] src The source array`
			`* @param[out] dst the destination array`
			`* @pre src and dst must contain at least 32 bytes (8 pixels)`
			`* @pre if srcAlign is true, src must be 16 byte aligned`
			`* @pre if dstAlign is true, src muts be 16 byte aligned`
			`*/`
			`template <HAL_AddressableLEDColorOrder order, bool srcAlign, bool dstAlign>`
			`void Convert8Pixels(const uint8_t* src, uint8_t* dst) {`
			`switch (order) {`
			`case HAL_ALED_RGB:`
			`ConvertNEON_8<srcAlign, dstAlign, ToRGB>(src, dst);`
			`break;`
			`case HAL_ALED_RBG:`
			`ConvertNEON_8<srcAlign, dstAlign, ToRBG>(src, dst);`
			`break;`
			`case HAL_ALED_BGR:`
			`ConvertNEON_8<srcAlign, dstAlign, ToBGR>(src, dst);`
			`break;`
			`case HAL_ALED_BRG:`
			`ConvertNEON_8<srcAlign, dstAlign, ToBRG>(src, dst);`
			`break;`
			`case HAL_ALED_GBR:`
			`ConvertNEON_8<srcAlign, dstAlign, ToGBR>(src, dst);`
			`break;`
			`}`
			`}`

			`/**`
			`* Copies 1 pixel from src to dst, converting from RGB to the specified order.`
			`* @param[in] order the color order to convert to`
			`* @param[in] in the source array`
			`* @param[out] the destination array`
			`* @pre in and out must contain at least 1 pixel (4 bytes).`
			`*/`
			`void Convert1Pixel(HAL_AddressableLEDColorOrder order, const uint8_t* src,`
			`uint8_t* dst) {`
			`uint8_t tmp[4];`
			`std::memcpy(tmp, src, 4); // Load 4 bytes`
			`// convert based on order`
			`switch (order) {`
			`case HAL_ALED_RGB:`
			`ToRGB(tmp);`
			`break;`
			`case HAL_ALED_RBG:`
			`ToRBG(tmp);`
			`break;`
			`case HAL_ALED_BGR:`
			`ToBGR(tmp);`
			`break;`
			`case HAL_ALED_BRG:`
			`ToBRG(tmp);`
			`break;`
			`case HAL_ALED_GBR:`
			`ToGBR(tmp);`
			`break;`
			`case HAL_ALED_GRB:`
			`break; // this shouldn't ever get hit but compiler`
			`// wants this to be exhaustive`
			`}`
			`std::memcpy(dst, tmp, 4); // Store 4 bytes`
			`}`
			`/**`
			`* Copies len pixels from src to dst, converting from GRB (wire order) to order.`
			`* Optimizes based on alignment of input and output arrays specified by srcAlign`
			`* and dstAlign`
			`* @tparam order the color order to convert to`
			`* @tparam srcAlign whether src is aligned to the size of a NEON register (16`
			`* bytes)`
			`* @tparam dstAlign whether dst is aligned to the size of a NEON register (16`
			`* bytes)`
			`* @param[in] src The source array`
			`* @param[out] dst the destination array`
			`* @param[in] len the size (in pixels, len = (size in bytes) / 4)`
			`* @pre src and dst must have at least len*4 capacity in bytes`
			`* @pre if srcAlign is true, src must be 16 byte aligned`
			`* @pre if dstAlign is true, src muts be 16 byte aligned`
			`*/`
			`template <HAL_AddressableLEDColorOrder order, bool srcAlign, bool dstAlign>`
			`void ConvertPixels(const uint8_t* src, uint8_t* dst, size_t len) {`
			`if (len >= 16) {`
			`constexpr size_t A4 =`
			`A * 4; // Stride of 1 16-pixel conversion operation. (4 NEON registers)`
			`size_t size = len * 4;`
			`size_t aligned = Simd::AlignLo(`
			`size, A4); // number of bytes we can copy with whole 16-pixel strides`
			`for (size_t i = 0; i < aligned; i += A4) {`
			`Convert16Pixels<order, srcAlign, dstAlign>(src + i, dst + i);`
			`}`
			`if (aligned < size) {`
			`Convert16Pixels<order, false, false>(`
			`src + size - A4,`
			`dst + size - A4); // copy last 16 pixels, possibly recopying.`
			`}`
			`} else if (len >= 8) {`
			`// If len between 8 and 16, we can do 1 or 2 8-pixel copies`
			`Convert8Pixels<order, srcAlign, dstAlign>(src, dst);`
			`if (len > 8) {`
			`size_t recopyOffset = (len * 4) - (HA * 4);`
			`Convert8Pixels<order, false, false>(`
			`src + recopyOffset,`
			`dst + recopyOffset); // copy last 8 pixels, possibly recopying`
			`}`
			`} else {`
			`// Just copy pixel-by-pixel for <8`
			`for (size_t i = 0; i < len; i += 4) {`
			`Convert1Pixel(order, src + i, dst + i);`
			`}`
			`}`
			`}`

			`/**`
			`* Copies pixelCount pixels from src to dst, converting from RGB to the`
			`* specified order`
			`* @tparam order the color order to convert to`
			`* @param src the source array`
			`* @param dst the destination array`
			`* @param pixelCount the number of pixels to convert and copy`
			`*/`
			`template <HAL_AddressableLEDColorOrder order>`
			`void ConvertPixels(const uint8_t* src, uint8_t* dst, size_t pixelCount) {`
			`if (Aligned(src) && Aligned(dst)) {`
			`ConvertPixels<order, true, true>(src, dst, pixelCount);`
			`} else if (Aligned(src)) {`
			`ConvertPixels<order, true, false>(src, dst, pixelCount);`
			`} else if (Aligned(dst)) {`
			`ConvertPixels<order, false, true>(src, dst, pixelCount);`
			`} else {`
			`ConvertPixels<order, false, false>(src, dst, pixelCount);`
			`}`
			`}`
			`} // namespace hal::detail`