"Saxpy" — single-precision a*x + y — is the canonical kernel for measuring fused-multiply-add throughput.
Below are minimal SIMD implementations on x86 (AVX/FMA) and aarch64 (NEON).
#include <immintrin.h>
void saxpy_avx(float a, const float *x, const float *y, float *out, size_t n) {
__m256 va = _mm256_set1_ps(a);
for (size_t i = 0; i + 8 <= n; i += 8) {
__m256 vx = _mm256_loadu_ps(x + i);
__m256 vy = _mm256_loadu_ps(y + i);
__m256 vr = _mm256_fmadd_ps(va, vx, vy);
_mm256_storeu_ps(out + i, vr);
}
}
The hot intrinsic is _mm256_fmadd_ps: per-lane a*b + c with a single rounding step,
beating _mm256_add_ps(_mm256_mul_ps(a, b), c) in both throughput and accuracy.
#include <arm_neon.h>
void saxpy_neon(float a, const float *x, const float *y, float *out, size_t n) {
float32x4_t va = vdupq_n_f32(a);
for (size_t i = 0; i + 4 <= n; i += 4) {
float32x4_t vx = vld1q_f32(x + i);
float32x4_t vy = vld1q_f32(y + i);
float32x4_t vr = vfmaq_f32(vy, vx, va);
vst1q_f32(out + i, vr);
}
}
NEON's vfmaq_f32(a, b, c) computes a + b * c per lane (note the argument order is
different from Intel's _mm256_fmadd_ps(a, b, c) which is a * b + c).
#include <arm_sve.h>
void saxpy_sve(float a, const float *x, const float *y, float *out, size_t n) {
svfloat32_t va = svdup_n_f32(a);
for (size_t i = 0; i < n; i += svcntw()) {
svbool_t pg = svwhilelt_b32(i, n);
svfloat32_t vx = svld1_f32(pg, x + i);
svfloat32_t vy = svld1_f32(pg, y + i);
svfloat32_t vr = svmla_f32_m(pg, vy, vx, va);
svst1_f32(pg, out + i, vr);
}
}
Notice the use of types svfloat32_t (a hardware-length-dependent vector) and svbool_t
(the predicate that handles the loop tail without needing a scalar epilogue). The
svwhilelt_b32 intrinsic generates a per-lane mask that's true while i < n.