pub fn axpy_f32(a: &mut [f32], alpha: f32, b: &[f32])
Compute a[i] = a[i] + alpha * b[i] (axpy operation)