pub fn subtract_f32(out: &mut [f32], a: &[f32], b: &[f32])
Subtract arrays element-wise: out[i] = a[i] - b[i]