#include #include // author: Jan Lemeire // This code shows how to reduce an array with vector instructions // // OUTCOMMENT THE cout's and preceding stores TO USE IT!!!!!!!!!!!!!!!!! using namespace std; float sumOf16Floats(float indata[16]){ __m256 ymm0, ymm1, ymm2, ymm3, ymm4; float outdata[8]; ymm0 = _mm256_loadu_ps(indata); ymm1 = _mm256_loadu_ps(indata + 8); ymm2 = _mm256_hadd_ps(ymm0, ymm1); // see https://www.officedaytime.com/simd512e/simdimg/si.php?f=haddps _mm256_storeu_ps(outdata, ymm2); cout << "Out vector : " << outdata[0] << " " << outdata[1] << " " << outdata[2] << " " << outdata[3] << " " << outdata[4] << " " << outdata[5] << " " << outdata[6] << " " << outdata[7] << endl; ymm3 = _mm256_hadd_ps(ymm2, ymm2); _mm256_storeu_ps(outdata, ymm3); cout << "Out vector2: " << outdata[0] << " " << outdata[1] << " " << outdata[2] << " " << outdata[3] << " " << outdata[4] << " " << outdata[5] << " " << outdata[6] << " " << outdata[7] << endl; ymm4 = _mm256_hadd_ps(ymm3, ymm3); _mm256_storeu_ps(outdata, ymm4); cout << "Out vector3: " << outdata[0] << " " << outdata[1] << " " << outdata[2] << " " << outdata[3] << " " << outdata[4] << " " << outdata[5] << " " << outdata[6] << " " << outdata[7] << endl; float sum_v = outdata[0] + outdata[4]; // last two sub results cout << "Sum intrinsics = " << sum_v << endl; float sum = 0; for (int i = 0; i < 16; i++) sum += indata[i]; cout << "Sum pure = " << sum << endl; return sum_v; } double sumOf8Doubles(double indata[8]) { __m256d ymm0, ymm1, ymm2, ymm3, ymm4; double outdata[4]; ymm0 = _mm256_loadu_pd(indata); ymm1 = _mm256_loadu_pd(indata+4); ymm2 = _mm256_hadd_pd(ymm0, ymm1); // https://www.officedaytime.com/simd512e/simdimg/si.php?f=haddps _mm256_storeu_pd(outdata, ymm2); cout << "Out vector: " << outdata[0] << " " << outdata[1] << " " << outdata[2] << " " << outdata[3] << endl; ymm3 = _mm256_hadd_pd(ymm2, ymm2); _mm256_storeu_pd(outdata, ymm3); cout << "Out vector: " << outdata[0] << " " << outdata[1] << " " << outdata[2] << " " << outdata[3] << endl; double sum_d = outdata[0] + outdata[2]; // last two sub results cout << "sum vectorized: " << sum_d << endl; //ymm3 = _mm256_permute2f128_pd(ymm2, ymm2, 0x01); // swap again //ymm4 = _mm256_add_pd(ymm2, ymm3); //_mm256_storeu_pd(outdata, ymm4); //cout << "Out vector: " << outdata[0] << " " << outdata[1] << " " <