#include #include #include #include #include #include #include #include #include #include #include // for vector instructions #include // for multithreading #include //openmp #include "versions_o2_nointrinsics.h" #include "versions_od_intrinsics.h" #include "versions_od_nointrinsics.h" #include // for: LARGE_INTEGER, QueryPerformanceFrequency, SYSTEM_INFO #pragma comment(lib, "user32.lib") // for retrieving CPU information using namespace std; using namespace std::chrono; #define N 1024 * 1024 #define NBR_EXPERIMENTS 10 #define NBR_ALGORITHM_VERSIONS 100 // take big enough // the different sequential versions _int64 sequential_version(const int n, float* arr, float* arr2, float* result); _int64 version_parametric_size(int n, float* arr, float* arr2, float* result); _int64 version_withpointers(const int n, float* arr, float* arr2, float* result); _int64 version_loop_unrolling(const int n, float* arr, float* arr2, float* result); _int64 version_c_xFunction_inlineO2(int N1, float* arr, float* arr2, float* result); // parallelized version _int64 version_parallel_hint(int n, float* arr, float* arr2, float* result); _int64 version_openmp(const int n, float* arr, float* arr2, float* result, int nbrThreads); _int64 version_m256(const int n, float* arr, float* arr2, float* result); _int64 version_multithreaded(const int n, float* arr, float* arr2, float* result, const int NBR_THREADS); // utility functions void checkIfResultsAreTheSame(string name, float* arr3, float* arr4, int n); _int64 minimalValue(_int64* values, int NBR); _int64 meanValue(_int64* values, int NBR); _int64 stddev(_int64* values, int NBR); void analyzePerformance(vector names, _int64 runtimes[NBR_ALGORITHM_VERSIONS][NBR_EXPERIMENTS], _int64 totalNbrOfOperations, _int64 totalNbrOfBytes, _int64 ticksPerMilliSecond, _int64 referenceTime, _int64 reference2Time); bool PRINT_DATA = false; // set this to true to print input & output data. Use this to debug int main(int argc, char *argv[]) { _int64 totalNbrOfOperations = N; _int64 totalNbrOfBytes = N * 12; LARGE_INTEGER lpFrequency; bool ok = QueryPerformanceFrequency(&lpFrequency); _int64 ticksPerMilliSecond = lpFrequency.QuadPart; cout << "** Comparison of element-wise product of 2 vectors ** " << sizeof(long)<< endl; cout << "Vector size N = " << N << " Nbr Experiments = " << NBR_EXPERIMENTS << " PRINT_DATA = "<< PRINT_DATA<< " => " << totalNbrOfOperations << " operations and "< names; for (int t = 0; t < NBR_EXPERIMENTS; t++) { // run multiple times because of fluctuations int version = 0; // ********* THE FIRST VERSION - THE REFERENCE ********* runtimes[version++][t] = version_c_Od_NoIntrinsics(N, arr, arr2, arr3); // returns time in microseconds if (t==0) names.push_back( "C Od NoIntrinsics"); // ********* THE SECOND VERSION ********* runtimes[version++][t] = sequential_version(N, arr, arr2, arr4); // returns time in microseconds if (t == 0) { names.push_back("C with optimizations"); checkIfResultsAreTheSame(names.back(), arr3, arr4, N); } // ********* ANOTHER VERSION ********* runtimes[version++][t] = version_c_Od_NoIntrinsics_nopragma(N, arr, arr2, arr4); // returns time in microseconds if (t == 0) { names.push_back("C Od Intrinsics no pragma"); checkIfResultsAreTheSame(names.back(), arr3, arr4, N); } // ********* ANOTHER VERSION ********* runtimes[version++][t] = version_parametric_size(N, arr, arr2, arr4); // returns time in microseconds if (t == 0) { names.push_back("C parametric size"); checkIfResultsAreTheSame(names.back(), arr3, arr4, N); } // ********* ANOTHER VERSION ********* runtimes[version++][t] = version_c_ManualLoopUnrolling(N, arr, arr2, arr4); // returns time in microseconds if (t == 0) { names.push_back("C with loop unrolling (Od)"); checkIfResultsAreTheSame(names.back(), arr3, arr4, N); } // ********* ANOTHER VERSION ********* runtimes[version++][t] = version_c_PragmaUnroll(N, arr, arr2, arr4); // returns time in microseconds if (t == 0) { names.push_back("C with pragma unroll (Od)"); checkIfResultsAreTheSame(names.back(), arr3, arr4, N); } // ********* ANOTHER VERSION ********* runtimes[version++][t] = version_c_PragmaUnrollInner(N, arr, arr2, arr4); // returns time in microseconds if (t == 0) { names.push_back("C with pragma unroll inner (Od)"); checkIfResultsAreTheSame(names.back(), arr3, arr4, N); } // ********* ANOTHER VERSION ********* runtimes[version++][t] = version_c_xFunction(N, arr, arr2, arr4); // returns time in microseconds if (t == 0) { names.push_back("C with function (Od)"); checkIfResultsAreTheSame(names.back(), arr3, arr4, N); } // ********* ANOTHER VERSION ********* runtimes[version++][t] = version_c_xFunction_inline(N, arr, arr2, arr4); // returns time in microseconds if (t == 0) { names.push_back("C with inline function (Od)"); checkIfResultsAreTheSame(names.back(), arr3, arr4, N); } // ********* ANOTHER VERSION ********* runtimes[version++][t] = version_c_xFunction_inlineO2(N, arr, arr2, arr4); // returns time in microseconds if (t == 0) { names.push_back("C with inline function (O2)"); checkIfResultsAreTheSame(names.back(), arr3, arr4, N); } // ********* ANOTHER VERSION ********* runtimes[version++][t] = version_loop_unrolling(N, arr, arr2, arr4); // returns time in microseconds if (t == 0) { names.push_back("C with loop unrolling (O2)"); checkIfResultsAreTheSame(names.back(), arr3, arr4, N); } // ********* ANOTHER VERSION ********* runtimes[version++][t] = version_c_O2_NoIntrinsics(N, arr, arr2, arr4); // returns time in microseconds if (t == 0) { names.push_back("C O2 NoIntrinsics"); checkIfResultsAreTheSame(names.back(), arr3, arr4, N); } // ********* ANOTHER VERSION ********* runtimes[version++][t] = version_parallel_hint(N, arr, arr2, arr4); // returns time in microseconds if (t == 0) { names.push_back("C with parallel hint"); checkIfResultsAreTheSame(names.back(), arr3, arr4, N); } // ********* ANOTHER VERSION ********* runtimes[version++][t] = version_openmp(N, arr, arr2, arr4, 4); // returns time in microseconds if (t == 0) { names.push_back("OpenMP 4 threads"); checkIfResultsAreTheSame(names.back(), arr3, arr4, N); } // ********* YET ANOTHER VERSION ********* runtimes[version++][t] = version_m256(N, arr, arr2, arr4); // returns time in microseconds if (t == 0) { names.push_back("C with m256 vectors"); checkIfResultsAreTheSame(names.back(), arr3, arr4, N); } // ********* ANOTHER VERSION ********* runtimes[version++][t] = version_m256_Od(N, arr, arr2, arr4); // returns time in microseconds if (t == 0) { names.push_back("C with m256 and Od"); checkIfResultsAreTheSame(names.back(), arr3, arr4, N); } // ********* YET ANOTHER VERSION ********* runtimes[version++][t] = version_multithreaded(N, arr, arr2, arr4, 2); if (t == 0) { names.push_back("C with 2 threads"); checkIfResultsAreTheSame(names.back(), arr3, arr4, N); } // ********* YET ANOTHER VERSION ********* runtimes[version++][t] = version_multithreaded(N, arr, arr2, arr4, 4); if (t == 0) { names.push_back("C with 4 threads"); checkIfResultsAreTheSame(names.back(), arr3, arr4, N); } // ********* YET ANOTHER VERSION ********* for (int nbr_threads = 2;nbr_threads < 20; nbr_threads++) { runtimes[version++][t] = version_multithreaded(N, arr, arr2, arr4, nbr_threads); // returns time in microseconds if (t == 0) { std::stringstream stream; stream << "C with " << nbr_threads << " threads"; names.push_back(stream.str()); checkIfResultsAreTheSame(names.back(), arr3, arr4, N); } } /*if (nbrCores > 4) { runtimes[version++][t] = version_multithreaded(N, arr, arr2, arr4, nbrCores); if (t == 0) { std::stringstream stream; stream << "C with " << nbrCores << " threads"; names.push_back(stream.str()); checkIfResultsAreTheSame(names.back(), arr3, arr4, N); } }*/ // ********* YET ANOTHER VERSION ********* // ********* ADD VERSIONS HERE ********* // ********* END OF ALL VERSIONS ********* } _int64 referenceTime = minimalValue(runtimes[0], NBR_EXPERIMENTS);// FIRST algorithm (non-optimized) _int64 reference2Time = minimalValue(runtimes[1], NBR_EXPERIMENTS);// SECOND algorithm (optimized) analyzePerformance(names, runtimes, totalNbrOfOperations, totalNbrOfBytes, ticksPerMilliSecond, referenceTime, reference2Time); // Deallocate memory delete[] arr; delete[] arr2; delete[] arr3; delete[] arr4; cout << endl<<"Press ENTER to close window..."; char c = cin.get(); } // ==== ALL VERSIONS ==== _int64 sequential_version(int n, float* arr, float* arr2, float* result) { time_point now = system_clock::now(); for (int i = 0; i < n; ++i) { result[i] = arr[i] * arr2[i]; } time_point epoch = system_clock::now(); microseconds us = duration_cast(epoch - now); return us.count(); } void multiplication_function(int t, float* arr, float* arr2, float* result, int n); _int64 version_parametric_size(int n, float* arr, float* arr2, float* result) { time_point now = system_clock::now(); // the same function as the multithreaded version multiplication_function(0, arr, arr2, result, n); //for (int i = 0; i < n; ++i) { // result[i] = arr[i] * arr2[i]; //} time_point epoch = system_clock::now(); microseconds us = duration_cast(epoch - now); return us.count(); } _int64 version_withpointers(const int n, float* arr, float* arr2, float* result) { time_point now = system_clock::now(); #pragma loop(no_vector) for(float* parr = arr, *parr2 = arr2, *presult = result; presult < result + N; ++presult, ++parr, ++parr2 ){ *presult = *parr * *parr2; } time_point epoch = system_clock::now(); microseconds us = duration_cast(epoch - now); return us.count(); } _int64 version_loop_unrolling(const int n, float* arr, float* arr2, float* result) { time_point now = system_clock::now(); for (int i = 0; i < N ; i+=4) { result[i ] = arr[i ] * arr2[i ]; result[i + 1] = arr[i + 1] * arr2[i + 1]; result[i + 2] = arr[i + 2] * arr2[i + 2]; result[i + 3] = arr[i + 3] * arr2[i + 3]; } time_point epoch = system_clock::now(); microseconds us = duration_cast(epoch - now); return us.count(); } _int64 version_parallel_hint(const int n, float* arr, float* arr2, float* result) { time_point now = system_clock::now(); #pragma loop(ivdep) // indicates that the iterations of the loop are independent. This statement seems to be necessary... #pragma loop(hint_parallel(4)) for (int i = 0; i < N; ++i) { result[i] = arr[i] * arr2[i]; } time_point epoch = system_clock::now(); microseconds us = duration_cast(epoch - now); return us.count(); } _int64 version_openmp(const int n, float* arr, float* arr2, float* result, int nbrThreads) { time_point now = system_clock::now(); int nThreads = 4, i=0; #pragma omp parallel num_threads(nbrThreads) default(none) private(i) shared(result, arr, arr2, nThreads) { #pragma omp master nThreads = omp_get_num_threads(); #pragma omp for for (i = 0; i < N; ++i) { result[i] = arr[i] * arr2[i]; } } time_point epoch = system_clock::now(); microseconds us = duration_cast(epoch - now); //if (nbrThreads != nThreads) cout << "OpenMP used " << nThreads << " threads instead of the expected " << nbrThreads<< " threads..." < now = system_clock::now(); __m256i mask = _mm256_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1); for (int i = 0; i < N; i += 8) { __m256 vec1 = _mm256_load_ps(&arr[i]); __m256 vec2 = _mm256_load_ps(&arr2[i]); __m256 vec_result = _mm256_mul_ps(vec1, vec2); _mm256_maskstore_ps(&result[i], mask, vec_result); } time_point epoch = system_clock::now(); microseconds us = duration_cast(epoch - now); return us.count(); } void multiplication_function(int t, float* arr, float* arr2, float* result, int n) { for (int i = 0; i < n; ++i) { result[i] = arr[i] * arr2[i]; } } _int64 version_multithreaded(const int n, float* arr, float* arr2, float* result, const int NBR_THREADS) { using namespace std::chrono; time_point now = system_clock::now(); const int ELEMENTS_PER_THREAD = N / NBR_THREADS; vector threads; // vector of pointers to threads // *** STARTING THE THREADS for (int t = 0; t < NBR_THREADS; t++) threads.push_back(new thread(sequential_version, ELEMENTS_PER_THREAD, arr + t * ELEMENTS_PER_THREAD, arr2 + t * ELEMENTS_PER_THREAD, result + t * ELEMENTS_PER_THREAD)); // pass the function to be executed and all the necessary parameters // do the remainder of the array in the main thread // do the remainder of the array int last_idx = NBR_THREADS * ELEMENTS_PER_THREAD; if (last_idx < n) sequential_version(n - last_idx, arr + last_idx, arr2 + last_idx, result + last_idx); // *** waiting for all threads to finish for (int t = 0; t < threads.size(); t++) { threads[t]->join(); delete threads[t]; } time_point epoch = system_clock::now(); microseconds us = duration_cast(epoch - now); return us.count(); } inline float product3(float x1, float x2) { return x1 * x2; } _int64 version_c_xFunction_inlineO2(int N1, float* arr, float* arr2, float* result) { time_point now = system_clock::now(); #pragma loop(no_vector) for (int i = 0; i < N1; ++i) { result[i] = product3(arr[i], arr2[i]); } time_point epoch = system_clock::now(); microseconds us = duration_cast(epoch - now); return us.count(); } // *** PERFORMANCE ANALYSIS *** void print_table_title() { cout << " ***** Version Name ***** | min(us) | mean(us) | stddev | GOp/s | GB/s | CPI | speedup |speedup opt |"; cout << endl; } void print_row(string name, _int64 time, _int64 time_mean, _int64 time_stddev, _int64 nbrOperations, _int64 nbrBytes, _int64 ticksPerMilliSecond, _int64 referenceTime, _int64 reference2Time) { const char separator = ' '; const int NAME_WIDTH = 32, NUM_WIDTH = 9, PRECISION = 2; if (name.length() > NAME_WIDTH) name = name.substr(0, NAME_WIDTH); cout << left << setw(NAME_WIDTH) << setfill(separator) << name << " | "; cout << right << setw(NUM_WIDTH) << setfill(separator) << time << " | "; cout << right << setw(NUM_WIDTH) << setfill(separator) << time_mean << " | "; cout << right << setw(NUM_WIDTH-4) << setfill(separator) << fixed << setprecision(1) << (time_stddev * 100.0 / time_mean) << "% | "; cout << right << setw(NUM_WIDTH) << setfill(separator) << fixed << setprecision(PRECISION) << (float)nbrOperations / time / 1000 << " | "; cout << right << setw(NUM_WIDTH) << setfill(separator) << fixed << setprecision(PRECISION) << (float)nbrBytes / time / 1000 << " | "; float total_nbr_cycles = (float)(time * (ticksPerMilliSecond / 1000)); cout << right << setw(NUM_WIDTH) << setfill(separator) << fixed << setprecision(PRECISION) << ((float)total_nbr_cycles / nbrOperations) << " | "; cout << right << setw(NUM_WIDTH) << setfill(separator) << fixed << setprecision(PRECISION) << (float)referenceTime / time << " | "; cout << right << setw(NUM_WIDTH+1) << setfill(separator) << fixed << setprecision(PRECISION) << (float)reference2Time / time << " | "; cout << endl; } void analyzePerformance(vector names, _int64 runtimes[NBR_ALGORITHM_VERSIONS][NBR_EXPERIMENTS], _int64 totalNbrOfOperations, _int64 totalNbrOfBytes, _int64 ticksPerMilliSecond, _int64 referenceTime, _int64 reference2Time) { print_table_title(); for (int v = 0; v < names.size(); v++) { _int64 time = minimalValue(runtimes[v], NBR_EXPERIMENTS); _int64 time_mean = meanValue(runtimes[v], NBR_EXPERIMENTS); _int64 time_stddev = stddev(runtimes[v], NBR_EXPERIMENTS); print_row(names[v], time, time_mean, time_stddev, totalNbrOfOperations, totalNbrOfBytes, ticksPerMilliSecond, referenceTime, reference2Time); } } // *** utility functions *** // we compare the output of the 2 versions and expect no significant differences void checkIfResultsAreTheSame(string name, float* arr3, float* arr4, int n) { int nbr_diff = 0; for (int i = 0; i < n; i++) { float rel_diff = arr3[i] == arr4[i] ? 0 : (arr3[i] - arr4[i]) / (arr3[i] == 0 ? arr4[i] : arr3[i]); if ( rel_diff > 0.0001) { nbr_diff++; if (PRINT_DATA && i < 10) cout << arr3[i] << "<>" << arr4[i] << " ("<< rel_diff<<"), "; } else { if (PRINT_DATA && i < 10) cout << arr3[i] << "==" << arr4[i] << " (" << rel_diff << "), "; } arr4[i] = 0; // we reset arr4 for further usage } if (PRINT_DATA) cout << endl; if (nbr_diff > 0) cout << "Problem with algorithm "<