/* Example program that shows how to create a multithreaded program in which each thread will treat a part of an array. */ #include #include #include #include #include // cout, ... #include //openmp using namespace std; using namespace std::chrono; #define USE_SIMPLE_OPERATION 0 // element_wise_product versus random generator const int MAX_NBR_THREADS = 8; #define ARRAY_SIZE (1024*2014) #define NBR_ITERATIONS 100 void multiplication_function(int t, float* arr, float* arr2, float* result, int n); long long sequential_version(int n, float* arr, float* arr2, float* result); long long sequential_version_with_function(int n, float* arr, float* arr2, float* result); long long version_multithreaded(const int n, float* arr, float* arr2, float* result, const int NBR_THREADS); long long version_openmp_simple(const int n, float* arr, float* arr2, float* result, int nbrThreads); long long version_openmp_complex(const int n, float* arr, float* arr2, float* result, int nbrThreads); long long version_openmp_function(const int n, float* arr, float* arr2, float* result, int nbrThreads); void checkIfResultsAreTheSame(string name, float* arr3, float* arr4, int n, bool PRINT_DATA); int main() { cout << "Considering arrays of size " << ARRAY_SIZE << endl; #if USE_SIMPLE_OPERATION const int NBR_OPERATIONS_PER_ELEMENT = 1; cout << "With simple operation (" << NBR_OPERATIONS_PER_ELEMENT <<" operation per element)." endl; #else const int NBR_OPERATIONS_PER_ELEMENT = NBR_ITERATIONS * 3; cout << "With complex operation (" << NBR_OPERATIONS_PER_ELEMENT << " operation per element)." << endl; #endif const int NBR_OPERATIONS = ARRAY_SIZE * NBR_OPERATIONS_PER_ELEMENT; // allocation and initialization srand(time(NULL)); // random initial seed float* arr = new float[ARRAY_SIZE], *arr2 = new float[ARRAY_SIZE], *result_seq = new float[ARRAY_SIZE], *result_par = new float[ARRAY_SIZE], *arr4 = new float[ARRAY_SIZE], *arr5 = new float[ARRAY_SIZE], *arr6 = new float[ARRAY_SIZE]; for (int i = 0; i < ARRAY_SIZE; i++) { arr[i] = rand(); arr2[i] = rand(); result_seq[i] = 0; // actually, initialization is not necessary result_par[i] = 0; arr4[i] = rand(); arr5[i] = rand(); arr6[i] = 0; } // flush caches long long run_time_seq = sequential_version(ARRAY_SIZE, arr4, arr5, arr6); cout << "Runtime for sequential version: " << run_time_seq << "us " << (run_time_seq * 1000 / NBR_OPERATIONS) << "ns/op" << endl; run_time_seq = sequential_version_with_function(ARRAY_SIZE, arr, arr2, result_seq); cout << "Runtime for sequential version (with function): " << run_time_seq << "us " <<(run_time_seq * 1000 / NBR_OPERATIONS) << "ns/op" << endl; // warm up version_multithreaded(ARRAY_SIZE, arr4, arr5, arr6, MAX_NBR_THREADS); for (int nbr_threads = 1; nbr_threads <= MAX_NBR_THREADS; nbr_threads++) { version_multithreaded(ARRAY_SIZE, arr4, arr5, arr6, MAX_NBR_THREADS); // clear the caches long long run_time = version_multithreaded(ARRAY_SIZE, arr, arr2, result_par, nbr_threads); checkIfResultsAreTheSame("Multithreaded version", result_seq, result_par, ARRAY_SIZE, false); cout << "Runtime for " << nbr_threads << " threads: " << run_time << "us " << (run_time * 1000 / NBR_OPERATIONS) << "ns/op - Speedup = "<< ((double)run_time_seq/ run_time) < start = system_clock::now(); multiplication_function(0, arr, arr2, result, n); time_point end = system_clock::now(); microseconds us = duration_cast(end - start); return us.count(); } long long sequential_version_with_function(int n, float* arr, float* arr2, float* result) { time_point start = system_clock::now(); multiplication_function_with_function(0, arr, arr2, result, n); time_point end = system_clock::now(); microseconds us = duration_cast(end - start); return us.count(); } long long version_multithreaded(const int n, float* arr, float* arr2, float* result, const int NBR_THREADS) { time_point start = system_clock::now(); const int ELEMENTS_PER_THREAD = n / NBR_THREADS; vector threads; // vector of pointers to threads // *** STARTING THE THREADS for (int t = 0; t < NBR_THREADS; t++) threads.push_back(new thread(multiplication_function_with_function, t, arr + t * ELEMENTS_PER_THREAD, arr2 + t * ELEMENTS_PER_THREAD, result + t * ELEMENTS_PER_THREAD, ELEMENTS_PER_THREAD)); // pass the function to be executed and all the necessary parameters // do the remainder of the array in the main thread // do the remainder of the array int last_idx = NBR_THREADS * ELEMENTS_PER_THREAD; if (last_idx < n) multiplication_function(0, arr + last_idx, arr2 + last_idx, result + last_idx, n - last_idx); // *** waiting for all threads to finish for (int t = 0; t < threads.size(); t++) { threads[t]->join(); delete threads[t]; } time_point end = system_clock::now(); microseconds us = duration_cast(end - start); return us.count(); } long long version_openmp_simple(const int n, float* arr, float* arr2, float* result, int nbrThreads) { time_point now = system_clock::now(); int nThreads, i = 0; #pragma omp parallel num_threads(nbrThreads) default(none) private(i) shared(result, arr, arr2, nThreads) { #pragma omp master nThreads = omp_get_num_threads(); #pragma omp for for (i = 0; i < ARRAY_SIZE; ++i) { result[i] = arr[i] * arr2[i]; } } time_point epoch = system_clock::now(); microseconds us = duration_cast(epoch - now); if (nThreads != nbrThreads) cout << "OpenMP used " << nThreads << " threads (expected " << nbrThreads << " threads) ..." << endl; return us.count(); } long long version_openmp_complex(const int n, float* arr, float* arr2, float* result, int nbrThreads) { time_point now = system_clock::now(); int nThreads, i = 0; #define A 23 #define B 4203 #define C 4748 #pragma omp parallel num_threads(nbrThreads) default(none) private(i, j) shared(result, arr, arr2, nThreads) { #pragma omp master nThreads = omp_get_num_threads(); #pragma omp for for (i = 0; i < ARRAY_SIZE; ++i) { int random = (int)arr[i]; // start value (seed) for (int j = 0; j < NBR_ITERATIONS; j++) { random = (A * random + B) % C; } result[i] = random; } } time_point epoch = system_clock::now(); microseconds us = duration_cast(epoch - now); if (nThreads != nbrThreads) cout << "OpenMP used " << nThreads << " threads (expected " << nbrThreads << " threads) ..." << endl; return us.count(); } long long version_openmp_function(const int n, float* arr, float* arr2, float* result, int nbrThreads) { time_point now = system_clock::now(); int nThreads, i = 0; #pragma omp parallel num_threads(nbrThreads) default(none) private(i) shared(result, arr, arr2, nThreads) { #pragma omp master nThreads = omp_get_num_threads(); #pragma omp for for (i = 0; i < ARRAY_SIZE; ++i) { result[i] = elementwise_function(arr[i], arr2[i]); //result[i] = arr[i] * arr2[i]; } } time_point epoch = system_clock::now(); microseconds us = duration_cast(epoch - now); if (nThreads != nbrThreads) cout << "OpenMP used " << nThreads << " threads (expected " << nbrThreads << " threads) ..." << endl; return us.count(); } void checkIfResultsAreTheSame(string name, float* arr3, float* arr4, int n, bool PRINT_DATA) { int nbr_diff = 0; for (int i = 0; i < n; i++) { float rel_diff = arr3[i] == arr4[i] ? 0 : (arr3[i] - arr4[i]) / (arr3[i] == 0 ? arr4[i] : arr3[i]); if (rel_diff > 0.0001) { nbr_diff++; if (PRINT_DATA && i < 10) cout << arr3[i] << "<>" << arr4[i] << " (" << rel_diff << "), "; } else { if (PRINT_DATA && i < 10) cout << arr3[i] << "==" << arr4[i] << " (" << rel_diff << "), "; } arr4[i] = 0; // we reset arr4 for further usage } if (PRINT_DATA) cout << endl; if (nbr_diff > 0) cout << "Problem with algorithm : " << nbr_diff << " differences with Original Algorithm." << endl; } // " << name << "