когда количество потоков увеличивается, производительность снижается в pthreads - PullRequest
0 голосов
/ 15 апреля 2020
#include <stdlib.h>
#include <stdio.h>
#include <cuda.h>
#include <math.h>
#include <time.h>
#include <curand_kernel.h>
#include <pthread.h>

#define TRIALS_PER_THREAD 2048
#define BLOCKS 256
#define THREADS 256
#define PI 3.1415926535  // known value of pi

__global__ void gpu_monte_carlo(float *estimate, curandState *states) {
    unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x;
    int points_in_circle = 0;
    float x, y;

    curand_init(1234, tid, 0, &states[tid]);  //    Initialize CURAND


    for(int i = 0; i < TRIALS_PER_THREAD; i++) {
        x = curand_uniform (&states[tid]);
        y = curand_uniform (&states[tid]);
        points_in_circle += (x*x + y*y <= 1.0f); // count if point (x,y) is in the circle
    }
    estimate[tid] = 4.0f * points_in_circle / (float) TRIALS_PER_THREAD; // return estimate of pi
}

__host__ float host_monte_carlo(long trials) {
    float x, y;
    long points_in_circle=0;
    for(long i = 0; i < trials; i++) {
        x = rand() / (float) RAND_MAX;
        y = rand() / (float) RAND_MAX;
        points_in_circle += (x*x + y*y <= 1.0f);
    }
    return 4.0f * points_in_circle / trials;
}

__host__ struct threadInfo
{
    long int volatile*num_hits;
    int num_threads;
    pthread_mutex_t*mutex;
};

__host__ void* hit(void* arg)
{
    struct threadInfo* threadInfoStruct;
    threadInfoStruct = (struct threadInfo*) arg;
    long trials = BLOCKS * THREADS * TRIALS_PER_THREAD;                  
    long end = (trials /(threadInfoStruct->num_threads));    
    long my_hits=0;
    float   x, y;
    long i;
    unsigned int myseed = time(NULL) ^ pthread_self();
    for (i=0; i < end; i++) {
        x = ((float) rand_r(&myseed))/ RAND_MAX;
        y = ((float) rand_r(&myseed))/ RAND_MAX;
        if (x*x + y*y <= 1.0f) 
            my_hits++;
    }
    pthread_mutex_lock (threadInfoStruct->mutex);
    *(threadInfoStruct->num_hits) += my_hits;
    pthread_mutex_unlock (threadInfoStruct->mutex);
    pthread_exit(NULL); 
}

__host__ float pthread_monte_carlo(long trials, int num_threads) {
        volatile long numHits = 0;
        pthread_mutex_t sharedLock;
        pthread_t ptr_tid[num_threads];
        struct threadInfo threadInfoPtr[num_threads];

        pthread_mutex_init(&sharedLock, NULL);
        int i;
        for (i = 0; i < num_threads; i++) { 
            threadInfoPtr[i].num_hits = &numHits;
            threadInfoPtr[i].mutex = &sharedLock;
            threadInfoPtr[i].num_threads = num_threads;
            pthread_create(&ptr_tid[i], NULL, hit, &threadInfoPtr[i]); 
        }                                                 
        for (i = 0; i < num_threads; i++)  {                  
            pthread_join(ptr_tid[i], NULL);                   
        }
        pthread_mutex_destroy(&sharedLock);
        return 4.0 * numHits / trials;
}

int main (int argc, char *argv[]) {
    clock_t start, stop;
    float* host = (float*)calloc(BLOCKS * THREADS, sizeof(float));
    float *dev;
    curandState *devStates;

    printf("# of trials per thread = %d, # of blocks = %d, # of threads/block = %d.\n", TRIALS_PER_THREAD,
BLOCKS, THREADS);

    start = clock();

    cudaMalloc((void **) &dev, BLOCKS * THREADS * sizeof(float)); // allocate device mem. for counts

    cudaMalloc( (void **)&devStates, THREADS * BLOCKS * sizeof(curandState) );

    gpu_monte_carlo<<<BLOCKS, THREADS>>>(dev, devStates);

    cudaMemcpy(host, dev, BLOCKS * THREADS * sizeof(float), cudaMemcpyDeviceToHost); // return results

    float pi_gpu=0.0;
    for(int i = 0; i < BLOCKS * THREADS; i++) {
        pi_gpu += host[i];
    }

    pi_gpu /= (BLOCKS * THREADS);

    stop = clock();

    printf("GPU pi calculated in %f s.\n", (stop-start)/(float)CLOCKS_PER_SEC);

    start = clock();
    float pi_cpu = host_monte_carlo(BLOCKS * THREADS * TRIALS_PER_THREAD);
    stop = clock();
    printf("CPU pi calculated in %f s.\n", (stop-start)/(float)CLOCKS_PER_SEC);

    int num_threads = atoi(argv[1]);
    start = clock();
    float pi_pthread = pthread_monte_carlo(BLOCKS * THREADS * TRIALS_PER_THREAD, num_threads);
    stop = clock();
    printf("Pthread pi calculated in %f s.\n", (stop-start)/(float)CLOCKS_PER_SEC);

    printf("CUDA estimate of PI = %f [error of %f]\n", pi_gpu, pi_gpu - PI);
    printf("CPU estimate of PI = %f [error of %f]\n", pi_cpu, pi_cpu - PI);
    printf("Pthread estimate of PI = %f [error of %f] when num_threads = %d\n", pi_pthread, pi_pthread - PI, num_threads);
    pthread_exit(NULL);

    cudaFree(&dev);
    cudaFree(&devStates);
    free(host);
    return 0;
}

Это моя программа cuda. Я добавил pthread здесь, чтобы сравнить результаты. Я не мог наблюдать улучшения производительности при увеличении числа потоков. Примечание: тот же код работает нормально с обычной программой pthread без cuda. И моя программа не выходит. Мне нужно принудительно выйти. Это почему? пожалуйста, помогите мне.

...