diff --git a/Makefile b/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..7ced09e7a5f6e9553a025486cdd6fb6b4999a35a --- /dev/null +++ b/Makefile @@ -0,0 +1,33 @@ + +GPU=NVIDIA +GPU=AMD + +OBJ = blas.o + + +ifeq ($(GPU), AMD) + LDFLAGS = -L/opt/rocm/lib -lhipblas -lrocblas -fopenmp + CFLAGS = -g -Wall -O3 -fopenmp -I/opt/rocm/include -I/opt/rocm/hip/include -DROC + CC = hipcc +else ifeq ($(GPU), NVIDIA) + CC = nvcc + LDFLAGS = -lcublas -lm -lgomp + INCLUDE= -I/usr/lib/x86_64-linux-gnu/openmpi/include/ + CFLAGS = ${INCLUDE} --compile -O3 -pg -Xcompiler -fopenmp -DCUDA + CUDAFLAGS = --Werror cross-execution-space-call --Wno-deprecated-gpu-targets +else + unknown_HW: +endif + + +all: ${OBJ} + ${CC} -o blas ${OBJ} ${LDFLAGS} ${CUDAFLAGS} + +%.o: %.c ${HEADER} + ${CC} ${CFLAGS} -c $< + +clean: + rm *.o + +unknown_HW: + @echo "hardware not detected" diff --git a/cuda_NVidia.c b/blas.c similarity index 70% rename from cuda_NVidia.c rename to blas.c index 24eb55fccf96455b3c0cbfe719c0f921942c4719..522545de7485b1bd38f6f965f6a5b415a94bb305 100644 --- a/cuda_NVidia.c +++ b/blas.c @@ -6,7 +6,7 @@ * Version: 1.0 * Created: 27.01.2021 12:45:18 * Revision: none - * Compiler: nvcc + * Compiler: hipc * * Author: Henning Fehrmann (), henning.fehrmann@aei.mpg.de * Organization: AEI Hannover @@ -15,18 +15,14 @@ * ===================================================================================== */ +#include "hardware_settings.h" #include <stdio.h> #include <stdlib.h> #include <math.h> -#include <assert.h> -#include <cuda_runtime.h> -#include <cuda.h> -#include <cublas_v2.h> #include <time.h> #include <omp.h> -#define __ASSERT(x) (assert((x)==cudaSuccess)) size_t m = 10000; @@ -35,12 +31,20 @@ size_t k = 10000; static unsigned long x=123456789, y=362436069, z=521288629; +#define __MALLOC(P, size) P = malloc(size); \ + if (P == NULL) \ + {\ + fprintf(stderr, "Allocation if failed at line %d in %s\n", __LINE__, __FILE__); \ + exit(EXIT_FAILURE); \ + }\ + unsigned long xorshf96 ( void ) { + // NOT thread save unsigned long t; x ^= x << 16; x ^= x >> 5; @@ -102,22 +106,22 @@ timer_stop void multiplication ( - cublasHandle_t handle, - const cuComplex *A, - const cuComplex *B, - cuComplex *C + __HANDLE__ handle, + const __COMPLEX8__ *A, + const __COMPLEX8__ *B, + __COMPLEX8__ *C ) { - cublasOperation_t transA = CUBLAS_OP_N; - cublasOperation_t transB = CUBLAS_OP_C; - const cuComplex alpha = {.x = 1.f, .y = 0.f}; - const cuComplex beta = {.x = 0.f, .y = 0.f}; + __BLAS_OPERATION__ transA = __NO_TRANSFORM__; + __BLAS_OPERATION__ transB = __CT_TRANSFORM__; + const __COMPLEX8__ alpha = {.x = 1.f, .y = 0.f}; + const __COMPLEX8__ beta = {.x = 0.f, .y = 0.f}; int lda = n; int ldb = n; int ldc = k; - cublasCgemm + __CGMEM__ ( handle, transA, @@ -139,8 +143,8 @@ multiplication void prepare_matrices ( - cuComplex * hA, - cuComplex * hB + __COMPLEX8__ * hA, + __COMPLEX8__ * hB ) { float fact = 1.f/(float)n/(float)x/(float)y/20.f; @@ -170,7 +174,7 @@ prepare_matrices void print_result ( - cuComplex * hC + __COMPLEX8__ * hC ) { printf("-------- %zu %zu\n", m, k); @@ -198,45 +202,27 @@ run_test m = dim; n = dim; k = dim; - struct runtime * timer = malloc(sizeof(*timer)); - - cuComplex *A; - cuComplex *B; - cuComplex *C; - __ASSERT(cudaMalloc((void **)&A, sizeof(*A) * (size_t)(m * n))); - if (A == NULL) - { - fprintf(stderr, "A not allocated\n"); - exit(1); - } - __ASSERT(cudaMalloc((void **)&B, sizeof(*B) * (size_t)(n * k))); - if (B == NULL) - { - fprintf(stderr, "B not allocated\n"); - exit(1); - } - __ASSERT(cudaMalloc((void **)&C, sizeof(*C) * (size_t)(m * k))); + struct runtime * timer; + __MALLOC(timer, sizeof(*timer)); + + __COMPLEX8__ *A; + __COMPLEX8__ *B; + __COMPLEX8__ *C; + __ASSERT(__PREFIX(Malloc)((void **)&A, sizeof(*A) * (size_t)(m * n))); + __ASSERT(__PREFIX(Malloc)((void **)&B, sizeof(*B) * (size_t)(n * k))); + __ASSERT(__PREFIX(Malloc)((void **)&C, sizeof(*C) * (size_t)(m * k))); if (C == NULL) { fprintf(stderr, "C not allocated\n"); exit(1); } - - cuComplex *hA = malloc(sizeof(*hA) * (size_t)(m * n)); - if (hA == NULL) - { - fprintf(stderr, "hA not allocated\n"); - exit(1); - } - cuComplex *hB = malloc(sizeof(*hB) * (size_t)(k * n)); - if (hB == NULL) - { - fprintf(stderr, "hB not allocated\n"); - exit(1); - } - - cuComplex *hC = malloc(sizeof(*hC) * (size_t)(m * k)); + __COMPLEX8__ *hA; + __MALLOC( hA, sizeof(*hA) * (size_t)(m * n)); + __COMPLEX8__ *hB; + __MALLOC( hB, sizeof(*hB) * (size_t)(k * n)); + __COMPLEX8__ *hC; + __MALLOC( hC, sizeof(*hC) * (size_t)(m * k)); if (hC == NULL) { fprintf(stderr, "hC not allocated\n"); @@ -248,14 +234,16 @@ run_test // timer_stop(timer); //timer_start(timer, "Memcopy"); - __ASSERT(cudaMemcpy(A, hA, sizeof(*A) * (size_t)(m * n), cudaMemcpyHostToDevice)); - __ASSERT(cudaMemcpy(B, hB, sizeof(*B) * (size_t)(k * n), cudaMemcpyHostToDevice)); + __ASSERT(__PREFIX(Memcpy)(A, hA, sizeof(*A) * (size_t)(m * n), __PREFIX(MemcpyHostToDevice))); + __ASSERT(__PREFIX(Memcpy)(B, hB, sizeof(*B) * (size_t)(k * n), __PREFIX(MemcpyHostToDevice))); // timer_stop(timer); - cudaSetDevice(0); - cublasHandle_t handle; + // cudaSetDevice(0); + __HANDLE__ handle; //timer_start(timer, "Create Handle"); - cublasCreate(&handle); + //if(rocblas_create_handle(&handle) != rocblas_status_success) return EXIT_FAILURE; + __CREATE_HANDLE(&handle); + //timer_stop(timer); for (unsigned r = 0; r < rep; r++) @@ -294,16 +282,17 @@ run_test + k * m * sizeof(*C) )/1.e+9); - __ASSERT(cudaMemcpy(hC, C, sizeof(*hC) * (size_t)(k * m), cudaMemcpyDeviceToHost)); + __ASSERT(__PREFIX(Memcpy)(hC, C, sizeof(*hC) * (size_t)(k * m), __PREFIX(MemcpyDeviceToHost))); //print_result(hC); // timer_start(timer, "Destroy Handle"); - if(cublasDestroy(handle) != cudaSuccess) return EXIT_FAILURE; + //if(rocblas_destroy_handle(handle) != rocblas_status_success) return EXIT_FAILURE; + if(__DESTROY_HANDLE(handle) != __PREFIX(Success)) return EXIT_FAILURE; // timer_stop(timer); - cudaFree(A); - cudaFree(B); - cudaFree(C); + __PREFIX(Free)(A); + __PREFIX(Free)(B); + __PREFIX(Free)(C); free(hA); free(hB); free(hC); @@ -320,12 +309,8 @@ main int min_dim = 1; int max_dim = 14; - float * res = malloc(sizeof(*res) * (size_t)((max_dim - min_dim) * rep)); - if (res == NULL) - { - fprintf(stderr, "Couldn't allocate res\n"); - exit(1); - } + float * res; + __MALLOC(res, sizeof(*res) * (size_t)((max_dim - min_dim) * rep)); for (int i = min_dim; i < max_dim; i++) { size_t dim = 1 << i; diff --git a/hardware_settings.h b/hardware_settings.h new file mode 100644 index 0000000000000000000000000000000000000000..b311022009800856d4709c84f001df9663760a3e --- /dev/null +++ b/hardware_settings.h @@ -0,0 +1,52 @@ +/* + * ===================================================================================== + * + * Filename: hardware_settings.h + * + * Description: + * + * Version: 1.0 + * Created: 28.01.2021 16:15:56 + * Revision: none + * Compiler: gcc + * + * Author: Henning Fehrmann (), henning.fehrmann@aei.mpg.de + * Organization: AEI Hannover + * Copyright: Copyright (c) 2021, Henning Fehrmann + * + * ===================================================================================== + */ + +#ifdef ROC +#define __HIP_PLATFORM_HCC__ +#include <rocblas.h> +#include <hip/hip_runtime.h> +#include <hip/hip_vector_types.h> +#define __ASSERT(x) (assert((x)==hipSuccess)) +#define __HANDLE__ rocblas_handle +#define __COMPLEX8__ rocblas_float_complex +#define __BLAS_OPERATION__ rocblas_operation +#define __NO_TRANSFORM__ rocblas_operation_none +#define __CT_TRANSFORM__ rocblas_operation_conjugate_transpose +#define __CGMEM__ rocblas_cgemm +#define __PREFIX(c) hip##c +#define __CREATE_HANDLE(h) rocblas_create_handle(h) +#define __DESTROY_HANDLE(h) rocblas_destroy_handle(h) +#endif + +#ifdef CUDA +#include <assert.h> +#include <cuda_runtime.h> +#include <cuda.h> +#include <cublas_v2.h> +#define __ASSERT(x) (assert((x)==cudaSuccess)) +#define __HANDLE__ cublasHandle_t +#define __COMPLEX8__ cuComplex +#define __BLAS_OPERATION__ cublasOperation_t +#define __NO_TRANSFORM__ CUBLAS_OP_N +#define __CT_TRANSFORM__ CUBLAS_OP_C +#define __CGMEM__ cublasCgemm +#define __PREFIX(c) cuda##c +#define __CREATE_HANDLE(h) cublasCreate(h) +#define __DESTROY_HANDLE(h) cublasDestroy(h) +#endif diff --git a/rocmblas_AMD.c b/rocmblas_AMD.c deleted file mode 100644 index f8cc59d6256d322ef5a7fe29c80a811302d83e12..0000000000000000000000000000000000000000 --- a/rocmblas_AMD.c +++ /dev/null @@ -1,361 +0,0 @@ -/* - * ===================================================================================== - * - * Description: BLAS Benchmark - * - * Version: 1.0 - * Created: 27.01.2021 12:45:18 - * Revision: none - * Compiler: hipc - * - * Author: Henning Fehrmann (), henning.fehrmann@aei.mpg.de - * Organization: AEI Hannover - * License: GNU General Public License v2 - * - * ===================================================================================== - */ - - -#define __HIP_PLATFORM_HCC__ - -#include <stdio.h> -#include <stdlib.h> -#include <math.h> -#include <rocblas.h> -#include <hip/hip_runtime.h> -#include <hip/hip_vector_types.h> -#include <time.h> -#include <omp.h> - -#define __ASSERT(x) (assert((x)==hipSuccess)) - - -size_t m = 10000; -size_t n = 10000; -size_t k = 10000; - -static unsigned long x=123456789, y=362436069, z=521288629; - -unsigned long -xorshf96 -( - void -) -{ - // NOT thread save - unsigned long t; - x ^= x << 16; - x ^= x >> 5; - x ^= x << 1; - - t = x; - x = y; - y = z; - z = t ^ x ^ y; - return z; -} - -struct runtime -{ - struct timespec start; - struct timespec stop; - char tag[128]; -}; - - -void -timer_start -( - struct runtime * timer, - char tag[128] -) -{ - struct timespec start; - sprintf(timer->tag,"%s", tag); - clock_gettime(CLOCK_REALTIME , &start); - timer->start = start; -// printf("--------> start timer: %s\n", timer->tag); -} - -double -timer_stop -( - struct runtime * timer -) -{ - struct timespec stop; - clock_gettime(CLOCK_REALTIME , &stop); - timer->stop = stop; - double res= (double) - ( - (timer->stop).tv_sec - (timer->start).tv_sec - )*1000. - + - (double) - ( - (timer->stop).tv_nsec - (timer->start).tv_nsec - )/1000000. - ; - // printf("--------> stop timer %s: %g ms\n", timer->tag, res ); - return res; -} - - -void -multiplication -( - rocblas_handle handle, - const rocblas_float_complex *A, - const rocblas_float_complex *B, - rocblas_float_complex *C -) -{ - rocblas_operation transA = rocblas_operation_none; - rocblas_operation transB = rocblas_operation_conjugate_transpose; - const rocblas_float_complex alpha = {.x = 1.f, .y = 0.f}; - const rocblas_float_complex beta = {.x = 0.f, .y = 0.f}; - - rocblas_int lda = n; - rocblas_int ldb = n; - rocblas_int ldc = k; - - rocblas_cgemm - ( - handle, - transA, - transB, - m, - n, - k, - &alpha, - A, - lda, - B, - ldb, - &beta, - C, - ldc - ); -} - -void -prepare_matrices -( - rocblas_float_complex * hA, - rocblas_float_complex * hB -) -{ - float fact = 1.f/(float)n/(float)x/(float)y/20.f; -#pragma omp parallel for - for (size_t i = 0; i < n; i++) - { - for (size_t j = 0; j < m; j++) - { - size_t ind = j + m * i; - hA[ind].x = (float)xorshf96()*fact; - hA[ind].y = (float)xorshf96()*fact; - } - } -#pragma omp parallel for - for (size_t i = 0; i < n; i++) - { - for (size_t j = 0; j < k; j++) - { - size_t ind = j + k * i; - hB[ind].x = (float)xorshf96()*fact; - hB[ind].y = (float)xorshf96()*fact; - } - } - -} - -void -print_result -( - rocblas_float_complex * hC -) -{ - printf("-------- %zu %zu\n", m, k); - for (size_t i = 0; i < m; i++) - { - for (size_t j = 0; j < k; j++) - { - size_t ind = j + k * i; - printf("%1.2f %1.2f\t", hC[ind].x, hC[ind].y); - } - printf("\n"); - } - printf("--------\n"); - -} - -int -run_test -( - size_t dim, - unsigned rep, - float * res -) -{ - m = dim; - n = dim; - k = dim; - struct runtime * timer = malloc(sizeof(*timer)); - - rocblas_float_complex *A; - rocblas_float_complex *B; - rocblas_float_complex *C; - __ASSERT(hipMalloc((void **)&A, sizeof(*A) * (size_t)(m * n))); - if (A == NULL) - { - fprintf(stderr, "A not allocated\n"); - exit(1); - } - __ASSERT(hipMalloc((void **)&B, sizeof(*B) * (size_t)(n * k))); - if (B == NULL) - { - fprintf(stderr, "B not allocated\n"); - exit(1); - } - __ASSERT(hipMalloc((void **)&C, sizeof(*C) * (size_t)(m * k))); - if (C == NULL) - { - fprintf(stderr, "C not allocated\n"); - exit(1); - } - - rocblas_float_complex *hA = malloc(sizeof(*hA) * (size_t)(m * n)); - if (hA == NULL) - { - fprintf(stderr, "hA not allocated\n"); - exit(1); - } - rocblas_float_complex *hB = malloc(sizeof(*hB) * (size_t)(k * n)); - if (hB == NULL) - { - fprintf(stderr, "hB not allocated\n"); - exit(1); - } - - rocblas_float_complex *hC = malloc(sizeof(*hC) * (size_t)(m * k)); - if (hC == NULL) - { - fprintf(stderr, "hC not allocated\n"); - exit(1); - } - - // timer_start(timer, "Prepare matrices"); - prepare_matrices(hA, hB); - // timer_stop(timer); - - //timer_start(timer, "Memcopy"); - __ASSERT(hipMemcpy(A, hA, sizeof(*A) * (size_t)(m * n), hipMemcpyHostToDevice)); - __ASSERT(hipMemcpy(B, hB, sizeof(*B) * (size_t)(k * n), hipMemcpyHostToDevice)); - // timer_stop(timer); - - rocblas_handle handle; - //timer_start(timer, "Create Handle"); - if(rocblas_create_handle(&handle) != rocblas_status_success) return EXIT_FAILURE; - //timer_stop(timer); - - for (unsigned r = 0; r < rep; r++) - { - float res_r = 0.f; - char mes[128]; - sprintf(mes, "dim %zu run %d a" ,dim, r); - timer_start(timer, mes); - multiplication - ( - handle, - A, - B, - C - ); - res_r += timer_stop(timer); - sprintf(mes, "dim %zu run %d b" ,dim, r); - /* - timer_start(timer, mes); - multiplication - ( - handle, - B, - A, - C - ); - res_r += timer_stop(timer); - */ - res[r] = res_r/1.f; - } - printf("dimensions: %zu %zu %zu\t -- ", n, m , k); - printf("required size: %f GB\n", - ( - m * n * sizeof(*A) - + k * n * sizeof(*B) - + k * m * sizeof(*C) - )/1.e+9); - - __ASSERT(hipMemcpy(hC, C, sizeof(*hC) * (size_t)(k * m), hipMemcpyDeviceToHost)); - //print_result(hC); - - // timer_start(timer, "Destroy Handle"); - if(rocblas_destroy_handle(handle) != rocblas_status_success) return EXIT_FAILURE; - // timer_stop(timer); - - hipFree(A); - hipFree(B); - hipFree(C); - free(hA); - free(hB); - free(hC); - free(timer); - return 0; -} - -int -main -( -) -{ - int rep = 512; - int min_dim = 1; - int max_dim = 14; - - float * res = malloc(sizeof(*res) * (size_t)((max_dim - min_dim) * rep)); - if (res == NULL) - { - fprintf(stderr, "Couldn't allocate res\n"); - exit(1); - } - for (int i = min_dim; i < max_dim; i++) - { - size_t dim = 1 << i; - int ind = (i - min_dim) * rep; - run_test(dim, rep, &res[ind]); - } - // store the results - FILE * f; - char name[128]; - sprintf(name, "runtimes"); - f= fopen(name, "w"); - if (f == NULL) - { - fprintf(stderr, "Couldn't open %s\n", name); - } - for (int i = min_dim; i < max_dim; i++) - { - size_t dim = 1 << i; - fprintf(f, "%zu\t", dim); - } - fprintf(f, "\n"); - for (int r = 0; r < rep; r++) - { - for (int i = min_dim; i < max_dim; i++) - { - size_t pos = (i - min_dim) * rep + r; - fprintf(f, "%1.6f\t", res[pos]); - } - fprintf(f, "\n"); - } - fclose(f); - return 0; -}