Commit 0626ce84 authored by Henning Fehrmann's avatar Henning Fehrmann Committed by Henning Fehrmann
Browse files

platform agnostic code

parent 21c8a6d9
GPU=NVIDIA
GPU=AMD
OBJ = blas.o
ifeq ($(GPU), AMD)
LDFLAGS = -L/opt/rocm/lib -lhipblas -lrocblas -fopenmp
CFLAGS = -g -Wall -O3 -fopenmp -I/opt/rocm/include -I/opt/rocm/hip/include -DROC
CC = hipcc
else ifeq ($(GPU), NVIDIA)
CC = nvcc
LDFLAGS = -lcublas -lm -lgomp
INCLUDE= -I/usr/lib/x86_64-linux-gnu/openmpi/include/
CFLAGS = ${INCLUDE} --compile -O3 -pg -Xcompiler -fopenmp -DCUDA
CUDAFLAGS = --Werror cross-execution-space-call --Wno-deprecated-gpu-targets
else
unknown_HW:
endif
all: ${OBJ}
${CC} -o blas ${OBJ} ${LDFLAGS} ${CUDAFLAGS}
%.o: %.c ${HEADER}
${CC} ${CFLAGS} -c $<
clean:
rm *.o
unknown_HW:
@echo "hardware not detected"
......@@ -6,7 +6,7 @@
* Version: 1.0
* Created: 27.01.2021 12:45:18
* Revision: none
* Compiler: nvcc
* Compiler: hipc
*
* Author: Henning Fehrmann (), henning.fehrmann@aei.mpg.de
* Organization: AEI Hannover
......@@ -15,18 +15,14 @@
* =====================================================================================
*/
#include "hardware_settings.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <cublas_v2.h>
#include <time.h>
#include <omp.h>
#define __ASSERT(x) (assert((x)==cudaSuccess))
size_t m = 10000;
......@@ -35,12 +31,20 @@ size_t k = 10000;
static unsigned long x=123456789, y=362436069, z=521288629;
#define __MALLOC(P, size) P = malloc(size); \
if (P == NULL) \
{\
fprintf(stderr, "Allocation if failed at line %d in %s\n", __LINE__, __FILE__); \
exit(EXIT_FAILURE); \
}\
unsigned long
xorshf96
(
void
)
{
// NOT thread save
unsigned long t;
x ^= x << 16;
x ^= x >> 5;
......@@ -102,22 +106,22 @@ timer_stop
void
multiplication
(
cublasHandle_t handle,
const cuComplex *A,
const cuComplex *B,
cuComplex *C
__HANDLE__ handle,
const __COMPLEX8__ *A,
const __COMPLEX8__ *B,
__COMPLEX8__ *C
)
{
cublasOperation_t transA = CUBLAS_OP_N;
cublasOperation_t transB = CUBLAS_OP_C;
const cuComplex alpha = {.x = 1.f, .y = 0.f};
const cuComplex beta = {.x = 0.f, .y = 0.f};
__BLAS_OPERATION__ transA = __NO_TRANSFORM__;
__BLAS_OPERATION__ transB = __CT_TRANSFORM__;
const __COMPLEX8__ alpha = {.x = 1.f, .y = 0.f};
const __COMPLEX8__ beta = {.x = 0.f, .y = 0.f};
int lda = n;
int ldb = n;
int ldc = k;
cublasCgemm
__CGMEM__
(
handle,
transA,
......@@ -139,8 +143,8 @@ multiplication
void
prepare_matrices
(
cuComplex * hA,
cuComplex * hB
__COMPLEX8__ * hA,
__COMPLEX8__ * hB
)
{
float fact = 1.f/(float)n/(float)x/(float)y/20.f;
......@@ -170,7 +174,7 @@ prepare_matrices
void
print_result
(
cuComplex * hC
__COMPLEX8__ * hC
)
{
printf("-------- %zu %zu\n", m, k);
......@@ -198,45 +202,27 @@ run_test
m = dim;
n = dim;
k = dim;
struct runtime * timer = malloc(sizeof(*timer));
cuComplex *A;
cuComplex *B;
cuComplex *C;
__ASSERT(cudaMalloc((void **)&A, sizeof(*A) * (size_t)(m * n)));
if (A == NULL)
{
fprintf(stderr, "A not allocated\n");
exit(1);
}
__ASSERT(cudaMalloc((void **)&B, sizeof(*B) * (size_t)(n * k)));
if (B == NULL)
{
fprintf(stderr, "B not allocated\n");
exit(1);
}
__ASSERT(cudaMalloc((void **)&C, sizeof(*C) * (size_t)(m * k)));
struct runtime * timer;
__MALLOC(timer, sizeof(*timer));
__COMPLEX8__ *A;
__COMPLEX8__ *B;
__COMPLEX8__ *C;
__ASSERT(__PREFIX(Malloc)((void **)&A, sizeof(*A) * (size_t)(m * n)));
__ASSERT(__PREFIX(Malloc)((void **)&B, sizeof(*B) * (size_t)(n * k)));
__ASSERT(__PREFIX(Malloc)((void **)&C, sizeof(*C) * (size_t)(m * k)));
if (C == NULL)
{
fprintf(stderr, "C not allocated\n");
exit(1);
}
cuComplex *hA = malloc(sizeof(*hA) * (size_t)(m * n));
if (hA == NULL)
{
fprintf(stderr, "hA not allocated\n");
exit(1);
}
cuComplex *hB = malloc(sizeof(*hB) * (size_t)(k * n));
if (hB == NULL)
{
fprintf(stderr, "hB not allocated\n");
exit(1);
}
cuComplex *hC = malloc(sizeof(*hC) * (size_t)(m * k));
__COMPLEX8__ *hA;
__MALLOC( hA, sizeof(*hA) * (size_t)(m * n));
__COMPLEX8__ *hB;
__MALLOC( hB, sizeof(*hB) * (size_t)(k * n));
__COMPLEX8__ *hC;
__MALLOC( hC, sizeof(*hC) * (size_t)(m * k));
if (hC == NULL)
{
fprintf(stderr, "hC not allocated\n");
......@@ -248,14 +234,16 @@ run_test
// timer_stop(timer);
//timer_start(timer, "Memcopy");
__ASSERT(cudaMemcpy(A, hA, sizeof(*A) * (size_t)(m * n), cudaMemcpyHostToDevice));
__ASSERT(cudaMemcpy(B, hB, sizeof(*B) * (size_t)(k * n), cudaMemcpyHostToDevice));
__ASSERT(__PREFIX(Memcpy)(A, hA, sizeof(*A) * (size_t)(m * n), __PREFIX(MemcpyHostToDevice)));
__ASSERT(__PREFIX(Memcpy)(B, hB, sizeof(*B) * (size_t)(k * n), __PREFIX(MemcpyHostToDevice)));
// timer_stop(timer);
cudaSetDevice(0);
cublasHandle_t handle;
// cudaSetDevice(0);
__HANDLE__ handle;
//timer_start(timer, "Create Handle");
cublasCreate(&handle);
//if(rocblas_create_handle(&handle) != rocblas_status_success) return EXIT_FAILURE;
__CREATE_HANDLE(&handle);
//timer_stop(timer);
for (unsigned r = 0; r < rep; r++)
......@@ -294,16 +282,17 @@ run_test
+ k * m * sizeof(*C)
)/1.e+9);
__ASSERT(cudaMemcpy(hC, C, sizeof(*hC) * (size_t)(k * m), cudaMemcpyDeviceToHost));
__ASSERT(__PREFIX(Memcpy)(hC, C, sizeof(*hC) * (size_t)(k * m), __PREFIX(MemcpyDeviceToHost)));
//print_result(hC);
// timer_start(timer, "Destroy Handle");
if(cublasDestroy(handle) != cudaSuccess) return EXIT_FAILURE;
//if(rocblas_destroy_handle(handle) != rocblas_status_success) return EXIT_FAILURE;
if(__DESTROY_HANDLE(handle) != __PREFIX(Success)) return EXIT_FAILURE;
// timer_stop(timer);
cudaFree(A);
cudaFree(B);
cudaFree(C);
__PREFIX(Free)(A);
__PREFIX(Free)(B);
__PREFIX(Free)(C);
free(hA);
free(hB);
free(hC);
......@@ -320,12 +309,8 @@ main
int min_dim = 1;
int max_dim = 14;
float * res = malloc(sizeof(*res) * (size_t)((max_dim - min_dim) * rep));
if (res == NULL)
{
fprintf(stderr, "Couldn't allocate res\n");
exit(1);
}
float * res;
__MALLOC(res, sizeof(*res) * (size_t)((max_dim - min_dim) * rep));
for (int i = min_dim; i < max_dim; i++)
{
size_t dim = 1 << i;
......
/*
* =====================================================================================
*
* Filename: hardware_settings.h
*
* Description:
*
* Version: 1.0
* Created: 28.01.2021 16:15:56
* Revision: none
* Compiler: gcc
*
* Author: Henning Fehrmann (), henning.fehrmann@aei.mpg.de
* Organization: AEI Hannover
* Copyright: Copyright (c) 2021, Henning Fehrmann
*
* =====================================================================================
*/
#ifdef ROC
#define __HIP_PLATFORM_HCC__
#include <rocblas.h>
#include <hip/hip_runtime.h>
#include <hip/hip_vector_types.h>
#define __ASSERT(x) (assert((x)==hipSuccess))
#define __HANDLE__ rocblas_handle
#define __COMPLEX8__ rocblas_float_complex
#define __BLAS_OPERATION__ rocblas_operation
#define __NO_TRANSFORM__ rocblas_operation_none
#define __CT_TRANSFORM__ rocblas_operation_conjugate_transpose
#define __CGMEM__ rocblas_cgemm
#define __PREFIX(c) hip##c
#define __CREATE_HANDLE(h) rocblas_create_handle(h)
#define __DESTROY_HANDLE(h) rocblas_destroy_handle(h)
#endif
#ifdef CUDA
#include <assert.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <cublas_v2.h>
#define __ASSERT(x) (assert((x)==cudaSuccess))
#define __HANDLE__ cublasHandle_t
#define __COMPLEX8__ cuComplex
#define __BLAS_OPERATION__ cublasOperation_t
#define __NO_TRANSFORM__ CUBLAS_OP_N
#define __CT_TRANSFORM__ CUBLAS_OP_C
#define __CGMEM__ cublasCgemm
#define __PREFIX(c) cuda##c
#define __CREATE_HANDLE(h) cublasCreate(h)
#define __DESTROY_HANDLE(h) cublasDestroy(h)
#endif
/*
* =====================================================================================
*
* Description: BLAS Benchmark
*
* Version: 1.0
* Created: 27.01.2021 12:45:18
* Revision: none
* Compiler: hipc
*
* Author: Henning Fehrmann (), henning.fehrmann@aei.mpg.de
* Organization: AEI Hannover
* License: GNU General Public License v2
*
* =====================================================================================
*/
#define __HIP_PLATFORM_HCC__
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <rocblas.h>
#include <hip/hip_runtime.h>
#include <hip/hip_vector_types.h>
#include <time.h>
#include <omp.h>
#define __ASSERT(x) (assert((x)==hipSuccess))
size_t m = 10000;
size_t n = 10000;
size_t k = 10000;
static unsigned long x=123456789, y=362436069, z=521288629;
unsigned long
xorshf96
(
void
)
{
// NOT thread save
unsigned long t;
x ^= x << 16;
x ^= x >> 5;
x ^= x << 1;
t = x;
x = y;
y = z;
z = t ^ x ^ y;
return z;
}
struct runtime
{
struct timespec start;
struct timespec stop;
char tag[128];
};
void
timer_start
(
struct runtime * timer,
char tag[128]
)
{
struct timespec start;
sprintf(timer->tag,"%s", tag);
clock_gettime(CLOCK_REALTIME , &start);
timer->start = start;
// printf("--------> start timer: %s\n", timer->tag);
}
double
timer_stop
(
struct runtime * timer
)
{
struct timespec stop;
clock_gettime(CLOCK_REALTIME , &stop);
timer->stop = stop;
double res= (double)
(
(timer->stop).tv_sec - (timer->start).tv_sec
)*1000.
+
(double)
(
(timer->stop).tv_nsec - (timer->start).tv_nsec
)/1000000.
;
// printf("--------> stop timer %s: %g ms\n", timer->tag, res );
return res;
}
void
multiplication
(
rocblas_handle handle,
const rocblas_float_complex *A,
const rocblas_float_complex *B,
rocblas_float_complex *C
)
{
rocblas_operation transA = rocblas_operation_none;
rocblas_operation transB = rocblas_operation_conjugate_transpose;
const rocblas_float_complex alpha = {.x = 1.f, .y = 0.f};
const rocblas_float_complex beta = {.x = 0.f, .y = 0.f};
rocblas_int lda = n;
rocblas_int ldb = n;
rocblas_int ldc = k;
rocblas_cgemm
(
handle,
transA,
transB,
m,
n,
k,
&alpha,
A,
lda,
B,
ldb,
&beta,
C,
ldc
);
}
void
prepare_matrices
(
rocblas_float_complex * hA,
rocblas_float_complex * hB
)
{
float fact = 1.f/(float)n/(float)x/(float)y/20.f;
#pragma omp parallel for
for (size_t i = 0; i < n; i++)
{
for (size_t j = 0; j < m; j++)
{
size_t ind = j + m * i;
hA[ind].x = (float)xorshf96()*fact;
hA[ind].y = (float)xorshf96()*fact;
}
}
#pragma omp parallel for
for (size_t i = 0; i < n; i++)
{
for (size_t j = 0; j < k; j++)
{
size_t ind = j + k * i;
hB[ind].x = (float)xorshf96()*fact;
hB[ind].y = (float)xorshf96()*fact;
}
}
}
void
print_result
(
rocblas_float_complex * hC
)
{
printf("-------- %zu %zu\n", m, k);
for (size_t i = 0; i < m; i++)
{
for (size_t j = 0; j < k; j++)
{
size_t ind = j + k * i;
printf("%1.2f %1.2f\t", hC[ind].x, hC[ind].y);
}
printf("\n");
}
printf("--------\n");
}
int
run_test
(
size_t dim,
unsigned rep,
float * res
)
{
m = dim;
n = dim;
k = dim;
struct runtime * timer = malloc(sizeof(*timer));
rocblas_float_complex *A;
rocblas_float_complex *B;
rocblas_float_complex *C;
__ASSERT(hipMalloc((void **)&A, sizeof(*A) * (size_t)(m * n)));
if (A == NULL)
{
fprintf(stderr, "A not allocated\n");
exit(1);
}
__ASSERT(hipMalloc((void **)&B, sizeof(*B) * (size_t)(n * k)));
if (B == NULL)
{
fprintf(stderr, "B not allocated\n");
exit(1);
}
__ASSERT(hipMalloc((void **)&C, sizeof(*C) * (size_t)(m * k)));
if (C == NULL)
{
fprintf(stderr, "C not allocated\n");
exit(1);
}
rocblas_float_complex *hA = malloc(sizeof(*hA) * (size_t)(m * n));
if (hA == NULL)
{
fprintf(stderr, "hA not allocated\n");
exit(1);
}
rocblas_float_complex *hB = malloc(sizeof(*hB) * (size_t)(k * n));
if (hB == NULL)
{
fprintf(stderr, "hB not allocated\n");
exit(1);
}
rocblas_float_complex *hC = malloc(sizeof(*hC) * (size_t)(m * k));
if (hC == NULL)
{
fprintf(stderr, "hC not allocated\n");
exit(1);
}
// timer_start(timer, "Prepare matrices");
prepare_matrices(hA, hB);
// timer_stop(timer);
//timer_start(timer, "Memcopy");
__ASSERT(hipMemcpy(A, hA, sizeof(*A) * (size_t)(m * n), hipMemcpyHostToDevice));
__ASSERT(hipMemcpy(B, hB, sizeof(*B) * (size_t)(k * n), hipMemcpyHostToDevice));
// timer_stop(timer);
rocblas_handle handle;
//timer_start(timer, "Create Handle");
if(rocblas_create_handle(&handle) != rocblas_status_success) return EXIT_FAILURE;
//timer_stop(timer);
for (unsigned r = 0; r < rep; r++)
{
float res_r = 0.f;
char mes[128];
sprintf(mes, "dim %zu run %d a" ,dim, r);
timer_start(timer, mes);
multiplication
(
handle,
A,
B,
C
);
res_r += timer_stop(timer);
sprintf(mes, "dim %zu run %d b" ,dim, r);
/*
timer_start(timer, mes);
multiplication
(
handle,
B,
A,
C
);
res_r += timer_stop(timer);
*/
res[r] = res_r/1.f;
}
printf("dimensions: %zu %zu %zu\t -- ", n, m , k);
printf("required size: %f GB\n",
(
m * n * sizeof(*A)
+ k * n * sizeof(*B)
+ k * m * sizeof(*C)
)/1.e+9);
__ASSERT(hipMemcpy(hC, C, sizeof(*hC) * (size_t)(k * m), hipMemcpyDeviceToHost));
//print_result(hC);
// timer_start(timer, "Destroy Handle");
if(rocblas_destroy_handle(handle) != rocblas_status_success) return EXIT_FAILURE;
// timer_stop(timer);
hipFree(A);
hipFree(B);
hipFree(C);
free(hA);
free(hB);
free(hC);
free(timer);
return 0;
}
int
main
(
)
{
int rep = 512;
int min_dim = 1;
int max_dim = 14;
float * res = malloc(sizeof(*res) * (size_t)((max_dim - min_dim) * rep));
if (res == NULL)
{
fprintf(stderr, "Couldn't allocate res\n");