Skip to content
Snippets Groups Projects
Commit 0626ce84 authored by Henning Fehrmann's avatar Henning Fehrmann Committed by Henning Fehrmann
Browse files

platform agnostic code

parent 21c8a6d9
No related branches found
No related tags found
No related merge requests found
Makefile 0 → 100644
GPU=NVIDIA
GPU=AMD
OBJ = blas.o
ifeq ($(GPU), AMD)
LDFLAGS = -L/opt/rocm/lib -lhipblas -lrocblas -fopenmp
CFLAGS = -g -Wall -O3 -fopenmp -I/opt/rocm/include -I/opt/rocm/hip/include -DROC
CC = hipcc
else ifeq ($(GPU), NVIDIA)
CC = nvcc
LDFLAGS = -lcublas -lm -lgomp
INCLUDE= -I/usr/lib/x86_64-linux-gnu/openmpi/include/
CFLAGS = ${INCLUDE} --compile -O3 -pg -Xcompiler -fopenmp -DCUDA
CUDAFLAGS = --Werror cross-execution-space-call --Wno-deprecated-gpu-targets
else
unknown_HW:
endif
all: ${OBJ}
${CC} -o blas ${OBJ} ${LDFLAGS} ${CUDAFLAGS}
%.o: %.c ${HEADER}
${CC} ${CFLAGS} -c $<
clean:
rm *.o
unknown_HW:
@echo "hardware not detected"
......@@ -6,7 +6,7 @@
* Version: 1.0
* Created: 27.01.2021 12:45:18
* Revision: none
* Compiler: nvcc
* Compiler: hipc
*
* Author: Henning Fehrmann (), henning.fehrmann@aei.mpg.de
* Organization: AEI Hannover
......@@ -15,18 +15,14 @@
* =====================================================================================
*/
#include "hardware_settings.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <cublas_v2.h>
#include <time.h>
#include <omp.h>
#define __ASSERT(x) (assert((x)==cudaSuccess))
size_t m = 10000;
......@@ -35,12 +31,20 @@ size_t k = 10000;
static unsigned long x=123456789, y=362436069, z=521288629;
#define __MALLOC(P, size) P = malloc(size); \
if (P == NULL) \
{\
fprintf(stderr, "Allocation if failed at line %d in %s\n", __LINE__, __FILE__); \
exit(EXIT_FAILURE); \
}\
unsigned long
xorshf96
(
void
)
{
// NOT thread save
unsigned long t;
x ^= x << 16;
x ^= x >> 5;
......@@ -102,22 +106,22 @@ timer_stop
void
multiplication
(
cublasHandle_t handle,
const cuComplex *A,
const cuComplex *B,
cuComplex *C
__HANDLE__ handle,
const __COMPLEX8__ *A,
const __COMPLEX8__ *B,
__COMPLEX8__ *C
)
{
cublasOperation_t transA = CUBLAS_OP_N;
cublasOperation_t transB = CUBLAS_OP_C;
const cuComplex alpha = {.x = 1.f, .y = 0.f};
const cuComplex beta = {.x = 0.f, .y = 0.f};
__BLAS_OPERATION__ transA = __NO_TRANSFORM__;
__BLAS_OPERATION__ transB = __CT_TRANSFORM__;
const __COMPLEX8__ alpha = {.x = 1.f, .y = 0.f};
const __COMPLEX8__ beta = {.x = 0.f, .y = 0.f};
int lda = n;
int ldb = n;
int ldc = k;
cublasCgemm
__CGMEM__
(
handle,
transA,
......@@ -139,8 +143,8 @@ multiplication
void
prepare_matrices
(
cuComplex * hA,
cuComplex * hB
__COMPLEX8__ * hA,
__COMPLEX8__ * hB
)
{
float fact = 1.f/(float)n/(float)x/(float)y/20.f;
......@@ -170,7 +174,7 @@ prepare_matrices
void
print_result
(
cuComplex * hC
__COMPLEX8__ * hC
)
{
printf("-------- %zu %zu\n", m, k);
......@@ -198,45 +202,27 @@ run_test
m = dim;
n = dim;
k = dim;
struct runtime * timer = malloc(sizeof(*timer));
cuComplex *A;
cuComplex *B;
cuComplex *C;
__ASSERT(cudaMalloc((void **)&A, sizeof(*A) * (size_t)(m * n)));
if (A == NULL)
{
fprintf(stderr, "A not allocated\n");
exit(1);
}
__ASSERT(cudaMalloc((void **)&B, sizeof(*B) * (size_t)(n * k)));
if (B == NULL)
{
fprintf(stderr, "B not allocated\n");
exit(1);
}
__ASSERT(cudaMalloc((void **)&C, sizeof(*C) * (size_t)(m * k)));
struct runtime * timer;
__MALLOC(timer, sizeof(*timer));
__COMPLEX8__ *A;
__COMPLEX8__ *B;
__COMPLEX8__ *C;
__ASSERT(__PREFIX(Malloc)((void **)&A, sizeof(*A) * (size_t)(m * n)));
__ASSERT(__PREFIX(Malloc)((void **)&B, sizeof(*B) * (size_t)(n * k)));
__ASSERT(__PREFIX(Malloc)((void **)&C, sizeof(*C) * (size_t)(m * k)));
if (C == NULL)
{
fprintf(stderr, "C not allocated\n");
exit(1);
}
cuComplex *hA = malloc(sizeof(*hA) * (size_t)(m * n));
if (hA == NULL)
{
fprintf(stderr, "hA not allocated\n");
exit(1);
}
cuComplex *hB = malloc(sizeof(*hB) * (size_t)(k * n));
if (hB == NULL)
{
fprintf(stderr, "hB not allocated\n");
exit(1);
}
cuComplex *hC = malloc(sizeof(*hC) * (size_t)(m * k));
__COMPLEX8__ *hA;
__MALLOC( hA, sizeof(*hA) * (size_t)(m * n));
__COMPLEX8__ *hB;
__MALLOC( hB, sizeof(*hB) * (size_t)(k * n));
__COMPLEX8__ *hC;
__MALLOC( hC, sizeof(*hC) * (size_t)(m * k));
if (hC == NULL)
{
fprintf(stderr, "hC not allocated\n");
......@@ -248,14 +234,16 @@ run_test
// timer_stop(timer);
//timer_start(timer, "Memcopy");
__ASSERT(cudaMemcpy(A, hA, sizeof(*A) * (size_t)(m * n), cudaMemcpyHostToDevice));
__ASSERT(cudaMemcpy(B, hB, sizeof(*B) * (size_t)(k * n), cudaMemcpyHostToDevice));
__ASSERT(__PREFIX(Memcpy)(A, hA, sizeof(*A) * (size_t)(m * n), __PREFIX(MemcpyHostToDevice)));
__ASSERT(__PREFIX(Memcpy)(B, hB, sizeof(*B) * (size_t)(k * n), __PREFIX(MemcpyHostToDevice)));
// timer_stop(timer);
cudaSetDevice(0);
cublasHandle_t handle;
// cudaSetDevice(0);
__HANDLE__ handle;
//timer_start(timer, "Create Handle");
cublasCreate(&handle);
//if(rocblas_create_handle(&handle) != rocblas_status_success) return EXIT_FAILURE;
__CREATE_HANDLE(&handle);
//timer_stop(timer);
for (unsigned r = 0; r < rep; r++)
......@@ -294,16 +282,17 @@ run_test
+ k * m * sizeof(*C)
)/1.e+9);
__ASSERT(cudaMemcpy(hC, C, sizeof(*hC) * (size_t)(k * m), cudaMemcpyDeviceToHost));
__ASSERT(__PREFIX(Memcpy)(hC, C, sizeof(*hC) * (size_t)(k * m), __PREFIX(MemcpyDeviceToHost)));
//print_result(hC);
// timer_start(timer, "Destroy Handle");
if(cublasDestroy(handle) != cudaSuccess) return EXIT_FAILURE;
//if(rocblas_destroy_handle(handle) != rocblas_status_success) return EXIT_FAILURE;
if(__DESTROY_HANDLE(handle) != __PREFIX(Success)) return EXIT_FAILURE;
// timer_stop(timer);
cudaFree(A);
cudaFree(B);
cudaFree(C);
__PREFIX(Free)(A);
__PREFIX(Free)(B);
__PREFIX(Free)(C);
free(hA);
free(hB);
free(hC);
......@@ -320,12 +309,8 @@ main
int min_dim = 1;
int max_dim = 14;
float * res = malloc(sizeof(*res) * (size_t)((max_dim - min_dim) * rep));
if (res == NULL)
{
fprintf(stderr, "Couldn't allocate res\n");
exit(1);
}
float * res;
__MALLOC(res, sizeof(*res) * (size_t)((max_dim - min_dim) * rep));
for (int i = min_dim; i < max_dim; i++)
{
size_t dim = 1 << i;
......
/*
* =====================================================================================
*
* Filename: hardware_settings.h
*
* Description:
*
* Version: 1.0
* Created: 28.01.2021 16:15:56
* Revision: none
* Compiler: gcc
*
* Author: Henning Fehrmann (), henning.fehrmann@aei.mpg.de
* Organization: AEI Hannover
* Copyright: Copyright (c) 2021, Henning Fehrmann
*
* =====================================================================================
*/
#ifdef ROC
#define __HIP_PLATFORM_HCC__
#include <rocblas.h>
#include <hip/hip_runtime.h>
#include <hip/hip_vector_types.h>
#define __ASSERT(x) (assert((x)==hipSuccess))
#define __HANDLE__ rocblas_handle
#define __COMPLEX8__ rocblas_float_complex
#define __BLAS_OPERATION__ rocblas_operation
#define __NO_TRANSFORM__ rocblas_operation_none
#define __CT_TRANSFORM__ rocblas_operation_conjugate_transpose
#define __CGMEM__ rocblas_cgemm
#define __PREFIX(c) hip##c
#define __CREATE_HANDLE(h) rocblas_create_handle(h)
#define __DESTROY_HANDLE(h) rocblas_destroy_handle(h)
#endif
#ifdef CUDA
#include <assert.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <cublas_v2.h>
#define __ASSERT(x) (assert((x)==cudaSuccess))
#define __HANDLE__ cublasHandle_t
#define __COMPLEX8__ cuComplex
#define __BLAS_OPERATION__ cublasOperation_t
#define __NO_TRANSFORM__ CUBLAS_OP_N
#define __CT_TRANSFORM__ CUBLAS_OP_C
#define __CGMEM__ cublasCgemm
#define __PREFIX(c) cuda##c
#define __CREATE_HANDLE(h) cublasCreate(h)
#define __DESTROY_HANDLE(h) cublasDestroy(h)
#endif
/*
* =====================================================================================
*
* Description: BLAS Benchmark
*
* Version: 1.0
* Created: 27.01.2021 12:45:18
* Revision: none
* Compiler: hipc
*
* Author: Henning Fehrmann (), henning.fehrmann@aei.mpg.de
* Organization: AEI Hannover
* License: GNU General Public License v2
*
* =====================================================================================
*/
#define __HIP_PLATFORM_HCC__
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <rocblas.h>
#include <hip/hip_runtime.h>
#include <hip/hip_vector_types.h>
#include <time.h>
#include <omp.h>
#define __ASSERT(x) (assert((x)==hipSuccess))
size_t m = 10000;
size_t n = 10000;
size_t k = 10000;
static unsigned long x=123456789, y=362436069, z=521288629;
unsigned long
xorshf96
(
void
)
{
// NOT thread save
unsigned long t;
x ^= x << 16;
x ^= x >> 5;
x ^= x << 1;
t = x;
x = y;
y = z;
z = t ^ x ^ y;
return z;
}
struct runtime
{
struct timespec start;
struct timespec stop;
char tag[128];
};
void
timer_start
(
struct runtime * timer,
char tag[128]
)
{
struct timespec start;
sprintf(timer->tag,"%s", tag);
clock_gettime(CLOCK_REALTIME , &start);
timer->start = start;
// printf("--------> start timer: %s\n", timer->tag);
}
double
timer_stop
(
struct runtime * timer
)
{
struct timespec stop;
clock_gettime(CLOCK_REALTIME , &stop);
timer->stop = stop;
double res= (double)
(
(timer->stop).tv_sec - (timer->start).tv_sec
)*1000.
+
(double)
(
(timer->stop).tv_nsec - (timer->start).tv_nsec
)/1000000.
;
// printf("--------> stop timer %s: %g ms\n", timer->tag, res );
return res;
}
void
multiplication
(
rocblas_handle handle,
const rocblas_float_complex *A,
const rocblas_float_complex *B,
rocblas_float_complex *C
)
{
rocblas_operation transA = rocblas_operation_none;
rocblas_operation transB = rocblas_operation_conjugate_transpose;
const rocblas_float_complex alpha = {.x = 1.f, .y = 0.f};
const rocblas_float_complex beta = {.x = 0.f, .y = 0.f};
rocblas_int lda = n;
rocblas_int ldb = n;
rocblas_int ldc = k;
rocblas_cgemm
(
handle,
transA,
transB,
m,
n,
k,
&alpha,
A,
lda,
B,
ldb,
&beta,
C,
ldc
);
}
void
prepare_matrices
(
rocblas_float_complex * hA,
rocblas_float_complex * hB
)
{
float fact = 1.f/(float)n/(float)x/(float)y/20.f;
#pragma omp parallel for
for (size_t i = 0; i < n; i++)
{
for (size_t j = 0; j < m; j++)
{
size_t ind = j + m * i;
hA[ind].x = (float)xorshf96()*fact;
hA[ind].y = (float)xorshf96()*fact;
}
}
#pragma omp parallel for
for (size_t i = 0; i < n; i++)
{
for (size_t j = 0; j < k; j++)
{
size_t ind = j + k * i;
hB[ind].x = (float)xorshf96()*fact;
hB[ind].y = (float)xorshf96()*fact;
}
}
}
void
print_result
(
rocblas_float_complex * hC
)
{
printf("-------- %zu %zu\n", m, k);
for (size_t i = 0; i < m; i++)
{
for (size_t j = 0; j < k; j++)
{
size_t ind = j + k * i;
printf("%1.2f %1.2f\t", hC[ind].x, hC[ind].y);
}
printf("\n");
}
printf("--------\n");
}
int
run_test
(
size_t dim,
unsigned rep,
float * res
)
{
m = dim;
n = dim;
k = dim;
struct runtime * timer = malloc(sizeof(*timer));
rocblas_float_complex *A;
rocblas_float_complex *B;
rocblas_float_complex *C;
__ASSERT(hipMalloc((void **)&A, sizeof(*A) * (size_t)(m * n)));
if (A == NULL)
{
fprintf(stderr, "A not allocated\n");
exit(1);
}
__ASSERT(hipMalloc((void **)&B, sizeof(*B) * (size_t)(n * k)));
if (B == NULL)
{
fprintf(stderr, "B not allocated\n");
exit(1);
}
__ASSERT(hipMalloc((void **)&C, sizeof(*C) * (size_t)(m * k)));
if (C == NULL)
{
fprintf(stderr, "C not allocated\n");
exit(1);
}
rocblas_float_complex *hA = malloc(sizeof(*hA) * (size_t)(m * n));
if (hA == NULL)
{
fprintf(stderr, "hA not allocated\n");
exit(1);
}
rocblas_float_complex *hB = malloc(sizeof(*hB) * (size_t)(k * n));
if (hB == NULL)
{
fprintf(stderr, "hB not allocated\n");
exit(1);
}
rocblas_float_complex *hC = malloc(sizeof(*hC) * (size_t)(m * k));
if (hC == NULL)
{
fprintf(stderr, "hC not allocated\n");
exit(1);
}
// timer_start(timer, "Prepare matrices");
prepare_matrices(hA, hB);
// timer_stop(timer);
//timer_start(timer, "Memcopy");
__ASSERT(hipMemcpy(A, hA, sizeof(*A) * (size_t)(m * n), hipMemcpyHostToDevice));
__ASSERT(hipMemcpy(B, hB, sizeof(*B) * (size_t)(k * n), hipMemcpyHostToDevice));
// timer_stop(timer);
rocblas_handle handle;
//timer_start(timer, "Create Handle");
if(rocblas_create_handle(&handle) != rocblas_status_success) return EXIT_FAILURE;
//timer_stop(timer);
for (unsigned r = 0; r < rep; r++)
{
float res_r = 0.f;
char mes[128];
sprintf(mes, "dim %zu run %d a" ,dim, r);
timer_start(timer, mes);
multiplication
(
handle,
A,
B,
C
);
res_r += timer_stop(timer);
sprintf(mes, "dim %zu run %d b" ,dim, r);
/*
timer_start(timer, mes);
multiplication
(
handle,
B,
A,
C
);
res_r += timer_stop(timer);
*/
res[r] = res_r/1.f;
}
printf("dimensions: %zu %zu %zu\t -- ", n, m , k);
printf("required size: %f GB\n",
(
m * n * sizeof(*A)
+ k * n * sizeof(*B)
+ k * m * sizeof(*C)
)/1.e+9);
__ASSERT(hipMemcpy(hC, C, sizeof(*hC) * (size_t)(k * m), hipMemcpyDeviceToHost));
//print_result(hC);
// timer_start(timer, "Destroy Handle");
if(rocblas_destroy_handle(handle) != rocblas_status_success) return EXIT_FAILURE;
// timer_stop(timer);
hipFree(A);
hipFree(B);
hipFree(C);
free(hA);
free(hB);
free(hC);
free(timer);
return 0;
}
int
main
(
)
{
int rep = 512;
int min_dim = 1;
int max_dim = 14;
float * res = malloc(sizeof(*res) * (size_t)((max_dim - min_dim) * rep));
if (res == NULL)
{
fprintf(stderr, "Couldn't allocate res\n");
exit(1);
}
for (int i = min_dim; i < max_dim; i++)
{
size_t dim = 1 << i;
int ind = (i - min_dim) * rep;
run_test(dim, rep, &res[ind]);
}
// store the results
FILE * f;
char name[128];
sprintf(name, "runtimes");
f= fopen(name, "w");
if (f == NULL)
{
fprintf(stderr, "Couldn't open %s\n", name);
}
for (int i = min_dim; i < max_dim; i++)
{
size_t dim = 1 << i;
fprintf(f, "%zu\t", dim);
}
fprintf(f, "\n");
for (int r = 0; r < rep; r++)
{
for (int i = min_dim; i < max_dim; i++)
{
size_t pos = (i - min_dim) * rep + r;
fprintf(f, "%1.6f\t", res[pos]);
}
fprintf(f, "\n");
}
fclose(f);
return 0;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment