Skip to content
Snippets Groups Projects
Commit 214b3996 authored by Henning Fehrmann's avatar Henning Fehrmann Committed by Henning Fehrmann
Browse files

try half precission mm

parent 2a4cf28a
No related branches found
No related tags found
No related merge requests found
...@@ -3,6 +3,7 @@ GPU=AMD ...@@ -3,6 +3,7 @@ GPU=AMD
GPU=NVIDIA GPU=NVIDIA
OBJ_blas = blas.o OBJ_blas = blas.o
OBJ_blas_hp = blas_hp.o
OBJ_fftw = fftw.o OBJ_fftw = fftw.o
OBJ_tensor_core = tensor_core.o OBJ_tensor_core = tensor_core.o
...@@ -25,17 +26,23 @@ else ...@@ -25,17 +26,23 @@ else
unknown_HW: unknown_HW:
endif endif
all: blas fftw all: blas fftw blas_hp
blas: ${OBJ_blas} blas: ${OBJ_blas}
${CC} -o blas ${OBJ_blas} ${LDFLAGS} ${LDFLAGS_blas} ${CUDAFLAGS} ${CC} -o blas ${OBJ_blas} ${LDFLAGS} ${LDFLAGS_blas} ${CUDAFLAGS}
blas_hp: ${OBJ_blas_hp}
${CC} -o blas_hp ${OBJ_blas_hp} ${LDFLAGS} ${LDFLAGS_blas} ${CUDAFLAGS}
tensor_core: ${OBJ_tensor_core} tensor_core: ${OBJ_tensor_core}
${CC} -o tensor_core ${OBJ_tensor_core} ${LDFLAGS} ${LDFLAGS_blas} ${CUDAFLAGS} ${CC} -o tensor_core ${OBJ_tensor_core} ${LDFLAGS} ${LDFLAGS_blas} ${CUDAFLAGS}
fftw: ${OBJ_fftw} fftw: ${OBJ_fftw}
${CC} -o fftw ${OBJ_fftw} ${LDFLAGS} ${LDFLAGS_fftw} ${CUDAFLAGS} ${CC} -o fftw ${OBJ_fftw} ${LDFLAGS} ${LDFLAGS_fftw} ${CUDAFLAGS}
%.o: %.cu ${HEADER}
nvcc ${INCLUDE} -dc $< ${CUDAFLAGS}
%.o: %.c ${HEADER} %.o: %.c ${HEADER}
${CC} ${CFLAGS} -c $< ${CC} ${CFLAGS} -c $<
......
/*
* =====================================================================================
*
* Description: BLAS Benchmark
*
* Version: 1.0
* Created: 27.01.2021 12:45:18
* Revision: none
* Compiler: hipc or nvcc
*
* Author: Henning Fehrmann (), henning.fehrmann@aei.mpg.de
* Organization: AEI Hannover
* License: GNU General Public License v2
*
* =====================================================================================
*/
#include "hardware_settings.h"
#include "profiler.h"
#include <stdlib.h>
#include <math.h>
#include <omp.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <cublas_v2.h>
#include <cuda_fp16.h>
#include <cufftw.h>
#define __MALLOC(P, size) P = malloc(size); \
if (P == NULL) \
{\
fprintf(stderr, "Allocation failed at line %d in %s\n", __LINE__, __FILE__); \
exit(EXIT_FAILURE); \
}\
void
check_status
(
cublasStatus_t status
)
{
switch (status)
{
case CUBLAS_STATUS_SUCCESS:
break;
case CUBLAS_STATUS_NOT_INITIALIZED:
printf("not initialized\n");
break;
case CUBLAS_STATUS_ALLOC_FAILED:
printf("CUBLAS_STATUS_ALLOC_FAILED\n");
break;
case CUBLAS_STATUS_INVALID_VALUE:
printf("CUBLAS_STATUS_INVALID_VALUE\n");
break;
case CUBLAS_STATUS_ARCH_MISMATCH:
printf("CUBLAS_STATUS_ARCH_MISMATCH\n");
break;
case CUBLAS_STATUS_MAPPING_ERROR:
printf("CUBLAS_STATUS_MAPPING_ERROR\n");
break;
case CUBLAS_STATUS_EXECUTION_FAILED:
printf("CUBLAS_STATUS_EXECUTION_FAILED\n");
break;
case CUBLAS_STATUS_INTERNAL_ERROR:
printf("CUBLAS_STATUS_INTERNAL_ERROR\n");
break;
case CUBLAS_STATUS_NOT_SUPPORTED:
printf("CUBLAS_STATUS_NOT_SUPPORTED\n");
break;
case CUBLAS_STATUS_LICENSE_ERROR:
printf("CUBLAS_STATUS_LICENSE_ERROR\n");
break;
}
}
void
prepare_matrices
(
__half * hA,
__half * hB,
size_t m,
size_t n,
size_t k
)
{
float fact = 1.f/(float)n/(float)x/(float)y/20.f;
#pragma omp parallel for
for (size_t i = 0; i < m; i++)
{
for (size_t j = 0; j < k; j++)
{
size_t ind = j + k * i;
hA[ind] = xorshf96()*fact;
}
}
#pragma omp parallel for
for (size_t i = 0; i < n; i++)
{
for (size_t j = 0; j < k; j++)
{
size_t ind = j + k * i;
hB[ind] = xorshf96()*fact;
}
}
}
void
print_result
(
__half * hC,
size_t m,
size_t n,
size_t k
)
{
printf("-------- %zu %zu\n", m, k);
for (size_t i = 0; i < m; i++)
{
for (size_t j = 0; j < k; j++)
{
size_t ind = j + k * i;
printf("%1.2f\t", hC[ind]);
}
printf("\n");
}
printf("--------\n");
}
int
run_test
(
size_t m,
size_t n,
size_t k,
unsigned rep,
float * res,
cublasHandle_t handle
)
{
struct runtime * timer;
timer = (runtime *)malloc(sizeof(*timer));
__half *A;
__half *B;
__half *C;
cudaMalloc((void **)&A, sizeof(*A) * (size_t)(m * k));
if (A == NULL) { fprintf(stderr, "A not allocated\n"); exit(1); }
cudaMalloc((void **)&B, sizeof(*B) * (size_t)(n * k));
if (B == NULL) { fprintf(stderr, "B not allocated\n"); exit(1); }
cudaMalloc((void **)&C, sizeof(*C) * (size_t)(m * n));
if (C == NULL) { fprintf(stderr, "C not allocated\n"); exit(1); }
__half *hA;
hA = (__half * )malloc(sizeof(*hA) * (size_t)(m * k));
__half *hB;
hB = (__half * )malloc(sizeof(*hB) * (size_t)(k * n));
__half *hC;
hC = (__half * )malloc(sizeof(*hC) * (size_t)(n * m));
// timer_start(timer, "Prepare matrices");
// timer_stop(timer);
//timer_start(timer, "Memcopy");
// timer_stop(timer);
//timer_start(timer, "Create Handle");
//if(rocblas_create_handle(&handle) != rocblas_status_success) return EXIT_FAILURE;
//timer_stop(timer);
// prepare_matrices(hA, hB, m, n, k);
cublasOperation_t transA = CUBLAS_OP_T;
cublasOperation_t transB = CUBLAS_OP_N;
const __half alpha = 1.f;
const __half beta = 0.f;
for (unsigned r = 0; r < rep; r++)
{
cudaMemcpy(A, hA, sizeof(*A) * (size_t)(m * k), cudaMemcpyHostToDevice);
cudaMemcpy(B, hB, sizeof(*B) * (size_t)(k * n), cudaMemcpyHostToDevice);
float res_r = 0.f;
char mes[128];
sprintf(mes, "m %zu n %zu k %zu run %d", m, n, k, r);
timer_start(timer, mes);
check_status(
cublasHgemm
(
handle,
transA,
transB,
m,
n,
k,
&alpha,
A,
m,
B,
n,
&beta,
C,
m
)
);
// cublasIcamax(handle,m * n, C, 1, &result);
cudaDeviceSynchronize();
res_r += timer_stop(timer);
res[r] = res_r/1.f;
}
printf("dimensions: %zu %zu %zu\t -- ", n, m , k);
printf("required size: %f GB\n",
(
m * n * sizeof(*A)
+ k * n * sizeof(*B)
+ k * m * sizeof(*C)
)/1.e+9);
cudaFree(A);
cudaFree(B);
cudaFree(C);
free(hA);
free(hB);
free(hC);
free(timer);
return 0;
}
int
main
(
)
{
int rep = 10;
size_t m_min = 9; // 13
size_t m_max = 10; // 16
size_t n_min = 17; // 11
size_t n_max = 18; // 19
size_t k_min = 8; // 7
size_t k_max = 9; // 11
float * res;
// cudaSetDevice(0);
cublasHandle_t handle;
res = (float *)malloc( sizeof(*res) * (size_t)(
(m_max - m_min + 1) *
(n_max - n_min + 1) *
(k_max - k_min + 1) *
rep));
cublasCreate(&handle);
for (int em = m_min; em <= m_max; em++)
{
for (int en = n_min; en <= n_max; en++)
{
for (int ek = k_min; ek <= k_max; ek++)
{
run_test(1 << em, 1 << en , 1 << ek, rep, &res[0], handle);
}
}
}
cublasDestroy(handle);
exit(0);
// store the results
/*
FILE * f;
char name[128];
sprintf(name, "runtimes");
f= fopen(name, "w");
if (f == NULL)
{
fprintf(stderr, "Couldn't open %s\n", name);
}
for (int i = min_dim; i < max_dim; i++)
{
size_t dim = 1 << i;
fprintf(f, "%zu\t", dim);
}
fprintf(f, "\n");
for (int r = 0; r < rep; r++)
{
for (int i = min_dim; i < max_dim; i++)
{
size_t pos = (i - min_dim) * rep + r;
fprintf(f, "%1.6f\t", res[pos]);
}
fprintf(f, "\n");
}
fclose(f);
*/
return 0;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment