Commit fd01712c authored by Henning Fehrmann's avatar Henning Fehrmann Committed by Henning Fehrmann
Browse files

working expample

parent 214b3996
/*
* =====================================================================================
*
* Description: BLAS Benchmark
*
* Version: 1.0
* Created: 27.01.2021 12:45:18
* Revision: none
* Compiler: hipc or nvcc
*
* Author: Henning Fehrmann (), henning.fehrmann@aei.mpg.de
* Organization: AEI Hannover
* License: GNU General Public License v2
*
* =====================================================================================
*/
#include "hardware_settings.h"
#include "profiler.h"
#include <unistd.h>
#include <iostream>
#include <stdlib.h>
#include <math.h>
#include <omp.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <cublas_v2.h>
#include <cuda_fp16.h>
#include <cufftw.h>
#include "fp16_conversion.h"
#include <time.h>
#define __MALLOC(P, size) P = malloc(size); \
if (P == NULL) \
{\
fprintf(stderr, "Allocation failed at line %d in %s\n", __LINE__, __FILE__); \
exit(EXIT_FAILURE); \
}\
using namespace std;
#define FP16MM
void
check_status
(
cublasStatus_t status
)
const char* cublasGetErrorString(cublasStatus_t status)
{
switch (status)
{
case CUBLAS_STATUS_SUCCESS:
break;
case CUBLAS_STATUS_NOT_INITIALIZED:
printf("not initialized\n");
break;
case CUBLAS_STATUS_ALLOC_FAILED:
printf("CUBLAS_STATUS_ALLOC_FAILED\n");
break;
case CUBLAS_STATUS_INVALID_VALUE:
printf("CUBLAS_STATUS_INVALID_VALUE\n");
break;
case CUBLAS_STATUS_ARCH_MISMATCH:
printf("CUBLAS_STATUS_ARCH_MISMATCH\n");
break;
case CUBLAS_STATUS_MAPPING_ERROR:
printf("CUBLAS_STATUS_MAPPING_ERROR\n");
break;
case CUBLAS_STATUS_EXECUTION_FAILED:
printf("CUBLAS_STATUS_EXECUTION_FAILED\n");
break;
case CUBLAS_STATUS_INTERNAL_ERROR:
printf("CUBLAS_STATUS_INTERNAL_ERROR\n");
break;
case CUBLAS_STATUS_NOT_SUPPORTED:
printf("CUBLAS_STATUS_NOT_SUPPORTED\n");
break;
case CUBLAS_STATUS_LICENSE_ERROR:
printf("CUBLAS_STATUS_LICENSE_ERROR\n");
break;
}
switch(status)
{
case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
}
return "unknown error";
}
void
prepare_matrices
(
__half * hA,
__half * hB,
size_t m,
size_t n,
size_t k
)
// Convenience function for checking CUDA runtime API results
// can be wrapped around any runtime API call. No-op in release builds.
inline
cudaError_t checkCuda(cudaError_t result)
{
float fact = 1.f/(float)n/(float)x/(float)y/20.f;
#pragma omp parallel for
for (size_t i = 0; i < m; i++)
{
for (size_t j = 0; j < k; j++)
{
size_t ind = j + k * i;
hA[ind] = xorshf96()*fact;
}
}
#pragma omp parallel for
for (size_t i = 0; i < n; i++)
{
for (size_t j = 0; j < k; j++)
{
size_t ind = j + k * i;
hB[ind] = xorshf96()*fact;
}
}
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
return result;
}
void
print_result
(
__half * hC,
size_t m,
size_t n,
size_t k
)
inline
cublasStatus_t checkCublas(cublasStatus_t result)
{
printf("-------- %zu %zu\n", m, k);
for (size_t i = 0; i < m; i++)
{
for (size_t j = 0; j < k; j++)
{
size_t ind = j + k * i;
printf("%1.2f\t", hC[ind]);
}
printf("\n");
}
printf("--------\n");
if (result != CUBLAS_STATUS_SUCCESS) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cublasGetErrorString(result));
assert(result == CUBLAS_STATUS_SUCCESS);
}
return result;
}
int
run_test
(
size_t m,
size_t n,
size_t k,
unsigned rep,
float * res,
cublasHandle_t handle
)
// Fill the array A(nr_rows_A, nr_cols_A) with random numbers on CPU
void CPU_fill_rand(float *A, int nr_rows_A, int nr_cols_A)
{
struct runtime * timer;
timer = (runtime *)malloc(sizeof(*timer));
__half *A;
__half *B;
__half *C;
cudaMalloc((void **)&A, sizeof(*A) * (size_t)(m * k));
if (A == NULL) { fprintf(stderr, "A not allocated\n"); exit(1); }
cudaMalloc((void **)&B, sizeof(*B) * (size_t)(n * k));
if (B == NULL) { fprintf(stderr, "B not allocated\n"); exit(1); }
cudaMalloc((void **)&C, sizeof(*C) * (size_t)(m * n));
if (C == NULL) { fprintf(stderr, "C not allocated\n"); exit(1); }
__half *hA;
hA = (__half * )malloc(sizeof(*hA) * (size_t)(m * k));
__half *hB;
hB = (__half * )malloc(sizeof(*hB) * (size_t)(k * n));
__half *hC;
hC = (__half * )malloc(sizeof(*hC) * (size_t)(n * m));
// timer_start(timer, "Prepare matrices");
// timer_stop(timer);
//timer_start(timer, "Memcopy");
// timer_stop(timer);
//timer_start(timer, "Create Handle");
//if(rocblas_create_handle(&handle) != rocblas_status_success) return EXIT_FAILURE;
//timer_stop(timer);
// prepare_matrices(hA, hB, m, n, k);
cublasOperation_t transA = CUBLAS_OP_T;
cublasOperation_t transB = CUBLAS_OP_N;
const __half alpha = 1.f;
const __half beta = 0.f;
for (unsigned r = 0; r < rep; r++)
{
cudaMemcpy(A, hA, sizeof(*A) * (size_t)(m * k), cudaMemcpyHostToDevice);
cudaMemcpy(B, hB, sizeof(*B) * (size_t)(k * n), cudaMemcpyHostToDevice);
float res_r = 0.f;
char mes[128];
sprintf(mes, "m %zu n %zu k %zu run %d", m, n, k, r);
timer_start(timer, mes);
check_status(
cublasHgemm
(
handle,
transA,
transB,
m,
n,
k,
&alpha,
A,
m,
B,
n,
&beta,
C,
m
)
);
// cublasIcamax(handle,m * n, C, 1, &result);
cudaDeviceSynchronize();
res_r += timer_stop(timer);
res[r] = res_r/1.f;
for(int i = 0; i < nr_rows_A * nr_cols_A; i++){
//A[i] = (float)rand()/(float)(RAND_MAX/a);
A[i] = 0.1;
}
printf("dimensions: %zu %zu %zu\t -- ", n, m , k);
printf("required size: %f GB\n",
(
m * n * sizeof(*A)
+ k * n * sizeof(*B)
+ k * m * sizeof(*C)
)/1.e+9);
cudaFree(A);
cudaFree(B);
cudaFree(C);
free(hA);
free(hB);
free(hC);
free(timer);
return 0;
}
int
main
int main
(
int argc,
char ** argv
)
{
int rep = 10;
size_t m_min = 9; // 13
size_t m_max = 10; // 16
size_t n_min = 17; // 11
size_t n_max = 18; // 19
size_t k_min = 8; // 7
size_t k_max = 9; // 11
float * res;
// cudaSetDevice(0);
int repeats = 10;
cout << "\nrunning cublasHgemm test\n" << endl;
cublasStatus_t stat;
cublasHandle_t handle;
res = (float *)malloc( sizeof(*res) * (size_t)(
(m_max - m_min + 1) *
(n_max - n_min + 1) *
(k_max - k_min + 1) *
rep));
cublasCreate(&handle);
for (int em = m_min; em <= m_max; em++)
checkCublas(cublasCreate(&handle));
size_t m = 1 << 11;
size_t n = 1 << 17;
size_t k = 1 << 9;
float *h_A = (float *)malloc(sizeof(*h_A) * m * k);
float *h_B = (float *)malloc(sizeof(*h_B) * k * n);
float *h_C = (float *)malloc(sizeof(*h_C) * n * m);
CPU_fill_rand(h_A, m, k);
CPU_fill_rand(h_B, k, n);
CPU_fill_rand(h_C, n, m);
__half *d_A, *d_B, *d_C;
checkCuda(cudaMallocManaged(&d_A, m * k * sizeof(* d_A)));
checkCuda(cudaMallocManaged(&d_B, k * n * sizeof(* d_B)));
checkCuda(cudaMallocManaged(&d_C, m * n * sizeof(* d_C)));
for (int i = 0; i < m * k; i++)
{
for (int en = n_min; en <= n_max; en++)
{
for (int ek = k_min; ek <= k_max; ek++)
{
run_test(1 << em, 1 << en , 1 << ek, rep, &res[0], handle);
}
}
d_A[i] = approx_float_to_half(h_A[i]);
}
cublasDestroy(handle);
exit(0);
// store the results
/*
FILE * f;
char name[128];
sprintf(name, "runtimes");
f= fopen(name, "w");
if (f == NULL)
for (int i = 0; i < k * n; i++)
{
fprintf(stderr, "Couldn't open %s\n", name);
d_B[i] = approx_float_to_half(h_B[i]);
}
for (int i = min_dim; i < max_dim; i++)
for (int i = 0; i < n * m; i++)
{
size_t dim = 1 << i;
fprintf(f, "%zu\t", dim);
d_C[i] = approx_float_to_half(h_C[i]);
}
fprintf(f, "\n");
for (int r = 0; r < rep; r++)
const __half alf = approx_float_to_half(1.0);
const __half bet = approx_float_to_half(0.0);
const __half *alpha = &alf;
const __half *beta = &bet;
cudaEvent_t cstart, cstop;
cudaEventCreate(&cstart);
cudaEventCreate(&cstop);
float sum1 = 0.f;
float sum2 = 0.f;
struct timespec start;
struct timespec stop;
for(int rep = 0; rep < repeats; rep++)
{
for (int i = min_dim; i < max_dim; i++)
cudaEventRecord(cstart, 0);
clock_gettime(CLOCK_REALTIME , &start);
stat = cublasHgemm
(
handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m,
n,
k,
alpha,
d_A,
m,
d_B,
k,
beta,
d_C,
m
);
cudaEventRecord(cstop,0);
cudaEventSynchronize(cstop);
clock_gettime(CLOCK_REALTIME , &stop);
if(stat != CUBLAS_STATUS_SUCCESS)
{
size_t pos = (i - min_dim) * rep + r;
fprintf(f, "%1.6f\t", res[pos]);
cerr << "cublasSgemmBatched failed" << endl;
exit(1);
}
fprintf(f, "\n");
assert(!cudaGetLastError());
double res= (double)
(
stop.tv_sec - start.tv_sec
)*1000.
+
(double)
(
stop.tv_nsec - start.tv_nsec
)/1000000.
;
float elapsed;
cudaEventElapsedTime(&elapsed, cstart, cstop);
elapsed /= 1000.0f;
sum1 += res;
sum2 += elapsed;
cout << res << " <=> " << elapsed << endl;
}
fclose(f);
*/
cout << "float16; size " << m <<" "<< k << " "<< n << " average: " << sum1/repeats << " ms "<< sum2/repeats << " s "<< endl;
//Free GPU memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
// Free CPU memory
free(h_A);
free(h_B);
free(h_C);
return 0;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment