try half precission mm

214b3996 · Henning Fehrmann · Henning Fehrmann · 2a4cf28a · 214b3996 · 214b3996
Commit 214b3996 authored Feb 8, 2021 by Henning Fehrmann Committed by Henning Fehrmann Feb 8, 2021
--- a/Makefile
+++ b/Makefile
@@ -3,6 +3,7 @@ GPU=AMD
 GPU=NVIDIA
 OBJ_blas = blas.o
+OBJ_blas_hp = blas_hp.o
 OBJ_fftw = fftw.o
 OBJ_tensor_core = tensor_core.o
@@ -25,17 +26,23 @@ else
  unknown_HW:
 endif
-all: blas fftw
+all: blas fftw blas_hp
 blas: ${OBJ_blas}
 	${CC} -o blas ${OBJ_blas} ${LDFLAGS} ${LDFLAGS_blas} ${CUDAFLAGS}
+blas_hp: ${OBJ_blas_hp}
+	${CC} -o blas_hp ${OBJ_blas_hp} ${LDFLAGS} ${LDFLAGS_blas} ${CUDAFLAGS}
 tensor_core: ${OBJ_tensor_core}
 	${CC} -o tensor_core ${OBJ_tensor_core} ${LDFLAGS} ${LDFLAGS_blas} ${CUDAFLAGS}
 fftw: ${OBJ_fftw}
 	${CC} -o fftw ${OBJ_fftw} ${LDFLAGS} ${LDFLAGS_fftw} ${CUDAFLAGS}
+%.o: %.cu ${HEADER}
+	nvcc ${INCLUDE} -dc $< ${CUDAFLAGS}
 %.o: %.c ${HEADER}
 	${CC} ${CFLAGS} -c $<

--- a/blas_hp.cu
+++ b/blas_hp.cu
+/*
+ * =====================================================================================
+ *
+ *    Description:  BLAS Benchmark
+ *
+ *        Version:  1.0
+ *        Created:  27.01.2021 12:45:18
+ *       Revision:  none
+ *       Compiler:  hipc or nvcc
+ *
+ *         Author:  Henning Fehrmann (), henning.fehrmann@aei.mpg.de
+ *   Organization:  AEI Hannover
+ *        License:  GNU General Public License v2
+ *
+ * =====================================================================================
+ */
+#include "hardware_settings.h"
+#include "profiler.h"
+#include <stdlib.h>
+#include <math.h>
+#include <omp.h>
+#include <assert.h>
+#include <cuda_runtime.h>
+#include <cuda.h>
+#include <cublas_v2.h>
+#include <cuda_fp16.h>
+#include <cufftw.h>
+#define __MALLOC(P, size) P = malloc(size); \
+	if (P == NULL) \
+	{\
+		fprintf(stderr, "Allocation failed at line %d in %s\n", __LINE__, __FILE__); \
+		exit(EXIT_FAILURE); \
+	}\
+void
+check_status
+(
+	cublasStatus_t status
+)
+{
+	switch (status)
+	{
+		case CUBLAS_STATUS_SUCCESS:
+			break;
+		case CUBLAS_STATUS_NOT_INITIALIZED:
+			printf("not initialized\n");
+			break;
+		case CUBLAS_STATUS_ALLOC_FAILED:
+			printf("CUBLAS_STATUS_ALLOC_FAILED\n");
+			break;
+		case CUBLAS_STATUS_INVALID_VALUE:
+			printf("CUBLAS_STATUS_INVALID_VALUE\n");
+			break;
+		case CUBLAS_STATUS_ARCH_MISMATCH:
+			printf("CUBLAS_STATUS_ARCH_MISMATCH\n");
+			break;
+		case CUBLAS_STATUS_MAPPING_ERROR:
+			printf("CUBLAS_STATUS_MAPPING_ERROR\n");
+			break;
+		case CUBLAS_STATUS_EXECUTION_FAILED:
+			printf("CUBLAS_STATUS_EXECUTION_FAILED\n");
+			break;
+		case CUBLAS_STATUS_INTERNAL_ERROR:
+			printf("CUBLAS_STATUS_INTERNAL_ERROR\n");
+			break;
+		case CUBLAS_STATUS_NOT_SUPPORTED:
+			printf("CUBLAS_STATUS_NOT_SUPPORTED\n");
+			break;
+		case CUBLAS_STATUS_LICENSE_ERROR:
+			printf("CUBLAS_STATUS_LICENSE_ERROR\n");
+			break;
+	}
+}
+void
+prepare_matrices
+(
+	__half * hA,
+	__half * hB,
+	size_t m,
+	size_t n,
+	size_t k
+)
+{
+	float fact = 1.f/(float)n/(float)x/(float)y/20.f;
+#pragma omp parallel for
+	for (size_t i = 0; i < m; i++)
+	{
+		for (size_t j = 0; j < k; j++)
+		{
+			size_t ind = j + k * i;
+			hA[ind] = xorshf96()*fact;
+		}
+	}
+#pragma omp parallel for
+	for (size_t i = 0; i < n; i++)
+	{
+		for (size_t j = 0; j < k; j++)
+		{
+			size_t ind = j + k * i;
+			hB[ind] = xorshf96()*fact;
+		}
+	}
+}
+void
+print_result
+(
+	__half * hC,
+	size_t m,
+	size_t n,
+	size_t k
+)
+{
+	printf("-------- %zu %zu\n", m, k);
+	for (size_t i = 0; i < m; i++)
+	{
+		for (size_t j = 0; j < k; j++)
+		{
+			size_t ind = j + k * i;
+			printf("%1.2f\t", hC[ind]);
+		}
+		printf("\n");
+	}
+	printf("--------\n");
+}
+int
+run_test
+(
+	size_t m,
+	size_t n,
+	size_t k,
+	unsigned rep,
+	float * res,
+	cublasHandle_t handle
+)
+{
+	struct runtime * timer;
+	timer = (runtime *)malloc(sizeof(*timer));
+	__half *A;
+	__half *B;
+	__half *C;
+	cudaMalloc((void **)&A, sizeof(*A) * (size_t)(m * k));
+	if (A == NULL) { fprintf(stderr, "A not allocated\n"); exit(1); }
+	cudaMalloc((void **)&B, sizeof(*B) * (size_t)(n * k));
+	if (B == NULL) { fprintf(stderr, "B not allocated\n"); exit(1); }
+	cudaMalloc((void **)&C, sizeof(*C) * (size_t)(m * n));
+	if (C == NULL) { fprintf(stderr, "C not allocated\n"); exit(1); }
+	__half *hA;
+	hA = (__half * )malloc(sizeof(*hA) * (size_t)(m * k));
+	__half *hB;
+	hB = (__half * )malloc(sizeof(*hB) * (size_t)(k * n));
+	__half *hC;
+	hC = (__half * )malloc(sizeof(*hC) * (size_t)(n * m));
+	// timer_start(timer, "Prepare matrices");
+	// timer_stop(timer);
+	//timer_start(timer, "Memcopy");
+	// timer_stop(timer);
+	//timer_start(timer, "Create Handle");
+	//if(rocblas_create_handle(&handle) != rocblas_status_success) return EXIT_FAILURE;
+	//timer_stop(timer);
+//	prepare_matrices(hA, hB, m, n, k);
+	cublasOperation_t transA = CUBLAS_OP_T;
+	cublasOperation_t transB = CUBLAS_OP_N;
+	const __half alpha = 1.f;
+	const __half beta = 0.f;
+	for (unsigned r = 0; r < rep; r++)
+	{
+		cudaMemcpy(A, hA, sizeof(*A) * (size_t)(m * k), cudaMemcpyHostToDevice);
+		cudaMemcpy(B, hB, sizeof(*B) * (size_t)(k * n), cudaMemcpyHostToDevice);
+		float res_r = 0.f;
+		char mes[128];
+		sprintf(mes, "m %zu n %zu k %zu run %d", m, n, k, r);
+		timer_start(timer, mes);
+		check_status(
+			cublasHgemm
+			(
+				handle,
+				transA,
+				transB,
+				m,
+				n,
+				k,
+				&alpha,
+				A,
+				m,
+				B,
+				n,
+				&beta,
+				C,
+				m
+			)
+		);
+	// cublasIcamax(handle,m * n, C, 1, &result);
+	cudaDeviceSynchronize();
+		res_r += timer_stop(timer);
+		res[r] = res_r/1.f;
+	}
+	printf("dimensions: %zu %zu %zu\t -- ", n, m , k);
+	printf("required size: %f GB\n",
+		(
+			m * n * sizeof(*A)
+			+  k * n * sizeof(*B)
+			+  k * m * sizeof(*C)
+		)/1.e+9);
+	cudaFree(A);
+	cudaFree(B);
+	cudaFree(C);
+	free(hA);
+	free(hB);
+	free(hC);
+	free(timer);
+	return 0;
+}
+int
+main
+(
+)
+{
+	int rep = 10;
+	size_t m_min = 9;  // 13
+	size_t m_max = 10; // 16
+	size_t n_min = 17; // 11
+	size_t n_max = 18; // 19
+	size_t k_min = 8;  // 7
+	size_t k_max = 9; // 11
+	float *  res;
+	// cudaSetDevice(0);
+	cublasHandle_t handle;
+	res = (float *)malloc( sizeof(*res) * (size_t)(
+	      (m_max - m_min + 1) *
+	      (n_max - n_min + 1) *
+	      (k_max - k_min + 1) *
+	      rep));
+	cublasCreate(&handle);
+	for (int em = m_min; em <= m_max; em++)
+	{
+		for (int en = n_min; en <= n_max; en++)
+		{
+			for (int ek = k_min; ek <= k_max; ek++)
+			{
+				run_test(1 << em, 1 << en , 1 << ek, rep, &res[0], handle);
+			}
+		}
+	}
+	cublasDestroy(handle);
+	exit(0);
+	// store the results
+	/*
+	FILE * f;
+	char name[128];
+	sprintf(name, "runtimes");
+	f= fopen(name, "w");
+	if (f == NULL)
+	{
+		fprintf(stderr, "Couldn't open %s\n", name);
+	}
+	for (int i = min_dim; i < max_dim; i++)
+	{
+		size_t dim = 1 << i;
+		fprintf(f, "%zu\t", dim);
+	}
+	fprintf(f, "\n");
+	for (int r = 0; r < rep; r++)
+	{
+		for (int i = min_dim; i < max_dim; i++)
+		{
+			size_t pos = (i - min_dim) * rep + r;
+			fprintf(f, "%1.6f\t", res[pos]);
+		}
+		fprintf(f, "\n");
+	}
+	fclose(f);
+	*/
+	return 0;
+}