diff --git a/tensor_core.c b/tensor_core.c new file mode 100644 index 0000000000000000000000000000000000000000..6feebfc9d5fb4ae37c1be3397c67898cb5d859e5 --- /dev/null +++ b/tensor_core.c @@ -0,0 +1,335 @@ +/* + * ===================================================================================== + * + * Description: BLAS Benchmark + * + * Version: 1.0 + * Created: 27.01.2021 12:45:18 + * Revision: none + * Compiler: hipc or nvcc + * + * Author: Henning Fehrmann (), henning.fehrmann@aei.mpg.de + * Organization: AEI Hannover + * License: GNU General Public License v2 + * + * ===================================================================================== + */ + +#include "hardware_settings.h" +#include "profiler.h" + +#include <stdlib.h> +#include <math.h> +#include <omp.h> +#include <mma.h> +#include <string.h> + +#define __MALLOC(P, size) P = malloc(size); \ + if (P == NULL) \ + {\ + fprintf(stderr, "Allocation failed at line %d in %s\n", __LINE__, __FILE__); \ + exit(EXIT_FAILURE); \ + }\ + +void +check_status +( + cublasStatus_t status +) +{ + switch (status) + { + case CUBLAS_STATUS_SUCCESS: + break; + case CUBLAS_STATUS_NOT_INITIALIZED: + printf("not initialized\n"); + break; + case CUBLAS_STATUS_ALLOC_FAILED: + printf("CUBLAS_STATUS_ALLOC_FAILED\n"); + break; + case CUBLAS_STATUS_INVALID_VALUE: + printf("CUBLAS_STATUS_INVALID_VALUE\n"); + break; + case CUBLAS_STATUS_ARCH_MISMATCH: + printf("CUBLAS_STATUS_ARCH_MISMATCH\n"); + break; + case CUBLAS_STATUS_MAPPING_ERROR: + printf("CUBLAS_STATUS_MAPPING_ERROR\n"); + break; + case CUBLAS_STATUS_EXECUTION_FAILED: + printf("CUBLAS_STATUS_EXECUTION_FAILED\n"); + break; + case CUBLAS_STATUS_INTERNAL_ERROR: + printf("CUBLAS_STATUS_INTERNAL_ERROR\n"); + break; + case CUBLAS_STATUS_NOT_SUPPORTED: + printf("CUBLAS_STATUS_NOT_SUPPORTED\n"); + break; + case CUBLAS_STATUS_LICENSE_ERROR: + printf("CUBLAS_STATUS_LICENSE_ERROR\n"); + break; + } +} + + + +void +multiplication +( + __HANDLE__ handle, + const __COMPLEX8__ *A, + const __COMPLEX8__ *B, + __COMPLEX8__ *C, + size_t m, + size_t n, + size_t k +) +{ + __BLAS_OPERATION__ transA = __NO_TRANSFORM__; + __BLAS_OPERATION__ transB = __CT_TRANSFORM__; + const __COMPLEX8__ alpha = {.x = 1.f, .y = 0.f}; + const __COMPLEX8__ beta = {.x = 0.f, .y = 0.f}; + + cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP; + check_status( cublasGemmEx( + handle, + transA, + transB, + m, + n, + k, + &alpha, + A, + CUDA_C_16F, + m, + B, + CUDA_C_16F, + n, + &beta, + C, + CUDA_C_16F, + m, + CUDA_C_32F, + algo + )); + exit(0); + // cublasIcamax(handle,m * n, C, 1, &result); + cudaDeviceSynchronize(); +} + +void +prepare_matrices +( + __COMPLEX8__ * hA, + __COMPLEX8__ * hB, + size_t m, + size_t n, + size_t k +) +{ + float fact = 1.f/(float)n/(float)x/(float)y/20.f; +#pragma omp parallel for + for (size_t i = 0; i < m; i++) + { + for (size_t j = 0; j < k; j++) + { + size_t ind = j + k * i; + hA[ind].x = (float)xorshf96()*fact; + hA[ind].y = (float)xorshf96()*fact; + } + } +#pragma omp parallel for + for (size_t i = 0; i < n; i++) + { + for (size_t j = 0; j < k; j++) + { + size_t ind = j + k * i; + hB[ind].x = (float)xorshf96()*fact; + hB[ind].y = (float)xorshf96()*fact; + } + } + +} + +void +print_result +( + __COMPLEX8__ * hC, + size_t m, + size_t n, + size_t k +) +{ + printf("-------- %zu %zu\n", m, k); + for (size_t i = 0; i < m; i++) + { + for (size_t j = 0; j < k; j++) + { + size_t ind = j + k * i; + printf("%1.2f %1.2f\t", hC[ind].x, hC[ind].y); + } + printf("\n"); + } + printf("--------\n"); + +} + +int +run_test +( + size_t m, + size_t n, + size_t k, + unsigned rep, + float * res, + __HANDLE__ handle +) +{ + struct runtime * timer; + __MALLOC(timer, sizeof(*timer)); + __COMPLEX8__ *A; + __COMPLEX8__ *B; + __COMPLEX8__ *C; + __ASSERT(__PREFIX(Malloc)((void **)&A, sizeof(*A) * (size_t)(m * k))); + __ASSERT(__PREFIX(Malloc)((void **)&B, sizeof(*B) * (size_t)(n * k))); + __ASSERT(__PREFIX(Malloc)((void **)&C, sizeof(*C) * (size_t)(m * n))); + if (C == NULL) + { + fprintf(stderr, "C not allocated\n"); + exit(1); + } + + __COMPLEX8__ *hA; + __MALLOC( hA, sizeof(*hA) * (size_t)(m * k)); + __COMPLEX8__ *hB; + __MALLOC( hB, sizeof(*hB) * (size_t)(k * n)); + __COMPLEX8__ *hC; + __MALLOC( hC, sizeof(*hC) * (size_t)(n * m)); + + // timer_start(timer, "Prepare matrices"); + // timer_stop(timer); + + //timer_start(timer, "Memcopy"); + // timer_stop(timer); + + //timer_start(timer, "Create Handle"); + //if(rocblas_create_handle(&handle) != rocblas_status_success) return EXIT_FAILURE; + + //timer_stop(timer); + + + prepare_matrices(hA, hB, m, n, k); + for (unsigned r = 0; r < rep; r++) + { + __ASSERT(__PREFIX(Memcpy)(A, hA, sizeof(*A) * (size_t)(m * k), __PREFIX(MemcpyHostToDevice))); + __ASSERT(__PREFIX(Memcpy)(B, hB, sizeof(*B) * (size_t)(k * n), __PREFIX(MemcpyHostToDevice))); + float res_r = 0.f; + char mes[128]; + sprintf(mes, "m %zu n %zu k %zu run %d", m, n, k, r); + timer_start(timer, mes); + + multiplication + ( + handle, + A, + B, + C, + m, + n, + k + ); + + res_r += timer_stop(timer); + res[r] = res_r/1.f; + } + + printf("dimensions: %zu %zu %zu\t -- ", n, m , k); + printf("required size: %f GB\n", + ( + m * n * sizeof(*A) + + k * n * sizeof(*B) + + k * m * sizeof(*C) + )/1.e+9); + + //__ASSERT(__PREFIX(Memcpy)(hC, C, sizeof(*hC) * (size_t)(k * m), __PREFIX(MemcpyDeviceToHost))); + //print_result(hC, 1 << em, 1 << en, 1 << ek); + + // timer_start(timer, "Destroy Handle"); + //if(rocblas_destroy_handle(handle) != rocblas_status_success) return EXIT_FAILURE; + // timer_stop(timer); + + __PREFIX(Free)(A); + __PREFIX(Free)(B); + __PREFIX(Free)(C); + free(hA); + free(hB); + free(hC); + free(timer); + return 0; +} + +int +main +( +) +{ + int rep = 10; + size_t m_min = 8; // 13 + size_t m_max = 11; // 16 + size_t n_min = 11; // 11 + size_t n_max = 19; // 19 + size_t k_min = 5; // 7 + size_t k_max = 11; // 11 + float * res; + + // cudaSetDevice(0); + __HANDLE__ handle; + + __CREATE_HANDLE(&handle); + + __MALLOC(res, sizeof(*res) * (size_t)( + (m_max - m_min + 1) * + (n_max - n_min + 1) * + (k_max - k_min + 1) * + rep)); + for (int em = m_min; em <= m_max; em++) + { + for (int en = n_min; en <= n_max; en++) + { + for (int ek = k_min; ek <= k_max; ek++) + { + run_test(1 << em, 1 << en , 1 << ek, rep, &res[0], handle); + } + } + } + if(__DESTROY_HANDLE(handle) != __PREFIX(Success)) return EXIT_FAILURE; + exit(0); + // store the results + /* + FILE * f; + char name[128]; + sprintf(name, "runtimes"); + f= fopen(name, "w"); + if (f == NULL) + { + fprintf(stderr, "Couldn't open %s\n", name); + } + for (int i = min_dim; i < max_dim; i++) + { + size_t dim = 1 << i; + fprintf(f, "%zu\t", dim); + } + fprintf(f, "\n"); + for (int r = 0; r < rep; r++) + { + for (int i = min_dim; i < max_dim; i++) + { + size_t pos = (i - min_dim) * rep + r; + fprintf(f, "%1.6f\t", res[pos]); + } + fprintf(f, "\n"); + } + fclose(f); + */ + return 0; +}