diff --git a/Makefile b/Makefile index c53832e93c1a65cf384b56c3e9733f8bb63d6c5b..7d1ba027370b4f9bc876c38b2e2e9ecb17606d50 100644 --- a/Makefile +++ b/Makefile @@ -27,7 +27,7 @@ else unknown_HW: endif -all: blas fftw +all: blas fftw tensor_core blas: ${OBJ_blas} ${CC} -o blas ${OBJ_blas} ${LDFLAGS} ${LDFLAGS_blas} ${CUDAFLAGS} diff --git a/fftw_test.c b/fftw_test.c new file mode 100644 index 0000000000000000000000000000000000000000..a3f013f41746467f6945a8575d1801f40409d006 --- /dev/null +++ b/fftw_test.c @@ -0,0 +1,181 @@ +/* + * ===================================================================================== + * + * Filename: fftw.c + * + * Description: FFTW profiling + * + * Version: 1.0 + * Created: 29.01.2021 10:55:14 + * Revision: none + * Compiler: gcc + * + * Author: Henning Fehrmann (), henning.fehrmann@aei.mpg.de + * Organization: AEI Hannover + * License: GNU General Public License v2 + * + * ===================================================================================== + */ + + +#define __HIP_PLATFORM_HCC__ + +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include <time.h> +#include <omp.h> + +#include "hardware_settings.h" +#include "profiler.h" + + +#define __MALLOC(P, size) P = malloc(size); \ + if (P == NULL) \ + {\ + fprintf(stderr, "Allocation failed at line %d in %s\n", __LINE__, __FILE__); \ + exit(EXIT_FAILURE); \ + }\ + +void +prepare_data +( + __COMPLEX8__ * hA, + size_t s +) +{ +#pragma omp parallel for + for (size_t i = 0; i < s; i++) + { + hA[i].x = 1.0f; + hA[i].y = 0.; + } +} + +int +run_test +( + size_t T, + size_t N, + unsigned rep +) +{ + + struct runtime * timer; + __MALLOC(timer, sizeof(*timer)); + // Create HIP device buffer + __COMPLEX8__ *A; + __COMPLEX8__ *hB; + __MALLOC(hB, sizeof(*hB) * N * T); + + __ASSERT(__PREFIX(Malloc)((void**)&A, sizeof(*A) * N * T)); + + // Initialize data + __COMPLEX8__ * hA; + __MALLOC(hA, sizeof(*hA) * N * T); + + // Create FFT plan + __FFTW_PLAN plan; + size_t length = N; + + char mes[128]; + //sprintf(mes, "dim: %zu\tPlan generation." ,N); + //timer_start(timer, mes); + +#ifdef ROC + rocfft_plan_create + ( + &plan, + rocfft_placement_inplace, + rocfft_transform_type_complex_forward, + rocfft_precision_single, + 1, + &length, + 1, + NULL + ); +#elif CUDA + int batch = T; // --- Number of batched executions + int rank = 1; // --- 1D FFTs + int na[] = { N }; // --- Size of the Fourier transform + int istride = 1, ostride = 1; // --- Distance between two successive input/output elements + int idist = N, odist = N; // --- Distance between batches + int inembed[] = { 0 }; // --- Input size with pitch (ignored for 1D transforms) + int onembed[] = { 0 }; // --- Output size with pitch (ignored for 1D transforms) + cufftPlanMany + ( + &plan, + rank, + na, + inembed, + istride, + idist, + onembed, + ostride, + odist, + CUFFT_C2C, + batch + ); +#endif + prepare_data(hA, N * T); + // Copy data to device + __ASSERT(__PREFIX(Memcpy)(A, hA, sizeof(*hA) * N, __PREFIX(MemcpyHostToDevice))); + for (int r = 0 ; r < 1; r++) + { + // Execute plan + sprintf(mes, "T = %zu n = %zu\t round %d." ,T, N , r); + timer_start(timer, mes); +#ifdef ROC + rocfft_execute(plan, (void**) &A, NULL, NULL); +#elif CUDA + cufftExecC2C(plan, A, A, CUFFT_FORWARD); +#endif + __PREFIX(DeviceSynchronize)(); + timer_stop(timer); + + } + // Destroy plan + __DESTROY_PLAN(plan); + + __ASSERT(__PREFIX(Memcpy)(hB, A, sizeof(*A) * N, __PREFIX(MemcpyDeviceToHost))); + for (size_t i = 0; i < N; i++) + { + printf("%g\t%g\n", hB[i].x, hB[i].y); + } + exit(0); + __ASSERT(__PREFIX(Free)(A)); + free(hA); + free(hB); + + free(timer); + return 0; +} +int +main +( +) +{ + int rep = 1; + int t_min = 8; + int t_max = 11; + int n_min = 11; + int n_max = 19; + + float * res = malloc(sizeof(*res) * (size_t)((n_max - n_min + 1) * rep)); + if (res == NULL) + { + fprintf(stderr, "Couldn't allocate res\n"); + exit(1); + } + + for (int et = t_min; et <= t_max; et ++) + { + int t = 1 << et; + for (int en = n_min; en <= n_max; en++) + { + size_t n = 1 << en; + run_test(t, n, rep); + } + } + free(res); +} diff --git a/fp16_conversion.h b/fp16_conversion.h new file mode 100644 index 0000000000000000000000000000000000000000..58f04b9b33d696a83e44115953eef68755905953 --- /dev/null +++ b/fp16_conversion.h @@ -0,0 +1,133 @@ +/* + * ===================================================================================== + * + * Filename: fp16_conversion.h + * + * Description: i + * + * Version: 1.0 + * Created: 02/08/21 16:07:51 + * Revision: none + * Compiler: gcc + * + * Author: Henning Fehrmann (), henning.fehrmann@aei.mpg.de + * Company: AEI Hannover + * Copyright: GPL v2.0 Copyright (c) 2021, Henning Fehrmann + * + * ===================================================================================== + */ + +// Copyright (c) 1993-2016, NVIDIA CORPORATION. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// This code modified from the public domain code here: +// https://gist.github.com/rygorous/2156668 +// The URL above includes more robust conversion routines +// that handle Inf and NaN correctly. +// +// It is recommended to use the more robust versions in production code. + +typedef unsigned uint; + +union FP32 +{ + uint u; + float f; + struct + { + uint Mantissa : 23; + uint Exponent : 8; + uint Sign : 1; + }; +}; + +union FP16 +{ + unsigned short u; + struct + { + uint Mantissa : 10; + uint Exponent : 5; + uint Sign : 1; + }; +}; + +// Approximate solution. This is faster but converts some sNaNs to +// infinity and doesn't round correctly. Handle with care. +// Approximate solution. This is faster but converts some sNaNs to +// infinity and doesn't round correctly. Handle with care. +static half approx_float_to_half(float fl) +{ + FP32 f32infty = { 255 << 23 }; + FP32 f16max = { (127 + 16) << 23 }; + FP32 magic = { 15 << 23 }; + FP32 expinf = { (255 ^ 31) << 23 }; + uint sign_mask = 0x80000000u; + FP16 o = { 0 }; + + FP32 f = *((FP32*)&fl); + + uint sign = f.u & sign_mask; + f.u ^= sign; + + if (!(f.f < f32infty.u)) // Inf or NaN + o.u = f.u ^ expinf.u; + else + { + if (f.f > f16max.f) f.f = f16max.f; + f.f *= magic.f; + } + + o.u = f.u >> 13; // Take the mantissa bits + o.u |= sign >> 16; + return *((half*)&o); +} + +// from half->float code - just for verification. +static float half_to_float(half hf) +{ + FP16 h = *((FP16*)&hf); + + static const FP32 magic = { 113 << 23 }; + static const uint shifted_exp = 0x7c00 << 13; // exponent mask after shift + FP32 o; + + o.u = (h.u & 0x7fff) << 13; // exponent/mantissa bits + uint exp = shifted_exp & o.u; // just the exponent + o.u += (127 - 15) << 23; // exponent adjust + + // handle exponent special cases + if (exp == shifted_exp) // Inf/NaN? + o.u += (128 - 16) << 23; // extra exp adjust + else if (exp == 0) // Zero/Denormal? + { + o.u += 1 << 23; // extra exp adjust + o.f -= magic.f; // renormalize + } + + o.u |= (h.u & 0x8000) << 16; // sign bit + return o.f; +} diff --git a/tensor2.cu b/tensor2.cu new file mode 100644 index 0000000000000000000000000000000000000000..4e8556f8bdbe00748f2ad5bd27377aa0951ed32d --- /dev/null +++ b/tensor2.cu @@ -0,0 +1,142 @@ +#include <iostream> + +#include <time.h> +#include <cublas_v2.h> +#include <thrust/device_vector.h> + +const char* cublasGetErrorString(cublasStatus_t status) { + switch(status) { + case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS"; + case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED"; + case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED"; + case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE"; + case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH"; + case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR"; + case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED"; + case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR"; + case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED"; + } + return "unknown error"; +} + +int main(void) { + // matrix A + + size_t m = 1 << 10; + size_t n = 1 << 17; + size_t k = 1 << 9; + m = 1024; + n = 1024; + k = 512; + int rowA = m; + int colA = k; + // matrix B + int rowB = colA; + int colB = n; + // matrix C + int rowC = rowA; + int colC = colB; + + thrust::device_vector<float> A(rowA * colA); + thrust::device_vector<float> B(rowB * colB); + thrust::device_vector<float> C(rowC * colC); + /* + for (size_t i = 0; i < rowA; i++){ + for (size_t j = 0; j < colA; j++){ + A[i * rowA + j] = i + j; + } + } + + for (size_t i = 0; i < rowB; i++){ + for (size_t j = 0; j < colB; j++){ + B[i * rowA + j] = i + j; + } + } + */ + cublasHandle_t handle; + cublasStatus_t status = cublasCreate(&handle); + if (status != CUBLAS_STATUS_SUCCESS) { + std::cerr << "cublasCreate failed. error is: " << cublasGetErrorString(status) << std::endl;; + } + + struct timespec start; + struct timespec stop; + int alpha = 1; + int beta = 0; + float alphaf = 1.f; + float betaf = 0.f; + // A * B + C + /* + + */ + + //cublasSetMathMode(handle, CUBLAS_PEDANTIC_MATH); + //cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH); + cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH); + + for (int r = 0; r < 10; r++) + { + clock_gettime(CLOCK_REALTIME , &start); + /* + status = cublasGemmEx + ( + handle, CUBLAS_OP_N, CUBLAS_OP_N, + rowA, colB, colA, + &alpha, thrust::raw_pointer_cast(&A[0]), + CUDA_R_8I, + rowA, + thrust::raw_pointer_cast(&B[0]), + CUDA_R_8I, + colB, + &beta, thrust::raw_pointer_cast(&C[0]), CUDA_R_32I, + colB, + CUDA_R_32I, CUBLAS_GEMM_ALGO0 + ); + */ + status = cublasSgemmEx + ( + handle, + CUBLAS_OP_N, + CUBLAS_OP_N, + rowA, + colB, + colA, + &alphaf, + thrust::raw_pointer_cast(&A[0]), + CUDA_R_16F, + rowA, + thrust::raw_pointer_cast(&B[0]), + CUDA_R_16F, + colB, + &betaf, + thrust::raw_pointer_cast(&C[0]), + CUDA_R_32F, + colB + ); + if (status != CUBLAS_STATUS_SUCCESS) { + std::cerr << "cublasGemmEx execution error is: " << cublasGetErrorString(status) << std::endl; + exit(0); + } + + cudaDeviceSynchronize(); + clock_gettime(CLOCK_REALTIME , &stop); + double res= (double) + ( + stop.tv_sec - start.tv_sec + )*1000. + + + (double) + ( + stop.tv_nsec - start.tv_nsec + )/1000000. + ; + printf("hp %d %d %d %d %g [ms]\n",r, m, n, k, res); + } + + status = cublasDestroy(handle); + if (status != CUBLAS_STATUS_SUCCESS) { + std::cerr << "shutdown error code is: " << cublasGetErrorString(status) << std::endl; + } + + return 0; +} diff --git a/tensor_core.cu b/tensor_core.cu new file mode 100644 index 0000000000000000000000000000000000000000..530d3f8d7ca64f83c5245dd0d561297973e20f35 --- /dev/null +++ b/tensor_core.cu @@ -0,0 +1,174 @@ +#include <unistd.h> +#include <iostream> +#include <stdlib.h> +#include <assert.h> +#include <cuda_runtime.h> +#include <cublas_v2.h> +#include "fp16_conversion.h" +#include <time.h> + +using namespace std; + +#define FP16MM + +const char* cublasGetErrorString(cublasStatus_t status) +{ + switch(status) + { + case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS"; + case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED"; + case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED"; + case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE"; + case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH"; + case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR"; + case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED"; + case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR"; + } + return "unknown error"; +} + +// Convenience function for checking CUDA runtime API results +// can be wrapped around any runtime API call. No-op in release builds. +inline +cudaError_t checkCuda(cudaError_t result) +{ + if (result != cudaSuccess) { + fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result)); + assert(result == cudaSuccess); + } + return result; +} + +inline +cublasStatus_t checkCublas(cublasStatus_t result) +{ + if (result != CUBLAS_STATUS_SUCCESS) { + fprintf(stderr, "CUDA Runtime Error: %s\n", cublasGetErrorString(result)); + assert(result == CUBLAS_STATUS_SUCCESS); + } + return result; +} + +// Fill the array A(nr_rows_A, nr_cols_A) with random numbers on CPU +void CPU_fill_rand(float *A, int nr_rows_A, int nr_cols_A) +{ + + for(int i = 0; i < nr_rows_A * nr_cols_A; i++){ + //A[i] = (float)rand()/(float)(RAND_MAX/a); + A[i] = 0.1; + } +} + +int main +( + int argc, + char ** argv +) +{ + + + int repeats = 10; + cout << "\nrunning cublasHgemm test\n" << endl; + + cublasStatus_t stat; + cublasHandle_t handle; + + checkCublas(cublasCreate(&handle)); + size_t m = 1 << 11; + size_t n = 1 << 17; + size_t k = 1 << 9; + float *h_A = (float *)malloc(sizeof(*h_A) * m * k); + float *h_B = (float *)malloc(sizeof(*h_B) * k * n); + float *h_C = (float *)malloc(sizeof(*h_C) * n * m); + CPU_fill_rand(h_A, m, k); + CPU_fill_rand(h_B, k, n); + CPU_fill_rand(h_C, n, m); + + __half *d_A, *d_B, *d_C; + checkCuda(cudaMallocManaged(&d_A, m * k * sizeof(* d_A))); + checkCuda(cudaMallocManaged(&d_B, k * n * sizeof(* d_B))); + checkCuda(cudaMallocManaged(&d_C, m * n * sizeof(* d_C))); + + for (int i = 0; i < m * k; i++) + { + d_A[i] = approx_float_to_half(h_A[i]); + } + for (int i = 0; i < k * n; i++) + { + d_B[i] = approx_float_to_half(h_B[i]); + } + for (int i = 0; i < n * m; i++) + { + d_C[i] = approx_float_to_half(h_C[i]); + } + + const __half alf = approx_float_to_half(1.0); + const __half bet = approx_float_to_half(0.0); + const __half *alpha = &alf; + const __half *beta = &bet; + cudaEvent_t cstart, cstop; + cudaEventCreate(&cstart); + cudaEventCreate(&cstop); + float sum1 = 0.f; + float sum2 = 0.f; + struct timespec start; + struct timespec stop; + for(int rep = 0; rep < repeats; rep++) + { + cudaEventRecord(cstart, 0); + clock_gettime(CLOCK_REALTIME , &start); + + stat = cublasHgemm + ( + handle, + CUBLAS_OP_N, + CUBLAS_OP_N, + m, + n, + k, + alpha, + d_A, + m, + d_B, + k, + beta, + d_C, + m + ); + cudaEventRecord(cstop,0); + cudaEventSynchronize(cstop); + clock_gettime(CLOCK_REALTIME , &stop); + if(stat != CUBLAS_STATUS_SUCCESS) + { + cerr << "cublasSgemmBatched failed" << endl; + exit(1); + } + assert(!cudaGetLastError()); + double res= (double) + ( + stop.tv_sec - start.tv_sec + )*1000. + + + (double) + ( + stop.tv_nsec - start.tv_nsec + )/1000000. + ; + float elapsed; + cudaEventElapsedTime(&elapsed, cstart, cstop); + elapsed /= 1000.0f; + sum1 += res; + sum2 += elapsed; + cout << res << " <=> " << elapsed << endl; + } + cout << "float16; size " << m <<" "<< k << " "<< n << " average: " << sum1/repeats << " ms "<< sum2/repeats << " s "<< endl; + //Free GPU memory + cudaFree(d_A); + cudaFree(d_B); + cudaFree(d_C); + // Free CPU memory + free(h_A); + free(h_B); + free(h_C); + return 0; +}