Commit 0a36e850 authored by Henning Fehrmann's avatar Henning Fehrmann Committed by Henning Fehrmann
Browse files

removed some merge obstacles

parents 7a2b7a7d 19061ae5
......@@ -27,7 +27,7 @@ else
unknown_HW:
endif
all: blas fftw
all: blas fftw tensor_core
blas: ${OBJ_blas}
${CC} -o blas ${OBJ_blas} ${LDFLAGS} ${LDFLAGS_blas} ${CUDAFLAGS}
......
/*
* =====================================================================================
*
* Filename: fftw.c
*
* Description: FFTW profiling
*
* Version: 1.0
* Created: 29.01.2021 10:55:14
* Revision: none
* Compiler: gcc
*
* Author: Henning Fehrmann (), henning.fehrmann@aei.mpg.de
* Organization: AEI Hannover
* License: GNU General Public License v2
*
* =====================================================================================
*/
#define __HIP_PLATFORM_HCC__
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <omp.h>
#include "hardware_settings.h"
#include "profiler.h"
#define __MALLOC(P, size) P = malloc(size); \
if (P == NULL) \
{\
fprintf(stderr, "Allocation failed at line %d in %s\n", __LINE__, __FILE__); \
exit(EXIT_FAILURE); \
}\
void
prepare_data
(
__COMPLEX8__ * hA,
size_t s
)
{
#pragma omp parallel for
for (size_t i = 0; i < s; i++)
{
hA[i].x = 1.0f;
hA[i].y = 0.;
}
}
int
run_test
(
size_t T,
size_t N,
unsigned rep
)
{
struct runtime * timer;
__MALLOC(timer, sizeof(*timer));
// Create HIP device buffer
__COMPLEX8__ *A;
__COMPLEX8__ *hB;
__MALLOC(hB, sizeof(*hB) * N * T);
__ASSERT(__PREFIX(Malloc)((void**)&A, sizeof(*A) * N * T));
// Initialize data
__COMPLEX8__ * hA;
__MALLOC(hA, sizeof(*hA) * N * T);
// Create FFT plan
__FFTW_PLAN plan;
size_t length = N;
char mes[128];
//sprintf(mes, "dim: %zu\tPlan generation." ,N);
//timer_start(timer, mes);
#ifdef ROC
rocfft_plan_create
(
&plan,
rocfft_placement_inplace,
rocfft_transform_type_complex_forward,
rocfft_precision_single,
1,
&length,
1,
NULL
);
#elif CUDA
int batch = T; // --- Number of batched executions
int rank = 1; // --- 1D FFTs
int na[] = { N }; // --- Size of the Fourier transform
int istride = 1, ostride = 1; // --- Distance between two successive input/output elements
int idist = N, odist = N; // --- Distance between batches
int inembed[] = { 0 }; // --- Input size with pitch (ignored for 1D transforms)
int onembed[] = { 0 }; // --- Output size with pitch (ignored for 1D transforms)
cufftPlanMany
(
&plan,
rank,
na,
inembed,
istride,
idist,
onembed,
ostride,
odist,
CUFFT_C2C,
batch
);
#endif
prepare_data(hA, N * T);
// Copy data to device
__ASSERT(__PREFIX(Memcpy)(A, hA, sizeof(*hA) * N, __PREFIX(MemcpyHostToDevice)));
for (int r = 0 ; r < 1; r++)
{
// Execute plan
sprintf(mes, "T = %zu n = %zu\t round %d." ,T, N , r);
timer_start(timer, mes);
#ifdef ROC
rocfft_execute(plan, (void**) &A, NULL, NULL);
#elif CUDA
cufftExecC2C(plan, A, A, CUFFT_FORWARD);
#endif
__PREFIX(DeviceSynchronize)();
timer_stop(timer);
}
// Destroy plan
__DESTROY_PLAN(plan);
__ASSERT(__PREFIX(Memcpy)(hB, A, sizeof(*A) * N, __PREFIX(MemcpyDeviceToHost)));
for (size_t i = 0; i < N; i++)
{
printf("%g\t%g\n", hB[i].x, hB[i].y);
}
exit(0);
__ASSERT(__PREFIX(Free)(A));
free(hA);
free(hB);
free(timer);
return 0;
}
int
main
(
)
{
int rep = 1;
int t_min = 8;
int t_max = 11;
int n_min = 11;
int n_max = 19;
float * res = malloc(sizeof(*res) * (size_t)((n_max - n_min + 1) * rep));
if (res == NULL)
{
fprintf(stderr, "Couldn't allocate res\n");
exit(1);
}
for (int et = t_min; et <= t_max; et ++)
{
int t = 1 << et;
for (int en = n_min; en <= n_max; en++)
{
size_t n = 1 << en;
run_test(t, n, rep);
}
}
free(res);
}
/*
* =====================================================================================
*
* Filename: fp16_conversion.h
*
* Description: i
*
* Version: 1.0
* Created: 02/08/21 16:07:51
* Revision: none
* Compiler: gcc
*
* Author: Henning Fehrmann (), henning.fehrmann@aei.mpg.de
* Company: AEI Hannover
* Copyright: GPL v2.0 Copyright (c) 2021, Henning Fehrmann
*
* =====================================================================================
*/
// Copyright (c) 1993-2016, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// This code modified from the public domain code here:
// https://gist.github.com/rygorous/2156668
// The URL above includes more robust conversion routines
// that handle Inf and NaN correctly.
//
// It is recommended to use the more robust versions in production code.
typedef unsigned uint;
union FP32
{
uint u;
float f;
struct
{
uint Mantissa : 23;
uint Exponent : 8;
uint Sign : 1;
};
};
union FP16
{
unsigned short u;
struct
{
uint Mantissa : 10;
uint Exponent : 5;
uint Sign : 1;
};
};
// Approximate solution. This is faster but converts some sNaNs to
// infinity and doesn't round correctly. Handle with care.
// Approximate solution. This is faster but converts some sNaNs to
// infinity and doesn't round correctly. Handle with care.
static half approx_float_to_half(float fl)
{
FP32 f32infty = { 255 << 23 };
FP32 f16max = { (127 + 16) << 23 };
FP32 magic = { 15 << 23 };
FP32 expinf = { (255 ^ 31) << 23 };
uint sign_mask = 0x80000000u;
FP16 o = { 0 };
FP32 f = *((FP32*)&fl);
uint sign = f.u & sign_mask;
f.u ^= sign;
if (!(f.f < f32infty.u)) // Inf or NaN
o.u = f.u ^ expinf.u;
else
{
if (f.f > f16max.f) f.f = f16max.f;
f.f *= magic.f;
}
o.u = f.u >> 13; // Take the mantissa bits
o.u |= sign >> 16;
return *((half*)&o);
}
// from half->float code - just for verification.
static float half_to_float(half hf)
{
FP16 h = *((FP16*)&hf);
static const FP32 magic = { 113 << 23 };
static const uint shifted_exp = 0x7c00 << 13; // exponent mask after shift
FP32 o;
o.u = (h.u & 0x7fff) << 13; // exponent/mantissa bits
uint exp = shifted_exp & o.u; // just the exponent
o.u += (127 - 15) << 23; // exponent adjust
// handle exponent special cases
if (exp == shifted_exp) // Inf/NaN?
o.u += (128 - 16) << 23; // extra exp adjust
else if (exp == 0) // Zero/Denormal?
{
o.u += 1 << 23; // extra exp adjust
o.f -= magic.f; // renormalize
}
o.u |= (h.u & 0x8000) << 16; // sign bit
return o.f;
}
#include <iostream>
#include <time.h>
#include <cublas_v2.h>
#include <thrust/device_vector.h>
const char* cublasGetErrorString(cublasStatus_t status) {
switch(status) {
case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
}
return "unknown error";
}
int main(void) {
// matrix A
size_t m = 1 << 10;
size_t n = 1 << 17;
size_t k = 1 << 9;
m = 1024;
n = 1024;
k = 512;
int rowA = m;
int colA = k;
// matrix B
int rowB = colA;
int colB = n;
// matrix C
int rowC = rowA;
int colC = colB;
thrust::device_vector<float> A(rowA * colA);
thrust::device_vector<float> B(rowB * colB);
thrust::device_vector<float> C(rowC * colC);
/*
for (size_t i = 0; i < rowA; i++){
for (size_t j = 0; j < colA; j++){
A[i * rowA + j] = i + j;
}
}
for (size_t i = 0; i < rowB; i++){
for (size_t j = 0; j < colB; j++){
B[i * rowA + j] = i + j;
}
}
*/
cublasHandle_t handle;
cublasStatus_t status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS) {
std::cerr << "cublasCreate failed. error is: " << cublasGetErrorString(status) << std::endl;;
}
struct timespec start;
struct timespec stop;
int alpha = 1;
int beta = 0;
float alphaf = 1.f;
float betaf = 0.f;
// A * B + C
/*
*/
//cublasSetMathMode(handle, CUBLAS_PEDANTIC_MATH);
//cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH);
cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH);
for (int r = 0; r < 10; r++)
{
clock_gettime(CLOCK_REALTIME , &start);
/*
status = cublasGemmEx
(
handle, CUBLAS_OP_N, CUBLAS_OP_N,
rowA, colB, colA,
&alpha, thrust::raw_pointer_cast(&A[0]),
CUDA_R_8I,
rowA,
thrust::raw_pointer_cast(&B[0]),
CUDA_R_8I,
colB,
&beta, thrust::raw_pointer_cast(&C[0]), CUDA_R_32I,
colB,
CUDA_R_32I, CUBLAS_GEMM_ALGO0
);
*/
status = cublasSgemmEx
(
handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
rowA,
colB,
colA,
&alphaf,
thrust::raw_pointer_cast(&A[0]),
CUDA_R_16F,
rowA,
thrust::raw_pointer_cast(&B[0]),
CUDA_R_16F,
colB,
&betaf,
thrust::raw_pointer_cast(&C[0]),
CUDA_R_32F,
colB
);
if (status != CUBLAS_STATUS_SUCCESS) {
std::cerr << "cublasGemmEx execution error is: " << cublasGetErrorString(status) << std::endl;
exit(0);
}
cudaDeviceSynchronize();
clock_gettime(CLOCK_REALTIME , &stop);
double res= (double)
(
stop.tv_sec - start.tv_sec
)*1000.
+
(double)
(
stop.tv_nsec - start.tv_nsec
)/1000000.
;
printf("hp %d %d %d %d %g [ms]\n",r, m, n, k, res);
}
status = cublasDestroy(handle);
if (status != CUBLAS_STATUS_SUCCESS) {
std::cerr << "shutdown error code is: " << cublasGetErrorString(status) << std::endl;
}
return 0;
}
#include <unistd.h>
#include <iostream>
#include <stdlib.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include "fp16_conversion.h"
#include <time.h>
using namespace std;
#define FP16MM
const char* cublasGetErrorString(cublasStatus_t status)
{
switch(status)
{
case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
}
return "unknown error";
}
// Convenience function for checking CUDA runtime API results
// can be wrapped around any runtime API call. No-op in release builds.
inline
cudaError_t checkCuda(cudaError_t result)
{
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
return result;
}
inline
cublasStatus_t checkCublas(cublasStatus_t result)
{
if (result != CUBLAS_STATUS_SUCCESS) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cublasGetErrorString(result));
assert(result == CUBLAS_STATUS_SUCCESS);
}
return result;
}
// Fill the array A(nr_rows_A, nr_cols_A) with random numbers on CPU
void CPU_fill_rand(float *A, int nr_rows_A, int nr_cols_A)
{
for(int i = 0; i < nr_rows_A * nr_cols_A; i++){
//A[i] = (float)rand()/(float)(RAND_MAX/a);
A[i] = 0.1;
}
}
int main
(
int argc,
char ** argv
)
{
int repeats = 10;
cout << "\nrunning cublasHgemm test\n" << endl;
cublasStatus_t stat;
cublasHandle_t handle;
checkCublas(cublasCreate(&handle));
size_t m = 1 << 11;
size_t n = 1 << 17;
size_t k = 1 << 9;
float *h_A = (float *)malloc(sizeof(*h_A) * m * k);
float *h_B = (float *)malloc(sizeof(*h_B) * k * n);
float *h_C = (float *)malloc(sizeof(*h_C) * n * m);
CPU_fill_rand(h_A, m, k);
CPU_fill_rand(h_B, k, n);
CPU_fill_rand(h_C, n, m);
__half *d_A, *d_B, *d_C;
checkCuda(cudaMallocManaged(&d_A, m * k * sizeof(* d_A)));
checkCuda(cudaMallocManaged(&d_B, k * n * sizeof(* d_B)));
checkCuda(cudaMallocManaged(&d_C, m * n * sizeof(* d_C)));
for (int i = 0; i < m * k; i++)
{
d_A[i] = approx_float_to_half(h_A[i]);
}
for (int i = 0; i < k * n; i++)
{
d_B[i] = approx_float_to_half(h_B[i]);
}
for (int i = 0; i < n * m; i++)
{
d_C[i] = approx_float_to_half(h_C[i]);
}
const __half alf = approx_float_to_half(1.0);
const __half bet = approx_float_to_half(0.0);
const __half *alpha = &alf;
const __half *beta = &bet;
cudaEvent_t cstart, cstop;
cudaEventCreate(&cstart);
cudaEventCreate(&cstop);
float sum1 = 0.f;
float sum2 = 0.f;
struct timespec start;
struct timespec stop;
for(int rep = 0; rep < repeats; rep++)
{
cudaEventRecord(cstart, 0);
clock_gettime(CLOCK_REALTIME , &start);
stat = cublasHgemm
(
handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m,
n,
k,
alpha,
d_A,
m,
d_B,
k,
beta,
d_C,
m
);
cudaEventRecord(cstop,0);
cudaEventSynchronize(cstop);
clock_gettime(CLOCK_REALTIME , &stop);
if(stat != CUBLAS_STATUS_SUCCESS)
{
cerr << "cublasSgemmBatched failed" << endl;
exit(1);
}
assert(!cudaGetLastError());
double res= (double)
(
stop.tv_sec - start.tv_sec
)*1000.
+
(double)
(
stop.tv_nsec - start.tv_nsec
)/1000000.
;
float elapsed;
cudaEventElapsedTime(&elapsed, cstart, cstop);
elapsed /= 1000.0f;
sum1 += res;
sum2 += elapsed;
cout << res << " <=> " << elapsed << endl;
}