Skip to content
Snippets Groups Projects
Commit 0a36e850 authored by Henning Fehrmann's avatar Henning Fehrmann Committed by Henning Fehrmann
Browse files

removed some merge obstacles

parents 7a2b7a7d 19061ae5
No related branches found
No related tags found
No related merge requests found
......@@ -27,7 +27,7 @@ else
unknown_HW:
endif
all: blas fftw
all: blas fftw tensor_core
blas: ${OBJ_blas}
${CC} -o blas ${OBJ_blas} ${LDFLAGS} ${LDFLAGS_blas} ${CUDAFLAGS}
......
/*
* =====================================================================================
*
* Filename: fftw.c
*
* Description: FFTW profiling
*
* Version: 1.0
* Created: 29.01.2021 10:55:14
* Revision: none
* Compiler: gcc
*
* Author: Henning Fehrmann (), henning.fehrmann@aei.mpg.de
* Organization: AEI Hannover
* License: GNU General Public License v2
*
* =====================================================================================
*/
#define __HIP_PLATFORM_HCC__
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <omp.h>
#include "hardware_settings.h"
#include "profiler.h"
#define __MALLOC(P, size) P = malloc(size); \
if (P == NULL) \
{\
fprintf(stderr, "Allocation failed at line %d in %s\n", __LINE__, __FILE__); \
exit(EXIT_FAILURE); \
}\
void
prepare_data
(
__COMPLEX8__ * hA,
size_t s
)
{
#pragma omp parallel for
for (size_t i = 0; i < s; i++)
{
hA[i].x = 1.0f;
hA[i].y = 0.;
}
}
int
run_test
(
size_t T,
size_t N,
unsigned rep
)
{
struct runtime * timer;
__MALLOC(timer, sizeof(*timer));
// Create HIP device buffer
__COMPLEX8__ *A;
__COMPLEX8__ *hB;
__MALLOC(hB, sizeof(*hB) * N * T);
__ASSERT(__PREFIX(Malloc)((void**)&A, sizeof(*A) * N * T));
// Initialize data
__COMPLEX8__ * hA;
__MALLOC(hA, sizeof(*hA) * N * T);
// Create FFT plan
__FFTW_PLAN plan;
size_t length = N;
char mes[128];
//sprintf(mes, "dim: %zu\tPlan generation." ,N);
//timer_start(timer, mes);
#ifdef ROC
rocfft_plan_create
(
&plan,
rocfft_placement_inplace,
rocfft_transform_type_complex_forward,
rocfft_precision_single,
1,
&length,
1,
NULL
);
#elif CUDA
int batch = T; // --- Number of batched executions
int rank = 1; // --- 1D FFTs
int na[] = { N }; // --- Size of the Fourier transform
int istride = 1, ostride = 1; // --- Distance between two successive input/output elements
int idist = N, odist = N; // --- Distance between batches
int inembed[] = { 0 }; // --- Input size with pitch (ignored for 1D transforms)
int onembed[] = { 0 }; // --- Output size with pitch (ignored for 1D transforms)
cufftPlanMany
(
&plan,
rank,
na,
inembed,
istride,
idist,
onembed,
ostride,
odist,
CUFFT_C2C,
batch
);
#endif
prepare_data(hA, N * T);
// Copy data to device
__ASSERT(__PREFIX(Memcpy)(A, hA, sizeof(*hA) * N, __PREFIX(MemcpyHostToDevice)));
for (int r = 0 ; r < 1; r++)
{
// Execute plan
sprintf(mes, "T = %zu n = %zu\t round %d." ,T, N , r);
timer_start(timer, mes);
#ifdef ROC
rocfft_execute(plan, (void**) &A, NULL, NULL);
#elif CUDA
cufftExecC2C(plan, A, A, CUFFT_FORWARD);
#endif
__PREFIX(DeviceSynchronize)();
timer_stop(timer);
}
// Destroy plan
__DESTROY_PLAN(plan);
__ASSERT(__PREFIX(Memcpy)(hB, A, sizeof(*A) * N, __PREFIX(MemcpyDeviceToHost)));
for (size_t i = 0; i < N; i++)
{
printf("%g\t%g\n", hB[i].x, hB[i].y);
}
exit(0);
__ASSERT(__PREFIX(Free)(A));
free(hA);
free(hB);
free(timer);
return 0;
}
int
main
(
)
{
int rep = 1;
int t_min = 8;
int t_max = 11;
int n_min = 11;
int n_max = 19;
float * res = malloc(sizeof(*res) * (size_t)((n_max - n_min + 1) * rep));
if (res == NULL)
{
fprintf(stderr, "Couldn't allocate res\n");
exit(1);
}
for (int et = t_min; et <= t_max; et ++)
{
int t = 1 << et;
for (int en = n_min; en <= n_max; en++)
{
size_t n = 1 << en;
run_test(t, n, rep);
}
}
free(res);
}
/*
* =====================================================================================
*
* Filename: fp16_conversion.h
*
* Description: i
*
* Version: 1.0
* Created: 02/08/21 16:07:51
* Revision: none
* Compiler: gcc
*
* Author: Henning Fehrmann (), henning.fehrmann@aei.mpg.de
* Company: AEI Hannover
* Copyright: GPL v2.0 Copyright (c) 2021, Henning Fehrmann
*
* =====================================================================================
*/
// Copyright (c) 1993-2016, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// This code modified from the public domain code here:
// https://gist.github.com/rygorous/2156668
// The URL above includes more robust conversion routines
// that handle Inf and NaN correctly.
//
// It is recommended to use the more robust versions in production code.
typedef unsigned uint;
union FP32
{
uint u;
float f;
struct
{
uint Mantissa : 23;
uint Exponent : 8;
uint Sign : 1;
};
};
union FP16
{
unsigned short u;
struct
{
uint Mantissa : 10;
uint Exponent : 5;
uint Sign : 1;
};
};
// Approximate solution. This is faster but converts some sNaNs to
// infinity and doesn't round correctly. Handle with care.
// Approximate solution. This is faster but converts some sNaNs to
// infinity and doesn't round correctly. Handle with care.
static half approx_float_to_half(float fl)
{
FP32 f32infty = { 255 << 23 };
FP32 f16max = { (127 + 16) << 23 };
FP32 magic = { 15 << 23 };
FP32 expinf = { (255 ^ 31) << 23 };
uint sign_mask = 0x80000000u;
FP16 o = { 0 };
FP32 f = *((FP32*)&fl);
uint sign = f.u & sign_mask;
f.u ^= sign;
if (!(f.f < f32infty.u)) // Inf or NaN
o.u = f.u ^ expinf.u;
else
{
if (f.f > f16max.f) f.f = f16max.f;
f.f *= magic.f;
}
o.u = f.u >> 13; // Take the mantissa bits
o.u |= sign >> 16;
return *((half*)&o);
}
// from half->float code - just for verification.
static float half_to_float(half hf)
{
FP16 h = *((FP16*)&hf);
static const FP32 magic = { 113 << 23 };
static const uint shifted_exp = 0x7c00 << 13; // exponent mask after shift
FP32 o;
o.u = (h.u & 0x7fff) << 13; // exponent/mantissa bits
uint exp = shifted_exp & o.u; // just the exponent
o.u += (127 - 15) << 23; // exponent adjust
// handle exponent special cases
if (exp == shifted_exp) // Inf/NaN?
o.u += (128 - 16) << 23; // extra exp adjust
else if (exp == 0) // Zero/Denormal?
{
o.u += 1 << 23; // extra exp adjust
o.f -= magic.f; // renormalize
}
o.u |= (h.u & 0x8000) << 16; // sign bit
return o.f;
}
#include <iostream>
#include <time.h>
#include <cublas_v2.h>
#include <thrust/device_vector.h>
const char* cublasGetErrorString(cublasStatus_t status) {
switch(status) {
case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
}
return "unknown error";
}
int main(void) {
// matrix A
size_t m = 1 << 10;
size_t n = 1 << 17;
size_t k = 1 << 9;
m = 1024;
n = 1024;
k = 512;
int rowA = m;
int colA = k;
// matrix B
int rowB = colA;
int colB = n;
// matrix C
int rowC = rowA;
int colC = colB;
thrust::device_vector<float> A(rowA * colA);
thrust::device_vector<float> B(rowB * colB);
thrust::device_vector<float> C(rowC * colC);
/*
for (size_t i = 0; i < rowA; i++){
for (size_t j = 0; j < colA; j++){
A[i * rowA + j] = i + j;
}
}
for (size_t i = 0; i < rowB; i++){
for (size_t j = 0; j < colB; j++){
B[i * rowA + j] = i + j;
}
}
*/
cublasHandle_t handle;
cublasStatus_t status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS) {
std::cerr << "cublasCreate failed. error is: " << cublasGetErrorString(status) << std::endl;;
}
struct timespec start;
struct timespec stop;
int alpha = 1;
int beta = 0;
float alphaf = 1.f;
float betaf = 0.f;
// A * B + C
/*
*/
//cublasSetMathMode(handle, CUBLAS_PEDANTIC_MATH);
//cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH);
cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH);
for (int r = 0; r < 10; r++)
{
clock_gettime(CLOCK_REALTIME , &start);
/*
status = cublasGemmEx
(
handle, CUBLAS_OP_N, CUBLAS_OP_N,
rowA, colB, colA,
&alpha, thrust::raw_pointer_cast(&A[0]),
CUDA_R_8I,
rowA,
thrust::raw_pointer_cast(&B[0]),
CUDA_R_8I,
colB,
&beta, thrust::raw_pointer_cast(&C[0]), CUDA_R_32I,
colB,
CUDA_R_32I, CUBLAS_GEMM_ALGO0
);
*/
status = cublasSgemmEx
(
handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
rowA,
colB,
colA,
&alphaf,
thrust::raw_pointer_cast(&A[0]),
CUDA_R_16F,
rowA,
thrust::raw_pointer_cast(&B[0]),
CUDA_R_16F,
colB,
&betaf,
thrust::raw_pointer_cast(&C[0]),
CUDA_R_32F,
colB
);
if (status != CUBLAS_STATUS_SUCCESS) {
std::cerr << "cublasGemmEx execution error is: " << cublasGetErrorString(status) << std::endl;
exit(0);
}
cudaDeviceSynchronize();
clock_gettime(CLOCK_REALTIME , &stop);
double res= (double)
(
stop.tv_sec - start.tv_sec
)*1000.
+
(double)
(
stop.tv_nsec - start.tv_nsec
)/1000000.
;
printf("hp %d %d %d %d %g [ms]\n",r, m, n, k, res);
}
status = cublasDestroy(handle);
if (status != CUBLAS_STATUS_SUCCESS) {
std::cerr << "shutdown error code is: " << cublasGetErrorString(status) << std::endl;
}
return 0;
}
#include <unistd.h>
#include <iostream>
#include <stdlib.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include "fp16_conversion.h"
#include <time.h>
using namespace std;
#define FP16MM
const char* cublasGetErrorString(cublasStatus_t status)
{
switch(status)
{
case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
}
return "unknown error";
}
// Convenience function for checking CUDA runtime API results
// can be wrapped around any runtime API call. No-op in release builds.
inline
cudaError_t checkCuda(cudaError_t result)
{
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
return result;
}
inline
cublasStatus_t checkCublas(cublasStatus_t result)
{
if (result != CUBLAS_STATUS_SUCCESS) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cublasGetErrorString(result));
assert(result == CUBLAS_STATUS_SUCCESS);
}
return result;
}
// Fill the array A(nr_rows_A, nr_cols_A) with random numbers on CPU
void CPU_fill_rand(float *A, int nr_rows_A, int nr_cols_A)
{
for(int i = 0; i < nr_rows_A * nr_cols_A; i++){
//A[i] = (float)rand()/(float)(RAND_MAX/a);
A[i] = 0.1;
}
}
int main
(
int argc,
char ** argv
)
{
int repeats = 10;
cout << "\nrunning cublasHgemm test\n" << endl;
cublasStatus_t stat;
cublasHandle_t handle;
checkCublas(cublasCreate(&handle));
size_t m = 1 << 11;
size_t n = 1 << 17;
size_t k = 1 << 9;
float *h_A = (float *)malloc(sizeof(*h_A) * m * k);
float *h_B = (float *)malloc(sizeof(*h_B) * k * n);
float *h_C = (float *)malloc(sizeof(*h_C) * n * m);
CPU_fill_rand(h_A, m, k);
CPU_fill_rand(h_B, k, n);
CPU_fill_rand(h_C, n, m);
__half *d_A, *d_B, *d_C;
checkCuda(cudaMallocManaged(&d_A, m * k * sizeof(* d_A)));
checkCuda(cudaMallocManaged(&d_B, k * n * sizeof(* d_B)));
checkCuda(cudaMallocManaged(&d_C, m * n * sizeof(* d_C)));
for (int i = 0; i < m * k; i++)
{
d_A[i] = approx_float_to_half(h_A[i]);
}
for (int i = 0; i < k * n; i++)
{
d_B[i] = approx_float_to_half(h_B[i]);
}
for (int i = 0; i < n * m; i++)
{
d_C[i] = approx_float_to_half(h_C[i]);
}
const __half alf = approx_float_to_half(1.0);
const __half bet = approx_float_to_half(0.0);
const __half *alpha = &alf;
const __half *beta = &bet;
cudaEvent_t cstart, cstop;
cudaEventCreate(&cstart);
cudaEventCreate(&cstop);
float sum1 = 0.f;
float sum2 = 0.f;
struct timespec start;
struct timespec stop;
for(int rep = 0; rep < repeats; rep++)
{
cudaEventRecord(cstart, 0);
clock_gettime(CLOCK_REALTIME , &start);
stat = cublasHgemm
(
handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m,
n,
k,
alpha,
d_A,
m,
d_B,
k,
beta,
d_C,
m
);
cudaEventRecord(cstop,0);
cudaEventSynchronize(cstop);
clock_gettime(CLOCK_REALTIME , &stop);
if(stat != CUBLAS_STATUS_SUCCESS)
{
cerr << "cublasSgemmBatched failed" << endl;
exit(1);
}
assert(!cudaGetLastError());
double res= (double)
(
stop.tv_sec - start.tv_sec
)*1000.
+
(double)
(
stop.tv_nsec - start.tv_nsec
)/1000000.
;
float elapsed;
cudaEventElapsedTime(&elapsed, cstart, cstop);
elapsed /= 1000.0f;
sum1 += res;
sum2 += elapsed;
cout << res << " <=> " << elapsed << endl;
}
cout << "float16; size " << m <<" "<< k << " "<< n << " average: " << sum1/repeats << " ms "<< sum2/repeats << " s "<< endl;
//Free GPU memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
// Free CPU memory
free(h_A);
free(h_B);
free(h_C);
return 0;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment