diff --git a/Makefile b/Makefile
index c53832e93c1a65cf384b56c3e9733f8bb63d6c5b..7d1ba027370b4f9bc876c38b2e2e9ecb17606d50 100644
--- a/Makefile
+++ b/Makefile
@@ -27,7 +27,7 @@ else
   unknown_HW:
 endif
 
-all: blas fftw
+all: blas fftw tensor_core
 
 blas: ${OBJ_blas}
 	${CC} -o blas ${OBJ_blas} ${LDFLAGS} ${LDFLAGS_blas} ${CUDAFLAGS}
diff --git a/fftw_test.c b/fftw_test.c
new file mode 100644
index 0000000000000000000000000000000000000000..a3f013f41746467f6945a8575d1801f40409d006
--- /dev/null
+++ b/fftw_test.c
@@ -0,0 +1,181 @@
+/*
+ * =====================================================================================
+ *
+ *       Filename:  fftw.c
+ *
+ *    Description:  FFTW profiling
+ *
+ *        Version:  1.0
+ *        Created:  29.01.2021 10:55:14
+ *       Revision:  none
+ *       Compiler:  gcc
+ *
+ *         Author:  Henning Fehrmann (), henning.fehrmann@aei.mpg.de
+ *   Organization:  AEI Hannover
+ *        License:  GNU General Public License v2
+ *
+ * =====================================================================================
+ */
+
+
+#define __HIP_PLATFORM_HCC__
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+#include <omp.h>
+
+#include "hardware_settings.h"
+#include "profiler.h"
+
+
+#define __MALLOC(P, size) P =  malloc(size); \
+	if (P == NULL) \
+	{\
+		fprintf(stderr, "Allocation failed at line %d in %s\n", __LINE__, __FILE__); \
+		exit(EXIT_FAILURE); \
+	}\
+
+void
+prepare_data
+(
+        __COMPLEX8__ * hA,
+	size_t s
+)
+{
+#pragma omp parallel for
+        for (size_t i = 0; i < s; i++)
+        {
+                hA[i].x = 1.0f;
+                hA[i].y = 0.;
+        }
+}
+
+int
+run_test
+(
+	size_t T,
+	size_t N,
+	unsigned rep
+)
+{
+
+	struct runtime * timer;
+	__MALLOC(timer, sizeof(*timer));
+        // Create HIP device buffer
+        __COMPLEX8__ *A;
+        __COMPLEX8__ *hB;
+	__MALLOC(hB, sizeof(*hB) * N * T);
+
+        __ASSERT(__PREFIX(Malloc)((void**)&A, sizeof(*A) * N * T));
+
+        // Initialize data
+        __COMPLEX8__ * hA;
+	__MALLOC(hA, sizeof(*hA) * N * T);
+
+        // Create FFT plan
+	__FFTW_PLAN plan;
+        size_t length = N;
+
+	char mes[128];
+	//sprintf(mes, "dim: %zu\tPlan generation." ,N);
+	//timer_start(timer, mes);
+
+#ifdef ROC
+        rocfft_plan_create
+	(
+		&plan,
+		rocfft_placement_inplace,
+		rocfft_transform_type_complex_forward,
+		rocfft_precision_single,
+		1,
+		&length,
+		1,
+		NULL
+	);
+#elif CUDA
+	int batch = T;				// --- Number of batched executions
+        int rank = 1;                           // --- 1D FFTs
+        int na[] = { N };			// --- Size of the Fourier transform
+        int istride = 1, ostride = 1;           // --- Distance between two successive input/output elements
+        int idist = N, odist = N;		// --- Distance between batches
+        int inembed[] = { 0 };                  // --- Input size with pitch (ignored for 1D transforms)
+        int onembed[] = { 0 };                  // --- Output size with pitch (ignored for 1D transforms)
+	cufftPlanMany
+	(
+	      &plan,
+	      rank,
+	      na,
+	      inembed,
+	      istride,
+	      idist,
+	      onembed,
+	      ostride,
+	      odist,
+	      CUFFT_C2C,
+		batch
+        );
+#endif
+	prepare_data(hA, N * T);
+        //  Copy data to device
+        __ASSERT(__PREFIX(Memcpy)(A, hA, sizeof(*hA) * N, __PREFIX(MemcpyHostToDevice)));
+	for (int r = 0 ; r < 1; r++)
+	{
+		// Execute plan
+		sprintf(mes, "T = %zu n = %zu\t round %d." ,T, N , r);
+		timer_start(timer, mes);
+#ifdef ROC
+		rocfft_execute(plan, (void**) &A, NULL, NULL);
+#elif CUDA
+		cufftExecC2C(plan, A, A, CUFFT_FORWARD);
+#endif
+		__PREFIX(DeviceSynchronize)();
+		timer_stop(timer);
+
+	}
+        // Destroy plan
+        __DESTROY_PLAN(plan);
+
+        __ASSERT(__PREFIX(Memcpy)(hB, A, sizeof(*A) * N, __PREFIX(MemcpyDeviceToHost)));
+	for (size_t i = 0; i < N; i++)
+	{
+		printf("%g\t%g\n", hB[i].x, hB[i].y);
+	}
+	exit(0);
+        __ASSERT(__PREFIX(Free)(A));
+        free(hA);
+        free(hB);
+
+	free(timer);
+        return 0;
+}
+int
+main
+(
+)
+{
+	int rep = 1;
+	int t_min = 8;
+	int t_max = 11;
+	int n_min = 11;
+	int n_max = 19;
+
+	float *  res = malloc(sizeof(*res) * (size_t)((n_max - n_min + 1) * rep));
+	if (res == NULL)
+	{
+		fprintf(stderr, "Couldn't allocate res\n");
+		exit(1);
+	}
+
+	for (int et = t_min; et <= t_max; et ++)
+	{
+		int t = 1 << et;
+		for (int en = n_min; en <= n_max; en++)
+		{
+			size_t n = 1 << en;
+			run_test(t, n,  rep);
+		}
+	}
+	free(res);
+}
diff --git a/fp16_conversion.h b/fp16_conversion.h
new file mode 100644
index 0000000000000000000000000000000000000000..58f04b9b33d696a83e44115953eef68755905953
--- /dev/null
+++ b/fp16_conversion.h
@@ -0,0 +1,133 @@
+/*
+ * =====================================================================================
+ *
+ *       Filename:  fp16_conversion.h
+ *
+ *    Description:  i
+ *
+ *        Version:  1.0
+ *        Created:  02/08/21 16:07:51
+ *       Revision:  none
+ *       Compiler:  gcc
+ *
+ *         Author:  Henning Fehrmann (), henning.fehrmann@aei.mpg.de
+ *        Company:  AEI Hannover
+ *        Copyright:  GPL v2.0 Copyright (c) 2021, Henning Fehrmann
+ *
+ * =====================================================================================
+ */
+
+// Copyright (c) 1993-2016, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// This code modified from the public domain code here: 
+// https://gist.github.com/rygorous/2156668
+// The URL above includes more robust conversion routines
+// that handle Inf and NaN correctly. 
+// 
+// It is recommended to use the more robust versions in production code.
+
+typedef unsigned uint;
+
+union FP32
+{
+    uint u;
+    float f;
+    struct
+    {
+        uint Mantissa : 23;
+        uint Exponent : 8;
+        uint Sign : 1;
+    };
+};
+
+union FP16
+{
+    unsigned short u;
+    struct
+    {
+        uint Mantissa : 10;
+        uint Exponent : 5;
+        uint Sign : 1;
+    };
+};
+
+// Approximate solution. This is faster but converts some sNaNs to
+// infinity and doesn't round correctly. Handle with care.
+// Approximate solution. This is faster but converts some sNaNs to
+// infinity and doesn't round correctly. Handle with care.
+static half approx_float_to_half(float fl)
+{
+    FP32 f32infty = { 255 << 23 };
+    FP32 f16max = { (127 + 16) << 23 };
+    FP32 magic = { 15 << 23 };
+    FP32 expinf = { (255 ^ 31) << 23 };
+    uint sign_mask = 0x80000000u;
+    FP16 o = { 0 };
+
+    FP32 f = *((FP32*)&fl);
+
+    uint sign = f.u & sign_mask;
+    f.u ^= sign;
+
+    if (!(f.f < f32infty.u)) // Inf or NaN
+        o.u = f.u ^ expinf.u;
+    else
+    {
+        if (f.f > f16max.f) f.f = f16max.f;
+        f.f *= magic.f;
+    }
+
+    o.u = f.u >> 13; // Take the mantissa bits
+    o.u |= sign >> 16;
+    return *((half*)&o);
+}
+
+// from half->float code - just for verification.
+static float half_to_float(half hf)
+{
+    FP16 h = *((FP16*)&hf);
+
+    static const FP32 magic = { 113 << 23 };
+    static const uint shifted_exp = 0x7c00 << 13; // exponent mask after shift
+    FP32 o;
+
+    o.u = (h.u & 0x7fff) << 13;     // exponent/mantissa bits
+    uint exp = shifted_exp & o.u;   // just the exponent
+    o.u += (127 - 15) << 23;        // exponent adjust
+
+    // handle exponent special cases
+    if (exp == shifted_exp) // Inf/NaN?
+        o.u += (128 - 16) << 23;    // extra exp adjust
+    else if (exp == 0) // Zero/Denormal?
+    {
+        o.u += 1 << 23;             // extra exp adjust
+        o.f -= magic.f;             // renormalize
+    }
+
+    o.u |= (h.u & 0x8000) << 16;    // sign bit
+    return o.f;
+}
diff --git a/tensor2.cu b/tensor2.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4e8556f8bdbe00748f2ad5bd27377aa0951ed32d
--- /dev/null
+++ b/tensor2.cu
@@ -0,0 +1,142 @@
+#include <iostream>
+
+#include <time.h>
+#include <cublas_v2.h>
+#include <thrust/device_vector.h>
+
+const char* cublasGetErrorString(cublasStatus_t status) {
+  switch(status) {
+    case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
+    case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
+    case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
+    case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
+    case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
+    case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
+    case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
+    case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
+	case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
+  }
+  return "unknown error";
+}
+
+int main(void) {
+  // matrix A
+
+	size_t m = 1 << 10;
+	size_t n = 1 << 17;
+	size_t k = 1 << 9;
+	m = 1024;
+	n = 1024;
+	k = 512;
+	int rowA = m;
+	int colA = k;
+	// matrix B
+	int rowB = colA;
+	int colB = n;
+	// matrix C
+	int rowC = rowA;
+	int colC = colB;
+
+  thrust::device_vector<float> A(rowA * colA);
+  thrust::device_vector<float> B(rowB * colB);
+  thrust::device_vector<float> C(rowC * colC);
+	/*
+  for (size_t i = 0; i < rowA; i++){
+    for (size_t j = 0; j < colA; j++){
+      A[i * rowA + j] = i + j;
+    }
+  }
+
+  for (size_t i = 0; i < rowB; i++){
+    for (size_t j = 0; j < colB; j++){
+      B[i * rowA + j] = i + j;
+    }
+  }
+	*/
+  cublasHandle_t handle;
+  cublasStatus_t status = cublasCreate(&handle);
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    std::cerr << "cublasCreate failed. error is: " << cublasGetErrorString(status) << std::endl;;
+  }
+
+	struct timespec start;
+	struct timespec stop;
+	int alpha = 1;
+	int beta = 0;
+	float alphaf = 1.f;
+	float betaf = 0.f;
+  // A * B + C
+	/*
+
+	*/
+
+	//cublasSetMathMode(handle, CUBLAS_PEDANTIC_MATH);
+	//cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH);
+	cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH);
+
+	for (int r = 0; r < 10; r++)
+	{
+		clock_gettime(CLOCK_REALTIME , &start);
+		/*	
+		status = cublasGemmEx
+		(
+			handle, CUBLAS_OP_N, CUBLAS_OP_N,
+			rowA, colB, colA,
+			&alpha, thrust::raw_pointer_cast(&A[0]),
+			CUDA_R_8I,
+			rowA,
+			thrust::raw_pointer_cast(&B[0]),
+			CUDA_R_8I,
+			colB,
+			&beta, thrust::raw_pointer_cast(&C[0]), CUDA_R_32I,
+			colB,
+			CUDA_R_32I, CUBLAS_GEMM_ALGO0
+		);
+		*/
+		status = cublasSgemmEx
+		(
+			handle,
+			CUBLAS_OP_N,
+			CUBLAS_OP_N,
+			rowA,
+			colB,
+			colA,
+			&alphaf,
+			thrust::raw_pointer_cast(&A[0]),
+			CUDA_R_16F,
+			rowA,
+			thrust::raw_pointer_cast(&B[0]),
+			CUDA_R_16F,
+			colB,
+			&betaf,
+			thrust::raw_pointer_cast(&C[0]),
+			CUDA_R_32F,
+			colB
+		);
+		if (status != CUBLAS_STATUS_SUCCESS) {
+			std::cerr << "cublasGemmEx execution error is: " << cublasGetErrorString(status) << std::endl;
+			exit(0);
+		}
+
+		cudaDeviceSynchronize();
+		clock_gettime(CLOCK_REALTIME , &stop);
+		double res= (double)
+		(
+			stop.tv_sec - start.tv_sec
+		)*1000.
+		+
+		(double)
+		(
+			stop.tv_nsec - start.tv_nsec
+		)/1000000.
+		;
+		printf("hp %d %d %d %d %g [ms]\n",r,  m, n, k, res);
+	}
+
+  status = cublasDestroy(handle);
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    std::cerr << "shutdown error code is: " << cublasGetErrorString(status) << std::endl;
+  }
+
+  return 0;
+}
diff --git a/tensor_core.cu b/tensor_core.cu
new file mode 100644
index 0000000000000000000000000000000000000000..530d3f8d7ca64f83c5245dd0d561297973e20f35
--- /dev/null
+++ b/tensor_core.cu
@@ -0,0 +1,174 @@
+#include <unistd.h>
+#include <iostream>
+#include <stdlib.h>
+#include <assert.h>
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include "fp16_conversion.h"
+#include <time.h>
+
+using namespace std;
+
+#define FP16MM
+
+const char* cublasGetErrorString(cublasStatus_t status)
+{
+    switch(status)
+    {
+        case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
+        case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
+        case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
+        case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE"; 
+        case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH"; 
+        case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
+        case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED"; 
+        case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR"; 
+    }
+    return "unknown error";
+}
+
+// Convenience function for checking CUDA runtime API results
+// can be wrapped around any runtime API call. No-op in release builds.
+inline
+cudaError_t checkCuda(cudaError_t result)
+{
+  if (result != cudaSuccess) {
+    fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
+    assert(result == cudaSuccess);
+  }
+  return result;
+}
+
+inline
+cublasStatus_t checkCublas(cublasStatus_t result)
+{
+  if (result != CUBLAS_STATUS_SUCCESS) {
+    fprintf(stderr, "CUDA Runtime Error: %s\n", cublasGetErrorString(result));
+    assert(result == CUBLAS_STATUS_SUCCESS);
+  }
+  return result;
+}
+
+// Fill the array A(nr_rows_A, nr_cols_A) with random numbers on CPU
+void CPU_fill_rand(float *A, int nr_rows_A, int nr_cols_A)
+{
+
+    for(int i = 0; i < nr_rows_A * nr_cols_A; i++){
+		//A[i] = (float)rand()/(float)(RAND_MAX/a);
+		A[i] = 0.1;
+	}
+}
+
+int main
+(
+	int argc,
+	char ** argv
+)
+{
+
+
+	int repeats = 10;
+	cout << "\nrunning cublasHgemm test\n" << endl;
+
+	cublasStatus_t stat;
+	cublasHandle_t handle;
+
+	checkCublas(cublasCreate(&handle));
+	size_t m = 1 << 11;
+	size_t n = 1 << 17;
+	size_t k = 1 << 9;
+	float *h_A = (float *)malloc(sizeof(*h_A) * m * k);
+	float *h_B = (float *)malloc(sizeof(*h_B) * k * n);
+	float *h_C = (float *)malloc(sizeof(*h_C) * n * m);
+	CPU_fill_rand(h_A, m, k);
+	CPU_fill_rand(h_B, k, n);
+	CPU_fill_rand(h_C, n, m);
+
+	__half *d_A, *d_B, *d_C;
+	checkCuda(cudaMallocManaged(&d_A, m * k * sizeof(* d_A)));
+	checkCuda(cudaMallocManaged(&d_B, k * n * sizeof(* d_B)));
+	checkCuda(cudaMallocManaged(&d_C, m * n * sizeof(* d_C)));
+
+	for (int i = 0; i < m * k; i++)
+	{
+		d_A[i] = approx_float_to_half(h_A[i]);
+	}
+	for (int i = 0; i < k * n; i++)
+	{
+		d_B[i] = approx_float_to_half(h_B[i]);
+	}
+	for (int i = 0; i < n * m; i++)
+	{
+		d_C[i] = approx_float_to_half(h_C[i]);
+	}
+
+	const __half alf = approx_float_to_half(1.0);
+	const __half bet = approx_float_to_half(0.0);
+	const __half *alpha = &alf;
+	const __half *beta = &bet;
+	cudaEvent_t cstart, cstop;
+	cudaEventCreate(&cstart);
+	cudaEventCreate(&cstop);
+	float sum1 = 0.f;
+	float sum2 = 0.f;
+	struct timespec start;
+	struct timespec stop;
+	for(int rep = 0; rep < repeats; rep++)
+	{
+		cudaEventRecord(cstart, 0);
+		clock_gettime(CLOCK_REALTIME , &start);
+
+		stat = cublasHgemm
+		(
+			handle,
+			CUBLAS_OP_N,
+			CUBLAS_OP_N,
+			m,
+			n,
+			k,
+			alpha,
+			d_A,
+			m,
+			d_B,
+			k,
+			beta,
+			d_C,
+			m
+		);
+		cudaEventRecord(cstop,0);
+		cudaEventSynchronize(cstop);
+		clock_gettime(CLOCK_REALTIME , &stop);
+		if(stat != CUBLAS_STATUS_SUCCESS)
+		{
+			cerr << "cublasSgemmBatched failed" << endl;
+			exit(1);
+		}
+		assert(!cudaGetLastError());
+		double res= (double)
+		(
+			stop.tv_sec - start.tv_sec
+		)*1000.
+		+
+		(double)
+		(
+			stop.tv_nsec - start.tv_nsec
+		)/1000000.
+		;
+		float elapsed;
+		cudaEventElapsedTime(&elapsed, cstart, cstop);
+		elapsed /= 1000.0f;
+		sum1 += res;
+		sum2 += elapsed;
+		cout << res << " <=> " << elapsed << endl;
+	}
+	cout << "float16; size " << m <<" "<< k << " "<< n  << " average: " << sum1/repeats << " ms "<< sum2/repeats << " s "<< endl;
+  //Free GPU memory
+	cudaFree(d_A);
+	cudaFree(d_B);
+	cudaFree(d_C);
+  // Free CPU memory
+	free(h_A);
+	free(h_B);
+	free(h_C);
+	return 0;
+}