diff --git a/tensor_core.c b/tensor_core.c
new file mode 100644
index 0000000000000000000000000000000000000000..6feebfc9d5fb4ae37c1be3397c67898cb5d859e5
--- /dev/null
+++ b/tensor_core.c
@@ -0,0 +1,335 @@
+/*
+ * =====================================================================================
+ *
+ *    Description:  BLAS Benchmark
+ *
+ *        Version:  1.0
+ *        Created:  27.01.2021 12:45:18
+ *       Revision:  none
+ *       Compiler:  hipc or nvcc
+ *
+ *         Author:  Henning Fehrmann (), henning.fehrmann@aei.mpg.de
+ *   Organization:  AEI Hannover
+ *        License:  GNU General Public License v2
+ *
+ * =====================================================================================
+ */
+
+#include "hardware_settings.h"
+#include "profiler.h"
+
+#include <stdlib.h>
+#include <math.h>
+#include <omp.h>
+#include <mma.h>
+#include <string.h>
+
+#define __MALLOC(P, size) P = malloc(size); \
+	if (P == NULL) \
+	{\
+		fprintf(stderr, "Allocation failed at line %d in %s\n", __LINE__, __FILE__); \
+		exit(EXIT_FAILURE); \
+	}\
+
+void
+check_status
+(
+	cublasStatus_t status
+)
+{
+	switch (status)
+	{
+		case CUBLAS_STATUS_SUCCESS:
+			break;
+		case CUBLAS_STATUS_NOT_INITIALIZED:
+			printf("not initialized\n");
+			break;
+		case CUBLAS_STATUS_ALLOC_FAILED:
+			printf("CUBLAS_STATUS_ALLOC_FAILED\n");
+			break;
+		case CUBLAS_STATUS_INVALID_VALUE:
+			printf("CUBLAS_STATUS_INVALID_VALUE\n");
+			break;
+		case CUBLAS_STATUS_ARCH_MISMATCH:
+			printf("CUBLAS_STATUS_ARCH_MISMATCH\n");
+			break;
+		case CUBLAS_STATUS_MAPPING_ERROR:
+			printf("CUBLAS_STATUS_MAPPING_ERROR\n");
+			break;
+		case CUBLAS_STATUS_EXECUTION_FAILED:
+			printf("CUBLAS_STATUS_EXECUTION_FAILED\n");
+			break;
+		case CUBLAS_STATUS_INTERNAL_ERROR:
+			printf("CUBLAS_STATUS_INTERNAL_ERROR\n");
+			break;
+		case CUBLAS_STATUS_NOT_SUPPORTED:
+			printf("CUBLAS_STATUS_NOT_SUPPORTED\n");
+			break;
+		case CUBLAS_STATUS_LICENSE_ERROR:
+			printf("CUBLAS_STATUS_LICENSE_ERROR\n");
+			break;
+	}
+}
+
+
+
+void
+multiplication
+(
+	__HANDLE__ handle,
+	const __COMPLEX8__ *A,
+	const __COMPLEX8__ *B,
+	__COMPLEX8__ *C,
+	size_t m,
+	size_t n,
+	size_t k
+)
+{
+	__BLAS_OPERATION__ transA = __NO_TRANSFORM__;
+	__BLAS_OPERATION__ transB = __CT_TRANSFORM__;
+	const __COMPLEX8__ alpha = {.x = 1.f, .y = 0.f};
+	const __COMPLEX8__ beta = {.x = 0.f, .y = 0.f};
+
+	cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+	check_status( cublasGemmEx(
+		handle,
+		transA,
+		transB,
+		m,
+		n,
+		k,
+		&alpha,
+		A,
+		CUDA_C_16F,
+		m,
+		B,
+		CUDA_C_16F,
+		n,
+		&beta,
+		C,
+		CUDA_C_16F,
+		m,
+		CUDA_C_32F,
+		algo
+	));
+	exit(0);
+	// cublasIcamax(handle,m * n, C, 1, &result);
+	cudaDeviceSynchronize();
+}
+
+void
+prepare_matrices
+(
+	__COMPLEX8__ * hA,
+	__COMPLEX8__ * hB,
+	size_t m,
+	size_t n,
+	size_t k
+)
+{
+	float fact = 1.f/(float)n/(float)x/(float)y/20.f;
+#pragma omp parallel for
+	for (size_t i = 0; i < m; i++)
+	{
+		for (size_t j = 0; j < k; j++)
+		{
+			size_t ind = j + k * i;
+			hA[ind].x = (float)xorshf96()*fact;
+			hA[ind].y = (float)xorshf96()*fact;
+		}
+	}
+#pragma omp parallel for
+	for (size_t i = 0; i < n; i++)
+	{
+		for (size_t j = 0; j < k; j++)
+		{
+			size_t ind = j + k * i;
+			hB[ind].x = (float)xorshf96()*fact;
+			hB[ind].y = (float)xorshf96()*fact;
+		}
+	}
+
+}
+
+void
+print_result
+(
+	__COMPLEX8__ * hC,
+	size_t m,
+	size_t n,
+	size_t k
+)
+{
+	printf("-------- %zu %zu\n", m, k);
+	for (size_t i = 0; i < m; i++)
+	{
+		for (size_t j = 0; j < k; j++)
+		{
+			size_t ind = j + k * i;
+			printf("%1.2f %1.2f\t", hC[ind].x, hC[ind].y);
+		}
+		printf("\n");
+	}
+	printf("--------\n");
+
+}
+
+int
+run_test
+(
+	size_t m,
+	size_t n,
+	size_t k,
+	unsigned rep,
+	float * res,
+	__HANDLE__ handle
+)
+{
+	struct runtime * timer;
+	__MALLOC(timer, sizeof(*timer));
+	__COMPLEX8__ *A;
+	__COMPLEX8__ *B;
+	__COMPLEX8__ *C;
+	__ASSERT(__PREFIX(Malloc)((void **)&A, sizeof(*A) * (size_t)(m * k)));
+	__ASSERT(__PREFIX(Malloc)((void **)&B, sizeof(*B) * (size_t)(n * k)));
+	__ASSERT(__PREFIX(Malloc)((void **)&C, sizeof(*C) * (size_t)(m * n)));
+	if (C == NULL)
+	{
+		fprintf(stderr, "C not allocated\n");
+		exit(1);
+	}
+
+	__COMPLEX8__ *hA;
+	__MALLOC( hA, sizeof(*hA) * (size_t)(m * k));
+	__COMPLEX8__ *hB;
+	__MALLOC( hB, sizeof(*hB) * (size_t)(k * n));
+	__COMPLEX8__ *hC;
+	__MALLOC( hC, sizeof(*hC) * (size_t)(n * m));
+
+	// timer_start(timer, "Prepare matrices");
+	// timer_stop(timer);
+
+	//timer_start(timer, "Memcopy");
+	// timer_stop(timer);
+
+	//timer_start(timer, "Create Handle");
+	//if(rocblas_create_handle(&handle) != rocblas_status_success) return EXIT_FAILURE;
+
+	//timer_stop(timer);
+
+
+	prepare_matrices(hA, hB, m, n, k);
+	for (unsigned r = 0; r < rep; r++)
+	{
+		__ASSERT(__PREFIX(Memcpy)(A, hA, sizeof(*A) * (size_t)(m * k), __PREFIX(MemcpyHostToDevice)));
+		__ASSERT(__PREFIX(Memcpy)(B, hB, sizeof(*B) * (size_t)(k * n), __PREFIX(MemcpyHostToDevice)));
+		float res_r = 0.f;
+		char mes[128];
+		sprintf(mes, "m %zu n %zu k %zu run %d", m, n, k, r);
+		timer_start(timer, mes);
+
+		multiplication
+		(
+			handle,
+			A,
+			B,
+			C,
+			m,
+			n,
+			k
+		);
+
+		res_r += timer_stop(timer);
+		res[r] = res_r/1.f;
+	}
+
+	printf("dimensions: %zu %zu %zu\t -- ", n, m , k);
+	printf("required size: %f GB\n",
+		(
+			m * n * sizeof(*A)
+			+  k * n * sizeof(*B)
+			+  k * m * sizeof(*C)
+		)/1.e+9);
+
+	//__ASSERT(__PREFIX(Memcpy)(hC, C, sizeof(*hC) * (size_t)(k * m), __PREFIX(MemcpyDeviceToHost)));
+	//print_result(hC, 1 << em, 1 << en, 1 << ek);
+
+	// timer_start(timer, "Destroy Handle");
+	//if(rocblas_destroy_handle(handle) != rocblas_status_success) return EXIT_FAILURE;
+	// timer_stop(timer);
+
+	__PREFIX(Free)(A);
+	__PREFIX(Free)(B);
+	__PREFIX(Free)(C);
+	free(hA);
+	free(hB);
+	free(hC);
+	free(timer);
+	return 0;
+}
+
+int
+main
+(
+)
+{
+	int rep = 10;
+	size_t m_min = 8;  // 13
+	size_t m_max = 11; // 16
+	size_t n_min = 11; // 11
+	size_t n_max = 19; // 19
+	size_t k_min = 5;  // 7
+	size_t k_max = 11; // 11
+	float *  res;
+
+	// cudaSetDevice(0);
+	__HANDLE__ handle;
+
+	__CREATE_HANDLE(&handle);
+
+	__MALLOC(res, sizeof(*res) * (size_t)(
+	      (m_max - m_min + 1) *
+	      (n_max - n_min + 1) *
+	      (k_max - k_min + 1) *
+	      rep));
+	for (int em = m_min; em <= m_max; em++)
+	{
+		for (int en = n_min; en <= n_max; en++)
+		{
+			for (int ek = k_min; ek <= k_max; ek++)
+			{
+				run_test(1 << em, 1 << en , 1 << ek, rep, &res[0], handle);
+			}
+		}
+	}
+	if(__DESTROY_HANDLE(handle) != __PREFIX(Success)) return EXIT_FAILURE;
+	exit(0);
+	// store the results
+	/*
+	FILE * f;
+	char name[128];
+	sprintf(name, "runtimes");
+	f= fopen(name, "w");
+	if (f == NULL)
+	{
+		fprintf(stderr, "Couldn't open %s\n", name);
+	}
+	for (int i = min_dim; i < max_dim; i++)
+	{
+		size_t dim = 1 << i;
+		fprintf(f, "%zu\t", dim);
+	}
+	fprintf(f, "\n");
+	for (int r = 0; r < rep; r++)
+	{
+		for (int i = min_dim; i < max_dim; i++)
+		{
+			size_t pos = (i - min_dim) * rep + r;
+			fprintf(f, "%1.6f\t", res[pos]);
+		}
+		fprintf(f, "\n");
+	}
+	fclose(f);
+	*/
+	return 0;
+}