diff --git a/Makefile b/Makefile
index ace3c81b6d270652589bead06c3a1e7bd15b4347..77c82306d54372b2405ec98a8d89e8e000bcd537 100644
--- a/Makefile
+++ b/Makefile
@@ -1,9 +1,10 @@
 
-GPU=NVIDIA
 GPU=AMD
+GPU=NVIDIA
 
 OBJ_blas = blas.o
 OBJ_fftw = fftw.o
+OBJ_tensor_core = tensor_core.o
 
 
 ifeq ($(GPU), AMD)
@@ -29,6 +30,9 @@ all: blas fftw
 blas: ${OBJ_blas}
 	${CC} -o blas ${OBJ_blas} ${LDFLAGS} ${LDFLAGS_blas} ${CUDAFLAGS}
 
+tensor_core: ${OBJ_tensor_core}
+	${CC} -o tensor_core ${OBJ_tensor_core} ${LDFLAGS} ${LDFLAGS_blas} ${CUDAFLAGS}
+
 fftw: ${OBJ_fftw}
 	${CC} -o fftw ${OBJ_fftw} ${LDFLAGS} ${LDFLAGS_fftw} ${CUDAFLAGS}
 
diff --git a/tensor_core.c b/tensor_core.c
index 6feebfc9d5fb4ae37c1be3397c67898cb5d859e5..b815bedc10526297fdb59ec5da07701f3efae56a 100644
--- a/tensor_core.c
+++ b/tensor_core.c
@@ -77,9 +77,9 @@ void
 multiplication
 (
 	__HANDLE__ handle,
-	const __COMPLEX8__ *A,
-	const __COMPLEX8__ *B,
-	__COMPLEX8__ *C,
+	const float *A,
+	const float *B,
+	float *C,
 	size_t m,
 	size_t n,
 	size_t k
@@ -87,8 +87,26 @@ multiplication
 {
 	__BLAS_OPERATION__ transA = __NO_TRANSFORM__;
 	__BLAS_OPERATION__ transB = __CT_TRANSFORM__;
-	const __COMPLEX8__ alpha = {.x = 1.f, .y = 0.f};
-	const __COMPLEX8__ beta = {.x = 0.f, .y = 0.f};
+	const float alpha = 1.f;
+	const float beta = 0.f;
+
+
+	check_status( cublasSgemm(
+		handle,
+		transA,
+		transB,
+		m,
+		n,
+		k,
+		&alpha,
+		A,
+		m,
+		B,
+		n,
+		&beta,
+		C,
+		m
+	));
 
 	cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP;
 	check_status( cublasGemmEx(
@@ -112,6 +130,7 @@ multiplication
 		CUDA_C_32F,
 		algo
 	));
+
 	exit(0);
 	// cublasIcamax(handle,m * n, C, 1, &result);
 	cudaDeviceSynchronize();
@@ -120,8 +139,8 @@ multiplication
 void
 prepare_matrices
 (
-	__COMPLEX8__ * hA,
-	__COMPLEX8__ * hB,
+	float * hA,
+	float * hB,
 	size_t m,
 	size_t n,
 	size_t k
@@ -134,9 +153,10 @@ prepare_matrices
 		for (size_t j = 0; j < k; j++)
 		{
 			size_t ind = j + k * i;
-			hA[ind].x = (float)xorshf96()*fact;
-			hA[ind].y = (float)xorshf96()*fact;
+			//hA[ind] = (float)xorshf96()*fact;
+			hA[ind] = 0.f;
 		}
+		hA[k * (i+1)] = 1.f;
 	}
 #pragma omp parallel for
 	for (size_t i = 0; i < n; i++)
@@ -144,9 +164,10 @@ prepare_matrices
 		for (size_t j = 0; j < k; j++)
 		{
 			size_t ind = j + k * i;
-			hB[ind].x = (float)xorshf96()*fact;
-			hB[ind].y = (float)xorshf96()*fact;
+			//hB[ind] = (float)xorshf96()*fact;
+			hB[ind] = 0.f;
 		}
+		hB[k * (i+1)] = 1.f;
 	}
 
 }
@@ -154,7 +175,7 @@ prepare_matrices
 void
 print_result
 (
-	__COMPLEX8__ * hC,
+	float * hC,
 	size_t m,
 	size_t n,
 	size_t k
@@ -166,7 +187,7 @@ print_result
 		for (size_t j = 0; j < k; j++)
 		{
 			size_t ind = j + k * i;
-			printf("%1.2f %1.2f\t", hC[ind].x, hC[ind].y);
+			printf("%1.2f\t", hC[ind]);
 		}
 		printf("\n");
 	}
@@ -187,9 +208,9 @@ run_test
 {
 	struct runtime * timer;
 	__MALLOC(timer, sizeof(*timer));
-	__COMPLEX8__ *A;
-	__COMPLEX8__ *B;
-	__COMPLEX8__ *C;
+	float *A;
+	float *B;
+	float *C;
 	__ASSERT(__PREFIX(Malloc)((void **)&A, sizeof(*A) * (size_t)(m * k)));
 	__ASSERT(__PREFIX(Malloc)((void **)&B, sizeof(*B) * (size_t)(n * k)));
 	__ASSERT(__PREFIX(Malloc)((void **)&C, sizeof(*C) * (size_t)(m * n)));
@@ -199,11 +220,11 @@ run_test
 		exit(1);
 	}
 
-	__COMPLEX8__ *hA;
+	float *hA;
 	__MALLOC( hA, sizeof(*hA) * (size_t)(m * k));
-	__COMPLEX8__ *hB;
+	float *hB;
 	__MALLOC( hB, sizeof(*hB) * (size_t)(k * n));
-	__COMPLEX8__ *hC;
+	float *hC;
 	__MALLOC( hC, sizeof(*hC) * (size_t)(n * m));
 
 	// timer_start(timer, "Prepare matrices");
@@ -287,6 +308,7 @@ main
 
 	__CREATE_HANDLE(&handle);
 
+	cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH);
 	__MALLOC(res, sizeof(*res) * (size_t)(
 	      (m_max - m_min + 1) *
 	      (n_max - n_min + 1) *