diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..7ced09e7a5f6e9553a025486cdd6fb6b4999a35a
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,33 @@
+
+GPU=NVIDIA
+GPU=AMD
+
+OBJ = blas.o
+
+
+ifeq ($(GPU), AMD)
+  LDFLAGS = -L/opt/rocm/lib -lhipblas -lrocblas -fopenmp
+  CFLAGS = -g -Wall -O3 -fopenmp -I/opt/rocm/include -I/opt/rocm/hip/include -DROC
+  CC = hipcc
+else ifeq ($(GPU), NVIDIA)
+  CC = nvcc
+  LDFLAGS =  -lcublas -lm -lgomp
+  INCLUDE= -I/usr/lib/x86_64-linux-gnu/openmpi/include/
+  CFLAGS = ${INCLUDE} --compile -O3 -pg -Xcompiler -fopenmp -DCUDA
+  CUDAFLAGS = --Werror cross-execution-space-call --Wno-deprecated-gpu-targets
+else
+  unknown_HW:
+endif
+
+
+all: ${OBJ}
+	${CC} -o blas ${OBJ} ${LDFLAGS} ${CUDAFLAGS}
+
+%.o: %.c ${HEADER}
+	${CC} ${CFLAGS} -c $<
+
+clean:
+	rm *.o
+
+unknown_HW:
+	@echo "hardware not detected"
diff --git a/cuda_NVidia.c b/blas.c
similarity index 70%
rename from cuda_NVidia.c
rename to blas.c
index 24eb55fccf96455b3c0cbfe719c0f921942c4719..522545de7485b1bd38f6f965f6a5b415a94bb305 100644
--- a/cuda_NVidia.c
+++ b/blas.c
@@ -6,7 +6,7 @@
  *        Version:  1.0
  *        Created:  27.01.2021 12:45:18
  *       Revision:  none
- *       Compiler:  nvcc
+ *       Compiler:  hipc
  *
  *         Author:  Henning Fehrmann (), henning.fehrmann@aei.mpg.de
  *   Organization:  AEI Hannover
@@ -15,18 +15,14 @@
  * =====================================================================================
  */
 
+#include "hardware_settings.h"
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>
-#include <assert.h>
-#include <cuda_runtime.h>
-#include <cuda.h>
-#include <cublas_v2.h>
 #include <time.h>
 #include <omp.h>
 
-#define __ASSERT(x) (assert((x)==cudaSuccess))
 
 
 size_t m = 10000;
@@ -35,12 +31,20 @@ size_t k = 10000;
 
 static unsigned long x=123456789, y=362436069, z=521288629;
 
+#define __MALLOC(P, size) P =  malloc(size); \
+	if (P == NULL) \
+	{\
+		fprintf(stderr, "Allocation if failed at line %d in %s\n", __LINE__, __FILE__); \
+		exit(EXIT_FAILURE); \
+	}\
+
 unsigned long
 xorshf96
 (
 	void
 )
 {
+	// NOT thread save
 	unsigned long t;
 	x ^= x << 16;
 	x ^= x >> 5;
@@ -102,22 +106,22 @@ timer_stop
 void
 multiplication
 (
-	cublasHandle_t handle,
-	const cuComplex *A,
-	const cuComplex *B,
-	cuComplex *C
+	__HANDLE__ handle,
+	const __COMPLEX8__ *A,
+	const __COMPLEX8__ *B,
+	__COMPLEX8__ *C
 )
 {
-	cublasOperation_t transA = CUBLAS_OP_N;
-	cublasOperation_t transB = CUBLAS_OP_C;
-	const cuComplex alpha = {.x = 1.f, .y = 0.f};
-	const cuComplex beta = {.x = 0.f, .y = 0.f};
+	__BLAS_OPERATION__ transA = __NO_TRANSFORM__;
+	__BLAS_OPERATION__ transB = __CT_TRANSFORM__;
+	const __COMPLEX8__ alpha = {.x = 1.f, .y = 0.f};
+	const __COMPLEX8__ beta = {.x = 0.f, .y = 0.f};
 
 	int lda = n;
 	int ldb = n;
 	int ldc = k;
 
-	cublasCgemm
+	__CGMEM__
 	(
 		handle,
 		transA,
@@ -139,8 +143,8 @@ multiplication
 void
 prepare_matrices
 (
-	cuComplex * hA,
-	cuComplex * hB
+	__COMPLEX8__ * hA,
+	__COMPLEX8__ * hB
 )
 {
 	float fact = 1.f/(float)n/(float)x/(float)y/20.f;
@@ -170,7 +174,7 @@ prepare_matrices
 void
 print_result
 (
-	cuComplex * hC
+	__COMPLEX8__ * hC
 )
 {
 	printf("-------- %zu %zu\n", m, k);
@@ -198,45 +202,27 @@ run_test
 	m = dim;
 	n = dim;
 	k = dim;
-	struct runtime * timer = malloc(sizeof(*timer));
-
-	cuComplex *A;
-	cuComplex *B;
-	cuComplex *C;
-	__ASSERT(cudaMalloc((void **)&A, sizeof(*A) * (size_t)(m * n)));
-	if (A == NULL)
-	{
-		fprintf(stderr, "A not allocated\n");
-		exit(1);
-	}
-	__ASSERT(cudaMalloc((void **)&B, sizeof(*B) * (size_t)(n * k)));
-	if (B == NULL)
-	{
-		fprintf(stderr, "B not allocated\n");
-		exit(1);
-	}
-	__ASSERT(cudaMalloc((void **)&C, sizeof(*C) * (size_t)(m * k)));
+	struct runtime * timer;
+	__MALLOC(timer, sizeof(*timer));
+
+	__COMPLEX8__ *A;
+	__COMPLEX8__ *B;
+	__COMPLEX8__ *C;
+	__ASSERT(__PREFIX(Malloc)((void **)&A, sizeof(*A) * (size_t)(m * n)));
+	__ASSERT(__PREFIX(Malloc)((void **)&B, sizeof(*B) * (size_t)(n * k)));
+	__ASSERT(__PREFIX(Malloc)((void **)&C, sizeof(*C) * (size_t)(m * k)));
 	if (C == NULL)
 	{
 		fprintf(stderr, "C not allocated\n");
 		exit(1);
 	}
 
-
-	cuComplex *hA = malloc(sizeof(*hA) * (size_t)(m * n));
-	if (hA == NULL)
-	{
-		fprintf(stderr, "hA not allocated\n");
-		exit(1);
-	}
-	cuComplex *hB = malloc(sizeof(*hB) * (size_t)(k * n));
-	if (hB == NULL)
-	{
-		fprintf(stderr, "hB not allocated\n");
-		exit(1);
-	}
-
-	cuComplex *hC = malloc(sizeof(*hC) * (size_t)(m * k));
+	__COMPLEX8__ *hA;
+	__MALLOC( hA, sizeof(*hA) * (size_t)(m * n));
+	__COMPLEX8__ *hB;
+	__MALLOC( hB, sizeof(*hB) * (size_t)(k * n));
+	__COMPLEX8__ *hC;
+	__MALLOC( hC, sizeof(*hC) * (size_t)(m * k));
 	if (hC == NULL)
 	{
 		fprintf(stderr, "hC not allocated\n");
@@ -248,14 +234,16 @@ run_test
 	// timer_stop(timer);
 
 	//timer_start(timer, "Memcopy");
-	__ASSERT(cudaMemcpy(A, hA, sizeof(*A) * (size_t)(m * n), cudaMemcpyHostToDevice));
-	__ASSERT(cudaMemcpy(B, hB, sizeof(*B) * (size_t)(k * n), cudaMemcpyHostToDevice));
+	__ASSERT(__PREFIX(Memcpy)(A, hA, sizeof(*A) * (size_t)(m * n), __PREFIX(MemcpyHostToDevice)));
+	__ASSERT(__PREFIX(Memcpy)(B, hB, sizeof(*B) * (size_t)(k * n), __PREFIX(MemcpyHostToDevice)));
 	// timer_stop(timer);
 
-	cudaSetDevice(0);
-	cublasHandle_t handle;
+	// cudaSetDevice(0);
+	__HANDLE__ handle;
 	//timer_start(timer, "Create Handle");
-	cublasCreate(&handle);
+	//if(rocblas_create_handle(&handle) != rocblas_status_success) return EXIT_FAILURE;
+	__CREATE_HANDLE(&handle);
+
 	//timer_stop(timer);
 
 	for (unsigned r = 0; r < rep; r++)
@@ -294,16 +282,17 @@ run_test
 			+  k * m * sizeof(*C)
 		)/1.e+9);
 
-	__ASSERT(cudaMemcpy(hC, C, sizeof(*hC) * (size_t)(k * m), cudaMemcpyDeviceToHost));
+	__ASSERT(__PREFIX(Memcpy)(hC, C, sizeof(*hC) * (size_t)(k * m), __PREFIX(MemcpyDeviceToHost)));
 	//print_result(hC);
 
 	// timer_start(timer, "Destroy Handle");
-	if(cublasDestroy(handle) != cudaSuccess) return EXIT_FAILURE;
+	//if(rocblas_destroy_handle(handle) != rocblas_status_success) return EXIT_FAILURE;
+	if(__DESTROY_HANDLE(handle) != __PREFIX(Success)) return EXIT_FAILURE;
 	// timer_stop(timer);
 
-	cudaFree(A);
-	cudaFree(B);
-	cudaFree(C);
+	__PREFIX(Free)(A);
+	__PREFIX(Free)(B);
+	__PREFIX(Free)(C);
 	free(hA);
 	free(hB);
 	free(hC);
@@ -320,12 +309,8 @@ main
 	int min_dim = 1;
 	int max_dim = 14;
 
-	float *  res = malloc(sizeof(*res) * (size_t)((max_dim - min_dim) * rep));
-	if (res == NULL)
-	{
-		fprintf(stderr, "Couldn't allocate res\n");
-		exit(1);
-	}
+	float *  res;
+	__MALLOC(res, sizeof(*res) * (size_t)((max_dim - min_dim) * rep));
 	for (int i = min_dim; i < max_dim; i++)
 	{
 		size_t dim = 1 << i;
diff --git a/hardware_settings.h b/hardware_settings.h
new file mode 100644
index 0000000000000000000000000000000000000000..b311022009800856d4709c84f001df9663760a3e
--- /dev/null
+++ b/hardware_settings.h
@@ -0,0 +1,52 @@
+/*
+ * =====================================================================================
+ *
+ *       Filename:  hardware_settings.h
+ *
+ *    Description:  
+ *
+ *        Version:  1.0
+ *        Created:  28.01.2021 16:15:56
+ *       Revision:  none
+ *       Compiler:  gcc
+ *
+ *         Author:  Henning Fehrmann (), henning.fehrmann@aei.mpg.de
+ *   Organization:  AEI Hannover
+ *      Copyright:  Copyright (c) 2021, Henning Fehrmann
+ *
+ * =====================================================================================
+ */
+
+#ifdef ROC
+#define __HIP_PLATFORM_HCC__
+#include <rocblas.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_vector_types.h>
+#define __ASSERT(x) (assert((x)==hipSuccess))
+#define __HANDLE__ rocblas_handle
+#define __COMPLEX8__ rocblas_float_complex
+#define __BLAS_OPERATION__ rocblas_operation
+#define __NO_TRANSFORM__ rocblas_operation_none
+#define __CT_TRANSFORM__ rocblas_operation_conjugate_transpose
+#define __CGMEM__ rocblas_cgemm
+#define __PREFIX(c) hip##c
+#define __CREATE_HANDLE(h)  rocblas_create_handle(h)
+#define __DESTROY_HANDLE(h) rocblas_destroy_handle(h)
+#endif
+
+#ifdef CUDA
+#include <assert.h>
+#include <cuda_runtime.h>
+#include <cuda.h>
+#include <cublas_v2.h>
+#define __ASSERT(x) (assert((x)==cudaSuccess))
+#define __HANDLE__ cublasHandle_t
+#define __COMPLEX8__ cuComplex
+#define __BLAS_OPERATION__ cublasOperation_t
+#define __NO_TRANSFORM__ CUBLAS_OP_N
+#define __CT_TRANSFORM__ CUBLAS_OP_C
+#define __CGMEM__ cublasCgemm
+#define __PREFIX(c) cuda##c
+#define __CREATE_HANDLE(h)  cublasCreate(h)
+#define __DESTROY_HANDLE(h) cublasDestroy(h)
+#endif
diff --git a/rocmblas_AMD.c b/rocmblas_AMD.c
deleted file mode 100644
index f8cc59d6256d322ef5a7fe29c80a811302d83e12..0000000000000000000000000000000000000000
--- a/rocmblas_AMD.c
+++ /dev/null
@@ -1,361 +0,0 @@
-/*
- * =====================================================================================
- *
- *    Description:  BLAS Benchmark
- *
- *        Version:  1.0
- *        Created:  27.01.2021 12:45:18
- *       Revision:  none
- *       Compiler:  hipc
- *
- *         Author:  Henning Fehrmann (), henning.fehrmann@aei.mpg.de
- *   Organization:  AEI Hannover
- *        License:  GNU General Public License v2
- *
- * =====================================================================================
- */
-
-
-#define __HIP_PLATFORM_HCC__
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <rocblas.h>
-#include <hip/hip_runtime.h>
-#include <hip/hip_vector_types.h>
-#include <time.h>
-#include <omp.h>
-
-#define __ASSERT(x) (assert((x)==hipSuccess))
-
-
-size_t m = 10000;
-size_t n = 10000;
-size_t k = 10000;
-
-static unsigned long x=123456789, y=362436069, z=521288629;
-
-unsigned long
-xorshf96
-(
-	void
-)
-{
-	// NOT thread save
-	unsigned long t;
-	x ^= x << 16;
-	x ^= x >> 5;
-	x ^= x << 1;
-
-	t = x;
-	x = y;
-	y = z;
-	z = t ^ x ^ y;
-	return z;
-}
-
-struct runtime
-{
-	struct timespec start;
-	struct timespec stop;
-	char tag[128];
-};
-
-
-void
-timer_start
-(
-	struct runtime * timer,
-	char tag[128]
-)
-{
-	struct timespec start;
-	sprintf(timer->tag,"%s", tag);
-	clock_gettime(CLOCK_REALTIME , &start);
-	timer->start = start;
-//	printf("-------->  start timer: %s\n", timer->tag);
-}
-
-double
-timer_stop
-(
-	struct runtime * timer
-)
-{
-	struct timespec stop;
-	clock_gettime(CLOCK_REALTIME , &stop);
-	timer->stop = stop;
-	double res= (double)
-		(
-			(timer->stop).tv_sec - (timer->start).tv_sec
-		)*1000.
-		+
-		(double)
-		(
-			(timer->stop).tv_nsec - (timer->start).tv_nsec
-		)/1000000.
-		;
-	// printf("-------->  stop timer %s: %g ms\n", timer->tag, res );
-	return res;
-}
-
-
-void
-multiplication
-(
-	rocblas_handle handle,
-	const rocblas_float_complex *A,
-	const rocblas_float_complex *B,
-	rocblas_float_complex *C
-)
-{
-	rocblas_operation transA = rocblas_operation_none;
-	rocblas_operation transB = rocblas_operation_conjugate_transpose;
-	const rocblas_float_complex alpha = {.x = 1.f, .y = 0.f};
-	const rocblas_float_complex beta = {.x = 0.f, .y = 0.f};
-
-	rocblas_int lda = n;
-	rocblas_int ldb = n;
-	rocblas_int ldc = k;
-
-	rocblas_cgemm
-	(
-		handle,
-		transA,
-		transB,
-		m,
-		n,
-		k,
-		&alpha,
-		A,
-		lda,
-		B,
-		ldb,
-		&beta,
-		C,
-		ldc
-	);
-}
-
-void
-prepare_matrices
-(
-	rocblas_float_complex * hA,
-	rocblas_float_complex * hB
-)
-{
-	float fact = 1.f/(float)n/(float)x/(float)y/20.f;
-#pragma omp parallel for
-	for (size_t i = 0; i < n; i++)
-	{
-		for (size_t j = 0; j < m; j++)
-		{
-			size_t ind = j + m * i;
-			hA[ind].x = (float)xorshf96()*fact;
-			hA[ind].y = (float)xorshf96()*fact;
-		}
-	}
-#pragma omp parallel for
-	for (size_t i = 0; i < n; i++)
-	{
-		for (size_t j = 0; j < k; j++)
-		{
-			size_t ind = j + k * i;
-			hB[ind].x = (float)xorshf96()*fact;
-			hB[ind].y = (float)xorshf96()*fact;
-		}
-	}
-
-}
-
-void
-print_result
-(
-	rocblas_float_complex * hC
-)
-{
-	printf("-------- %zu %zu\n", m, k);
-	for (size_t i = 0; i < m; i++)
-	{
-		for (size_t j = 0; j < k; j++)
-		{
-			size_t ind = j + k * i;
-			printf("%1.2f %1.2f\t", hC[ind].x, hC[ind].y);
-		}
-		printf("\n");
-	}
-	printf("--------\n");
-
-}
-
-int
-run_test
-(
-	size_t dim,
-	unsigned rep,
-	float * res
-)
-{
-	m = dim;
-	n = dim;
-	k = dim;
-	struct runtime * timer = malloc(sizeof(*timer));
-
-	rocblas_float_complex *A;
-	rocblas_float_complex *B;
-	rocblas_float_complex *C;
-	__ASSERT(hipMalloc((void **)&A, sizeof(*A) * (size_t)(m * n)));
-	if (A == NULL)
-	{
-		fprintf(stderr, "A not allocated\n");
-		exit(1);
-	}
-	__ASSERT(hipMalloc((void **)&B, sizeof(*B) * (size_t)(n * k)));
-	if (B == NULL)
-	{
-		fprintf(stderr, "B not allocated\n");
-		exit(1);
-	}
-	__ASSERT(hipMalloc((void **)&C, sizeof(*C) * (size_t)(m * k)));
-	if (C == NULL)
-	{
-		fprintf(stderr, "C not allocated\n");
-		exit(1);
-	}
-
-	rocblas_float_complex *hA = malloc(sizeof(*hA) * (size_t)(m * n));
-	if (hA == NULL)
-	{
-		fprintf(stderr, "hA not allocated\n");
-		exit(1);
-	}
-	rocblas_float_complex *hB = malloc(sizeof(*hB) * (size_t)(k * n));
-	if (hB == NULL)
-	{
-		fprintf(stderr, "hB not allocated\n");
-		exit(1);
-	}
-
-	rocblas_float_complex *hC = malloc(sizeof(*hC) * (size_t)(m * k));
-	if (hC == NULL)
-	{
-		fprintf(stderr, "hC not allocated\n");
-		exit(1);
-	}
-
-	// timer_start(timer, "Prepare matrices");
-	prepare_matrices(hA, hB);
-	// timer_stop(timer);
-
-	//timer_start(timer, "Memcopy");
-	__ASSERT(hipMemcpy(A, hA, sizeof(*A) * (size_t)(m * n), hipMemcpyHostToDevice));
-	__ASSERT(hipMemcpy(B, hB, sizeof(*B) * (size_t)(k * n), hipMemcpyHostToDevice));
-	// timer_stop(timer);
-
-	rocblas_handle handle;
-	//timer_start(timer, "Create Handle");
-	if(rocblas_create_handle(&handle) != rocblas_status_success) return EXIT_FAILURE;
-	//timer_stop(timer);
-
-	for (unsigned r = 0; r < rep; r++)
-	{
-		float res_r = 0.f;
-		char mes[128];
-		sprintf(mes, "dim %zu run %d a" ,dim,  r);
-		timer_start(timer, mes);
-		multiplication
-		(
-			handle,
-			A,
-			B,
-			C
-		);
-		res_r += timer_stop(timer);
-		sprintf(mes, "dim %zu run %d b" ,dim,  r);
-		/*
-		timer_start(timer, mes);
-		multiplication
-		(
-			handle,
-			B,
-			A,
-			C
-		);
-		res_r += timer_stop(timer);
-		*/
-		res[r] = res_r/1.f;
-	}
-	printf("dimensions: %zu %zu %zu\t -- ", n, m , k);
-	printf("required size: %f GB\n",
-		(
-			m * n * sizeof(*A)
-			+  k * n * sizeof(*B)
-			+  k * m * sizeof(*C)
-		)/1.e+9);
-
-	__ASSERT(hipMemcpy(hC, C, sizeof(*hC) * (size_t)(k * m), hipMemcpyDeviceToHost));
-	//print_result(hC);
-
-	// timer_start(timer, "Destroy Handle");
-	if(rocblas_destroy_handle(handle) != rocblas_status_success) return EXIT_FAILURE;
-	// timer_stop(timer);
-
-	hipFree(A);
-	hipFree(B);
-	hipFree(C);
-	free(hA);
-	free(hB);
-	free(hC);
-	free(timer);
-	return 0;
-}
-
-int
-main
-(
-)
-{
-	int rep = 512;
-	int min_dim = 1;
-	int max_dim = 14;
-
-	float *  res = malloc(sizeof(*res) * (size_t)((max_dim - min_dim) * rep));
-	if (res == NULL)
-	{
-		fprintf(stderr, "Couldn't allocate res\n");
-		exit(1);
-	}
-	for (int i = min_dim; i < max_dim; i++)
-	{
-		size_t dim = 1 << i;
-		int ind = (i - min_dim) * rep;
-		run_test(dim, rep, &res[ind]);
-	}
-	// store the results
-	FILE * f;
-	char name[128];
-	sprintf(name, "runtimes");
-	f= fopen(name, "w");
-	if (f == NULL)
-	{
-		fprintf(stderr, "Couldn't open %s\n", name);
-	}
-	for (int i = min_dim; i < max_dim; i++)
-	{
-		size_t dim = 1 << i;
-		fprintf(f, "%zu\t", dim);
-	}
-	fprintf(f, "\n");
-	for (int r = 0; r < rep; r++)
-	{
-		for (int i = min_dim; i < max_dim; i++)
-		{
-			size_t pos = (i - min_dim) * rep + r;
-			fprintf(f, "%1.6f\t", res[pos]);
-		}
-		fprintf(f, "\n");
-	}
-	fclose(f);
-	return 0;
-}