some tensor core code compilation instruction. not in the git yet

2a2041d4 · Henning Fehrmann · Henning Fehrmann · 246b9d52 · 2a2041d4
Commit 2a2041d4 authored Mar 8, 2021 by Henning Fehrmann Committed by Henning Fehrmann Mar 8, 2021
--- a/Makefile
+++ b/Makefile
@@ -3,12 +3,15 @@ GPU=NVIDIA
 GPU=AMD

 OBJ_blas = blas.o
+OBJ_tensor_core = tensor_core.o
 OBJ_fftw = fftw.o
+OBJ_tensor_core = tensor_core.o
+OBJ_tensor2 = tensor2.o


 ifeq ($(GPU), AMD)
  LDFLAGS = -L/opt/rocm/lib -fopenmp
-  LDFLAGS_blas = -lhipblas -lrocblas
+  LDFLAGS_blas = -lrocblas
  LDFLAGS_fftw = -lrocfft
  CFLAGS = -g -Wall -O3 -fopenmp -I/opt/rocm/include -I/opt/rocm/hip/include -DROC
  CC = hipcc
@@ -18,8 +21,8 @@ else ifeq ($(GPU), NVIDIA)
  LDFLAGS_blas =  -lcublas
  LDFLAGS_fftw =  -lcufft
  INCLUDE= -I/usr/lib/x86_64-linux-gnu/openmpi/include/
-  CFLAGS = ${INCLUDE} --compile -O3 -pg -Xcompiler -fopenmp -DCUDA
-  CUDAFLAGS = --Werror cross-execution-space-call --Wno-deprecated-gpu-targets
+  CFLAGS = ${INCLUDE}  -arch sm_75 --compile -O3 -pg -Xcompiler -fopenmp -DCUDA
+  CUDAFLAGS = -arch sm_75  --Werror cross-execution-space-call --Wno-deprecated-gpu-targets
 else
  unknown_HW:
 endif
@@ -29,9 +32,18 @@ all: blas fftw
 blas: ${OBJ_blas}
 	${CC} -o blas ${OBJ_blas} ${LDFLAGS} ${LDFLAGS_blas} ${CUDAFLAGS}

+tensor_core: ${OBJ_tensor_core}
+	${CC} -o tensor_core ${OBJ_tensor_core} ${LDFLAGS} ${LDFLAGS_fftw} ${LDFLAGS_blas} ${CUDAFLAGS}
+
+tensor2: ${OBJ_tensor2}
+	${CC} -o tensor2 ${OBJ_tensor2} ${LDFLAGS} ${LDFLAGS_fftw} ${LDFLAGS_blas} ${CUDAFLAGS}
+
 fftw: ${OBJ_fftw}
 	${CC} -o fftw ${OBJ_fftw} ${LDFLAGS} ${LDFLAGS_fftw} ${CUDAFLAGS}

+%.o: %.cu ${HEADER}
+	nvcc ${INCLUDE} -dc $< ${CUDAFLAGS}
+
 %.o: %.c ${HEADER}
 	${CC} ${CFLAGS} -c $<