Commit 2a2041d4 authored by Henning Fehrmann's avatar Henning Fehrmann Committed by Henning Fehrmann
Browse files

some tensor core code compilation instruction. not in the git yet

parent 246b9d52
......@@ -3,12 +3,15 @@ GPU=NVIDIA
GPU=AMD
OBJ_blas = blas.o
OBJ_tensor_core = tensor_core.o
OBJ_fftw = fftw.o
OBJ_tensor_core = tensor_core.o
OBJ_tensor2 = tensor2.o
ifeq ($(GPU), AMD)
LDFLAGS = -L/opt/rocm/lib -fopenmp
LDFLAGS_blas = -lhipblas -lrocblas
LDFLAGS_blas = -lrocblas
LDFLAGS_fftw = -lrocfft
CFLAGS = -g -Wall -O3 -fopenmp -I/opt/rocm/include -I/opt/rocm/hip/include -DROC
CC = hipcc
......@@ -18,8 +21,8 @@ else ifeq ($(GPU), NVIDIA)
LDFLAGS_blas = -lcublas
LDFLAGS_fftw = -lcufft
INCLUDE= -I/usr/lib/x86_64-linux-gnu/openmpi/include/
CFLAGS = ${INCLUDE} --compile -O3 -pg -Xcompiler -fopenmp -DCUDA
CUDAFLAGS = --Werror cross-execution-space-call --Wno-deprecated-gpu-targets
CFLAGS = ${INCLUDE} -arch sm_75 --compile -O3 -pg -Xcompiler -fopenmp -DCUDA
CUDAFLAGS = -arch sm_75 --Werror cross-execution-space-call --Wno-deprecated-gpu-targets
else
unknown_HW:
endif
......@@ -29,9 +32,18 @@ all: blas fftw
blas: ${OBJ_blas}
${CC} -o blas ${OBJ_blas} ${LDFLAGS} ${LDFLAGS_blas} ${CUDAFLAGS}
tensor_core: ${OBJ_tensor_core}
${CC} -o tensor_core ${OBJ_tensor_core} ${LDFLAGS} ${LDFLAGS_fftw} ${LDFLAGS_blas} ${CUDAFLAGS}
tensor2: ${OBJ_tensor2}
${CC} -o tensor2 ${OBJ_tensor2} ${LDFLAGS} ${LDFLAGS_fftw} ${LDFLAGS_blas} ${CUDAFLAGS}
fftw: ${OBJ_fftw}
${CC} -o fftw ${OBJ_fftw} ${LDFLAGS} ${LDFLAGS_fftw} ${CUDAFLAGS}
%.o: %.cu ${HEADER}
nvcc ${INCLUDE} -dc $< ${CUDAFLAGS}
%.o: %.c ${HEADER}
${CC} ${CFLAGS} -c $<
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment