blas.c

/*
 * =====================================================================================
 *
 *    Description:  BLAS Benchmark
 *
 *        Version:  1.0
 *        Created:  27.01.2021 12:45:18
 *       Revision:  none
 *       Compiler:  hipc or nvcc
 *
 *         Author:  Henning Fehrmann (), henning.fehrmann@aei.mpg.de
 *   Organization:  AEI Hannover
 *        License:  GNU General Public License v2
 *
 * =====================================================================================
 */

#include "hardware_settings.h"
#include "profiler.h"

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>

#define __MALLOC(P, size) P =  malloc(size); \
	if (P == NULL) \
	{\
		fprintf(stderr, "Allocation failed at line %d in %s\n", __LINE__, __FILE__); \
		exit(EXIT_FAILURE); \
	}\

size_t m = 10000;
size_t n = 10000;
size_t k = 10000;

void
multiplication
(
	__HANDLE__ handle,
	const __COMPLEX8__ *A,
	const __COMPLEX8__ *B,
	__COMPLEX8__ *C
)
{
	__BLAS_OPERATION__ transA = __NO_TRANSFORM__;
	__BLAS_OPERATION__ transB = __CT_TRANSFORM__;
	const __COMPLEX8__ alpha = {.x = 1.f, .y = 0.f};
	const __COMPLEX8__ beta = {.x = 0.f, .y = 0.f};

	int lda = n;
	int ldb = n;
	int ldc = k;

	__CGMEM__
	(
		handle,
		transA,
		transB,
		m,
		n,
		k,
		&alpha,
		A,
		lda,
		B,
		ldb,
		&beta,
		C,
		ldc
	);
}

void
prepare_matrices
(
	__COMPLEX8__ * hA,
	__COMPLEX8__ * hB
)
{
	float fact = 1.f/(float)n/(float)x/(float)y/20.f;
#pragma omp parallel for
	for (size_t i = 0; i < n; i++)
	{
		for (size_t j = 0; j < m; j++)
		{
			size_t ind = j + m * i;
			hA[ind].x = (float)xorshf96()*fact;
			hA[ind].y = (float)xorshf96()*fact;
		}
	}
#pragma omp parallel for
	for (size_t i = 0; i < n; i++)
	{
		for (size_t j = 0; j < k; j++)
		{
			size_t ind = j + k * i;
			hB[ind].x = (float)xorshf96()*fact;
			hB[ind].y = (float)xorshf96()*fact;
		}
	}

}

void
print_result
(
	__COMPLEX8__ * hC
)
{
	printf("-------- %zu %zu\n", m, k);
	for (size_t i = 0; i < m; i++)
	{
		for (size_t j = 0; j < k; j++)
		{
			size_t ind = j + k * i;
			printf("%1.2f %1.2f\t", hC[ind].x, hC[ind].y);
		}
		printf("\n");
	}
	printf("--------\n");

}

int
run_test
(
	size_t dim,
	unsigned rep,
	float * res
)
{
	m = dim;
	n = dim;
	k = dim;
	struct runtime * timer;
	__MALLOC(timer, sizeof(*timer));

	__COMPLEX8__ *A;
	__COMPLEX8__ *B;
	__COMPLEX8__ *C;
	__ASSERT(__PREFIX(Malloc)((void **)&A, sizeof(*A) * (size_t)(m * n)));
	__ASSERT(__PREFIX(Malloc)((void **)&B, sizeof(*B) * (size_t)(n * k)));
	__ASSERT(__PREFIX(Malloc)((void **)&C, sizeof(*C) * (size_t)(m * k)));
	if (C == NULL)
	{
		fprintf(stderr, "C not allocated\n");
		exit(1);
	}

	__COMPLEX8__ *hA;
	__MALLOC( hA, sizeof(*hA) * (size_t)(m * n));
	__COMPLEX8__ *hB;
	__MALLOC( hB, sizeof(*hB) * (size_t)(k * n));
	__COMPLEX8__ *hC;
	__MALLOC( hC, sizeof(*hC) * (size_t)(m * k));

	// timer_start(timer, "Prepare matrices");
	prepare_matrices(hA, hB);
	// timer_stop(timer);

	//timer_start(timer, "Memcopy");
	__ASSERT(__PREFIX(Memcpy)(A, hA, sizeof(*A) * (size_t)(m * n), __PREFIX(MemcpyHostToDevice)));
	__ASSERT(__PREFIX(Memcpy)(B, hB, sizeof(*B) * (size_t)(k * n), __PREFIX(MemcpyHostToDevice)));
	// timer_stop(timer);

	// cudaSetDevice(0);
	__HANDLE__ handle;
	//timer_start(timer, "Create Handle");
	//if(rocblas_create_handle(&handle) != rocblas_status_success) return EXIT_FAILURE;
	__CREATE_HANDLE(&handle);

	//timer_stop(timer);

	for (unsigned r = 0; r < rep; r++)
	{
		float res_r = 0.f;
		char mes[128];
		sprintf(mes, "dim %zu run %d a" ,dim,  r);
		timer_start(timer, mes);
		multiplication
		(
			handle,
			A,
			B,
			C
		);
		res_r += timer_stop(timer);
		sprintf(mes, "dim %zu run %d b" ,dim,  r);
		/*
		timer_start(timer, mes);
		multiplication
		(
			handle,
			B,
			A,
			C
		);
		res_r += timer_stop(timer);
		*/
		res[r] = res_r/1.f;
	}
	printf("dimensions: %zu %zu %zu\t -- ", n, m , k);
	printf("required size: %f GB\n",
		(
			m * n * sizeof(*A)
			+  k * n * sizeof(*B)
			+  k * m * sizeof(*C)
		)/1.e+9);
	__ASSERT(__PREFIX(Memcpy)(hC, C, sizeof(*hC) * (size_t)(k * m), __PREFIX(MemcpyDeviceToHost)));
	//print_result(hC);

	// timer_start(timer, "Destroy Handle");
	//if(rocblas_destroy_handle(handle) != rocblas_status_success) return EXIT_FAILURE;
	if(__DESTROY_HANDLE(handle) != __PREFIX(Success)) return EXIT_FAILURE;
	// timer_stop(timer);

	__PREFIX(Free)(A);
	__PREFIX(Free)(B);
	__PREFIX(Free)(C);
	free(hA);
	free(hB);
	free(hC);
	free(timer);
	return 0;
}

int
main
(
)
{
	int rep = 512;
	int min_dim = 1;
	int max_dim = 14;

	float *  res;
	__MALLOC(res, sizeof(*res) * (size_t)((max_dim - min_dim) * rep));
	for (int i = min_dim; i < max_dim; i++)
	{
		size_t dim = 1 << i;
		int ind = (i - min_dim) * rep;
		run_test(dim, rep, &res[ind]);
	}
	// store the results
	FILE * f;
	char name[128];
	sprintf(name, "runtimes");
	f= fopen(name, "w");
	if (f == NULL)
	{
		fprintf(stderr, "Couldn't open %s\n", name);
	}
	for (int i = min_dim; i < max_dim; i++)
	{
		size_t dim = 1 << i;
		fprintf(f, "%zu\t", dim);
	}
	fprintf(f, "\n");
	for (int r = 0; r < rep; r++)
	{
		for (int i = min_dim; i < max_dim; i++)
		{
			size_t pos = (i - min_dim) * rep + r;
			fprintf(f, "%1.6f\t", res[pos]);
		}
		fprintf(f, "\n");
	}
	fclose(f);
	return 0;
}