Skip to content
Snippets Groups Projects
Select Git revision
  • e43923a096d836f67d0a3d69172216acbe290220
  • master default protected
2 results

test_beam_det.py

Blame
  • blas.c 5.13 KiB
    /*
     * =====================================================================================
     *
     *    Description:  BLAS Benchmark
     *
     *        Version:  1.0
     *        Created:  27.01.2021 12:45:18
     *       Revision:  none
     *       Compiler:  hipc or nvcc
     *
     *         Author:  Henning Fehrmann (), henning.fehrmann@aei.mpg.de
     *   Organization:  AEI Hannover
     *        License:  GNU General Public License v2
     *
     * =====================================================================================
     */
    
    #include "hardware_settings.h"
    #include "profiler.h"
    
    #include <stdio.h>
    #include <stdlib.h>
    #include <math.h>
    #include <omp.h>
    
    #define __MALLOC(P, size) P =  malloc(size); \
    	if (P == NULL) \
    	{\
    		fprintf(stderr, "Allocation failed at line %d in %s\n", __LINE__, __FILE__); \
    		exit(EXIT_FAILURE); \
    	}\
    
    size_t m = 10000;
    size_t n = 10000;
    size_t k = 10000;
    
    void
    multiplication
    (
    	__HANDLE__ handle,
    	const __COMPLEX8__ *A,
    	const __COMPLEX8__ *B,
    	__COMPLEX8__ *C
    )
    {
    	__BLAS_OPERATION__ transA = __NO_TRANSFORM__;
    	__BLAS_OPERATION__ transB = __CT_TRANSFORM__;
    	const __COMPLEX8__ alpha = {.x = 1.f, .y = 0.f};
    	const __COMPLEX8__ beta = {.x = 0.f, .y = 0.f};
    
    	int lda = n;
    	int ldb = n;
    	int ldc = k;
    
    	__CGMEM__
    	(
    		handle,
    		transA,
    		transB,
    		m,
    		n,
    		k,
    		&alpha,
    		A,
    		lda,
    		B,
    		ldb,
    		&beta,
    		C,
    		ldc
    	);
    }
    
    void
    prepare_matrices
    (
    	__COMPLEX8__ * hA,
    	__COMPLEX8__ * hB
    )
    {
    	float fact = 1.f/(float)n/(float)x/(float)y/20.f;
    #pragma omp parallel for
    	for (size_t i = 0; i < n; i++)
    	{
    		for (size_t j = 0; j < m; j++)
    		{
    			size_t ind = j + m * i;
    			hA[ind].x = (float)xorshf96()*fact;
    			hA[ind].y = (float)xorshf96()*fact;
    		}
    	}
    #pragma omp parallel for
    	for (size_t i = 0; i < n; i++)
    	{
    		for (size_t j = 0; j < k; j++)
    		{
    			size_t ind = j + k * i;
    			hB[ind].x = (float)xorshf96()*fact;
    			hB[ind].y = (float)xorshf96()*fact;
    		}
    	}
    
    }
    
    void
    print_result
    (
    	__COMPLEX8__ * hC
    )
    {
    	printf("-------- %zu %zu\n", m, k);
    	for (size_t i = 0; i < m; i++)
    	{
    		for (size_t j = 0; j < k; j++)
    		{
    			size_t ind = j + k * i;
    			printf("%1.2f %1.2f\t", hC[ind].x, hC[ind].y);
    		}
    		printf("\n");
    	}
    	printf("--------\n");
    
    }
    
    int
    run_test
    (
    	size_t dim,
    	unsigned rep,
    	float * res
    )
    {
    	m = dim;
    	n = dim;
    	k = dim;
    	struct runtime * timer;
    	__MALLOC(timer, sizeof(*timer));
    
    	__COMPLEX8__ *A;
    	__COMPLEX8__ *B;
    	__COMPLEX8__ *C;
    	__ASSERT(__PREFIX(Malloc)((void **)&A, sizeof(*A) * (size_t)(m * n)));
    	__ASSERT(__PREFIX(Malloc)((void **)&B, sizeof(*B) * (size_t)(n * k)));
    	__ASSERT(__PREFIX(Malloc)((void **)&C, sizeof(*C) * (size_t)(m * k)));
    	if (C == NULL)
    	{
    		fprintf(stderr, "C not allocated\n");
    		exit(1);
    	}
    
    	__COMPLEX8__ *hA;
    	__MALLOC( hA, sizeof(*hA) * (size_t)(m * n));
    	__COMPLEX8__ *hB;
    	__MALLOC( hB, sizeof(*hB) * (size_t)(k * n));
    	__COMPLEX8__ *hC;
    	__MALLOC( hC, sizeof(*hC) * (size_t)(m * k));
    
    	// timer_start(timer, "Prepare matrices");
    	prepare_matrices(hA, hB);
    	// timer_stop(timer);
    
    	//timer_start(timer, "Memcopy");
    	__ASSERT(__PREFIX(Memcpy)(A, hA, sizeof(*A) * (size_t)(m * n), __PREFIX(MemcpyHostToDevice)));
    	__ASSERT(__PREFIX(Memcpy)(B, hB, sizeof(*B) * (size_t)(k * n), __PREFIX(MemcpyHostToDevice)));
    	// timer_stop(timer);
    
    	// cudaSetDevice(0);
    	__HANDLE__ handle;
    	//timer_start(timer, "Create Handle");
    	//if(rocblas_create_handle(&handle) != rocblas_status_success) return EXIT_FAILURE;
    	__CREATE_HANDLE(&handle);
    
    	//timer_stop(timer);
    
    	for (unsigned r = 0; r < rep; r++)
    	{
    		float res_r = 0.f;
    		char mes[128];
    		sprintf(mes, "dim %zu run %d a" ,dim,  r);
    		timer_start(timer, mes);
    		multiplication
    		(
    			handle,
    			A,
    			B,
    			C
    		);
    		res_r += timer_stop(timer);
    		sprintf(mes, "dim %zu run %d b" ,dim,  r);
    		/*
    		timer_start(timer, mes);
    		multiplication
    		(
    			handle,
    			B,
    			A,
    			C
    		);
    		res_r += timer_stop(timer);
    		*/
    		res[r] = res_r/1.f;
    	}
    	printf("dimensions: %zu %zu %zu\t -- ", n, m , k);
    	printf("required size: %f GB\n",
    		(
    			m * n * sizeof(*A)
    			+  k * n * sizeof(*B)
    			+  k * m * sizeof(*C)
    		)/1.e+9);
    
    	__ASSERT(__PREFIX(Memcpy)(hC, C, sizeof(*hC) * (size_t)(k * m), __PREFIX(MemcpyDeviceToHost)));
    	//print_result(hC);
    
    	// timer_start(timer, "Destroy Handle");
    	//if(rocblas_destroy_handle(handle) != rocblas_status_success) return EXIT_FAILURE;
    	if(__DESTROY_HANDLE(handle) != __PREFIX(Success)) return EXIT_FAILURE;
    	// timer_stop(timer);
    
    	__PREFIX(Free)(A);
    	__PREFIX(Free)(B);
    	__PREFIX(Free)(C);
    	free(hA);
    	free(hB);
    	free(hC);
    	free(timer);
    	return 0;
    }
    
    int
    main
    (
    )
    {
    	int rep = 512;
    	int min_dim = 1;
    	int max_dim = 14;
    
    	float *  res;
    	__MALLOC(res, sizeof(*res) * (size_t)((max_dim - min_dim) * rep));
    	for (int i = min_dim; i < max_dim; i++)
    	{
    		size_t dim = 1 << i;
    		int ind = (i - min_dim) * rep;
    		run_test(dim, rep, &res[ind]);
    	}
    	// store the results
    	FILE * f;
    	char name[128];
    	sprintf(name, "runtimes");
    	f= fopen(name, "w");
    	if (f == NULL)
    	{
    		fprintf(stderr, "Couldn't open %s\n", name);
    	}
    	for (int i = min_dim; i < max_dim; i++)
    	{
    		size_t dim = 1 << i;
    		fprintf(f, "%zu\t", dim);
    	}
    	fprintf(f, "\n");
    	for (int r = 0; r < rep; r++)
    	{
    		for (int i = min_dim; i < max_dim; i++)
    		{
    			size_t pos = (i - min_dim) * rep + r;
    			fprintf(f, "%1.6f\t", res[pos]);
    		}
    		fprintf(f, "\n");
    	}
    	fclose(f);
    	return 0;
    }