Select Git revision
StarsphereS5R3.cpp
Forked from
einsteinathome / graphicsframework
Source project has a limited visibility.
-
Oliver Bock authored
* Using single font instances per text group (adds flexibility, avoid font change during render) * initialize() and resize() are now virtual, and overridden in subclass * HUD dimensions now follow actual size (no more scaling needed) * No more FSAA needed * Removed unnecessary Open GL calls during render Pro: - Crystal clear HUD text - Constant HUD text size Con: - Slightly higher CPU usage (5% -> 7.5%, also for full screen?)
Oliver Bock authored* Using single font instances per text group (adds flexibility, avoid font change during render) * initialize() and resize() are now virtual, and overridden in subclass * HUD dimensions now follow actual size (no more scaling needed) * No more FSAA needed * Removed unnecessary Open GL calls during render Pro: - Crystal clear HUD text - Constant HUD text size Con: - Slightly higher CPU usage (5% -> 7.5%, also for full screen?)
blas.c 5.13 KiB
/*
* =====================================================================================
*
* Description: BLAS Benchmark
*
* Version: 1.0
* Created: 27.01.2021 12:45:18
* Revision: none
* Compiler: hipc or nvcc
*
* Author: Henning Fehrmann (), henning.fehrmann@aei.mpg.de
* Organization: AEI Hannover
* License: GNU General Public License v2
*
* =====================================================================================
*/
#include "hardware_settings.h"
#include "profiler.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>
#define __MALLOC(P, size) P = malloc(size); \
if (P == NULL) \
{\
fprintf(stderr, "Allocation failed at line %d in %s\n", __LINE__, __FILE__); \
exit(EXIT_FAILURE); \
}\
size_t m = 10000;
size_t n = 10000;
size_t k = 10000;
void
multiplication
(
__HANDLE__ handle,
const __COMPLEX8__ *A,
const __COMPLEX8__ *B,
__COMPLEX8__ *C
)
{
__BLAS_OPERATION__ transA = __NO_TRANSFORM__;
__BLAS_OPERATION__ transB = __CT_TRANSFORM__;
const __COMPLEX8__ alpha = {.x = 1.f, .y = 0.f};
const __COMPLEX8__ beta = {.x = 0.f, .y = 0.f};
int lda = n;
int ldb = n;
int ldc = k;
__CGMEM__
(
handle,
transA,
transB,
m,
n,
k,
&alpha,
A,
lda,
B,
ldb,
&beta,
C,
ldc
);
}
void
prepare_matrices
(
__COMPLEX8__ * hA,
__COMPLEX8__ * hB
)
{
float fact = 1.f/(float)n/(float)x/(float)y/20.f;
#pragma omp parallel for
for (size_t i = 0; i < n; i++)
{
for (size_t j = 0; j < m; j++)
{
size_t ind = j + m * i;
hA[ind].x = (float)xorshf96()*fact;
hA[ind].y = (float)xorshf96()*fact;
}
}
#pragma omp parallel for
for (size_t i = 0; i < n; i++)
{
for (size_t j = 0; j < k; j++)
{
size_t ind = j + k * i;
hB[ind].x = (float)xorshf96()*fact;
hB[ind].y = (float)xorshf96()*fact;
}
}
}
void
print_result
(
__COMPLEX8__ * hC
)
{
printf("-------- %zu %zu\n", m, k);
for (size_t i = 0; i < m; i++)
{
for (size_t j = 0; j < k; j++)
{
size_t ind = j + k * i;
printf("%1.2f %1.2f\t", hC[ind].x, hC[ind].y);
}
printf("\n");
}
printf("--------\n");
}
int
run_test
(
size_t dim,
unsigned rep,
float * res
)
{
m = dim;
n = dim;
k = dim;
struct runtime * timer;
__MALLOC(timer, sizeof(*timer));
__COMPLEX8__ *A;
__COMPLEX8__ *B;
__COMPLEX8__ *C;
__ASSERT(__PREFIX(Malloc)((void **)&A, sizeof(*A) * (size_t)(m * n)));
__ASSERT(__PREFIX(Malloc)((void **)&B, sizeof(*B) * (size_t)(n * k)));
__ASSERT(__PREFIX(Malloc)((void **)&C, sizeof(*C) * (size_t)(m * k)));
if (C == NULL)
{
fprintf(stderr, "C not allocated\n");
exit(1);
}
__COMPLEX8__ *hA;
__MALLOC( hA, sizeof(*hA) * (size_t)(m * n));
__COMPLEX8__ *hB;
__MALLOC( hB, sizeof(*hB) * (size_t)(k * n));
__COMPLEX8__ *hC;
__MALLOC( hC, sizeof(*hC) * (size_t)(m * k));
// timer_start(timer, "Prepare matrices");
prepare_matrices(hA, hB);
// timer_stop(timer);
//timer_start(timer, "Memcopy");
__ASSERT(__PREFIX(Memcpy)(A, hA, sizeof(*A) * (size_t)(m * n), __PREFIX(MemcpyHostToDevice)));
__ASSERT(__PREFIX(Memcpy)(B, hB, sizeof(*B) * (size_t)(k * n), __PREFIX(MemcpyHostToDevice)));
// timer_stop(timer);
// cudaSetDevice(0);
__HANDLE__ handle;
//timer_start(timer, "Create Handle");
//if(rocblas_create_handle(&handle) != rocblas_status_success) return EXIT_FAILURE;
__CREATE_HANDLE(&handle);
//timer_stop(timer);
for (unsigned r = 0; r < rep; r++)
{
float res_r = 0.f;
char mes[128];
sprintf(mes, "dim %zu run %d a" ,dim, r);
timer_start(timer, mes);
multiplication
(
handle,
A,
B,
C
);
res_r += timer_stop(timer);
sprintf(mes, "dim %zu run %d b" ,dim, r);
/*
timer_start(timer, mes);
multiplication
(
handle,
B,
A,
C
);
res_r += timer_stop(timer);
*/
res[r] = res_r/1.f;
}
printf("dimensions: %zu %zu %zu\t -- ", n, m , k);
printf("required size: %f GB\n",
(
m * n * sizeof(*A)
+ k * n * sizeof(*B)
+ k * m * sizeof(*C)
)/1.e+9);
__ASSERT(__PREFIX(Memcpy)(hC, C, sizeof(*hC) * (size_t)(k * m), __PREFIX(MemcpyDeviceToHost)));
//print_result(hC);
// timer_start(timer, "Destroy Handle");
//if(rocblas_destroy_handle(handle) != rocblas_status_success) return EXIT_FAILURE;
if(__DESTROY_HANDLE(handle) != __PREFIX(Success)) return EXIT_FAILURE;
// timer_stop(timer);
__PREFIX(Free)(A);
__PREFIX(Free)(B);
__PREFIX(Free)(C);
free(hA);
free(hB);
free(hC);
free(timer);
return 0;
}
int
main
(
)
{
int rep = 512;
int min_dim = 1;
int max_dim = 14;
float * res;
__MALLOC(res, sizeof(*res) * (size_t)((max_dim - min_dim) * rep));
for (int i = min_dim; i < max_dim; i++)
{
size_t dim = 1 << i;
int ind = (i - min_dim) * rep;
run_test(dim, rep, &res[ind]);
}
// store the results
FILE * f;
char name[128];
sprintf(name, "runtimes");
f= fopen(name, "w");
if (f == NULL)
{
fprintf(stderr, "Couldn't open %s\n", name);
}
for (int i = min_dim; i < max_dim; i++)
{
size_t dim = 1 << i;
fprintf(f, "%zu\t", dim);
}
fprintf(f, "\n");
for (int r = 0; r < rep; r++)
{
for (int i = min_dim; i < max_dim; i++)
{
size_t pos = (i - min_dim) * rep + r;
fprintf(f, "%1.6f\t", res[pos]);
}
fprintf(f, "\n");
}
fclose(f);
return 0;
}