cudaTransientFstatRectWindow.cu 4.1 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
__global__ void cudaTransientFstatRectWindow ( float *input,
                                               unsigned int numAtoms,
                                               unsigned int TAtom,
                                               unsigned int t0_data,
                                               unsigned int win_t0,
                                               unsigned int win_dt0,
                                               unsigned int win_tau,
                                               unsigned int win_dtau,
                                               unsigned int N_tauRange,
                                               float *Fmn
                                             )
{

  /* match CUDA thread indexing and high-level (t0,tau) indexing */
  // assume 1D block, grid setup
  unsigned int m         = blockDim.x * blockIdx.x + threadIdx.x; // t0:  row

  unsigned short input_cols = 7; // must match input matrix!

  /* compute Fstat-atom index i_t0 in [0, numAtoms) */
  unsigned int TAtomHalf = TAtom/2; // integer division
  unsigned int t0 = win_t0 + m * win_dt0;
  int i_tmp = ( t0 - t0_data + TAtomHalf ) / TAtom; // integer round: floor(x+0.5)
  if ( i_tmp < 0 ) {
    i_tmp = 0;
  }
  unsigned int i_t0 = (unsigned int)i_tmp;
  if ( i_t0 >= numAtoms ) {
    i_t0 = numAtoms - 1;
  }

  float Ad    = 0.0f;
  float Bd    = 0.0f;
  float Cd    = 0.0f;
  float Fa_re = 0.0f;
  float Fa_im = 0.0f;
  float Fb_re = 0.0f;
  float Fb_im = 0.0f;
  unsigned int i_t1_last = i_t0;

  /* INNER loop over timescale-parameter tau
   * NOT parallelized so that we can still use the i_t1_last trick
   * (empirically seems to be faster than 2D CUDA version)
   */
  for ( unsigned int n = 0; n < N_tauRange; n ++ ) {

    if ( (m < N_tauRange) && (n < N_tauRange) ) {

      /* translate n into an atoms end-index for this search interval [t0, t0+Tcoh],
       * giving the index range of atoms to sum over
       */
      unsigned int tau = win_tau + n * win_dtau;

      /* get end-time t1 of this transient-window search */
      unsigned int t1 = t0 + tau;

      /* compute window end-time Fstat-atom index i_t1 in [0, numAtoms) */
      i_tmp = ( t1 - t0_data + TAtomHalf ) / TAtom  - 1; // integer round: floor(x+0.5)
      if ( i_tmp < 0 ) {
        i_tmp = 0;
      }
      unsigned int i_t1 = (unsigned int)i_tmp;
      if ( i_t1 >= numAtoms ) {
        i_t1 = numAtoms - 1;
      }

      /* now we have two valid atoms-indices [i_t0, i_t1]
       * spanning our Fstat-window to sum over
       */

      for ( unsigned int i = i_t1_last; i <= i_t1; i ++ ) {
        /* sum up atoms,
         * special optimiziation in the rectangular-window case:
         * just add on to previous tau values,
         * ie re-use the sum over [i_t0, i_t1_last] from the pevious tau-loop iteration
         */
        Ad    += input[i*input_cols+0]; // a2_alpha
        Bd    += input[i*input_cols+1]; // b2_alpha
        Cd    += input[i*input_cols+2]; // ab_alpha
        Fa_re += input[i*input_cols+3]; // Fa_alpha_re
        Fa_im += input[i*input_cols+4]; // Fa_alpha_im
        Fb_re += input[i*input_cols+5]; // Fb_alpha_re
        Fb_im += input[i*input_cols+6]; // Fb_alpha_im
        /* keep track of up to where we summed for the next iteration */
        i_t1_last = i_t1 + 1;
      }

      /* get determinant */
      float Dd = ( Ad * Bd - Cd * Cd );
      float DdInv = 0.0f;
      /* safety catch as in XLALWeightMultiAMCoeffs():
       * make it so that in the end F=0 instead of -nan
       */
      if ( Dd > 0.0 ) {
        DdInv  = 1.0 / Dd;
      }

      /* from XLALComputeFstatFromFaFb */
      float F  = DdInv * (  Bd * ( Fa_re*Fa_re + Fa_im*Fa_im )
                          + Ad * ( Fb_re*Fb_re + Fb_im*Fb_im )
                          - 2.0 * Cd * ( Fa_re * Fb_re + Fa_im * Fb_im )
                         );

      /* store result in Fstat-matrix
       * at unraveled index of element {m,n}
       */
      unsigned int outidx = m * N_tauRange + n;
      Fmn[outidx] = F;

    } // if ( (m < N_tauRange) && (n < N_tauRange) )

  } // for ( unsigned int n = 0; n < N_tauRange; n ++ )

} // cudaTransientFstatRectWindow()