cudaTransientFstatRectWindow.cu 4.14 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
__global__ void cudaTransientFstatRectWindow ( float *input,
                                               unsigned int numAtoms,
                                               unsigned int TAtom,
                                               unsigned int t0_data,
                                               unsigned int win_t0,
                                               unsigned int win_dt0,
                                               unsigned int win_tau,
                                               unsigned int win_dtau,
                                               unsigned int N_tauRange,
                                               float *Fmn
                                             )
{

  /* match CUDA thread indexing and high-level (t0,tau) indexing */
  // assume 1D block, grid setup
  unsigned int m         = blockDim.x * blockIdx.x + threadIdx.x; // t0:  row

  unsigned short input_cols = 7; // must match input matrix!

  /* compute Fstat-atom index i_t0 in [0, numAtoms) */
  unsigned int TAtomHalf = TAtom/2; // integer division
  unsigned int t0 = win_t0 + m * win_dt0;
David Keitel's avatar
David Keitel committed
23
24
  /* integer round: floor(x+0.5) */
  int i_tmp = ( t0 - t0_data + TAtomHalf ) / TAtom;
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
  if ( i_tmp < 0 ) {
    i_tmp = 0;
  }
  unsigned int i_t0 = (unsigned int)i_tmp;
  if ( i_t0 >= numAtoms ) {
    i_t0 = numAtoms - 1;
  }

  float Ad    = 0.0f;
  float Bd    = 0.0f;
  float Cd    = 0.0f;
  float Fa_re = 0.0f;
  float Fa_im = 0.0f;
  float Fb_re = 0.0f;
  float Fb_im = 0.0f;
  unsigned int i_t1_last = i_t0;

  /* INNER loop over timescale-parameter tau
   * NOT parallelized so that we can still use the i_t1_last trick
   * (empirically seems to be faster than 2D CUDA version)
   */
  for ( unsigned int n = 0; n < N_tauRange; n ++ ) {

    if ( (m < N_tauRange) && (n < N_tauRange) ) {

David Keitel's avatar
David Keitel committed
50
51
      /* translate n into an atoms end-index
       * for this search interval [t0, t0+Tcoh],
52
53
54
55
56
57
58
       * giving the index range of atoms to sum over
       */
      unsigned int tau = win_tau + n * win_dtau;

      /* get end-time t1 of this transient-window search */
      unsigned int t1 = t0 + tau;

David Keitel's avatar
David Keitel committed
59
60
61
62
      /* compute window end-time Fstat-atom index i_t1 in [0, numAtoms)
       * using integer round: floor(x+0.5)
       */
      i_tmp = ( t1 - t0_data + TAtomHalf ) / TAtom  - 1;
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
      if ( i_tmp < 0 ) {
        i_tmp = 0;
      }
      unsigned int i_t1 = (unsigned int)i_tmp;
      if ( i_t1 >= numAtoms ) {
        i_t1 = numAtoms - 1;
      }

      /* now we have two valid atoms-indices [i_t0, i_t1]
       * spanning our Fstat-window to sum over
       */

      for ( unsigned int i = i_t1_last; i <= i_t1; i ++ ) {
        /* sum up atoms,
         * special optimiziation in the rectangular-window case:
         * just add on to previous tau values,
David Keitel's avatar
David Keitel committed
79
80
         * ie re-use the sum over [i_t0, i_t1_last]
         from the pevious tau-loop iteration
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
         */
        Ad    += input[i*input_cols+0]; // a2_alpha
        Bd    += input[i*input_cols+1]; // b2_alpha
        Cd    += input[i*input_cols+2]; // ab_alpha
        Fa_re += input[i*input_cols+3]; // Fa_alpha_re
        Fa_im += input[i*input_cols+4]; // Fa_alpha_im
        Fb_re += input[i*input_cols+5]; // Fb_alpha_re
        Fb_im += input[i*input_cols+6]; // Fb_alpha_im
        /* keep track of up to where we summed for the next iteration */
        i_t1_last = i_t1 + 1;
      }

      /* get determinant */
      float Dd = ( Ad * Bd - Cd * Cd );
      float DdInv = 0.0f;
      /* safety catch as in XLALWeightMultiAMCoeffs():
       * make it so that in the end F=0 instead of -nan
       */
      if ( Dd > 0.0 ) {
        DdInv  = 1.0 / Dd;
      }

      /* from XLALComputeFstatFromFaFb */
      float F  = DdInv * (  Bd * ( Fa_re*Fa_re + Fa_im*Fa_im )
                          + Ad * ( Fb_re*Fb_re + Fb_im*Fb_im )
                          - 2.0 * Cd * ( Fa_re * Fb_re + Fa_im * Fb_im )
                         );

      /* store result in Fstat-matrix
       * at unraveled index of element {m,n}
       */
      unsigned int outidx = m * N_tauRange + n;
      Fmn[outidx] = F;

    } // if ( (m < N_tauRange) && (n < N_tauRange) )

  } // for ( unsigned int n = 0; n < N_tauRange; n ++ )

} // cudaTransientFstatRectWindow()