cudaTransientFstatExpWindow.cu

__global__ void cudaTransientFstatExpWindow ( float *input,
                                              unsigned int numAtoms,
                                              unsigned int TAtom,
                                              unsigned int t0_data,
                                              unsigned int win_t0,
                                              unsigned int win_dt0,
                                              unsigned int win_tau,
                                              unsigned int win_dtau,
                                              unsigned int Fmn_rows,
                                              unsigned int Fmn_cols,
                                              float *Fmn
                                            )
{

  /* match CUDA thread indexing and high-level (t0,tau) indexing */
  unsigned int m         = blockDim.x * blockIdx.x + threadIdx.x; // t0:  row
  unsigned int n         = blockDim.y * blockIdx.y + threadIdx.y; // tau: column
  /* unraveled 1D index for 2D output array */
  unsigned int outidx    = Fmn_cols * m + n;

  /* hardcoded copy from lalpulsar */
  unsigned int TRANSIENT_EXP_EFOLDING = 3;

  if ( (m < Fmn_rows) && (n < Fmn_cols) ) {

    /* compute Fstat-atom index i_t0 in [0, numAtoms) */
    unsigned int TAtomHalf = TAtom/2; // integer division
    unsigned int t0 = win_t0 + m * win_dt0;
    int i_tmp = ( t0 - t0_data + TAtomHalf ) / TAtom; // integer round: floor(x+0.5)
    if ( i_tmp < 0 ) {
        i_tmp = 0;
    }
    unsigned int i_t0 = (unsigned int)i_tmp;
    if ( i_t0 >= numAtoms ) {
        i_t0 = numAtoms - 1;
    }

    /* translate n into an atoms end-index for this search interval [t0, t0+Tcoh],
     * giving the index range of atoms to sum over
     */
    unsigned int tau = win_tau + n * win_dtau;

    /* get end-time t1 of this transient-window search
     * for given tau, what Tcoh should the exponential window cover?
     * for speed reasons we want to truncate Tcoh = tau * TRANSIENT_EXP_EFOLDING
     * with the e-folding factor chosen such that the window-value
     * is practically negligible after that, where it will be set to 0
     */
//     unsigned int t1 = lround( win_t0 + TRANSIENT_EXP_EFOLDING * win_tau);
    unsigned int t1 = t0 + TRANSIENT_EXP_EFOLDING * tau;

    /* compute window end-time Fstat-atom index i_t1 in [0, numAtoms) */
    i_tmp = ( t1 - t0_data + TAtomHalf ) / TAtom  - 1; // integer round: floor(x+0.5)
    if ( i_tmp < 0 ) {
        i_tmp = 0;
    }
    unsigned int i_t1 = (unsigned int)i_tmp;
    if ( i_t1 >= numAtoms ) {
        i_t1 = numAtoms - 1;
    }

    /* now we have two valid atoms-indices [i_t0, i_t1]
     * spanning our Fstat-window to sum over
     */

    float Ad    = 0.0f;
    float Bd    = 0.0f;
    float Cd    = 0.0f;
    float Fa_re = 0.0f;
    float Fa_im = 0.0f;
    float Fb_re = 0.0f;
    float Fb_im = 0.0f;

    unsigned short input_cols = 7; // must match input matrix!

    /* sum up atoms */
    for ( unsigned int i=i_t0; i<=i_t1; i++ ) {

      unsigned int t_i = t0_data + i * TAtom;

      float win_i = 0.0;
      if ( t_i >= t0 && t_i <= t1 ) {
        float x = 1.0 * ( t_i - t0 ) / tau;
        win_i = exp ( -x );
      }

      float win2_i = win_i * win_i;

      Ad    += input[i*input_cols+0] * win2_i; // a2_alpha
      Bd    += input[i*input_cols+1] * win2_i; // b2_alpha
      Cd    += input[i*input_cols+2] * win2_i; // ab_alpha
      Fa_re += input[i*input_cols+3] * win_i; // Fa_alpha_re
      Fa_im += input[i*input_cols+4] * win_i; // Fa_alpha_im
      Fb_re += input[i*input_cols+5] * win_i; // Fb_alpha_re
      Fb_im += input[i*input_cols+6] * win_i; // Fb_alpha_im

    }

    /* get determinant */
    float Dd = ( Ad * Bd - Cd * Cd );
    float DdInv = 0.0f;
    /* safety catch as in XLALWeightMultiAMCoeffs():
     * make it so that in the end F=0 instead of -nan
     */
    if ( Dd > 0.0 ) {
      DdInv  = 1.0 / Dd;
    }

    /* from XLALComputeFstatFromFaFb */
    float F  = DdInv * (  Bd * ( Fa_re*Fa_re + Fa_im*Fa_im )
                        + Ad * ( Fb_re*Fb_re + Fb_im*Fb_im )
                        - 2.0 * Cd * ( Fa_re * Fb_re + Fa_im * Fb_im )
                       );

    /* store result in Fstat-matrix
     * at unraveled index of element {m,n}
     */
    Fmn[outidx] = F;

  } // ( (m < Fmn_rows) && (n < Fmn_cols) )

} // cudaTransientFstatExpWindow()