fft_kernelstring.cpp 49.5 KB
Newer Older
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
		        (*kInfo)->lmem_size = (radix + 1)*batchSize;
		    else
			    (*kInfo)->lmem_size = threadsPerBlock*R1;
		}
		(*kInfo)->num_workgroups = numBlocks;
        (*kInfo)->num_xforms_per_workgroup = 1;
		(*kInfo)->num_workitems_per_workgroup = threadsPerBlock;
		(*kInfo)->dir = dir;
		if( (passNum == (numPasses - 1)) && (numPasses & 1) )
		    (*kInfo)->in_place_possible = 1;
		else
			(*kInfo)->in_place_possible = 0;
		(*kInfo)->next = NULL;
		(*kInfo)->kernel_name = (char *) malloc(sizeof(char)*(kernelName.size()+1));
		strcpy((*kInfo)->kernel_name, kernelName.c_str());
Oliver Bock's avatar
Oliver Bock committed
1016

1017
		insertVariables(localString, R1);
Oliver Bock's avatar
Oliver Bock committed
1018
1019

		if(vertical)
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
		{
			localString += string("xNum = groupId >> ") + num2str((int)log2(numBlocksPerXForm)) + string(";\n");
			localString += string("groupId = groupId & ") + num2str(numBlocksPerXForm - 1) + string(";\n");
			localString += string("indexIn = mad24(groupId, ") + num2str(batchSize) + string(", xNum << ") + num2str((int)log2(n*BS)) + string(");\n");
			localString += string("tid = mul24(groupId, ") + num2str(batchSize) + string(");\n");
			localString += string("i = tid >> ") + num2str(lgStrideO) + string(";\n");
			localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n");
			int stride = radix*Rinit;
			for(i = 0; i < passNum; i++)
				stride *= radixArr[i];
			localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j + ") + string("(xNum << ") + num2str((int) log2(n*BS)) + string("));\n");
			localString += string("bNum = groupId;\n");
		}
Oliver Bock's avatar
Oliver Bock committed
1033
		else
1034
1035
1036
1037
1038
1039
1040
		{
			int lgNumBlocksPerXForm = log2(numBlocksPerXForm);
			localString += string("bNum = groupId & ") + num2str(numBlocksPerXForm - 1) + string(";\n");
			localString += string("xNum = groupId >> ") + num2str(lgNumBlocksPerXForm) + string(";\n");
			localString += string("indexIn = mul24(bNum, ") + num2str(batchSize) + string(");\n");
			localString += string("tid = indexIn;\n");
			localString += string("i = tid >> ") + num2str(lgStrideO) + string(";\n");
Oliver Bock's avatar
Oliver Bock committed
1041
			localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n");
1042
1043
1044
			int stride = radix*Rinit;
			for(i = 0; i < passNum; i++)
				stride *= radixArr[i];
Oliver Bock's avatar
Oliver Bock committed
1045
			localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j);\n");
1046
			localString += string("indexIn += (xNum << ") + num2str(m) + string(");\n");
Oliver Bock's avatar
Oliver Bock committed
1047
			localString += string("indexOut += (xNum << ") + num2str(m) + string(");\n");
1048
		}
Oliver Bock's avatar
Oliver Bock committed
1049

1050
1051
1052
1053
		// Load Data
		int lgBatchSize = log2(batchSize);
		localString += string("tid = lId;\n");
		localString += string("i = tid & ") + num2str(batchSize - 1) + string(";\n");
Oliver Bock's avatar
Oliver Bock committed
1054
		localString += string("j = tid >> ") + num2str(lgBatchSize) + string(";\n");
1055
1056
		localString += string("indexIn += mad24(j, ") + num2str(strideI) + string(", i);\n");

Oliver Bock's avatar
Oliver Bock committed
1057
		if(dataFormat == clFFT_SplitComplexFormat)
1058
1059
		{
			localString += string("in_real += indexIn;\n");
Oliver Bock's avatar
Oliver Bock committed
1060
			localString += string("in_imag += indexIn;\n");
1061
1062
			for(j = 0; j < R1; j++)
				localString += string("a[") + num2str(j) + string("].x = in_real[") + num2str(j*gInInc*strideI) + string("];\n");
Oliver Bock's avatar
Oliver Bock committed
1063
			for(j = 0; j < R1; j++)
1064
1065
				localString += string("a[") + num2str(j) + string("].y = in_imag[") + num2str(j*gInInc*strideI) + string("];\n");
		}
Oliver Bock's avatar
Oliver Bock committed
1066
		else
1067
1068
1069
1070
1071
		{
			localString += string("in += indexIn;\n");
			for(j = 0; j < R1; j++)
				localString += string("a[") + num2str(j) + string("] = in[") + num2str(j*gInInc*strideI) + string("];\n");
	    }
Oliver Bock's avatar
Oliver Bock committed
1072
1073
1074

		localString += string("fftKernel") + num2str(R1) + string("(a, dir);\n");

1075
1076
1077
		if(R2 > 1)
		{
		    // twiddle
Oliver Bock's avatar
Oliver Bock committed
1078
		    for(k = 1; k < R1; k++)
1079
1080
1081
		    {
			    localString += string("ang = dir*(2.0f*M_PI*") + num2str(k) + string("/") + num2str(radix) + string(")*j;\n");
			    localString += string("w = (float2)(native_cos(ang), native_sin(ang));\n");
Oliver Bock's avatar
Oliver Bock committed
1082
			    localString += string("a[") + num2str(k) + string("] = complexMul(a[") + num2str(k) + string("], w);\n");
1083
		    }
Oliver Bock's avatar
Oliver Bock committed
1084

1085
		    // shuffle
Oliver Bock's avatar
Oliver Bock committed
1086
		    numIter = R1 / R2;
1087
1088
1089
		    localString += string("indexIn = mad24(j, ") + num2str(threadsPerBlock*numIter) + string(", i);\n");
		    localString += string("lMemStore = sMem + tid;\n");
		    localString += string("lMemLoad = sMem + indexIn;\n");
Oliver Bock's avatar
Oliver Bock committed
1090
		    for(k = 0; k < R1; k++)
1091
			    localString += string("lMemStore[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n");
Oliver Bock's avatar
Oliver Bock committed
1092
		    localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
1093
1094
1095
1096
		    for(k = 0; k < numIter; k++)
			    for(t = 0; t < R2; t++)
				    localString += string("a[") + num2str(k*R2+t) + string("].x = lMemLoad[") + num2str(t*batchSize + k*threadsPerBlock) + string("];\n");
		    localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
Oliver Bock's avatar
Oliver Bock committed
1097
		    for(k = 0; k < R1; k++)
1098
			    localString += string("lMemStore[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n");
Oliver Bock's avatar
Oliver Bock committed
1099
		    localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
1100
1101
1102
1103
		    for(k = 0; k < numIter; k++)
			    for(t = 0; t < R2; t++)
				    localString += string("a[") + num2str(k*R2+t) + string("].y = lMemLoad[") + num2str(t*batchSize + k*threadsPerBlock) + string("];\n");
		    localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
Oliver Bock's avatar
Oliver Bock committed
1104

1105
1106
1107
		    for(j = 0; j < numIter; j++)
			    localString += string("fftKernel") + num2str(R2) + string("(a + ") + num2str(j*R2) + string(", dir);\n");
		}
Oliver Bock's avatar
Oliver Bock committed
1108

1109
		// twiddle
Oliver Bock's avatar
Oliver Bock committed
1110
		if(passNum < (numPasses - 1))
1111
1112
		{
			localString += string("l = ((bNum << ") + num2str(lgBatchSize) + string(") + i) >> ") + num2str(lgStrideO) + string(";\n");
Oliver Bock's avatar
Oliver Bock committed
1113
			localString += string("k = j << ") + num2str((int)log2(R1/R2)) + string(";\n");
1114
			localString += string("ang1 = dir*(2.0f*M_PI/") + num2str(N) + string(")*l;\n");
Oliver Bock's avatar
Oliver Bock committed
1115
			for(t = 0; t < R1; t++)
1116
1117
1118
1119
1120
1121
			{
				localString += string("ang = ang1*(k + ") + num2str((t%R2)*R1 + (t/R2)) + string(");\n");
				localString += string("w = (float2)(native_cos(ang), native_sin(ang));\n");
				localString += string("a[") + num2str(t) + string("] = complexMul(a[") + num2str(t) + string("], w);\n");
			}
		}
Oliver Bock's avatar
Oliver Bock committed
1122

1123
		// Store Data
Oliver Bock's avatar
Oliver Bock committed
1124
		if(strideO == 1)
1125
		{
Oliver Bock's avatar
Oliver Bock committed
1126

1127
1128
			localString += string("lMemStore = sMem + mad24(i, ") + num2str(radix + 1) + string(", j << ") + num2str((int)log2(R1/R2)) + string(");\n");
			localString += string("lMemLoad = sMem + mad24(tid >> ") + num2str((int)log2(radix)) + string(", ") + num2str(radix+1) + string(", tid & ") + num2str(radix-1) + string(");\n");
Oliver Bock's avatar
Oliver Bock committed
1129

1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
			for(int i = 0; i < R1/R2; i++)
				for(int j = 0; j < R2; j++)
					localString += string("lMemStore[ ") + num2str(i + j*R1) + string("] = a[") + num2str(i*R2+j) + string("].x;\n");
			localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
			if(threadsPerBlock >= radix)
            {
                for(int i = 0; i < R1; i++)
                localString += string("a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i*(radix+1)*(threadsPerBlock/radix)) + string("];\n");
            }
            else
            {
                int innerIter = radix/threadsPerBlock;
                int outerIter = R1/innerIter;
                for(int i = 0; i < outerIter; i++)
                    for(int j = 0; j < innerIter; j++)
                        localString += string("a[") + num2str(i*innerIter+j) + string("].x = lMemLoad[") + num2str(j*threadsPerBlock + i*(radix+1)) + string("];\n");
            }
			localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
Oliver Bock's avatar
Oliver Bock committed
1148

1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
			for(int i = 0; i < R1/R2; i++)
				for(int j = 0; j < R2; j++)
					localString += string("lMemStore[ ") + num2str(i + j*R1) + string("] = a[") + num2str(i*R2+j) + string("].y;\n");
			localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
			if(threadsPerBlock >= radix)
            {
                for(int i = 0; i < R1; i++)
                    localString += string("a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i*(radix+1)*(threadsPerBlock/radix)) + string("];\n");
            }
            else
            {
                int innerIter = radix/threadsPerBlock;
                int outerIter = R1/innerIter;
                for(int i = 0; i < outerIter; i++)
                    for(int j = 0; j < innerIter; j++)
                        localString += string("a[") + num2str(i*innerIter+j) + string("].y = lMemLoad[") + num2str(j*threadsPerBlock + i*(radix+1)) + string("];\n");
            }
			localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
Oliver Bock's avatar
Oliver Bock committed
1167

1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
			localString += string("indexOut += tid;\n");
			if(dataFormat == clFFT_SplitComplexFormat) {
				localString += string("out_real += indexOut;\n");
				localString += string("out_imag += indexOut;\n");
				for(k = 0; k < R1; k++)
					localString += string("out_real[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n");
				for(k = 0; k < R1; k++)
					localString += string("out_imag[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n");
			}
			else {
				localString += string("out += indexOut;\n");
				for(k = 0; k < R1; k++)
Oliver Bock's avatar
Oliver Bock committed
1180
					localString += string("out[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("];\n");
1181
			}
Oliver Bock's avatar
Oliver Bock committed
1182

1183
		}
Oliver Bock's avatar
Oliver Bock committed
1184
		else
1185
1186
1187
1188
		{
			localString += string("indexOut += mad24(j, ") + num2str(numIter*strideO) + string(", i);\n");
			if(dataFormat == clFFT_SplitComplexFormat) {
				localString += string("out_real += indexOut;\n");
Oliver Bock's avatar
Oliver Bock committed
1189
				localString += string("out_imag += indexOut;\n");
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
				for(k = 0; k < R1; k++)
					localString += string("out_real[") + num2str(((k%R2)*R1 + (k/R2))*strideO) + string("] = a[") + num2str(k) + string("].x;\n");
				for(k = 0; k < R1; k++)
					localString += string("out_imag[") + num2str(((k%R2)*R1 + (k/R2))*strideO) + string("] = a[") + num2str(k) + string("].y;\n");
			}
			else {
				localString += string("out += indexOut;\n");
				for(k = 0; k < R1; k++)
					localString += string("out[") + num2str(((k%R2)*R1 + (k/R2))*strideO) + string("] = a[") + num2str(k) + string("];\n");
			}
		}
Oliver Bock's avatar
Oliver Bock committed
1201

1202
1203
1204
1205
1206
		insertHeader(*kernelString, kernelName, dataFormat);
		*kernelString += string("{\n");
		if((*kInfo)->lmem_size)
			*kernelString += string("    __local float sMem[") + num2str((*kInfo)->lmem_size) + string("];\n");
		*kernelString += localString;
Oliver Bock's avatar
Oliver Bock committed
1207
1208
		*kernelString += string("}\n");

1209
1210
1211
1212
1213
1214
1215
		N /= radix;
		kInfo = &(*kInfo)->next;
		kCount++;
	}
}

void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir)
Oliver Bock's avatar
Oliver Bock committed
1216
{
1217
1218
    unsigned int radixArray[10];
    unsigned int numRadix;
Oliver Bock's avatar
Oliver Bock committed
1219

1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
	switch(dir)
	{
		case cl_fft_kernel_x:
		    if(plan->n.x > plan->max_localmem_fft_size)
		    {
		        createGlobalFFTKernelString(plan, plan->n.x, 1, cl_fft_kernel_x, 1);
		    }
		    else if(plan->n.x > 1)
		    {
		        getRadixArray(plan->n.x, radixArray, &numRadix, 0);
		        if(plan->n.x / radixArray[0] <= plan->max_work_item_per_workgroup)
		        {
				    createLocalMemfftKernelString(plan);
				}
			    else
			    {
			        getRadixArray(plan->n.x, radixArray, &numRadix, plan->max_radix);
			        if(plan->n.x / radixArray[0] <= plan->max_work_item_per_workgroup)
			            createLocalMemfftKernelString(plan);
			        else
				        createGlobalFFTKernelString(plan, plan->n.x, 1, cl_fft_kernel_x, 1);
				}
		    }
			break;
Oliver Bock's avatar
Oliver Bock committed
1244

1245
1246
1247
1248
		case cl_fft_kernel_y:
			if(plan->n.y > 1)
			    createGlobalFFTKernelString(plan, plan->n.y, plan->n.x, cl_fft_kernel_y, 1);
			break;
Oliver Bock's avatar
Oliver Bock committed
1249

1250
1251
1252
1253
1254
1255
1256
1257
		case cl_fft_kernel_z:
			if(plan->n.z > 1)
			    createGlobalFFTKernelString(plan, plan->n.z, plan->n.x*plan->n.y, cl_fft_kernel_z, 1);
		default:
			return;
	}
}
For faster browsing, not all history is shown. View entire blame