Kernel produced does not compile for different input of the same fn.
See original GitHub issueDescribe the bug This one is a continuation of https://github.com/beehive-lab/TornadoVM/issues/153 We tried to port code to Kernel semantics, but the results are puzzling and sometimes kernel is not compiled at all in between the calls, and is clearly related to different way of passing arguments when Tornado intercepts that input is small (i.e. length of the input array is ~10 vs 1000).
Below, we can see kernel for a larger array (that sometimes works, depending on the input as described in a subsequent issue) then then Tonadovm generates code that it seems unfolds arrays passed and generates 100s of input params)
The 2nd time function is called, the generated code is broken and fails to compile
2732 INFO [main] gaia.cu7.algo.character.periodsearch.methods.test.MethodLeastSquareComputeComparison [] - Sequential LSQ: 0.682237031
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
__kernel void lookupBufferAddress(__global uchar *_heap_base, uint _frame_base, __constant uchar *_constant_region, __local uchar *_local_region, __global int *_atomics)
{
__global ulong *_frame = (__global ulong *) &_heap_base[_frame_base];
// BLOCK 0
_frame[0] = (ulong) _heap_base;
} // kernel
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
__kernel void lookupBufferAddress(__global uchar *_heap_base, uint _frame_base, __constant uchar *_constant_region, __local uchar *_local_region, __global int *_atomics)
{
__global ulong *_frame = (__global ulong *) &_heap_base[_frame_base];
// BLOCK 0
_frame[0] = (ulong) _heap_base;
} // kernel
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
__kernel void computeKernel(__global uchar *_heap_base, uint _frame_base, __constant uchar *_constant_region, __local uchar *_local_region, __global int *_atomics)
{
uint ui_3, ui_2, ui_15, ui_1, ui_17, ui_0, ui_96, ui_11, ui_27, ui_13, ui_29, ui_22;
int i_16, i_18, i_12, i_14, i_24, i_23, i_55, i_26, i_25, i_20, i_19, i_21, i_64, i_95, i_65, i_97, i_92, i_94, i_93, i_40, i_131;
double d_49, d_50, d_51, d_52, d_53, d_54, d_56, d_41, d_42, d_43, d_44, d_45, d_46, d_47, d_48, d_66, d_67, d_68, d_69, d_70, d_71, d_72, d_57, d_58, d_59, d_60, d_61, d_62, d_63, d_81, d_82, d_83, d_84, d_85, d_86, d_87, d_88, d_73, d_74, d_75, d_76, d_77, d_78, d_79, d_80, d_98, d_99, d_100, d_
101, d_102, d_103, d_104, d_89, d_90, d_91, d_113, d_114, d_115, d_116, d_117, d_118, d_119, d_120, d_105, d_106, d_107, d_108, d_109, d_110, d_111, d_112, d_129, d_130, d_121, d_122, d_123, d_124, d_125, d_126, d_127, d_128, d_33, d_34, d_35, d_36, d_37, d_38, d_39, d_28, d_30, d_31, d_32;
__global ulong *_frame = (__global ulong *) &_heap_base[_frame_base];
// BLOCK 0
ui_0 = (uint) _frame[8];
ui_1 = (uint) _frame[9];
ui_2 = (uint) _frame[10];
ui_3 = (uint) _frame[11];
__private double ui_4[512];
__private double ui_5[512];
__private double ui_6[512];
__private double ui_7[512];
__private double ui_8[512];
__private double ui_9[512];
__private double ui_10[512];
ui_11 = ui_3 + 24;
i_12 = get_local_size(0);
*((__global int *) ui_11) = i_12;
ui_13 = ui_3 + 28;
i_14 = get_global_size(0);
*((__global int *) ui_13) = i_14;
ui_15 = ui_3 + 32;
i_16 = get_local_id(0);
*((__global int *) ui_15) = i_16;
ui_17 = ui_3 + 36;
i_18 = get_global_id(0);
*((__global int *) ui_17) = i_18;
i_19 = i_18 + 4;
i_20 = i_19 << 2;
i_21 = i_20 + 24;
ui_22 = ui_3 + i_21;
*((__global int *) ui_22) = i_18;
i_23 = i_18 << 7;
// BLOCK 1 MERGES [0 5 ]
i_24 = 0;
for(;i_24 < 165;)
{
// BLOCK 2
i_25 = i_24 << 3;
i_26 = i_25 + 24;
ui_27 = ui_0 + i_26;
d_28 = *((__global double *) ui_27);
ui_29 = ui_1 + i_26;
d_30 = *((__global double *) ui_29);
d_31 = (double) i_23;
d_32 = fma(d_31, 5.0E-5, 0.06);
d_33 = d_32 * 2.0;
d_34 = d_33 * 3.141592653589793;
d_35 = d_34 * d_28;
d_36 = cos(d_35);
d_37 = sin(d_35);
// BLOCK 3 MERGES [2 4 ]
d_38 = d_37;
d_39 = d_36;
i_40 = 0;
for(;i_40 < 128;)
{
// BLOCK 4
d_41 = ui_4[i_40];
d_42 = d_38 + d_41;
ui_4[i_40] = d_42;
d_41 = d_42;
d_43 = ui_5[i_40];
d_44 = d_39 + d_43;
ui_5[i_40] = d_44;
d_43 = d_44;
d_45 = ui_6[i_40];
d_46 = fma(d_38, d_38, d_45);
ui_6[i_40] = d_46;
d_45 = d_46;
d_47 = ui_7[i_40];
d_48 = fma(d_39, d_39, d_47);
ui_7[i_40] = d_48;
d_47 = d_48;
d_49 = ui_8[i_40];
d_50 = fma(d_38, d_39, d_49);
ui_8[i_40] = d_50;
d_49 = d_50;
d_51 = ui_9[i_40];
d_52 = fma(d_38, d_30, d_51);
ui_9[i_40] = d_52;
d_51 = d_52;
d_53 = ui_10[i_40];
d_54 = fma(d_39, d_30, d_53);
ui_10[i_40] = d_54;
d_53 = d_54;
i_55 = i_40 + 1;
d_56 = d_28 * 3.141592653589793E-4;
d_57 = cos(d_56);
d_58 = d_57 * d_39;
d_59 = sin(d_56);
d_60 = d_59 * d_38;
d_61 = d_58 - d_60;
d_62 = d_59 * d_39;
d_63 = fma(d_57, d_38, d_62);
d_38 = d_63;
d_39 = d_61;
i_40 = i_55;
} // B4
// BLOCK 5
i_64 = i_24 + 1;
i_24 = i_64;
} // B5
// BLOCK 6
barrier(CLK_LOCAL_MEM_FENCE);
// BLOCK 7 MERGES [6 11 ]
i_65 = 0;
for(;i_65 < 128;)
{
// BLOCK 8
d_66 = ui_7[i_65];
d_67 = ui_6[i_65];
d_68 = ui_8[i_65];
d_69 = ui_8[i_65];
d_70 = ui_5[i_65];
d_71 = ui_5[i_65];
d_72 = ui_6[i_65];
d_73 = ui_4[i_65];
d_74 = ui_4[i_65];
d_75 = ui_7[i_65];
d_76 = ui_4[i_65];
d_77 = ui_5[i_65];
d_78 = ui_8[i_65];
d_79 = d_76 * 2.0;
d_80 = d_79 * d_77;
d_81 = d_66 * d_67;
d_82 = d_68 * d_69;
d_83 = d_81 - d_82;
d_84 = d_83 * 165.0;
d_85 = d_70 * d_71;
d_86 = d_85 * d_72;
d_87 = d_84 - d_86;
d_88 = d_73 * d_74;
d_89 = d_88 * d_75;
d_90 = d_87 - d_89;
d_91 = fma(d_80, d_78, d_90);
i_92 = i_65 + 1;
i_93 = i_65 + i_23;
i_94 = i_93 << 3;
i_95 = i_94 + 24;
ui_96 = ui_2 + i_95;
i_97 = isless(d_91, 0.0);
if(i_97 == 1)
{
// BLOCK 9
*((__global double *) ui_96) = 1.0;
} // B9
else
{
// BLOCK 10
d_98 = ui_5[i_65];
d_99 = ui_9[i_65];
d_100 = ui_4[i_65];
d_101 = ui_10[i_65];
d_102 = ui_10[i_65];
d_103 = ui_6[i_65];
d_104 = ui_9[i_65];
d_105 = ui_8[i_65];
d_106 = ui_4[i_65];
d_107 = ui_9[i_65];
d_108 = ui_7[i_65];
d_109 = ui_10[i_65];
d_110 = ui_8[i_65];
d_111 = ui_5[i_65];
d_112 = ui_10[i_65];
d_113 = ui_9[i_65];
d_114 = d_98 * d_99;
d_115 = d_100 * d_101;
d_116 = d_114 - d_115;
d_117 = d_102 * d_103;
d_118 = d_104 * d_105;
d_119 = d_117 - d_118;
d_120 = d_119 * 165.0;
d_121 = fma(d_116, d_106, d_120);
d_122 = d_107 * d_108;
d_123 = d_109 * d_110;
d_124 = d_122 - d_123;
d_125 = d_124 * 165.0;
d_126 = d_111 * d_116;
d_127 = d_125 - d_126;
d_128 = d_127 * d_113;
d_129 = fma(d_121, d_112, d_128);
d_130 = d_129 / d_91;
*((__global double *) ui_96) = d_130;
} // B10
// BLOCK 11 MERGES [9 10 ]
i_131 = i_92;
i_65 = i_131;
} // B11
// BLOCK 12
return;
} // kernel
{
"PeriodSearch": {
"COPY_IN_TIME": "55480",
"TOTAL_DISPATCH_DATA_TRANSFERS_TIME": "12397",
"TOTAL_TASK_SCHEDULE_TIME": "432744662",
"COPY_OUT_TIME": "510240",
"TOTAL_KERNEL_TIME": "117545840",
"TOTAL_DRIVER_COMPILE_TIME": "1394951",
"TOTAL_GRAAL_COMPILE_TIME": "73815732",
"TOTAL_DISPATCH_KERNEL_TIME": "17385",
"TOTAL_BYTE_CODE_GENERATION": "7382591",
"TOTAL_COPY_IN_SIZE_BYTES": "5376",
"TOTAL_COPY_OUT_SIZE_BYTES": "3585676",
"PeriodSearch.t0": {
"METHOD": "MethodLeastSquareGPU.computeKernel",
"DEVICE_ID": "0:2",
"DEVICE": "AMD Radeon Pro 5500M Compute Engine",
"TOTAL_COPY_IN_SIZE_BYTES": "96",
"TASK_KERNEL_TIME": "117545840",
"TASK_COMPILE_GRAAL_TIME": "73815732",
"TASK_COMPILE_DRIVER_TIME": "1394951"
}
}
}
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
__kernel void computeKernel(__global uchar *_heap_base, uint _frame_base, __constant uchar *_constant_region, __local uchar *_local_region, __global int *_atomics)
{
uint ui_15, ui_17, ui_11, ui_13, ui_200, ui_264, ui_328, ui_392, ui_456, ui_520, ui_584, ui_3, ui_2, ui_30, ui_1, ui_0, ui_28, ui_22;
int i_199, i_263, i_327, i_391, i_455, i_519, i_583, i_201, i_265, i_329, i_393, i_457, i_521, i_585, i_198, i_262, i_326, i_390, i_454, i_518, i_582, i_197, i_261, i_325, i_389, i_453, i_517, i_581, i_16, i_18, i_12, i_14, i_24, i_23, i_26, i_25, i_20, i_19, i_21, i_27, i_170;
double d_617, d_618, d_561, d_562, d_563, d_564, d_565, d_566, d_567, d_568, d_553, d_554, d_555, d_556, d_557, d_558, d_559, d_560, d_577, d_578, d_579, d_580, d_569, d_570, d_571, d_572, d_573, d_574, d_575, d_576, d_593, d_594, d_595, d_596, d_597, d_598, d_599, d_600, d_586, d_587, d_588, d_58
9, d_590, d_591, d_592, d_609, d_610, d_611, d_612, d_613, d_614, d_615, d_616, d_601, d_602, d_603, d_604, d_605, d_606, d_607, d_608, d_497, d_498, d_499, d_500, d_501, d_502, d_503, d_504, d_489, d_490, d_491, d_492, d_493, d_494, d_495, d_496, d_513, d_514, d_515, d_516, d_505, d_506, d_507, d_5
08, d_509, d_510, d_511, d_512, d_529, d_530, d_531, d_532, d_533, d_534, d_535, d_536, d_522, d_523, d_524, d_525, d_526, d_527, d_528, d_545, d_546, d_547, d_548, d_549, d_550, d_551, d_552, d_537, d_538, d_539, d_540, d_541, d_542, d_543, d_544, d_177, d_178, d_179, d_180, d_181, d_182, d_183, d_
184, d_169, d_171, d_172, d_173, d_174, d_175, d_176, d_193, d_194, d_195, d_196, d_185, d_186, d_187, d_188, d_189, d_190, d_191, d_192, d_209, d_210, d_211, d_212, d_213, d_214, d_215, d_216, d_202, d_203, d_204, d_205, d_206, d_207, d_208, d_225, d_226, d_227, d_228, d_229, d_230, d_231, d_232, d
_217, d_218, d_219, d_220, d_221, d_222, d_223, d_224, d_113, d_114, d_115, d_116, d_117, d_118, d_119, d_120, d_105, d_106, d_107, d_108, d_109, d_110, d_111, d_112, d_129, d_130, d_131, d_132, d_133, d_134, d_135, d_136, d_121, d_122, d_123, d_124, d_125, d_126, d_127, d_128, d_145, d_146, d_147,
d_148, d_149, d_150, d_151, d_152, d_137, d_138, d_139, d_140, d_141, d_142, d_143, d_144, d_161, d_162, d_163, d_164, d_165, d_166, d_167, d_168, d_153, d_154, d_155, d_156, d_157, d_158, d_159, d_160, d_49, d_50, d_51, d_52, d_53, d_54, d_55, d_56, d_41, d_42, d_43, d_44, d_45, d_46, d_47, d_48, d
_65, d_66, d_67, d_68, d_69, d_70, d_71, d_72, d_57, d_58, d_59, d_60, d_61, d_62, d_63, d_64, d_81, d_82, d_83, d_84, d_85, d_86, d_87, d_88, d_73, d_74, d_75, d_76, d_77, d_78, d_79, d_80, d_97, d_98, d_99, d_100, d_101, d_102, d_103, d_104, d_89, d_90, d_91, d_92, d_93, d_94, d_95, d_96, d_33, d_
34, d_35, d_36, d_37, d_38, d_39, d_40, d_29, d_31, d_32, d_433, d_434, d_435, d_436, d_437, d_438, d_439, d_440, d_425, d_426, d_427, d_428, d_429, d_430, d_431, d_432, d_449, d_450, d_451, d_452, d_441, d_442, d_443, d_444, d_445, d_446, d_447, d_448, d_465, d_466, d_467, d_468, d_469, d_470, d_47
1, d_472, d_458, d_459, d_460, d_461, d_462, d_463, d_464, d_481, d_482, d_483, d_484, d_485, d_486, d_487, d_488, d_473, d_474, d_475, d_476, d_477, d_478, d_479, d_480, d_369, d_370, d_371, d_372, d_373, d_374, d_375, d_376, d_361, d_362, d_363, d_364, d_365, d_366, d_367, d_368, d_385, d_386, d_3
87, d_388, d_377, d_378, d_379, d_380, d_381, d_382, d_383, d_384, d_401, d_402, d_403, d_404, d_405, d_406, d_407, d_408, d_394, d_395, d_396, d_397, d_398, d_399, d_400, d_417, d_418, d_419, d_420, d_421, d_422, d_423, d_424, d_409, d_410, d_411, d_412, d_413, d_414, d_415, d_416, d_305, d_306, d_
307, d_308, d_309, d_310, d_311, d_312, d_297, d_298, d_299, d_300, d_301, d_302, d_303, d_304, d_321, d_322, d_323, d_324, d_313, d_314, d_315, d_316, d_317, d_318, d_319, d_320, d_337, d_338, d_339, d_340, d_341, d_342, d_343, d_344, d_330, d_331, d_332, d_333, d_334, d_335, d_336, d_353, d_354, d
_355, d_356, d_357, d_358, d_359, d_360, d_345, d_346, d_347, d_348, d_349, d_350, d_351, d_352, d_241, d_242, d_243, d_244, d_245, d_246, d_247, d_248, d_233, d_234, d_235, d_236, d_237, d_238, d_239, d_240, d_257, d_258, d_259, d_260, d_249, d_250, d_251, d_252, d_253, d_254, d_255, d_256, d_273,
d_274, d_275, d_276, d_277, d_278, d_279, d_280, d_266, d_267, d_268, d_269, d_270, d_271, d_272, d_289, d_290, d_291, d_292, d_293, d_294, d_295, d_296, d_281, d_282, d_283, d_284, d_285, d_286, d_287, d_288;
__global ulong *_frame = (__global ulong *) &_heap_base[_frame_base];
// BLOCK 0
ui_0 = (uint) _frame[8];
ui_1 = (uint) _frame[9];
ui_2 = (uint) _frame[10];
ui_3 = (uint) _frame[11];
__private double ui_4[512];
__private double ui_5[512];
__private double ui_6[512];
__private double ui_7[512];
__private double ui_8[512];
__private double ui_9[512];
__private double ui_10[512];
ui_11 = ui_3 + 24;
i_12 = get_local_size(0);
*((__global int *) ui_11) = i_12;
ui_13 = ui_3 + 28;
i_14 = get_global_size(0);
*((__global int *) ui_13) = i_14;
ui_15 = ui_3 + 32;
i_16 = get_local_id(0);
*((__global int *) ui_15) = i_16;
ui_17 = ui_3 + 36;
i_18 = get_global_id(0);
*((__global int *) ui_17) = i_18;
i_19 = i_18 + 4;
i_20 = i_19 << 2;
i_21 = i_20 + 24;
ui_22 = ui_3 + i_21;
*((__global int *) ui_22) = i_18;
i_23 = i_18 << 3;
i_24 = i_23 - i_18;
// BLOCK 1 MERGES [0 2 ]
i_25 = 0;
for(;i_25 < 165;)
{
// BLOCK 2
i_26 = i_25 << 3;
i_27 = i_26 + 24;
ui_28 = ui_0 + i_27;
d_29 = *((__global double *) ui_28);
ui_30 = ui_1 + i_27;
d_31 = *((__global double *) ui_30);
d_32 = ui_4[0];
d_33 = (double) i_24;
d_34 = fma(d_33, 5.0E-6, 6.461485);
d_35 = d_34 * 2.0;
d_36 = d_35 * 3.141592653589793;
d_37 = d_36 * d_29;
d_38 = sin(d_37);
d_39 = d_38 + d_32;
ui_4[0] = d_39;
d_32 = d_39;
d_40 = ui_5[0];
d_41 = cos(d_37);
d_42 = d_41 + d_40;
ui_5[0] = d_42;
d_40 = d_42;
d_43 = ui_6[0];
d_44 = fma(d_38, d_38, d_43);
ui_6[0] = d_44;
d_43 = d_44;
d_45 = ui_7[0];
d_46 = fma(d_41, d_41, d_45);
ui_7[0] = d_46;
d_45 = d_46;
d_47 = ui_8[0];
d_48 = fma(d_38, d_41, d_47);
ui_8[0] = d_48;
d_47 = d_48;
d_49 = ui_9[0];
d_50 = fma(d_38, d_31, d_49);
ui_9[0] = d_50;
d_49 = d_50;
d_51 = ui_10[0];
d_52 = fma(d_41, d_31, d_51);
ui_10[0] = d_52;
d_51 = d_52;
d_53 = ui_4[1];
d_54 = d_29 * 3.1415926535897935E-5;
d_55 = cos(d_54);
d_56 = sin(d_54);
d_57 = d_56 * d_41;
d_58 = fma(d_38, d_55, d_57);
d_59 = d_58 + d_53;
ui_4[1] = d_59;
d_53 = d_59;
d_60 = ui_5[1];
d_61 = d_41 * d_55;
d_62 = d_38 * d_56;
d_63 = d_61 - d_62;
d_64 = d_63 + d_60;
ui_5[1] = d_64;
d_60 = d_64;
d_65 = ui_6[1];
d_66 = fma(d_58, d_58, d_65);
ui_6[1] = d_66;
d_65 = d_66;
d_67 = ui_7[1];
d_68 = fma(d_63, d_63, d_67);
ui_7[1] = d_68;
d_67 = d_68;
d_69 = ui_8[1];
d_70 = fma(d_63, d_58, d_69);
ui_8[1] = d_70;
d_69 = d_70;
d_71 = ui_9[1];
d_72 = fma(d_58, d_31, d_71);
ui_9[1] = d_72;
d_71 = d_72;
d_73 = ui_10[1];
d_74 = fma(d_63, d_31, d_73);
ui_10[1] = d_74;
d_73 = d_74;
d_75 = ui_4[2];
d_76 = d_56 * d_63;
d_77 = fma(d_55, d_58, d_76);
d_78 = d_77 + d_75;
ui_4[2] = d_78;
d_75 = d_78;
d_79 = ui_5[2];
d_80 = d_55 * d_63;
d_81 = d_56 * d_58;
d_82 = d_80 - d_81;
d_83 = d_82 + d_79;
ui_5[2] = d_83;
d_79 = d_83;
d_84 = ui_6[2];
d_85 = fma(d_77, d_77, d_84);
ui_6[2] = d_85;
d_84 = d_85;
d_86 = ui_7[2];
d_87 = fma(d_82, d_82, d_86);
ui_7[2] = d_87;
d_86 = d_87;
d_88 = ui_8[2];
d_89 = fma(d_82, d_77, d_88);
ui_8[2] = d_89;
d_88 = d_89;
d_90 = ui_9[2];
d_91 = fma(d_77, d_31, d_90);
ui_9[2] = d_91;
d_90 = d_91;
d_92 = ui_10[2];
d_93 = fma(d_82, d_31, d_92);
ui_10[2] = d_93;
d_92 = d_93;
d_94 = ui_4[3];
d_95 = d_56 * d_82;
d_96 = fma(d_55, d_77, d_95);
d_97 = d_96 + d_94;
ui_4[3] = d_97;
d_94 = d_97;
d_98 = ui_5[3];
d_99 = d_55 * d_82;
d_100 = d_56 * d_77;
d_101 = d_99 - d_100;
d_102 = d_101 + d_98;
ui_5[3] = d_102;
d_98 = d_102;
d_103 = ui_6[3];
d_104 = fma(d_96, d_96, d_103);
ui_6[3] = d_104;
d_103 = d_104;
d_105 = ui_7[3];
d_106 = fma(d_101, d_101, d_105);
ui_7[3] = d_106;
d_105 = d_106;
d_107 = ui_8[3];
d_108 = fma(d_101, d_96, d_107);
ui_8[3] = d_108;
d_107 = d_108;
d_109 = ui_9[3];
d_110 = fma(d_96, d_31, d_109);
ui_9[3] = d_110;
d_109 = d_110;
d_111 = ui_10[3];
d_112 = fma(d_101, d_31, d_111);
ui_10[3] = d_112;
d_111 = d_112;
d_113 = ui_4[4];
d_114 = d_56 * d_101;
d_115 = fma(d_55, d_96, d_114);
d_116 = d_115 + d_113;
ui_4[4] = d_116;
d_113 = d_116;
d_117 = ui_5[4];
d_118 = d_55 * d_101;
d_119 = d_56 * d_96;
d_120 = d_118 - d_119;
d_121 = d_120 + d_117;
ui_5[4] = d_121;
d_117 = d_121;
d_122 = ui_6[4];
d_123 = fma(d_115, d_115, d_122);
ui_6[4] = d_123;
d_122 = d_123;
d_124 = ui_7[4];
d_125 = fma(d_120, d_120, d_124);
ui_7[4] = d_125;
d_124 = d_125;
d_126 = ui_8[4];
d_127 = fma(d_120, d_115, d_126);
ui_8[4] = d_127;
d_126 = d_127;
d_128 = ui_9[4];
d_129 = fma(d_115, d_31, d_128);
ui_9[4] = d_129;
d_128 = d_129;
d_130 = ui_10[4];
d_131 = fma(d_120, d_31, d_130);
ui_10[4] = d_131;
d_130 = d_131;
d_132 = ui_4[5];
d_133 = d_56 * d_120;
d_134 = fma(d_55, d_115, d_133);
d_135 = d_134 + d_132;
ui_4[5] = d_135;
d_132 = d_135;
d_136 = ui_5[5];
d_137 = d_55 * d_120;
d_138 = d_56 * d_115;
d_139 = d_137 - d_138;
d_140 = d_139 + d_136;
ui_5[5] = d_140;
d_136 = d_140;
d_141 = ui_6[5];
d_142 = fma(d_134, d_134, d_141);
ui_6[5] = d_142;
d_141 = d_142;
d_143 = ui_7[5];
d_144 = fma(d_139, d_139, d_143);
ui_7[5] = d_144;
d_143 = d_144;
d_145 = ui_8[5];
d_146 = fma(d_139, d_134, d_145);
ui_8[5] = d_146;
d_145 = d_146;
d_147 = ui_9[5];
d_148 = fma(d_134, d_31, d_147);
ui_9[5] = d_148;
d_147 = d_148;
d_149 = ui_10[5];
d_150 = fma(d_139, d_31, d_149);
ui_10[5] = d_150;
d_149 = d_150;
d_151 = ui_4[6];
d_152 = d_56 * d_139;
d_153 = fma(d_55, d_134, d_152);
d_154 = d_153 + d_151;
ui_4[6] = d_154;
d_151 = d_154;
d_155 = ui_5[6];
d_156 = d_55 * d_139;
d_157 = d_56 * d_134;
d_158 = d_156 - d_157;
d_159 = d_158 + d_155;
ui_5[6] = d_159;
d_155 = d_159;
d_160 = ui_6[6];
d_161 = fma(d_153, d_153, d_160);
ui_6[6] = d_161;
d_160 = d_161;
d_162 = ui_7[6];
d_163 = fma(d_158, d_158, d_162);
ui_7[6] = d_163;
d_162 = d_163;
d_164 = ui_8[6];
d_165 = fma(d_158, d_153, d_164);
ui_8[6] = d_165;
d_164 = d_165;
d_166 = ui_9[6];
d_167 = fma(d_153, d_31, d_166);
ui_9[6] = d_167;
d_166 = d_167;
d_168 = ui_10[6];
d_169 = fma(d_158, d_31, d_168);
ui_10[6] = d_169;
d_168 = d_169;
i_170 = i_25 + 1;
i_25 = i_170;
} // B2
// BLOCK 3
barrier(CLK_LOCAL_MEM_FENCE);
d_171 = ui_7[0];
d_172 = ui_6[0];
d_173 = ui_8[0];
d_174 = ui_8[0];
d_175 = ui_5[0];
d_176 = ui_5[0];
d_177 = ui_6[0];
d_178 = ui_4[0];
d_179 = ui_4[0];
d_180 = ui_7[0];
d_181 = ui_4[0];
d_182 = ui_5[0];
d_183 = ui_8[0];
d_184 = d_181 * 2.0;
d_185 = d_184 * d_182;
d_186 = d_171 * d_172;
d_187 = d_173 * d_174;
d_188 = d_186 - d_187;
d_189 = d_188 * 165.0;
d_190 = d_175 * d_176;
d_191 = d_190 * d_177;
d_192 = d_189 - d_191;
d_193 = d_178 * d_179;
d_194 = d_193 * d_180;
d_195 = d_192 - d_194;
d_196 = fma(d_185, d_183, d_195);
i_197 = i_18 << 6;
i_198 = i_197 - i_23;
i_199 = i_198 + 24;
ui_200 = ui_2 + i_199;
i_201 = isless(d_196, 0.0);
if(i_201 == 1)
{
// BLOCK 4
*((__global double *) ui_200) = 1.0;
} // B4
else
{
// BLOCK 5
d_202 = ui_5[0];
d_203 = ui_9[0];
d_204 = ui_4[0];
d_205 = ui_10[0];
d_206 = ui_10[0];
d_207 = ui_6[0];
d_208 = ui_9[0];
d_209 = ui_8[0];
d_210 = ui_4[0];
d_211 = ui_9[0];
d_212 = ui_7[0];
d_213 = ui_10[0];
d_214 = ui_8[0];
d_215 = ui_5[0];
d_216 = ui_10[0];
d_217 = ui_9[0];
d_218 = d_202 * d_203;
d_219 = d_204 * d_205;
d_220 = d_218 - d_219;
d_221 = d_206 * d_207;
d_222 = d_208 * d_209;
d_223 = d_221 - d_222;
d_224 = d_223 * 165.0;
d_225 = fma(d_220, d_210, d_224);
d_226 = d_211 * d_212;
d_227 = d_213 * d_214;
d_228 = d_226 - d_227;
d_229 = d_228 * 165.0;
d_230 = d_215 * d_220;
d_231 = d_229 - d_230;
d_232 = d_231 * d_217;
d_233 = fma(d_225, d_216, d_232);
d_234 = d_233 / d_196;
*((__global double *) ui_200) = d_234;
} // B5
// BLOCK 6 MERGES [4 5 ]
d_235 = ui_7[1];
d_236 = ui_6[1];
d_237 = ui_8[1];
d_238 = ui_8[1];
d_239 = ui_5[1];
d_240 = ui_5[1];
d_241 = ui_6[1];
d_242 = ui_4[1];
d_243 = ui_4[1];
d_244 = ui_7[1];
d_245 = ui_4[1];
d_246 = ui_5[1];
d_247 = ui_8[1];
d_248 = d_245 * 2.0;
d_249 = d_248 * d_246;
d_250 = d_235 * d_236;
d_251 = d_237 * d_238;
d_252 = d_250 - d_251;
d_253 = d_252 * 165.0;
d_254 = d_239 * d_240;
d_255 = d_254 * d_241;
d_256 = d_253 - d_255;
d_257 = d_242 * d_243;
d_258 = d_257 * d_244;
d_259 = d_256 - d_258;
d_260 = fma(d_249, d_247, d_259);
i_261 = i_24 + 1;
i_262 = i_261 << 3;
i_263 = i_262 + 24;
ui_264 = ui_2 + i_263;
i_265 = isless(d_260, 0.0);
if(i_265 == 1)
{
// BLOCK 7
*((__global double *) ui_264) = 1.0;
} // B7
else
{
// BLOCK 8
d_266 = ui_5[1];
d_267 = ui_9[1];
d_268 = ui_4[1];
d_269 = ui_10[1];
d_270 = ui_10[1];
d_271 = ui_6[1];
d_272 = ui_9[1];
d_273 = ui_8[1];
d_274 = ui_4[1];
d_275 = ui_9[1];
d_276 = ui_7[1];
d_277 = ui_10[1];
d_278 = ui_8[1];
d_279 = ui_5[1];
d_280 = ui_10[1];
d_281 = ui_9[1];
d_282 = d_266 * d_267;
d_283 = d_268 * d_269;
d_284 = d_282 - d_283;
d_285 = d_270 * d_271;
d_286 = d_272 * d_273;
d_287 = d_285 - d_286;
d_288 = d_287 * 165.0;
d_289 = fma(d_284, d_274, d_288);
d_290 = d_275 * d_276;
d_291 = d_277 * d_278;
d_292 = d_290 - d_291;
d_293 = d_292 * 165.0;
d_294 = d_279 * d_284;
d_295 = d_293 - d_294;
d_296 = d_295 * d_281;
d_297 = fma(d_289, d_280, d_296);
d_298 = d_297 / d_260;
*((__global double *) ui_264) = d_298;
} // B8
// BLOCK 9 MERGES [7 8 ]
d_299 = ui_7[2];
d_300 = ui_6[2];
d_301 = ui_8[2];
d_302 = ui_8[2];
d_303 = ui_5[2];
d_304 = ui_5[2];
d_305 = ui_6[2];
d_306 = ui_4[2];
d_307 = ui_4[2];
d_308 = ui_7[2];
d_309 = ui_4[2];
d_310 = ui_5[2];
d_311 = ui_8[2];
d_312 = d_309 * 2.0;
d_313 = d_312 * d_310;
d_314 = d_299 * d_300;
d_315 = d_301 * d_302;
d_316 = d_314 - d_315;
d_317 = d_316 * 165.0;
d_318 = d_303 * d_304;
d_319 = d_318 * d_305;
d_320 = d_317 - d_319;
d_321 = d_306 * d_307;
d_322 = d_321 * d_308;
d_323 = d_320 - d_322;
d_324 = fma(d_313, d_311, d_323);
i_325 = i_24 + 2;
i_326 = i_325 << 3;
i_327 = i_326 + 24;
ui_328 = ui_2 + i_327;
i_329 = isless(d_324, 0.0);
if(i_329 == 1)
{
// BLOCK 10
*((__global double *) ui_328) = 1.0;
} // B10
else
{
// BLOCK 11
d_330 = ui_5[2];
d_331 = ui_9[2];
d_332 = ui_4[2];
d_333 = ui_10[2];
d_334 = ui_10[2];
d_335 = ui_6[2];
d_336 = ui_9[2];
d_337 = ui_8[2];
d_338 = ui_4[2];
d_339 = ui_9[2];
d_340 = ui_7[2];
d_341 = ui_10[2];
d_342 = ui_8[2];
d_343 = ui_5[2];
d_344 = ui_10[2];
d_345 = ui_9[2];
d_346 = d_330 * d_331;
d_347 = d_332 * d_333;
d_348 = d_346 - d_347;
d_349 = d_334 * d_335;
d_350 = d_336 * d_337;
d_351 = d_349 - d_350;
d_352 = d_351 * 165.0;
d_353 = fma(d_348, d_338, d_352);
d_354 = d_339 * d_340;
d_355 = d_341 * d_342;
d_356 = d_354 - d_355;
d_357 = d_356 * 165.0;
d_358 = d_343 * d_348;
d_359 = d_357 - d_358;
d_360 = d_359 * d_345;
d_361 = fma(d_353, d_344, d_360);
d_362 = d_361 / d_324;
*((__global double *) ui_328) = d_362;
} // B11
// BLOCK 12 MERGES [10 11 ]
d_363 = ui_7[3];
d_364 = ui_6[3];
d_365 = ui_8[3];
d_366 = ui_8[3];
d_367 = ui_5[3];
d_368 = ui_5[3];
d_369 = ui_6[3];
d_370 = ui_4[3];
d_371 = ui_4[3];
d_372 = ui_7[3];
d_373 = ui_4[3];
d_374 = ui_5[3];
d_375 = ui_8[3];
d_376 = d_373 * 2.0;
d_377 = d_376 * d_374;
d_378 = d_363 * d_364;
d_379 = d_365 * d_366;
d_380 = d_378 - d_379;
d_381 = d_380 * 165.0;
d_382 = d_367 * d_368;
d_383 = d_382 * d_369;
d_384 = d_381 - d_383;
d_385 = d_370 * d_371;
d_386 = d_385 * d_372;
d_387 = d_384 - d_386;
d_388 = fma(d_377, d_375, d_387);
i_389 = i_24 + 3;
i_390 = i_389 << 3;
i_391 = i_390 + 24;
ui_392 = ui_2 + i_391;
i_393 = isless(d_388, 0.0);
if(i_393 == 1)
{
// BLOCK 13
*((__global double *) ui_392) = 1.0;
} // B13
else
{
// BLOCK 14
d_394 = ui_5[3];
d_395 = ui_9[3];
d_396 = ui_4[3];
d_397 = ui_10[3];
d_398 = ui_10[3];
d_399 = ui_6[3];
d_400 = ui_9[3];
d_401 = ui_8[3];
d_402 = ui_4[3];
d_403 = ui_9[3];
d_404 = ui_7[3];
d_405 = ui_10[3];
d_406 = ui_8[3];
d_407 = ui_5[3];
d_408 = ui_10[3];
d_409 = ui_9[3];
d_410 = d_394 * d_395;
d_411 = d_396 * d_397;
d_412 = d_410 - d_411;
d_413 = d_398 * d_399;
d_414 = d_400 * d_401;
d_415 = d_413 - d_414;
d_416 = d_415 * 165.0;
d_417 = fma(d_412, d_402, d_416);
d_418 = d_403 * d_404;
d_419 = d_405 * d_406;
d_420 = d_418 - d_419;
d_421 = d_420 * 165.0;
d_422 = d_407 * d_412;
d_423 = d_421 - d_422;
d_424 = d_423 * d_409;
d_425 = fma(d_417, d_408, d_424);
d_426 = d_425 / d_388;
*((__global double *) ui_392) = d_426;
} // B14
// BLOCK 15 MERGES [13 14 ]
d_427 = ui_7[4];
d_428 = ui_6[4];
d_429 = ui_8[4];
d_430 = ui_8[4];
d_431 = ui_5[4];
d_432 = ui_5[4];
d_433 = ui_6[4];
d_434 = ui_4[4];
d_435 = ui_4[4];
d_436 = ui_7[4];
d_437 = ui_4[4];
d_438 = ui_5[4];
d_439 = ui_8[4];
d_440 = d_437 * 2.0;
d_441 = d_440 * d_438;
d_442 = d_427 * d_428;
d_443 = d_429 * d_430;
d_444 = d_442 - d_443;
d_445 = d_444 * 165.0;
d_446 = d_431 * d_432;
d_447 = d_446 * d_433;
d_448 = d_445 - d_447;
d_449 = d_434 * d_435;
d_450 = d_449 * d_436;
d_451 = d_448 - d_450;
d_452 = fma(d_441, d_439, d_451);
i_453 = i_24 + 4;
i_454 = i_453 << 3;
i_455 = i_454 + 24;
ui_456 = ui_2 + i_455;
i_457 = isless(d_452, 0.0);
if(i_457 == 1)
{
// BLOCK 16
*((__global double *) ui_456) = 1.0;
} // B16
else
{
// BLOCK 17
d_458 = ui_5[4];
d_459 = ui_9[4];
d_460 = ui_4[4];
d_461 = ui_10[4];
d_462 = ui_10[4];
d_463 = ui_6[4];
d_464 = ui_9[4];
d_465 = ui_8[4];
d_466 = ui_4[4];
d_467 = ui_9[4];
d_468 = ui_7[4];
d_469 = ui_10[4];
d_470 = ui_8[4];
d_471 = ui_5[4];
d_472 = ui_10[4];
d_473 = ui_9[4];
d_474 = d_458 * d_459;
d_475 = d_460 * d_461;
d_476 = d_474 - d_475;
d_477 = d_462 * d_463;
d_478 = d_464 * d_465;
d_479 = d_477 - d_478;
d_480 = d_479 * 165.0;
d_481 = fma(d_476, d_466, d_480);
d_482 = d_467 * d_468;
d_483 = d_469 * d_470;
d_484 = d_482 - d_483;
d_485 = d_484 * 165.0;
d_486 = d_471 * d_476;
d_487 = d_485 - d_486;
d_488 = d_487 * d_473;
d_489 = fma(d_481, d_472, d_488);
d_490 = d_489 / d_452;
*((__global double *) ui_456) = d_490;
} // B17
// BLOCK 18 MERGES [16 17 ]
d_491 = ui_7[5];
d_492 = ui_6[5];
d_493 = ui_8[5];
d_494 = ui_8[5];
d_495 = ui_5[5];
d_496 = ui_5[5];
d_497 = ui_6[5];
d_498 = ui_4[5];
d_499 = ui_4[5];
d_500 = ui_7[5];
d_501 = ui_4[5];
d_502 = ui_5[5];
d_503 = ui_8[5];
d_504 = d_501 * 2.0;
d_505 = d_504 * d_502;
d_506 = d_491 * d_492;
d_507 = d_493 * d_494;
d_508 = d_506 - d_507;
d_509 = d_508 * 165.0;
d_510 = d_495 * d_496;
d_511 = d_510 * d_497;
d_512 = d_509 - d_511;
d_513 = d_498 * d_499;
d_514 = d_513 * d_500;
d_515 = d_512 - d_514;
d_516 = fma(d_505, d_503, d_515);
i_517 = i_24 + 5;
i_518 = i_517 << 3;
i_519 = i_518 + 24;
ui_520 = ui_2 + i_519;
i_521 = isless(d_516, 0.0);
if(i_521 == 1)
{
// BLOCK 19
*((__global double *) ui_520) = 1.0;
} // B19
else
{
// BLOCK 20
d_522 = ui_5[5];
d_523 = ui_9[5];
d_524 = ui_4[5];
d_525 = ui_10[5];
d_526 = ui_10[5];
d_527 = ui_6[5];
d_528 = ui_9[5];
d_529 = ui_8[5];
d_530 = ui_4[5];
d_531 = ui_9[5];
d_532 = ui_7[5];
d_533 = ui_10[5];
d_534 = ui_8[5];
d_535 = ui_5[5];
d_536 = ui_10[5];
d_537 = ui_9[5];
d_538 = d_522 * d_523;
d_539 = d_524 * d_525;
d_540 = d_538 - d_539;
d_541 = d_526 * d_527;
d_542 = d_528 * d_529;
d_543 = d_541 - d_542;
d_544 = d_543 * 165.0;
d_545 = fma(d_540, d_530, d_544);
d_546 = d_531 * d_532;
d_547 = d_533 * d_534;
d_548 = d_546 - d_547;
d_549 = d_548 * 165.0;
d_550 = d_535 * d_540;
d_551 = d_549 - d_550;
d_552 = d_551 * d_537;
d_553 = fma(d_545, d_536, d_552);
d_554 = d_553 / d_516;
*((__global double *) ui_520) = d_554;
} // B20
// BLOCK 21 MERGES [19 20 ]
d_555 = ui_7[6];
d_556 = ui_6[6];
d_557 = ui_8[6];
d_558 = ui_8[6];
d_559 = ui_5[6];
d_560 = ui_5[6];
d_561 = ui_6[6];
d_562 = ui_4[6];
d_563 = ui_4[6];
d_564 = ui_7[6];
d_565 = ui_4[6];
d_566 = ui_5[6];
d_567 = ui_8[6];
d_568 = d_565 * 2.0;
d_569 = d_568 * d_566;
d_570 = d_555 * d_556;
d_571 = d_557 * d_558;
d_572 = d_570 - d_571;
d_573 = d_572 * 165.0;
d_574 = d_559 * d_560;
d_575 = d_574 * d_561;
d_576 = d_573 - d_575;
d_577 = d_562 * d_563;
d_578 = d_577 * d_564;
d_579 = d_576 - d_578;
d_580 = fma(d_569, d_567, d_579);
i_581 = i_24 + 6;
i_582 = i_581 << 3;
i_583 = i_582 + 24;
ui_584 = ui_2 + i_583;
i_585 = isless(d_580, 0.0);
if(i_585 == 1)
{
// BLOCK 22
*((__global double *) ui_584) = 1.0;
} // B22
else
{
// BLOCK 23
d_586 = ui_5[6];
d_587 = ui_9[6];
d_588 = ui_4[6];
d_589 = ui_10[6];
d_590 = ui_10[6];
d_591 = ui_6[6];
d_592 = ui_9[6];
d_593 = ui_8[6];
d_594 = ui_4[6];
d_595 = ui_9[6];
d_596 = ui_7[6];
d_597 = ui_10[6];
d_598 = ui_8[6];
d_599 = ui_5[6];
d_600 = ui_10[6];
d_601 = ui_9[6];
d_602 = d_586 * d_587;
d_603 = d_588 * d_589;
d_604 = d_602 - d_603;
d_605 = d_590 * d_591;
d_606 = d_592 * d_593;
d_607 = d_605 - d_606;
d_608 = d_607 * 165.0;
d_609 = fma(d_604, d_594, d_608);
d_610 = d_595 * d_596;
d_611 = d_597 * d_598;
d_612 = d_610 - d_611;
d_613 = d_612 * 165.0;
d_614 = d_599 * d_604;
d_615 = d_613 - d_614;
d_616 = d_615 * d_601;
d_617 = fma(d_609, d_600, d_616);
d_618 = d_617 / d_580;
*((__global double *) ui_584) = d_618;
} // B23
// BLOCK 24 MERGES [22 23 ]
return;
} // B24
} // kernel
Second invocation finishes with the error
[JNI] uk.ac.manchester.tornado.drivers.opencl> notify error:
[JNI] uk.ac.manchester.tornado.drivers.opencl> [CL_DEVICE_NOT_AVAILABLE] : OpenCL Error : Error: Build Program driver returned (-2)
[JNI] uk.ac.manchester.tornado.drivers.opencl> notify error:
[JNI] uk.ac.manchester.tornado.drivers.opencl> OpenCL Warning : clBuildProgram failed: could not build program for 0x1021e00 (AMD Radeon Pro 5500M Compute Engine) (err:-2)
[JNI] uk.ac.manchester.tornado.drivers.opencl> notify error:
[JNI] uk.ac.manchester.tornado.drivers.opencl> [CL_BUILD_ERROR] : OpenCL Build Error : Compiler build log:
<program source>:823:1: error: extraneous closing brace ('}')
} // kernel
^
How To Reproduce
Java kernelized code:
public static void computeKernel(KernelContext context, int tileSize, final double infirstScanFrequency, final double frequencyStep, final double deltaEpsilon, final double[] normObsTimes, final double[] normObsValues, final double[] amplitudes, int[] localGroups)
{
int tileIdx = context.globalIdx;
//testing kernel params
localGroups[0] = context.localGroupSizeX;
localGroups[1] = context.globalGroupSizeX;
localGroups[2] = context.localIdx;
localGroups[3] = context.globalIdx;
localGroups[4+context.globalIdx]=context.globalIdx;
//\testing kernel params
int startFreqIdx = tileIdx * tileSize;
int nObservations = normObsTimes.length;
final double omega = (infirstScanFrequency + startFreqIdx * frequencyStep) * 2 * Math.PI;
// starting frequency
final double deltaOmega = frequencyStep * 2 * Math.PI;
// frequency step
// final double[] sumSx = context.allocateDoubleLocalArray(constTileSize); this does not work, reported in another issue
final double[] sumSx = new double[constTileSize];
// sum of sin(obsTimes[i]*2pi*f)
final double[] sumCx = new double[constTileSize];
// sum of cos(obsTimes[i]*2pi*f)
final double[] sumSx2 = new double[constTileSize];
// sum of sin(obsTimes[i]*2pi*f)*sin(obsTimes[i]*2pi*f)
final double[] sumCx2 = new double[constTileSize];
// sum of cos(obsTimes[i]*2pi*f)*cos(obsTimes[i]*2pi*f)
final double[] sumSxCx = new double[constTileSize];
// sum of sin(obsTimes[i]*2pi*f)*cos(obsTimes[i]*2pi*f)
final double[] sumSxVal = new double[constTileSize];
// sum of sin(obsTimes[i]*2pi*f)*obsValue[i]
final double[] sumCxVal = new double[constTileSize];
// sum of cos(obsTimes[i]*2pi*f)*obsValue[i]
// for each set of observation data
for (int i = 0; i < nObservations; i++) {
final double obsTime = normObsTimes[i];
final double obsValue = normObsValues[i];
// calculate the starting phase and it's sine and cosine
final double phase = obsTime * omega;
double sPh = Math.sin(phase);
double cPh = Math.cos(phase);
// calculate the phase step and it's sine and cosine
final double dPhase = obsTime * deltaOmega;
final double sDPh = Math.sin(dPhase);
final double cDPh = Math.cos(dPhase);
// for each frequency to test, increment the phase with the phase step
for (int j = 0; j < tileSize; j++) {
sumSx[j] += sPh;
sumCx[j] += cPh;
sumSx2[j] += sPh*sPh;
sumCx2[j] += cPh*cPh;
sumSxCx[j] += cPh*sPh;
sumSxVal[j] += sPh*obsValue;
sumCxVal[j] += cPh*obsValue;
final double cT = cPh;
cPh = cT * cDPh - sPh * sDPh;
sPh = sPh * cDPh + cT * sDPh;
}
}
// calculate intermediate variables and results
for (int i = 0; i < tileSize; i++) {
final double d = nObservations * ( sumCx2[i] * sumSx2[i] - sumSxCx[i] * sumSxCx[i])
- sumCx[i] * sumCx[i] * sumSx2[i] - sumSx[i] * sumSx[i] * sumCx2[i]
+ 2 * sumSx[i] * sumCx[i] * sumSxCx[i];
if (d < deltaEpsilon) {
amplitudes[startFreqIdx + i] = MAXRANGE;
} else {
final double b = sumCx[i] * sumSxVal[i] - sumSx[i] * sumCxVal[i];
final double c1 = nObservations * ( sumCxVal[i] * sumSx2[i] - sumSxVal[i] * sumSxCx[i] ) + sumSx[i] * b;
final double c2 = nObservations * ( sumSxVal[i] * sumCx2[i] - sumCxVal[i] * sumSxCx[i] ) - sumCx[i] * b;
amplitudes[startFreqIdx + i] = ( c1 * sumCxVal[i] + c2 * sumSxVal[i] ) / d;
}
}
}
Invoked as
final double[] amplitudes = new double[this.nScanFrequencies];
// init context
int deviceNum = 2;
WorkerGrid workerGrid = new WorkerGrid1D(nScanFrequencies);
GridScheduler gridScheduler = new GridScheduler("PeriodSearch.t0", workerGrid);
KernelContext context = new KernelContext();
// Set the global work size as we slice by frequencies
workerGrid.setGlobalWork(4096, 1, 1); // single dimension
int tileSize = 128;
tileSize = amplitudes.length<tileSize?amplitudes.length:tileSize;
workerGrid.setLocalWork(tileSize,1,1);
//some debug info.
final int[] localGroup = new int[amplitudes.length+4];
IntStream.range(0, localGroup.length).parallel().forEach(idx -> {
localGroup[idx] = 0;
});
//\some debug info.
TaskSchedule task1 = new TaskSchedule("PeriodSearch") //
.streamIn(amplitudes) //does not change much here, input is relatively small
.task("t0", MethodLeastSquareGPU::computeKernel, context, tileSize , firstScanFrequency, frequencyStep, deltaEpsilon, normObsTimes, normObsValues, amplitudes, localGroup)
.streamOut(amplitudes,localGroup);
task1.execute(gridScheduler);
Provide a test-case and instructions of how to reproduce the issue. Expected behavior
Proper code generated on the call with smaller input, no openCL errors.
Computing system setup (please complete the following information):
- OS: macOS/Linux
- OpenCL Version 1.2,2,3
- TornadoVM 0.13-dev
Additional context
Issue Analytics
- State:
- Created 2 years ago
- Comments:11 (11 by maintainers)
Top GitHub Comments
For SPIRV backend the following exception appears:
Devices:
Could existing
-Dtornado.recover.bailout=False
be used for this?