question-mark
Stuck on an issue?

Lightrun Answers was designed to reduce the constant googling that comes with debugging 3rd party libraries. It collects links to all the places you might be looking at while hunting down a tough bug.

And, if you’re still stuck at the end, we’re happy to hop on a call to see how we can help out.

Kernel produced does not compile for different input of the same fn.

See original GitHub issue

Describe the bug This one is a continuation of https://github.com/beehive-lab/TornadoVM/issues/153 We tried to port code to Kernel semantics, but the results are puzzling and sometimes kernel is not compiled at all in between the calls, and is clearly related to different way of passing arguments when Tornado intercepts that input is small (i.e. length of the input array is ~10 vs 1000).

Below, we can see kernel for a larger array (that sometimes works, depending on the input as described in a subsequent issue) then then Tonadovm generates code that it seems unfolds arrays passed and generates 100s of input params)

The 2nd time function is called, the generated code is broken and fails to compile

2732 INFO  [main] gaia.cu7.algo.character.periodsearch.methods.test.MethodLeastSquareComputeComparison  [] - Sequential LSQ: 0.682237031
#pragma OPENCL EXTENSION cl_khr_fp64 : enable  
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable  
__kernel void lookupBufferAddress(__global uchar *_heap_base, uint _frame_base, __constant uchar *_constant_region, __local uchar *_local_region, __global int *_atomics)
{

  __global ulong *_frame = (__global ulong *) &_heap_base[_frame_base];


  // BLOCK 0
  _frame[0]  =  (ulong) _heap_base;
}  //  kernel

#pragma OPENCL EXTENSION cl_khr_fp64 : enable  
__kernel void lookupBufferAddress(__global uchar *_heap_base, uint _frame_base, __constant uchar *_constant_region, __local uchar *_local_region, __global int *_atomics)
{

  __global ulong *_frame = (__global ulong *) &_heap_base[_frame_base];


  // BLOCK 0
  _frame[0]  =  (ulong) _heap_base;
}  //  kernel

#pragma OPENCL EXTENSION cl_khr_fp64 : enable  
__kernel void computeKernel(__global uchar *_heap_base, uint _frame_base, __constant uchar *_constant_region, __local uchar *_local_region, __global int *_atomics)
{
  uint ui_3, ui_2, ui_15, ui_1, ui_17, ui_0, ui_96, ui_11, ui_27, ui_13, ui_29, ui_22; 
  int i_16, i_18, i_12, i_14, i_24, i_23, i_55, i_26, i_25, i_20, i_19, i_21, i_64, i_95, i_65, i_97, i_92, i_94, i_93, i_40, i_131; 
  double d_49, d_50, d_51, d_52, d_53, d_54, d_56, d_41, d_42, d_43, d_44, d_45, d_46, d_47, d_48, d_66, d_67, d_68, d_69, d_70, d_71, d_72, d_57, d_58, d_59, d_60, d_61, d_62, d_63, d_81, d_82, d_83, d_84, d_85, d_86, d_87, d_88, d_73, d_74, d_75, d_76, d_77, d_78, d_79, d_80, d_98, d_99, d_100, d_
101, d_102, d_103, d_104, d_89, d_90, d_91, d_113, d_114, d_115, d_116, d_117, d_118, d_119, d_120, d_105, d_106, d_107, d_108, d_109, d_110, d_111, d_112, d_129, d_130, d_121, d_122, d_123, d_124, d_125, d_126, d_127, d_128, d_33, d_34, d_35, d_36, d_37, d_38, d_39, d_28, d_30, d_31, d_32; 

  __global ulong *_frame = (__global ulong *) &_heap_base[_frame_base];


  // BLOCK 0
  ui_0  =  (uint) _frame[8];
  ui_1  =  (uint) _frame[9];
  ui_2  =  (uint) _frame[10];
  ui_3  =  (uint) _frame[11];
  __private double ui_4[512];
  __private double ui_5[512];
  __private double ui_6[512];
  __private double ui_7[512];
  __private double ui_8[512];
  __private double ui_9[512];
  __private double ui_10[512];
  ui_11  =  ui_3 + 24;
  i_12  =  get_local_size(0);
  *((__global int *) ui_11)  =  i_12;
  ui_13  =  ui_3 + 28;
  i_14  =  get_global_size(0);
  *((__global int *) ui_13)  =  i_14;
  ui_15  =  ui_3 + 32;
  i_16  =  get_local_id(0);
  *((__global int *) ui_15)  =  i_16;
  ui_17  =  ui_3 + 36;
  i_18  =  get_global_id(0);
  *((__global int *) ui_17)  =  i_18;
  i_19  =  i_18 + 4;
  i_20  =  i_19 << 2;
  i_21  =  i_20 + 24;
  ui_22  =  ui_3 + i_21;
  *((__global int *) ui_22)  =  i_18;
  i_23  =  i_18 << 7;
  // BLOCK 1 MERGES [0 5 ]
  i_24  =  0;
  for(;i_24 < 165;)
  {
    // BLOCK 2
    i_25  =  i_24 << 3;
    i_26  =  i_25 + 24;
    ui_27  =  ui_0 + i_26;
    d_28  =  *((__global double *) ui_27);
    ui_29  =  ui_1 + i_26;
    d_30  =  *((__global double *) ui_29);
    d_31  =  (double) i_23;
    d_32  =  fma(d_31, 5.0E-5, 0.06);
    d_33  =  d_32 * 2.0;
    d_34  =  d_33 * 3.141592653589793;
    d_35  =  d_34 * d_28;
    d_36  =  cos(d_35);
    d_37  =  sin(d_35);
    // BLOCK 3 MERGES [2 4 ]
    d_38  =  d_37;
    d_39  =  d_36;
    i_40  =  0;
    for(;i_40 < 128;)
    {
      // BLOCK 4
      d_41  =  ui_4[i_40];
      d_42  =  d_38 + d_41;
      ui_4[i_40]  =  d_42;
      d_41  =  d_42;
      d_43  =  ui_5[i_40];
      d_44  =  d_39 + d_43;
      ui_5[i_40]  =  d_44;
      d_43  =  d_44;
      d_45  =  ui_6[i_40];
      d_46  =  fma(d_38, d_38, d_45);
      ui_6[i_40]  =  d_46;
      d_45  =  d_46;
      d_47  =  ui_7[i_40];
      d_48  =  fma(d_39, d_39, d_47);
      ui_7[i_40]  =  d_48;
      d_47  =  d_48;
      d_49  =  ui_8[i_40];
      d_50  =  fma(d_38, d_39, d_49);
      ui_8[i_40]  =  d_50;
      d_49  =  d_50;
      d_51  =  ui_9[i_40];
      d_52  =  fma(d_38, d_30, d_51);
      ui_9[i_40]  =  d_52;
      d_51  =  d_52;
      d_53  =  ui_10[i_40];
      d_54  =  fma(d_39, d_30, d_53);
      ui_10[i_40]  =  d_54;
      d_53  =  d_54;
      i_55  =  i_40 + 1;
      d_56  =  d_28 * 3.141592653589793E-4;
      d_57  =  cos(d_56);
      d_58  =  d_57 * d_39;
      d_59  =  sin(d_56);
      d_60  =  d_59 * d_38;
      d_61  =  d_58 - d_60;
      d_62  =  d_59 * d_39;
      d_63  =  fma(d_57, d_38, d_62);
      d_38  =  d_63;
      d_39  =  d_61;
      i_40  =  i_55;
    }  // B4
    // BLOCK 5
    i_64  =  i_24 + 1;
    i_24  =  i_64;
  }  // B5
  // BLOCK 6
  barrier(CLK_LOCAL_MEM_FENCE);
  // BLOCK 7 MERGES [6 11 ]
  i_65  =  0;
  for(;i_65 < 128;)
  {
    // BLOCK 8
    d_66  =  ui_7[i_65];
    d_67  =  ui_6[i_65];
    d_68  =  ui_8[i_65];
    d_69  =  ui_8[i_65];
    d_70  =  ui_5[i_65];
    d_71  =  ui_5[i_65];
    d_72  =  ui_6[i_65];
    d_73  =  ui_4[i_65];
    d_74  =  ui_4[i_65];
    d_75  =  ui_7[i_65];
    d_76  =  ui_4[i_65];
    d_77  =  ui_5[i_65];
    d_78  =  ui_8[i_65];
    d_79  =  d_76 * 2.0;
    d_80  =  d_79 * d_77;
    d_81  =  d_66 * d_67;
    d_82  =  d_68 * d_69;
    d_83  =  d_81 - d_82;
    d_84  =  d_83 * 165.0;
    d_85  =  d_70 * d_71;
    d_86  =  d_85 * d_72;
    d_87  =  d_84 - d_86;
    d_88  =  d_73 * d_74;
    d_89  =  d_88 * d_75;
    d_90  =  d_87 - d_89;
    d_91  =  fma(d_80, d_78, d_90);
    i_92  =  i_65 + 1;
    i_93  =  i_65 + i_23;
    i_94  =  i_93 << 3;
    i_95  =  i_94 + 24;
    ui_96  =  ui_2 + i_95;
    i_97  =  isless(d_91, 0.0);
    if(i_97 == 1)
    {
      // BLOCK 9
      *((__global double *) ui_96)  =  1.0;
    }  // B9
    else
    {
      // BLOCK 10
      d_98  =  ui_5[i_65];
      d_99  =  ui_9[i_65];
      d_100  =  ui_4[i_65];
      d_101  =  ui_10[i_65];
      d_102  =  ui_10[i_65];
      d_103  =  ui_6[i_65];
      d_104  =  ui_9[i_65];
      d_105  =  ui_8[i_65];
      d_106  =  ui_4[i_65];
      d_107  =  ui_9[i_65];
      d_108  =  ui_7[i_65];
      d_109  =  ui_10[i_65];
      d_110  =  ui_8[i_65];
      d_111  =  ui_5[i_65];
      d_112  =  ui_10[i_65];
      d_113  =  ui_9[i_65];
      d_114  =  d_98 * d_99;
      d_115  =  d_100 * d_101;
      d_116  =  d_114 - d_115;
      d_117  =  d_102 * d_103;
      d_118  =  d_104 * d_105;
      d_119  =  d_117 - d_118;
      d_120  =  d_119 * 165.0;
      d_121  =  fma(d_116, d_106, d_120);
      d_122  =  d_107 * d_108;
      d_123  =  d_109 * d_110;
      d_124  =  d_122 - d_123;
      d_125  =  d_124 * 165.0;
      d_126  =  d_111 * d_116;
      d_127  =  d_125 - d_126;
      d_128  =  d_127 * d_113;
      d_129  =  fma(d_121, d_112, d_128);
      d_130  =  d_129 / d_91;
      *((__global double *) ui_96)  =  d_130;
    }  // B10
    // BLOCK 11 MERGES [9 10 ]
    i_131  =  i_92;
    i_65  =  i_131;
  }  // B11
  // BLOCK 12
  return;
}  //  kernel

{
    "PeriodSearch": {
        "COPY_IN_TIME": "55480",
        "TOTAL_DISPATCH_DATA_TRANSFERS_TIME": "12397",
        "TOTAL_TASK_SCHEDULE_TIME": "432744662",
        "COPY_OUT_TIME": "510240",
        "TOTAL_KERNEL_TIME": "117545840",
        "TOTAL_DRIVER_COMPILE_TIME": "1394951",
        "TOTAL_GRAAL_COMPILE_TIME": "73815732",
        "TOTAL_DISPATCH_KERNEL_TIME": "17385",
        "TOTAL_BYTE_CODE_GENERATION": "7382591",
        "TOTAL_COPY_IN_SIZE_BYTES": "5376",
        "TOTAL_COPY_OUT_SIZE_BYTES": "3585676",
        "PeriodSearch.t0": {
            "METHOD": "MethodLeastSquareGPU.computeKernel",
            "DEVICE_ID": "0:2",
            "DEVICE": "AMD Radeon Pro 5500M Compute Engine",
            "TOTAL_COPY_IN_SIZE_BYTES": "96",
            "TASK_KERNEL_TIME": "117545840",
            "TASK_COMPILE_GRAAL_TIME": "73815732",
            "TASK_COMPILE_DRIVER_TIME": "1394951"
        }
    }
}

#pragma OPENCL EXTENSION cl_khr_fp64 : enable  
__kernel void computeKernel(__global uchar *_heap_base, uint _frame_base, __constant uchar *_constant_region, __local uchar *_local_region, __global int *_atomics)
{
  uint ui_15, ui_17, ui_11, ui_13, ui_200, ui_264, ui_328, ui_392, ui_456, ui_520, ui_584, ui_3, ui_2, ui_30, ui_1, ui_0, ui_28, ui_22; 
  int i_199, i_263, i_327, i_391, i_455, i_519, i_583, i_201, i_265, i_329, i_393, i_457, i_521, i_585, i_198, i_262, i_326, i_390, i_454, i_518, i_582, i_197, i_261, i_325, i_389, i_453, i_517, i_581, i_16, i_18, i_12, i_14, i_24, i_23, i_26, i_25, i_20, i_19, i_21, i_27, i_170; 
  double d_617, d_618, d_561, d_562, d_563, d_564, d_565, d_566, d_567, d_568, d_553, d_554, d_555, d_556, d_557, d_558, d_559, d_560, d_577, d_578, d_579, d_580, d_569, d_570, d_571, d_572, d_573, d_574, d_575, d_576, d_593, d_594, d_595, d_596, d_597, d_598, d_599, d_600, d_586, d_587, d_588, d_58
9, d_590, d_591, d_592, d_609, d_610, d_611, d_612, d_613, d_614, d_615, d_616, d_601, d_602, d_603, d_604, d_605, d_606, d_607, d_608, d_497, d_498, d_499, d_500, d_501, d_502, d_503, d_504, d_489, d_490, d_491, d_492, d_493, d_494, d_495, d_496, d_513, d_514, d_515, d_516, d_505, d_506, d_507, d_5
08, d_509, d_510, d_511, d_512, d_529, d_530, d_531, d_532, d_533, d_534, d_535, d_536, d_522, d_523, d_524, d_525, d_526, d_527, d_528, d_545, d_546, d_547, d_548, d_549, d_550, d_551, d_552, d_537, d_538, d_539, d_540, d_541, d_542, d_543, d_544, d_177, d_178, d_179, d_180, d_181, d_182, d_183, d_
184, d_169, d_171, d_172, d_173, d_174, d_175, d_176, d_193, d_194, d_195, d_196, d_185, d_186, d_187, d_188, d_189, d_190, d_191, d_192, d_209, d_210, d_211, d_212, d_213, d_214, d_215, d_216, d_202, d_203, d_204, d_205, d_206, d_207, d_208, d_225, d_226, d_227, d_228, d_229, d_230, d_231, d_232, d
_217, d_218, d_219, d_220, d_221, d_222, d_223, d_224, d_113, d_114, d_115, d_116, d_117, d_118, d_119, d_120, d_105, d_106, d_107, d_108, d_109, d_110, d_111, d_112, d_129, d_130, d_131, d_132, d_133, d_134, d_135, d_136, d_121, d_122, d_123, d_124, d_125, d_126, d_127, d_128, d_145, d_146, d_147, 
d_148, d_149, d_150, d_151, d_152, d_137, d_138, d_139, d_140, d_141, d_142, d_143, d_144, d_161, d_162, d_163, d_164, d_165, d_166, d_167, d_168, d_153, d_154, d_155, d_156, d_157, d_158, d_159, d_160, d_49, d_50, d_51, d_52, d_53, d_54, d_55, d_56, d_41, d_42, d_43, d_44, d_45, d_46, d_47, d_48, d
_65, d_66, d_67, d_68, d_69, d_70, d_71, d_72, d_57, d_58, d_59, d_60, d_61, d_62, d_63, d_64, d_81, d_82, d_83, d_84, d_85, d_86, d_87, d_88, d_73, d_74, d_75, d_76, d_77, d_78, d_79, d_80, d_97, d_98, d_99, d_100, d_101, d_102, d_103, d_104, d_89, d_90, d_91, d_92, d_93, d_94, d_95, d_96, d_33, d_
34, d_35, d_36, d_37, d_38, d_39, d_40, d_29, d_31, d_32, d_433, d_434, d_435, d_436, d_437, d_438, d_439, d_440, d_425, d_426, d_427, d_428, d_429, d_430, d_431, d_432, d_449, d_450, d_451, d_452, d_441, d_442, d_443, d_444, d_445, d_446, d_447, d_448, d_465, d_466, d_467, d_468, d_469, d_470, d_47
1, d_472, d_458, d_459, d_460, d_461, d_462, d_463, d_464, d_481, d_482, d_483, d_484, d_485, d_486, d_487, d_488, d_473, d_474, d_475, d_476, d_477, d_478, d_479, d_480, d_369, d_370, d_371, d_372, d_373, d_374, d_375, d_376, d_361, d_362, d_363, d_364, d_365, d_366, d_367, d_368, d_385, d_386, d_3
87, d_388, d_377, d_378, d_379, d_380, d_381, d_382, d_383, d_384, d_401, d_402, d_403, d_404, d_405, d_406, d_407, d_408, d_394, d_395, d_396, d_397, d_398, d_399, d_400, d_417, d_418, d_419, d_420, d_421, d_422, d_423, d_424, d_409, d_410, d_411, d_412, d_413, d_414, d_415, d_416, d_305, d_306, d_
307, d_308, d_309, d_310, d_311, d_312, d_297, d_298, d_299, d_300, d_301, d_302, d_303, d_304, d_321, d_322, d_323, d_324, d_313, d_314, d_315, d_316, d_317, d_318, d_319, d_320, d_337, d_338, d_339, d_340, d_341, d_342, d_343, d_344, d_330, d_331, d_332, d_333, d_334, d_335, d_336, d_353, d_354, d
_355, d_356, d_357, d_358, d_359, d_360, d_345, d_346, d_347, d_348, d_349, d_350, d_351, d_352, d_241, d_242, d_243, d_244, d_245, d_246, d_247, d_248, d_233, d_234, d_235, d_236, d_237, d_238, d_239, d_240, d_257, d_258, d_259, d_260, d_249, d_250, d_251, d_252, d_253, d_254, d_255, d_256, d_273, 
d_274, d_275, d_276, d_277, d_278, d_279, d_280, d_266, d_267, d_268, d_269, d_270, d_271, d_272, d_289, d_290, d_291, d_292, d_293, d_294, d_295, d_296, d_281, d_282, d_283, d_284, d_285, d_286, d_287, d_288; 

  __global ulong *_frame = (__global ulong *) &_heap_base[_frame_base];


  // BLOCK 0
  ui_0  =  (uint) _frame[8];
  ui_1  =  (uint) _frame[9];
  ui_2  =  (uint) _frame[10];
  ui_3  =  (uint) _frame[11];
  __private double ui_4[512];
  __private double ui_5[512];
  __private double ui_6[512];
  __private double ui_7[512];
  __private double ui_8[512];
  __private double ui_9[512];
  __private double ui_10[512];
  ui_11  =  ui_3 + 24;
  i_12  =  get_local_size(0);
  *((__global int *) ui_11)  =  i_12;
  ui_13  =  ui_3 + 28;
  i_14  =  get_global_size(0);
  *((__global int *) ui_13)  =  i_14;
  ui_15  =  ui_3 + 32;
  i_16  =  get_local_id(0);
  *((__global int *) ui_15)  =  i_16;
  ui_17  =  ui_3 + 36;
  i_18  =  get_global_id(0);
  *((__global int *) ui_17)  =  i_18;
  i_19  =  i_18 + 4;
  i_20  =  i_19 << 2;
  i_21  =  i_20 + 24;
  ui_22  =  ui_3 + i_21;
  *((__global int *) ui_22)  =  i_18;
  i_23  =  i_18 << 3;
  i_24  =  i_23 - i_18;
  // BLOCK 1 MERGES [0 2 ]
  i_25  =  0;
  for(;i_25 < 165;)
  {
    // BLOCK 2
    i_26  =  i_25 << 3;
    i_27  =  i_26 + 24;
    ui_28  =  ui_0 + i_27;
    d_29  =  *((__global double *) ui_28);
    ui_30  =  ui_1 + i_27;
    d_31  =  *((__global double *) ui_30);
    d_32  =  ui_4[0];
    d_33  =  (double) i_24;
    d_34  =  fma(d_33, 5.0E-6, 6.461485);
    d_35  =  d_34 * 2.0;
    d_36  =  d_35 * 3.141592653589793;
    d_37  =  d_36 * d_29;
    d_38  =  sin(d_37);
    d_39  =  d_38 + d_32;
    ui_4[0]  =  d_39;
    d_32  =  d_39;
    d_40  =  ui_5[0];
    d_41  =  cos(d_37);
    d_42  =  d_41 + d_40;
    ui_5[0]  =  d_42;
    d_40  =  d_42;
    d_43  =  ui_6[0];
    d_44  =  fma(d_38, d_38, d_43);
    ui_6[0]  =  d_44;
    d_43  =  d_44;
    d_45  =  ui_7[0];
    d_46  =  fma(d_41, d_41, d_45);
    ui_7[0]  =  d_46;
    d_45  =  d_46;
    d_47  =  ui_8[0];
    d_48  =  fma(d_38, d_41, d_47);
    ui_8[0]  =  d_48;
    d_47  =  d_48;
    d_49  =  ui_9[0];
    d_50  =  fma(d_38, d_31, d_49);
    ui_9[0]  =  d_50;
    d_49  =  d_50;
    d_51  =  ui_10[0];
    d_52  =  fma(d_41, d_31, d_51);
    ui_10[0]  =  d_52;
    d_51  =  d_52;
    d_53  =  ui_4[1];
    d_54  =  d_29 * 3.1415926535897935E-5;
    d_55  =  cos(d_54);
    d_56  =  sin(d_54);
    d_57  =  d_56 * d_41;
    d_58  =  fma(d_38, d_55, d_57);
    d_59  =  d_58 + d_53;
    ui_4[1]  =  d_59;
    d_53  =  d_59;
    d_60  =  ui_5[1];
    d_61  =  d_41 * d_55;
    d_62  =  d_38 * d_56;
    d_63  =  d_61 - d_62;
    d_64  =  d_63 + d_60;
    ui_5[1]  =  d_64;
    d_60  =  d_64;
    d_65  =  ui_6[1];
    d_66  =  fma(d_58, d_58, d_65);
    ui_6[1]  =  d_66;
    d_65  =  d_66;
    d_67  =  ui_7[1];
    d_68  =  fma(d_63, d_63, d_67);
    ui_7[1]  =  d_68;
    d_67  =  d_68;
    d_69  =  ui_8[1];
    d_70  =  fma(d_63, d_58, d_69);
    ui_8[1]  =  d_70;
    d_69  =  d_70;
    d_71  =  ui_9[1];
    d_72  =  fma(d_58, d_31, d_71);
    ui_9[1]  =  d_72;
    d_71  =  d_72;
    d_73  =  ui_10[1];
    d_74  =  fma(d_63, d_31, d_73);
    ui_10[1]  =  d_74;
    d_73  =  d_74;
    d_75  =  ui_4[2];
    d_76  =  d_56 * d_63;
    d_77  =  fma(d_55, d_58, d_76);
    d_78  =  d_77 + d_75;
    ui_4[2]  =  d_78;
    d_75  =  d_78;
    d_79  =  ui_5[2];
    d_80  =  d_55 * d_63;
    d_81  =  d_56 * d_58;
    d_82  =  d_80 - d_81;
    d_83  =  d_82 + d_79;
    ui_5[2]  =  d_83;
    d_79  =  d_83;
    d_84  =  ui_6[2];
    d_85  =  fma(d_77, d_77, d_84);
    ui_6[2]  =  d_85;
    d_84  =  d_85;
    d_86  =  ui_7[2];
    d_87  =  fma(d_82, d_82, d_86);
    ui_7[2]  =  d_87;
    d_86  =  d_87;
    d_88  =  ui_8[2];
    d_89  =  fma(d_82, d_77, d_88);
    ui_8[2]  =  d_89;
    d_88  =  d_89;
    d_90  =  ui_9[2];
    d_91  =  fma(d_77, d_31, d_90);
    ui_9[2]  =  d_91;
    d_90  =  d_91;
    d_92  =  ui_10[2];
    d_93  =  fma(d_82, d_31, d_92);
    ui_10[2]  =  d_93;
    d_92  =  d_93;
    d_94  =  ui_4[3];
    d_95  =  d_56 * d_82;
    d_96  =  fma(d_55, d_77, d_95);
    d_97  =  d_96 + d_94;
    ui_4[3]  =  d_97;
    d_94  =  d_97;
    d_98  =  ui_5[3];
    d_99  =  d_55 * d_82;
    d_100  =  d_56 * d_77;
    d_101  =  d_99 - d_100;
    d_102  =  d_101 + d_98;
    ui_5[3]  =  d_102;
    d_98  =  d_102;
    d_103  =  ui_6[3];
    d_104  =  fma(d_96, d_96, d_103);
    ui_6[3]  =  d_104;
    d_103  =  d_104;
    d_105  =  ui_7[3];
    d_106  =  fma(d_101, d_101, d_105);
    ui_7[3]  =  d_106;
    d_105  =  d_106;
    d_107  =  ui_8[3];
    d_108  =  fma(d_101, d_96, d_107);
    ui_8[3]  =  d_108;
    d_107  =  d_108;
    d_109  =  ui_9[3];
    d_110  =  fma(d_96, d_31, d_109);
    ui_9[3]  =  d_110;
    d_109  =  d_110;
    d_111  =  ui_10[3];
    d_112  =  fma(d_101, d_31, d_111);
    ui_10[3]  =  d_112;
    d_111  =  d_112;
    d_113  =  ui_4[4];
    d_114  =  d_56 * d_101;
    d_115  =  fma(d_55, d_96, d_114);
    d_116  =  d_115 + d_113;
    ui_4[4]  =  d_116;
    d_113  =  d_116;
    d_117  =  ui_5[4];
    d_118  =  d_55 * d_101;
    d_119  =  d_56 * d_96;
    d_120  =  d_118 - d_119;
    d_121  =  d_120 + d_117;
    ui_5[4]  =  d_121;
    d_117  =  d_121;
    d_122  =  ui_6[4];
    d_123  =  fma(d_115, d_115, d_122);
    ui_6[4]  =  d_123;
    d_122  =  d_123;
    d_124  =  ui_7[4];
    d_125  =  fma(d_120, d_120, d_124);
    ui_7[4]  =  d_125;
    d_124  =  d_125;
    d_126  =  ui_8[4];
    d_127  =  fma(d_120, d_115, d_126);
    ui_8[4]  =  d_127;
    d_126  =  d_127;
    d_128  =  ui_9[4];
    d_129  =  fma(d_115, d_31, d_128);
    ui_9[4]  =  d_129;
    d_128  =  d_129;
    d_130  =  ui_10[4];
    d_131  =  fma(d_120, d_31, d_130);
    ui_10[4]  =  d_131;
    d_130  =  d_131;
    d_132  =  ui_4[5];
    d_133  =  d_56 * d_120;
    d_134  =  fma(d_55, d_115, d_133);
    d_135  =  d_134 + d_132;
    ui_4[5]  =  d_135;
    d_132  =  d_135;
    d_136  =  ui_5[5];
    d_137  =  d_55 * d_120;
    d_138  =  d_56 * d_115;
    d_139  =  d_137 - d_138;
    d_140  =  d_139 + d_136;
    ui_5[5]  =  d_140;
    d_136  =  d_140;
    d_141  =  ui_6[5];
    d_142  =  fma(d_134, d_134, d_141);
    ui_6[5]  =  d_142;
    d_141  =  d_142;
    d_143  =  ui_7[5];
    d_144  =  fma(d_139, d_139, d_143);
    ui_7[5]  =  d_144;
    d_143  =  d_144;
    d_145  =  ui_8[5];
    d_146  =  fma(d_139, d_134, d_145);
    ui_8[5]  =  d_146;
    d_145  =  d_146;
    d_147  =  ui_9[5];
    d_148  =  fma(d_134, d_31, d_147);
    ui_9[5]  =  d_148;
    d_147  =  d_148;
    d_149  =  ui_10[5];
    d_150  =  fma(d_139, d_31, d_149);
    ui_10[5]  =  d_150;
    d_149  =  d_150;
    d_151  =  ui_4[6];
    d_152  =  d_56 * d_139;
    d_153  =  fma(d_55, d_134, d_152);
    d_154  =  d_153 + d_151;
    ui_4[6]  =  d_154;
    d_151  =  d_154;
    d_155  =  ui_5[6];
    d_156  =  d_55 * d_139;
    d_157  =  d_56 * d_134;
    d_158  =  d_156 - d_157;
    d_159  =  d_158 + d_155;
    ui_5[6]  =  d_159;
    d_155  =  d_159;
    d_160  =  ui_6[6];
    d_161  =  fma(d_153, d_153, d_160);
    ui_6[6]  =  d_161;
    d_160  =  d_161;
    d_162  =  ui_7[6];
    d_163  =  fma(d_158, d_158, d_162);
    ui_7[6]  =  d_163;
    d_162  =  d_163;
    d_164  =  ui_8[6];
    d_165  =  fma(d_158, d_153, d_164);
    ui_8[6]  =  d_165;
    d_164  =  d_165;
    d_166  =  ui_9[6];
    d_167  =  fma(d_153, d_31, d_166);
    ui_9[6]  =  d_167;
    d_166  =  d_167;
    d_168  =  ui_10[6];
    d_169  =  fma(d_158, d_31, d_168);
    ui_10[6]  =  d_169;
    d_168  =  d_169;
    i_170  =  i_25 + 1;
    i_25  =  i_170;
  }  // B2
  // BLOCK 3
  barrier(CLK_LOCAL_MEM_FENCE);
  d_171  =  ui_7[0];
  d_172  =  ui_6[0];
  d_173  =  ui_8[0];
  d_174  =  ui_8[0];
  d_175  =  ui_5[0];
  d_176  =  ui_5[0];
  d_177  =  ui_6[0];
  d_178  =  ui_4[0];
  d_179  =  ui_4[0];
  d_180  =  ui_7[0];
  d_181  =  ui_4[0];
  d_182  =  ui_5[0];
  d_183  =  ui_8[0];
  d_184  =  d_181 * 2.0;
  d_185  =  d_184 * d_182;
  d_186  =  d_171 * d_172;
  d_187  =  d_173 * d_174;
  d_188  =  d_186 - d_187;
  d_189  =  d_188 * 165.0;
  d_190  =  d_175 * d_176;
  d_191  =  d_190 * d_177;
  d_192  =  d_189 - d_191;
  d_193  =  d_178 * d_179;
  d_194  =  d_193 * d_180;
  d_195  =  d_192 - d_194;
  d_196  =  fma(d_185, d_183, d_195);
  i_197  =  i_18 << 6;
  i_198  =  i_197 - i_23;
  i_199  =  i_198 + 24;
  ui_200  =  ui_2 + i_199;
  i_201  =  isless(d_196, 0.0);
  if(i_201 == 1)
  {
    // BLOCK 4
    *((__global double *) ui_200)  =  1.0;
  }  // B4
  else
  {
    // BLOCK 5
    d_202  =  ui_5[0];
    d_203  =  ui_9[0];
    d_204  =  ui_4[0];
    d_205  =  ui_10[0];
    d_206  =  ui_10[0];
    d_207  =  ui_6[0];
    d_208  =  ui_9[0];
    d_209  =  ui_8[0];
    d_210  =  ui_4[0];
    d_211  =  ui_9[0];
    d_212  =  ui_7[0];
    d_213  =  ui_10[0];
    d_214  =  ui_8[0];
    d_215  =  ui_5[0];
    d_216  =  ui_10[0];
    d_217  =  ui_9[0];
    d_218  =  d_202 * d_203;
    d_219  =  d_204 * d_205;
    d_220  =  d_218 - d_219;
    d_221  =  d_206 * d_207;
    d_222  =  d_208 * d_209;
    d_223  =  d_221 - d_222;
    d_224  =  d_223 * 165.0;
    d_225  =  fma(d_220, d_210, d_224);
    d_226  =  d_211 * d_212;
    d_227  =  d_213 * d_214;
    d_228  =  d_226 - d_227;
    d_229  =  d_228 * 165.0;
    d_230  =  d_215 * d_220;
    d_231  =  d_229 - d_230;
    d_232  =  d_231 * d_217;
    d_233  =  fma(d_225, d_216, d_232);
    d_234  =  d_233 / d_196;
    *((__global double *) ui_200)  =  d_234;
  }  // B5
  // BLOCK 6 MERGES [4 5 ]
  d_235  =  ui_7[1];
  d_236  =  ui_6[1];
  d_237  =  ui_8[1];
  d_238  =  ui_8[1];
  d_239  =  ui_5[1];
  d_240  =  ui_5[1];
  d_241  =  ui_6[1];
  d_242  =  ui_4[1];
  d_243  =  ui_4[1];
  d_244  =  ui_7[1];
  d_245  =  ui_4[1];
  d_246  =  ui_5[1];
  d_247  =  ui_8[1];
  d_248  =  d_245 * 2.0;
  d_249  =  d_248 * d_246;
  d_250  =  d_235 * d_236;
  d_251  =  d_237 * d_238;
  d_252  =  d_250 - d_251;
  d_253  =  d_252 * 165.0;
  d_254  =  d_239 * d_240;
  d_255  =  d_254 * d_241;
  d_256  =  d_253 - d_255;
  d_257  =  d_242 * d_243;
  d_258  =  d_257 * d_244;
  d_259  =  d_256 - d_258;
  d_260  =  fma(d_249, d_247, d_259);
  i_261  =  i_24 + 1;
  i_262  =  i_261 << 3;
  i_263  =  i_262 + 24;
  ui_264  =  ui_2 + i_263;
  i_265  =  isless(d_260, 0.0);
  if(i_265 == 1)
  {
    // BLOCK 7
    *((__global double *) ui_264)  =  1.0;
  }  // B7
  else
  {
    // BLOCK 8
    d_266  =  ui_5[1];
    d_267  =  ui_9[1];
    d_268  =  ui_4[1];
    d_269  =  ui_10[1];
    d_270  =  ui_10[1];
    d_271  =  ui_6[1];
    d_272  =  ui_9[1];
    d_273  =  ui_8[1];
    d_274  =  ui_4[1];
    d_275  =  ui_9[1];
    d_276  =  ui_7[1];
    d_277  =  ui_10[1];
    d_278  =  ui_8[1];
    d_279  =  ui_5[1];
    d_280  =  ui_10[1];
    d_281  =  ui_9[1];
    d_282  =  d_266 * d_267;
    d_283  =  d_268 * d_269;
    d_284  =  d_282 - d_283;
    d_285  =  d_270 * d_271;
    d_286  =  d_272 * d_273;
    d_287  =  d_285 - d_286;
    d_288  =  d_287 * 165.0;
    d_289  =  fma(d_284, d_274, d_288);
    d_290  =  d_275 * d_276;
    d_291  =  d_277 * d_278;
    d_292  =  d_290 - d_291;
    d_293  =  d_292 * 165.0;
    d_294  =  d_279 * d_284;
    d_295  =  d_293 - d_294;
    d_296  =  d_295 * d_281;
    d_297  =  fma(d_289, d_280, d_296);
    d_298  =  d_297 / d_260;
    *((__global double *) ui_264)  =  d_298;
  }  // B8
  // BLOCK 9 MERGES [7 8 ]
  d_299  =  ui_7[2];
  d_300  =  ui_6[2];
  d_301  =  ui_8[2];
  d_302  =  ui_8[2];
  d_303  =  ui_5[2];
  d_304  =  ui_5[2];
  d_305  =  ui_6[2];
  d_306  =  ui_4[2];
  d_307  =  ui_4[2];
  d_308  =  ui_7[2];
  d_309  =  ui_4[2];
  d_310  =  ui_5[2];
  d_311  =  ui_8[2];
  d_312  =  d_309 * 2.0;
  d_313  =  d_312 * d_310;
  d_314  =  d_299 * d_300;
  d_315  =  d_301 * d_302;
  d_316  =  d_314 - d_315;
  d_317  =  d_316 * 165.0;
  d_318  =  d_303 * d_304;
  d_319  =  d_318 * d_305;
  d_320  =  d_317 - d_319;
  d_321  =  d_306 * d_307;
  d_322  =  d_321 * d_308;
  d_323  =  d_320 - d_322;
  d_324  =  fma(d_313, d_311, d_323);
  i_325  =  i_24 + 2;
  i_326  =  i_325 << 3;
  i_327  =  i_326 + 24;
  ui_328  =  ui_2 + i_327;
  i_329  =  isless(d_324, 0.0);
  if(i_329 == 1)
  {
    // BLOCK 10
    *((__global double *) ui_328)  =  1.0;
  }  // B10
  else
  {
    // BLOCK 11
    d_330  =  ui_5[2];
    d_331  =  ui_9[2];
    d_332  =  ui_4[2];
    d_333  =  ui_10[2];
    d_334  =  ui_10[2];
    d_335  =  ui_6[2];
    d_336  =  ui_9[2];
    d_337  =  ui_8[2];
    d_338  =  ui_4[2];
    d_339  =  ui_9[2];
    d_340  =  ui_7[2];
    d_341  =  ui_10[2];
    d_342  =  ui_8[2];
    d_343  =  ui_5[2];
    d_344  =  ui_10[2];
    d_345  =  ui_9[2];
    d_346  =  d_330 * d_331;
    d_347  =  d_332 * d_333;
    d_348  =  d_346 - d_347;
    d_349  =  d_334 * d_335;
    d_350  =  d_336 * d_337;
    d_351  =  d_349 - d_350;
    d_352  =  d_351 * 165.0;
    d_353  =  fma(d_348, d_338, d_352);
    d_354  =  d_339 * d_340;
    d_355  =  d_341 * d_342;
    d_356  =  d_354 - d_355;
    d_357  =  d_356 * 165.0;
    d_358  =  d_343 * d_348;
    d_359  =  d_357 - d_358;
    d_360  =  d_359 * d_345;
    d_361  =  fma(d_353, d_344, d_360);
    d_362  =  d_361 / d_324;
    *((__global double *) ui_328)  =  d_362;
  }  // B11
  // BLOCK 12 MERGES [10 11 ]
  d_363  =  ui_7[3];
  d_364  =  ui_6[3];
  d_365  =  ui_8[3];
  d_366  =  ui_8[3];
  d_367  =  ui_5[3];
  d_368  =  ui_5[3];
  d_369  =  ui_6[3];
  d_370  =  ui_4[3];
  d_371  =  ui_4[3];
  d_372  =  ui_7[3];
  d_373  =  ui_4[3];
  d_374  =  ui_5[3];
  d_375  =  ui_8[3];
  d_376  =  d_373 * 2.0;
  d_377  =  d_376 * d_374;
  d_378  =  d_363 * d_364;
  d_379  =  d_365 * d_366;
  d_380  =  d_378 - d_379;
  d_381  =  d_380 * 165.0;
  d_382  =  d_367 * d_368;
  d_383  =  d_382 * d_369;
  d_384  =  d_381 - d_383;
  d_385  =  d_370 * d_371;
  d_386  =  d_385 * d_372;
  d_387  =  d_384 - d_386;
  d_388  =  fma(d_377, d_375, d_387);
  i_389  =  i_24 + 3;
  i_390  =  i_389 << 3;
  i_391  =  i_390 + 24;
  ui_392  =  ui_2 + i_391;
  i_393  =  isless(d_388, 0.0);
  if(i_393 == 1)
  {
    // BLOCK 13
    *((__global double *) ui_392)  =  1.0;
  }  // B13
  else
  {
    // BLOCK 14
    d_394  =  ui_5[3];
    d_395  =  ui_9[3];
    d_396  =  ui_4[3];
    d_397  =  ui_10[3];
    d_398  =  ui_10[3];
    d_399  =  ui_6[3];
    d_400  =  ui_9[3];
    d_401  =  ui_8[3];
    d_402  =  ui_4[3];
    d_403  =  ui_9[3];
    d_404  =  ui_7[3];
    d_405  =  ui_10[3];
    d_406  =  ui_8[3];
    d_407  =  ui_5[3];
    d_408  =  ui_10[3];
    d_409  =  ui_9[3];
    d_410  =  d_394 * d_395;
    d_411  =  d_396 * d_397;
    d_412  =  d_410 - d_411;
    d_413  =  d_398 * d_399;
    d_414  =  d_400 * d_401;
    d_415  =  d_413 - d_414;
    d_416  =  d_415 * 165.0;
    d_417  =  fma(d_412, d_402, d_416);
    d_418  =  d_403 * d_404;
    d_419  =  d_405 * d_406;
    d_420  =  d_418 - d_419;
    d_421  =  d_420 * 165.0;
    d_422  =  d_407 * d_412;
    d_423  =  d_421 - d_422;
    d_424  =  d_423 * d_409;
    d_425  =  fma(d_417, d_408, d_424);
    d_426  =  d_425 / d_388;
    *((__global double *) ui_392)  =  d_426;
  }  // B14
  // BLOCK 15 MERGES [13 14 ]
  d_427  =  ui_7[4];
  d_428  =  ui_6[4];
  d_429  =  ui_8[4];
  d_430  =  ui_8[4];
  d_431  =  ui_5[4];
  d_432  =  ui_5[4];
  d_433  =  ui_6[4];
  d_434  =  ui_4[4];
  d_435  =  ui_4[4];
  d_436  =  ui_7[4];
  d_437  =  ui_4[4];
  d_438  =  ui_5[4];
  d_439  =  ui_8[4];
  d_440  =  d_437 * 2.0;
  d_441  =  d_440 * d_438;
  d_442  =  d_427 * d_428;
  d_443  =  d_429 * d_430;
  d_444  =  d_442 - d_443;
  d_445  =  d_444 * 165.0;
  d_446  =  d_431 * d_432;
  d_447  =  d_446 * d_433;
  d_448  =  d_445 - d_447;
  d_449  =  d_434 * d_435;
  d_450  =  d_449 * d_436;
  d_451  =  d_448 - d_450;
  d_452  =  fma(d_441, d_439, d_451);
  i_453  =  i_24 + 4;
  i_454  =  i_453 << 3;
  i_455  =  i_454 + 24;
  ui_456  =  ui_2 + i_455;
  i_457  =  isless(d_452, 0.0);
  if(i_457 == 1)
  {
    // BLOCK 16
    *((__global double *) ui_456)  =  1.0;
  }  // B16
  else
  {
    // BLOCK 17
    d_458  =  ui_5[4];
    d_459  =  ui_9[4];
    d_460  =  ui_4[4];
    d_461  =  ui_10[4];
    d_462  =  ui_10[4];
    d_463  =  ui_6[4];
    d_464  =  ui_9[4];
    d_465  =  ui_8[4];
    d_466  =  ui_4[4];
    d_467  =  ui_9[4];
    d_468  =  ui_7[4];
    d_469  =  ui_10[4];
    d_470  =  ui_8[4];
    d_471  =  ui_5[4];
    d_472  =  ui_10[4];
    d_473  =  ui_9[4];
    d_474  =  d_458 * d_459;
    d_475  =  d_460 * d_461;
    d_476  =  d_474 - d_475;
    d_477  =  d_462 * d_463;
    d_478  =  d_464 * d_465;
    d_479  =  d_477 - d_478;
    d_480  =  d_479 * 165.0;
    d_481  =  fma(d_476, d_466, d_480);
    d_482  =  d_467 * d_468;
    d_483  =  d_469 * d_470;
    d_484  =  d_482 - d_483;
    d_485  =  d_484 * 165.0;
    d_486  =  d_471 * d_476;
    d_487  =  d_485 - d_486;
    d_488  =  d_487 * d_473;
    d_489  =  fma(d_481, d_472, d_488);
    d_490  =  d_489 / d_452;
    *((__global double *) ui_456)  =  d_490;
  }  // B17
  // BLOCK 18 MERGES [16 17 ]
  d_491  =  ui_7[5];
  d_492  =  ui_6[5];
  d_493  =  ui_8[5];
  d_494  =  ui_8[5];
  d_495  =  ui_5[5];
  d_496  =  ui_5[5];
  d_497  =  ui_6[5];
  d_498  =  ui_4[5];
  d_499  =  ui_4[5];
  d_500  =  ui_7[5];
  d_501  =  ui_4[5];
  d_502  =  ui_5[5];
  d_503  =  ui_8[5];
  d_504  =  d_501 * 2.0;
  d_505  =  d_504 * d_502;
  d_506  =  d_491 * d_492;
  d_507  =  d_493 * d_494;
  d_508  =  d_506 - d_507;
  d_509  =  d_508 * 165.0;
  d_510  =  d_495 * d_496;
  d_511  =  d_510 * d_497;
  d_512  =  d_509 - d_511;
  d_513  =  d_498 * d_499;
  d_514  =  d_513 * d_500;
  d_515  =  d_512 - d_514;
  d_516  =  fma(d_505, d_503, d_515);
  i_517  =  i_24 + 5;
  i_518  =  i_517 << 3;
  i_519  =  i_518 + 24;
  ui_520  =  ui_2 + i_519;
  i_521  =  isless(d_516, 0.0);
  if(i_521 == 1)
  {
    // BLOCK 19
    *((__global double *) ui_520)  =  1.0;
  }  // B19
  else
  {
    // BLOCK 20
    d_522  =  ui_5[5];
    d_523  =  ui_9[5];
    d_524  =  ui_4[5];
    d_525  =  ui_10[5];
    d_526  =  ui_10[5];
    d_527  =  ui_6[5];
    d_528  =  ui_9[5];
    d_529  =  ui_8[5];
    d_530  =  ui_4[5];
    d_531  =  ui_9[5];
    d_532  =  ui_7[5];
    d_533  =  ui_10[5];
    d_534  =  ui_8[5];
    d_535  =  ui_5[5];
    d_536  =  ui_10[5];
    d_537  =  ui_9[5];
    d_538  =  d_522 * d_523;
    d_539  =  d_524 * d_525;
    d_540  =  d_538 - d_539;
    d_541  =  d_526 * d_527;
    d_542  =  d_528 * d_529;
    d_543  =  d_541 - d_542;
    d_544  =  d_543 * 165.0;
    d_545  =  fma(d_540, d_530, d_544);
    d_546  =  d_531 * d_532;
    d_547  =  d_533 * d_534;
    d_548  =  d_546 - d_547;
    d_549  =  d_548 * 165.0;
    d_550  =  d_535 * d_540;
    d_551  =  d_549 - d_550;
    d_552  =  d_551 * d_537;
    d_553  =  fma(d_545, d_536, d_552);
    d_554  =  d_553 / d_516;
    *((__global double *) ui_520)  =  d_554;
  }  // B20
  // BLOCK 21 MERGES [19 20 ]
  d_555  =  ui_7[6];
  d_556  =  ui_6[6];
  d_557  =  ui_8[6];
  d_558  =  ui_8[6];
  d_559  =  ui_5[6];
  d_560  =  ui_5[6];
  d_561  =  ui_6[6];
  d_562  =  ui_4[6];
  d_563  =  ui_4[6];
  d_564  =  ui_7[6];
  d_565  =  ui_4[6];
  d_566  =  ui_5[6];
  d_567  =  ui_8[6];
  d_568  =  d_565 * 2.0;
  d_569  =  d_568 * d_566;
  d_570  =  d_555 * d_556;
  d_571  =  d_557 * d_558;
  d_572  =  d_570 - d_571;
  d_573  =  d_572 * 165.0;
  d_574  =  d_559 * d_560;
  d_575  =  d_574 * d_561;
  d_576  =  d_573 - d_575;
  d_577  =  d_562 * d_563;
  d_578  =  d_577 * d_564;
  d_579  =  d_576 - d_578;
  d_580  =  fma(d_569, d_567, d_579);
  i_581  =  i_24 + 6;
  i_582  =  i_581 << 3;
  i_583  =  i_582 + 24;
  ui_584  =  ui_2 + i_583;
  i_585  =  isless(d_580, 0.0);
  if(i_585 == 1)
  {
    // BLOCK 22
    *((__global double *) ui_584)  =  1.0;
  }  // B22
  else
  {
    // BLOCK 23
    d_586  =  ui_5[6];
    d_587  =  ui_9[6];
    d_588  =  ui_4[6];
    d_589  =  ui_10[6];
    d_590  =  ui_10[6];
    d_591  =  ui_6[6];
    d_592  =  ui_9[6];
    d_593  =  ui_8[6];
    d_594  =  ui_4[6];
    d_595  =  ui_9[6];
    d_596  =  ui_7[6];
    d_597  =  ui_10[6];
    d_598  =  ui_8[6];
    d_599  =  ui_5[6];
    d_600  =  ui_10[6];
    d_601  =  ui_9[6];
    d_602  =  d_586 * d_587;
    d_603  =  d_588 * d_589;
    d_604  =  d_602 - d_603;
    d_605  =  d_590 * d_591;
    d_606  =  d_592 * d_593;
    d_607  =  d_605 - d_606;
    d_608  =  d_607 * 165.0;
    d_609  =  fma(d_604, d_594, d_608);
    d_610  =  d_595 * d_596;
    d_611  =  d_597 * d_598;
    d_612  =  d_610 - d_611;
    d_613  =  d_612 * 165.0;
    d_614  =  d_599 * d_604;
    d_615  =  d_613 - d_614;
    d_616  =  d_615 * d_601;
    d_617  =  fma(d_609, d_600, d_616);
    d_618  =  d_617 / d_580;
    *((__global double *) ui_584)  =  d_618;
  }  // B23
  // BLOCK 24 MERGES [22 23 ]
  return;
}  // B24
}  //  kernel

Second invocation finishes with the error

[JNI] uk.ac.manchester.tornado.drivers.opencl> notify error:
[JNI] uk.ac.manchester.tornado.drivers.opencl> [CL_DEVICE_NOT_AVAILABLE] : OpenCL Error : Error: Build Program driver returned (-2)
[JNI] uk.ac.manchester.tornado.drivers.opencl> notify error:
[JNI] uk.ac.manchester.tornado.drivers.opencl> OpenCL Warning : clBuildProgram failed: could not build program for 0x1021e00 (AMD Radeon Pro 5500M Compute Engine) (err:-2)
[JNI] uk.ac.manchester.tornado.drivers.opencl> notify error:
[JNI] uk.ac.manchester.tornado.drivers.opencl> [CL_BUILD_ERROR] : OpenCL Build Error : Compiler build log:
<program source>:823:1: error: extraneous closing brace ('}')
}  //  kernel
^

How To Reproduce

Java kernelized code:

public static void computeKernel(KernelContext context,  int tileSize, final double infirstScanFrequency, final double frequencyStep, final double deltaEpsilon, final double[] normObsTimes, final double[] normObsValues, final double[] amplitudes, int[] localGroups) 
	{
		
		int tileIdx = context.globalIdx;

                //testing kernel params
		localGroups[0] = context.localGroupSizeX;
		localGroups[1] = context.globalGroupSizeX;
		localGroups[2] = context.localIdx;
		localGroups[3] = context.globalIdx;
		localGroups[4+context.globalIdx]=context.globalIdx;
                //\testing kernel params		

		int startFreqIdx = tileIdx * tileSize;
		int nObservations = normObsTimes.length;		
		
		final double omega =  (infirstScanFrequency + startFreqIdx * frequencyStep) * 2 * Math.PI;		
		// starting frequency
		final double deltaOmega = frequencyStep * 2 * Math.PI;		
		// frequency step
//		final double[] sumSx = context.allocateDoubleLocalArray(constTileSize); this does not work, reported in another issue		
		final double[] sumSx = new double[constTileSize];
		// sum of sin(obsTimes[i]*2pi*f)
		final double[] sumCx = new double[constTileSize]; 		
		// sum of cos(obsTimes[i]*2pi*f)
		final double[] sumSx2  = new double[constTileSize]; 		
		// sum of sin(obsTimes[i]*2pi*f)*sin(obsTimes[i]*2pi*f)
		final double[] sumCx2 = new double[constTileSize]; 		
		// sum of cos(obsTimes[i]*2pi*f)*cos(obsTimes[i]*2pi*f)
		final double[] sumSxCx = new double[constTileSize]; 		
		// sum of sin(obsTimes[i]*2pi*f)*cos(obsTimes[i]*2pi*f)
		final double[] sumSxVal = new double[constTileSize]; 		
		// sum of sin(obsTimes[i]*2pi*f)*obsValue[i]
		final double[] sumCxVal = new double[constTileSize]; 		
		// sum of cos(obsTimes[i]*2pi*f)*obsValue[i]
		
		
		// for each set of observation data 
		for (int i = 0; i < nObservations; i++)   {
			final double obsTime = normObsTimes[i];
			final double obsValue = normObsValues[i];

			// calculate the starting phase and it's sine and cosine
			final double phase = obsTime * omega;
			double sPh = Math.sin(phase);
			double cPh = Math.cos(phase);

			// calculate the phase step and it's sine and cosine
			final double dPhase = obsTime * deltaOmega;
			final double sDPh = Math.sin(dPhase);
			final double cDPh = Math.cos(dPhase);
			
			// for each frequency to test, increment the phase with the phase step
			for (int j = 0; j < tileSize; j++) {
				sumSx[j] += sPh;
				sumCx[j] += cPh;
				sumSx2[j] += sPh*sPh;
				sumCx2[j] += cPh*cPh;
				sumSxCx[j] += cPh*sPh;
				sumSxVal[j] += sPh*obsValue;
				sumCxVal[j] += cPh*obsValue;

				final double cT = cPh;
				cPh = cT * cDPh - sPh * sDPh;
				sPh = sPh * cDPh + cT * sDPh;
			}
		}


		// calculate intermediate variables and results
		for (int i = 0; i < tileSize; i++)    {
			final double d = nObservations * ( sumCx2[i] * sumSx2[i] - sumSxCx[i] * sumSxCx[i]) 
					- sumCx[i] * sumCx[i] * sumSx2[i] - sumSx[i] * sumSx[i] * sumCx2[i]
					+ 2 * sumSx[i] * sumCx[i] * sumSxCx[i];  
			if (d < deltaEpsilon) {
				amplitudes[startFreqIdx + i] = MAXRANGE;
			} else {
				final double b = sumCx[i] * sumSxVal[i] - sumSx[i] * sumCxVal[i];
				final double c1 = nObservations * ( sumCxVal[i] * sumSx2[i] - sumSxVal[i] * sumSxCx[i] ) + sumSx[i] * b;
				final double c2 = nObservations * ( sumSxVal[i] * sumCx2[i] - sumCxVal[i] * sumSxCx[i] ) - sumCx[i] * b;
				amplitudes[startFreqIdx + i] = ( c1 * sumCxVal[i] + c2 * sumSxVal[i] ) / d;
			}
		}
	}

Invoked as

		final double[] amplitudes = new double[this.nScanFrequencies];	
		
		// init context
		int deviceNum = 2;
        WorkerGrid workerGrid = new WorkerGrid1D(nScanFrequencies);
        GridScheduler gridScheduler = new GridScheduler("PeriodSearch.t0", workerGrid);
        KernelContext context = new KernelContext();
        // Set the global work size as we slice by frequencies
        workerGrid.setGlobalWork(4096, 1, 1); // single dimension
        int tileSize =  128;
        tileSize  = amplitudes.length<tileSize?amplitudes.length:tileSize;
        workerGrid.setLocalWork(tileSize,1,1);

	//some debug info.	
	final int[] localGroup  = new int[amplitudes.length+4];
        IntStream.range(0, localGroup.length).parallel().forEach(idx -> {
            localGroup[idx] = 0;
        });
	//\some debug info.	  

    	TaskSchedule task1 = new TaskSchedule("PeriodSearch") //
    			.streamIn(amplitudes) //does not change much here, input is relatively small
                .task("t0", MethodLeastSquareGPU::computeKernel, context, tileSize , firstScanFrequency, frequencyStep, deltaEpsilon, normObsTimes, normObsValues, amplitudes, localGroup) 
                .streamOut(amplitudes,localGroup);
    	
    	task1.execute(gridScheduler);

Provide a test-case and instructions of how to reproduce the issue. Expected behavior

Proper code generated on the call with smaller input, no openCL errors.

Computing system setup (please complete the following information):

  • OS: macOS/Linux
  • OpenCL Version 1.2,2,3
  • TornadoVM 0.13-dev

Additional context

Issue Analytics

  • State:closed
  • Created 2 years ago
  • Comments:11 (11 by maintainers)

github_iconTop GitHub Comments

1reaction
yazuncommented, Dec 7, 2021

For SPIRV backend the following exception appears:

There was 1 failure:
1) testGPUGaiaSource4660664932119220224(gaia.cu7.algo.character.periodsearch.methods.test.MethodLeastSquareComputeComparison)
java.lang.NullPointerException: Cannot invoke "jdk.vm.ci.meta.Constant.toValueString()" because "value" is null
        at tornado.drivers.spirv@0.13-dev/uk.ac.manchester.tornado.drivers.spirv.SPIRVBackend.buildLiteralContextNumber(SPIRVBackend.java:461)
        at tornado.drivers.spirv@0.13-dev/uk.ac.manchester.tornado.drivers.spirv.SPIRVBackend.emitPrologueForMainKernelEntry(SPIRVBackend.java:884)
        at tornado.drivers.spirv@0.13-dev/uk.ac.manchester.tornado.drivers.spirv.SPIRVBackend.emitPrologue(SPIRVBackend.java:982)
        at tornado.drivers.spirv@0.13-dev/uk.ac.manchester.tornado.drivers.spirv.SPIRVBackend.emitCode(SPIRVBackend.java:370)
        at tornado.drivers.spirv@0.13-dev/uk.ac.manchester.tornado.drivers.spirv.graal.compiler.SPIRVCompiler.emitCode(SPIRVCompiler.java:307)
        at tornado.drivers.spirv@0.13-dev/uk.ac.manchester.tornado.drivers.spirv.graal.compiler.SPIRVCompiler.emitBackEnd(SPIRVCompiler.java:231)
        at tornado.drivers.spirv@0.13-dev/uk.ac.manchester.tornado.drivers.spirv.graal.compiler.SPIRVCompiler.compile(SPIRVCompiler.java:174)
        at tornado.drivers.spirv@0.13-dev/uk.ac.manchester.tornado.drivers.spirv.graal.compiler.SPIRVCompiler$SPIRVCompilationRequest.execute(SPIRVCompiler.java:156)
        at tornado.drivers.spirv@0.13-dev/uk.ac.manchester.tornado.drivers.spirv.graal.compiler.SPIRVCompiler.compileSketchForDevice(SPIRVCompiler.java:412)
        at tornado.drivers.spirv@0.13-dev/uk.ac.manchester.tornado.drivers.spirv.runtime.SPIRVTornadoDevice.compileTask(SPIRVTornadoDevice.java:195)
        at tornado.drivers.spirv@0.13-dev/uk.ac.manchester.tornado.drivers.spirv.runtime.SPIRVTornadoDevice.installCode(SPIRVTornadoDevice.java:128)
        at tornado.runtime@0.13-dev/uk.ac.manchester.tornado.runtime.TornadoVM.compileTaskFromBytecodeToBinary(TornadoVM.java:467)
        at tornado.runtime@0.13-dev/uk.ac.manchester.tornado.runtime.TornadoVM.execute(TornadoVM.java:743)
        at tornado.runtime@0.13-dev/uk.ac.manchester.tornado.runtime.TornadoVM.execute(TornadoVM.java:227)
        at tornado.runtime@0.13-dev/uk.ac.manchester.tornado.runtime.tasks.TornadoTaskSchedule.scheduleInner(TornadoTaskSchedule.java:630)
        at tornado.runtime@0.13-dev/uk.ac.manchester.tornado.runtime.tasks.TornadoTaskSchedule.schedule(TornadoTaskSchedule.java:912)
        at tornado.runtime@0.13-dev/uk.ac.manchester.tornado.runtime.tasks.TornadoTaskSchedule.schedule(TornadoTaskSchedule.java:920)
        at tornado.api@0.13-dev/uk.ac.manchester.tornado.api.TaskSchedule.execute(TaskSchedule.java:305)
        at gaia.cu7.algo.character.periodsearch.methods.MethodLeastSquareGPU.computeMainKernel(MethodLeastSquareGPU.java:415)
...

Devices:

 tornado --devices
WARNING: Using incubator modules: jdk.incubator.vector, jdk.incubator.foreign

Number of Tornado drivers: 2
Driver: SPIRV
  Total number of SPIRV devices  : 1
  Tornado device=0:0
        SPIRV -- SPIRV LevelZero - Intel(R) Iris(TM) Pro Graphics P580 [0x193a]
                Global Memory Size: 50.2 GB
                Local Memory Size: 64.0 KB
                Workgroup Dimensions: 3
                Total Number of Block Threads: 256
                Max WorkGroup Configuration: [256, 256, 256]
                Device OpenCL C version:  (LEVEL ZERO) 1.1

Driver: OpenCL
  Total number of OpenCL devices  : 2
  Tornado device=1:0
        OpenCL --  [Intel(R) CPU Runtime for OpenCL(TM) Applications] -- Intel(R) Xeon(R) CPU E3-1585L v5 @ 3.00GHz
                Global Memory Size: 62.8 GB
                Local Memory Size: 32.0 KB
                Workgroup Dimensions: 3
                Total Number of Block Threads: 8192
                Max WorkGroup Configuration: [8192, 8192, 8192]
                Device OpenCL C version: OpenCL C 2.0

  Tornado device=1:1
        OpenCL --  [Intel(R) OpenCL HD Graphics] -- Intel(R) Iris(TM) Pro Graphics P580 [0x193a]
                Global Memory Size: 50.2 GB
                Local Memory Size: 64.0 KB
                Workgroup Dimensions: 3
                Total Number of Block Threads: 256
                Max WorkGroup Configuration: [256, 256, 256]
                Device OpenCL C version: OpenCL C 3.0

1reaction
yazuncommented, Dec 6, 2021

It seems to me an error during the code-gen for OpenCL. Have you tried other backend? For example PTX or SPIR-V?

TornadoVM bails out and executes the Java sequential, but only when using the Loop Parallel API (not the kernel context). We have plans to include support for bailout when using the KernelContext API, but it is not implemented at the moment.

Could existing -Dtornado.recover.bailout=False be used for this?

Read more comments on GitHub >

github_iconTop Results From Across the Web

A pair of Rust kernel modules - LWN.net
I do not want a Rust compiler on my computer, I do not want to need a Rust compiler on my computer to...
Read more >
Transpiling A Kernel Module to Rust: The Good, the Bad and ...
The basic steps we'll cover are: translating the C files into Rust,; compiling generated Rust, and; linking everything into a loadable kernel module....
Read more >
Is there a way to fail a Linux kernel build if prompted for config ...
It will list and describe the needed options asking for input from standard input. If nothing is missing, it will just output something...
Read more >
A Freestanding Rust Binary | Writing an OS in Rust
The first step in creating our own operating system kernel is to create a Rust executable that does not link the standard library....
Read more >
gcc(1) - Linux manual page - man7.org
Some options control the preprocessor and others the compiler itself. Yet other options control the assembler and linker; most of these are not...
Read more >

github_iconTop Related Medium Post

No results found

github_iconTop Related StackOverflow Question

No results found

github_iconTroubleshoot Live Code

Lightrun enables developers to add logs, metrics and snapshots to live code - no restarts or redeploys required.
Start Free

github_iconTop Related Reddit Thread

No results found

github_iconTop Related Hackernoon Post

No results found

github_iconTop Related Tweet

No results found

github_iconTop Related Dev.to Post

No results found

github_iconTop Related Hashnode Post

No results found