HenriquesLab · brunomsaraiva · Apr 9, 2025 · Mar 12, 2025 · Mar 12, 2025 · Mar 12, 2025
diff --git a/src/include/_c_sr_radial_gradient_convergence.c b/src/include/_c_sr_radial_gradient_convergence.c
@@ -14,36 +14,47 @@ double _c_calculate_dk(float Gx, float Gy, float dx, float dy, float distance) {
   return Dk;
 }
 
-float _c_calculate_rgc(int xM, int yM, float* imIntGx, float* imIntGy, int colsM, int rowsM, int magnification, float Gx_Gy_MAGNIFICATION, float fwhm, float tSO, float tSS, float sensitivity) {
+void _rotate_vector(float* Gx, float* Gy, float angle) {
+    float cos_angle = cos(angle);
+    float sin_angle = sin(angle);
+
+    float original_Gx = *Gx;
+    float original_Gy = *Gy;
+
+    *Gx = original_Gx * cos_angle - original_Gy * sin_angle;
+    *Gy = original_Gx * sin_angle + original_Gy * cos_angle;
+}
+
+float _c_calculate_rgc(int xM, int yM, float* imIntGx, float* imIntGy, int colsM, int rowsM, int magnification, float Gx_Gy_MAGNIFICATION, float fwhm, float tSO, float tSS, float sensitivity, float offset, float xyoffset, float angle) {
 
     float vx, vy, Gx, Gy, dx, dy, distance, distanceWeight, GdotR, Dk;
 
-    float xc = (xM + 0.5) / magnification;
-    float yc = (yM + 0.5) / magnification;
+    float xc = (float)xM / magnification + offset; // offset in non-magnified space
+    float yc = (float)yM / magnification + offset;
 
     float RGC = 0;
     float distanceWeightSum = 0;
 
-    int _start = -(int)(Gx_Gy_MAGNIFICATION * fwhm);
-    int _end = (int)(Gx_Gy_MAGNIFICATION * fwhm + 1);
+    int _start = -(int)(2 * fwhm);
+    int _end = (int)(2 * fwhm + 1);
 
     for (int j = _start; j < _end; j++) {
-        vy = (int)(Gx_Gy_MAGNIFICATION * yc) + j;
-        vy /= Gx_Gy_MAGNIFICATION;
+        vy = yc + j;
 
-        if (0 < vy && vy <= rowsM - 1) {
+        if (0 < vy && vy <= rowsM/magnification - 1) {
             for (int i = _start; i < _end; i++) {
-                vx = (int)(Gx_Gy_MAGNIFICATION * xc) + i;
-                vx /= Gx_Gy_MAGNIFICATION;
+                vx = xc + i;
 
-                if (0 < vx && vx <= colsM - 1) {
+                if (0 < vx && vx <= colsM/magnification - 1) {
                     dx = vx - xc;
                     dy = vy - yc;
                     distance = sqrt(dx * dx + dy * dy);
 
                     if (distance != 0 && distance <= tSO) {
-                        Gx = imIntGx[(int)(vy * magnification * Gx_Gy_MAGNIFICATION * colsM * Gx_Gy_MAGNIFICATION) + (int)(vx * magnification * Gx_Gy_MAGNIFICATION)];
-                        Gy = imIntGy[(int)(vy * magnification * Gx_Gy_MAGNIFICATION * colsM * Gx_Gy_MAGNIFICATION) + (int)(vx * magnification * Gx_Gy_MAGNIFICATION)];
+                        Gx = imIntGx[(int)((vy+xyoffset) * magnification * Gx_Gy_MAGNIFICATION * colsM * Gx_Gy_MAGNIFICATION) + (int)((vx+xyoffset) * magnification * Gx_Gy_MAGNIFICATION)];
+                        Gy = imIntGy[(int)((vy+xyoffset) * magnification * Gx_Gy_MAGNIFICATION * colsM * Gx_Gy_MAGNIFICATION) + (int)((vx+xyoffset) * magnification * Gx_Gy_MAGNIFICATION)];
+
+                        _rotate_vector(&Gx, &Gy, angle);
 
                         distanceWeight = _c_calculate_dw(distance, tSS);
                         distanceWeightSum += distanceWeight;

diff --git a/src/include/_c_sr_radial_gradient_convergence.h b/src/include/_c_sr_radial_gradient_convergence.h
@@ -7,7 +7,9 @@ double _c_calculate_dw(double distance, double tSS);
 
 double _c_calculate_dk(float Gx, float Gy, float dx, float dy, float distance);
 
-float _c_calculate_rgc(int xM, int yM, float* imIntGx, float* imIntGy, int colsM, int rowsM, int magnification, float Gx_Gy_MAGNIFICATION, float fwhm, float tSO, float tSS, float sensitivity);
+void _rotate_vector(float* Gx, float* Gy, float angle);
+
+float _c_calculate_rgc(int xM, int yM, float* imIntGx, float* imIntGy, int colsM, int rowsM, int magnification, float Gx_Gy_MAGNIFICATION, float fwhm, float tSO, float tSS, float sensitivity, float offset, float xyoffset, float angle);
 
 double _c_calculate_dw3D_isotropic(double distance, double tSS);
 
@@ -21,6 +23,7 @@ double _c_calculate_dk3D(float Gx, float Gy, float Gz, float dx, float dy, float
 
 float _c_calculate_rgc3D(int xM, int yM, int sliceM, float* imIntGx, float* imIntGy, float* imIntGz, int colsM, int rowsM, int slicesM, int magnification_xy, int magnification_z, float ratio_px, float Gx_Gy_MAGNIFICATION, float Gz_MAGNIFICATION, float fwhm, float fwhm_z, float tSO, float tSO_z, float tSS, float tSS_z, float sensitivity);
 
+
 float _c_calculate_rgc_3d(int cM, int rM, int sM, float* imIntGx, float* imIntGy, float* imIntGz, int colsM, int rowsM, int slicesM, int magnification_xy, int magnification_z, float Gx_Gy_MAGNIFICATION, float fwhm, float fwhm_z, float tSO, float tSS, float sensitivity);
 
 #endif
diff --git a/src/liquid_benchmarks/_le_convolution/Convolution.yml b/src/liquid_benchmarks/_le_convolution/Convolution.yml
@@ -1,5 +1,5 @@
 Cuda:
-  (['shape(100, 100)', 'shape(23, 23)'], {}):
+  (['shape(1, 100, 100)', 'shape(23, 23)'], {}):
   - 5290000
   - 0.0004861999999974387
   - 0.0004765000000048758
@@ -22,7 +22,7 @@ Cuda:
   - 0.00047459999998977764
   - 0.0004846000000213735
 Dask:
-  (['shape(100, 100)', 'shape(23, 23)'], {}):
+  (['shape(1, 100, 100)', 'shape(23, 23)'], {}):
   - 5290000
   - 0.00937319999999886
   - 0.00935059999999055
@@ -45,7 +45,7 @@ Dask:
   - 0.009444899999976997
   - 0.008983199999988756
 Numba:
-  (['shape(100, 100)', 'shape(23, 23)'], {}):
+  (['shape(1, 100, 100)', 'shape(23, 23)'], {}):
   - 5290000
   - 0.005503999999987741
   - 0.005404399999989096
@@ -68,7 +68,7 @@ Numba:
   - 0.006126800000004096
   - 0.00559069999999906
 OpenCL:
-  (['shape(100, 100)', 'shape(23, 23)'], {}):
+  (['shape(1, 100, 100)', 'shape(23, 23)'], {}):
   - 5290000
   - 0.09695740000000796
   - 0.0791657999999984
@@ -91,7 +91,7 @@ OpenCL:
   - 0.07781260000001566
   - 0.07502910000002316
 Python:
-  (['shape(100, 100)', 'shape(23, 23)'], {}):
+  (['shape(1, 100, 100)', 'shape(23, 23)'], {}):
   - 5290000
   - 2.4462038000000064
   - 2.4630003999999985
@@ -114,7 +114,7 @@ Python:
   - 2.3908779999999865
   - 2.423667199999983
 Threaded:
-  (['shape(100, 100)', 'shape(23, 23)'], {}):
+  (['shape(1, 100, 100)', 'shape(23, 23)'], {}):
   - 5290000
   - 0.000824199999996722
   - 0.0008181999999976597
@@ -137,7 +137,7 @@ Threaded:
   - 0.0007410999999990509
   - 0.00074050000000625
 Threaded_dynamic:
-  (['shape(100, 100)', 'shape(23, 23)'], {}):
+  (['shape(1, 100, 100)', 'shape(23, 23)'], {}):
   - 5290000
   - 0.0004450999999932037
   - 0.00044110000000330274
@@ -160,7 +160,7 @@ Threaded_dynamic:
   - 0.00042849999999816646
   - 0.00042849999999816646
 Threaded_guided:
-  (['shape(100, 100)', 'shape(23, 23)'], {}):
+  (['shape(1, 100, 100)', 'shape(23, 23)'], {}):
   - 5290000
   - 0.00045450000000357704
   - 0.0004828000000003385
@@ -183,7 +183,7 @@ Threaded_guided:
   - 0.0004787000000021635
   - 0.000453700000008439
 Threaded_static:
-  (['shape(100, 100)', 'shape(23, 23)'], {}):
+  (['shape(1, 100, 100)', 'shape(23, 23)'], {}):
   - 5290000
   - 0.0004264999999890051
   - 0.0006580000000013797
@@ -206,7 +206,7 @@ Threaded_static:
   - 0.00047909999997841624
   - 0.0004892999999981384
 Transonic:
-  (['shape(100, 100)', 'shape(23, 23)'], {}):
+  (['shape(1, 100, 100)', 'shape(23, 23)'], {}):
   - 5290000
   - 0.005411799999990308
   - 0.00537510000000907
@@ -229,7 +229,7 @@ Transonic:
   - 0.005397600000009106
   - 0.005529200000012224
 Unthreaded:
-  (['shape(100, 100)', 'shape(23, 23)'], {}):
+  (['shape(1, 100, 100)', 'shape(23, 23)'], {}):
   - 5290000
   - 0.006530499999996664
   - 0.0064456000000063796

diff --git a/src/mako_templates/nanopyx.core.analysis._le_channel_registration.pyx b/src/mako_templates/nanopyx.core.analysis._le_channel_registration.pyx
@@ -85,10 +85,9 @@ def _gaussian_filter(inpimg, _runtype, sigma):
     knl1 = np.exp(-0.5 / (sigma*sigma) * x1 ** 2)
     knl1 = knl1 / knl1.sum()
     knl1 = knl1.astype(np.float32)
-
     img1 = conv.run(inpimg,knl1.reshape((len(x1), 1)),run_type=_runtype)
+    img1 = np.expand_dims(img1, axis=0)
     img2 = conv.run(img1,knl1.reshape((1, len(x1))),run_type=_runtype)
-
     return img2
 
 class ChannelRegistrationEstimator(LiquidEngine):

diff --git a/src/mako_templates/nanopyx.core.transform._le_convolution.pyx b/src/mako_templates/nanopyx.core.transform._le_convolution.pyx
@@ -13,7 +13,7 @@ from cython.parallel import parallel, prange
 from libc.math cimport cos, sin
 
 from .__interpolation_tools__ import check_image, value2array
-from .convolution import check_array, convolution2D_cuda, convolution2D_dask, convolution2D_numba, convolution2D_python, convolution2D_transonic
+from .convolution import convolution2D_cuda, convolution2D_dask, convolution2D_numba, convolution2D_python, convolution2D_transonic
 from ...__liquid_engine__ import LiquidEngine
 from ...__opencl__ import cl, cl_array, _fastest_device
 
@@ -29,23 +29,25 @@ class Convolution(LiquidEngine):
             clear_benchmarks=clear_benchmarks, testing=testing, verbose=verbose)
 
     def run(self, image, kernel, run_type=None):
-        image = check_array(image)
+        image = check_image(image)
         return self._run(image, kernel, run_type=run_type)
 
     def benchmark(self, image, kernel):
+        image = check_image(image)
         return super().benchmark(image, kernel)
 
     % for sch in schedulers:
-    def _run_${sch}(self, float[:,:] image, float[:,:] kernel):
+    def _run_${sch}(self, float[:,:,:] image, float[:,:] kernel):
         """
         @cpu
         % if sch!='unthreaded':
         @threaded
         % endif
         @cython
         """
-        cdef int nRows = image.shape[0]
-        cdef int nCols = image.shape[1]
+        cdef int nFrames = image.shape[0]
+        cdef int nRows = image.shape[1]
+        cdef int nCols = image.shape[2]
 
         cdef int nRows_kernel = kernel.shape[0]
         cdef int nCols_kernel = kernel.shape[1]
@@ -61,29 +63,30 @@ class Convolution(LiquidEngine):
         cdef float acc = 0
 
 
-        conv_out = np.zeros((nRows, nCols), dtype=np.float32)
-        cdef float[:,:] _conv_out = conv_out
+        conv_out = np.zeros((nFrames, nRows, nCols), dtype=np.float32)
+        cdef float[:,:,:] _conv_out = conv_out
 
         with nogil:
-            % if sch=='unthreaded':
-            for r in range(nRows):
-                for c in range(nCols):
-            % elif sch=='threaded':
-            for r in prange(nRows):
-                for c in prange(nCols):
-            % else:
-            for r in prange(nRows,schedule="${sch.split('_')[1]}"):
-                for c in prange(nCols,schedule="${sch.split('_')[1]}"):
-            % endif
-                    acc = 0
-                    for kr in range(nRows_kernel):
-                        for kc in range(nCols_kernel):
-                            local_row = min(max(r+(kr-center_r),0),nRows-1)
-                            local_col = min(max(c+(kc-center_c),0),nCols-1)
-                            acc = acc + kernel[kr,kc] * image[local_row, local_col]
-                    _conv_out[r,c] = acc
-
-        return conv_out
+            for f in range(nFrames):
+                % if sch=='unthreaded':
+                for r in range(nRows):
+                    for c in range(nCols):
+                % elif sch=='threaded':
+                for r in prange(nRows):
+                    for c in prange(nCols):
+                % else:
+                for r in prange(nRows,schedule="${sch.split('_')[1]}"):
+                    for c in prange(nCols,schedule="${sch.split('_')[1]}"):
+                % endif
+                        acc = 0
+                        for kr in range(nRows_kernel):
+                            for kc in range(nCols_kernel):
+                                local_row = min(max(r+(kr-center_r),0),nRows-1)
+                                local_col = min(max(c+(kc-center_c),0),nCols-1)
+                                acc = acc + kernel[kr,kc] * image[f,local_row, local_col]
+                        _conv_out[f,r,c] = acc
+
+        return np.squeeze(conv_out)
 
     % endfor
 
@@ -99,60 +102,67 @@ class Convolution(LiquidEngine):
         dc = device['device']
         cl_queue = cl.CommandQueue(cl_ctx)
 
-        image_out = np.zeros((image.shape[0], image.shape[1]), dtype=np.float32)
+        image_out = np.zeros((image.shape[0], image.shape[1], image.shape[2]), dtype=np.float32)
         mf = cl.mem_flags
 
-        input_image = cl.image_from_array(cl_ctx, image, mode='r')
-        input_kernel = cl.image_from_array(cl_ctx, kernel, mode='r')
-        output_opencl = cl.image_from_array(cl_ctx, image_out, mode='w')
+        input_image = cl.Buffer(cl_ctx, mf.READ_ONLY, image.nbytes)
+        cl.enqueue_copy(cl_queue, input_image, image).wait()
+
+        input_kernel = cl.Buffer(cl_ctx, mf.READ_ONLY, kernel.nbytes)
+        cl.enqueue_copy(cl_queue, input_kernel, kernel).wait()
+
+        output_opencl = cl.Buffer(cl_ctx, mf.WRITE_ONLY, image_out.nbytes)
+
+        kernelsize = kernel.shape[0]
         cl_queue.finish()
 
         code = self._get_cl_code("_le_convolution.cl", device['DP'])
         prg = cl.Program(cl_ctx, code).build()
-        knl = prg.conv2d
+        knl = prg.conv2d_2
 
         knl(cl_queue,
-            (1,image.shape[0], image.shape[1]), 
-            self.get_work_group(device['device'],(1,image.shape[0], image.shape[1])),
+            (image.shape[0],image.shape[1],image.shape[2]), 
+            None,#self.get_work_group(device['device'],(image.shape[0], image.shape[1], image.shape[2])),
             input_image, 
             output_opencl, 
-            input_kernel).wait() 
+            input_kernel,
+            np.int32(kernelsize)).wait() 
 
-        cl.enqueue_copy(cl_queue, image_out, output_opencl,origin=(0,0), region=(image.shape[0], image.shape[1])).wait() 
+        cl.enqueue_copy(cl_queue, image_out, output_opencl).wait() 
 
         cl_queue.finish()
-        return image_out
+        return np.squeeze(image_out)
 
     def _run_python(self, image, kernel):
         """
         @cpu
         """
-        return convolution2D_python(image, kernel).astype(np.float32)
+        return np.squeeze(convolution2D_python(image, kernel))
 
     def _run_transonic(self, image, kernel):
         """
         @cpu
         @threaded
         """
-        return convolution2D_transonic(image, kernel).astype(np.float32)
+        return np.squeeze(convolution2D_transonic(image, kernel))
 
     def _run_dask(self, image, kernel):
         """
         @cpu
         @threaded
         """
-        return convolution2D_dask(image, kernel).astype(np.float32)
+        return np.squeeze(convolution2D_dask(image, kernel))
 
     def _run_cuda(self, image, kernel):
         """
         @gpu
         """
-        return convolution2D_cuda(image, kernel).astype(np.float32)
+        return np.squeeze(convolution2D_cuda(image, kernel))
 
     def _run_numba(self, image, kernel):
         """
         @cpu
         @threaded
         @numba
         """
-        return convolution2D_numba(image, kernel).astype(np.float32)
+        return np.squeeze(convolution2D_numba(image, kernel))