weixin_39611666
weixin_39611666
2020-11-27 18:12

Speedup for raw CA correction

I made a new speedup for raw CA correction. It's in the 20% range, measured on my old holiday laptop with 2 cores using Sabayon/64bit. I post it as a patch instead of a branch because I have to use a different proxy with proxy authentication each day, which complicates usage of git push.

I managed to vectorize another loop, but the larger part of the speedup was achieved by using only one half of memory to store the values for the rgb[0] and rgb[2] arrays.

I checked for differences before/after. There are some, because I also optimized some calculations, but imho the differences are very very small and the result should be more precise than the before patch result.

patch
diff --git a/rtengine/CA_correct_RT.cc b/rtengine/CA_correct_RT.cc
index aea20d8c..ee25c16c 100644
--- a/rtengine/CA_correct_RT.cc
+++ b/rtengine/CA_correct_RT.cc
@@ -27,6 +27,8 @@
 #include "rawimagesource.h"
 #include "rt_math.h"
 #include "median.h"
+#define BENCHMARK
+#include "StopWatch.h"

 namespace {

@@ -113,6 +115,7 @@ using namespace rtengine;

 void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const double cablue, const double caautostrength, array2D<float> &rawData)
 {
+    BENCHFUN
 // multithreaded and partly vectorized by Ingo Weyrich
     constexpr int ts = 128;
     constexpr int tsh = ts / 2;
@@ -185,7 +188,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
         float   blockavethr[2][2] = {{0, 0}, {0, 0}}, blocksqavethr[2][2] = {{0, 0}, {0, 0}}, blockdenomthr[2][2] = {{0, 0}, {0, 0}};

         // assign working space
-        constexpr int buffersize = 3 * sizeof(float) * ts * ts + 6 * sizeof(float) * ts * tsh + 8 * 64 + 63;
+        constexpr int buffersize = sizeof(float) * ts * ts + 8 * sizeof(float) * ts * tsh + 8 * 64 + 63;
         char *buffer = (char *) malloc(buffersize);
         char *data = (char*)( ( uintptr_t(buffer) + uintptr_t(63)) / 64 * 64);

@@ -194,21 +197,21 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
         //rgb data in a tile
         float* rgb[3];
         rgb[0]         = (float (*)) data;
-        rgb[1]         = (float (*)) (data + 1 * sizeof(float) * ts * ts + 1 * 64);
-        rgb[2]         = (float (*)) (data + 2 * sizeof(float) * ts * ts + 2 * 64);
+        rgb[1]         = (float (*)) (data + sizeof(float) * ts * tsh + 1 * 64);
+        rgb[2]         = (float (*)) (data + sizeof(float) * (ts * ts + ts * tsh) + 2 * 64);

         //high pass filter for R/B in vertical direction
-        float *rbhpfh  = (float (*)) (data + 3 * sizeof(float) * ts * ts + 3 * 64);
+        float *rbhpfh  = (float (*)) (data + 2 * sizeof(float) * ts * ts + 3 * 64);
         //high pass filter for R/B in horizontal direction
-        float *rbhpfv  = (float (*)) (data + 3 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 4 * 64);
+        float *rbhpfv  = (float (*)) (data + 2 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 4 * 64);
         //low pass filter for R/B in horizontal direction
-        float *rblpfh  = (float (*)) (data + 4 * sizeof(float) * ts * ts + 5 * 64);
+        float *rblpfh  = (float (*)) (data + 3 * sizeof(float) * ts * ts + 5 * 64);
         //low pass filter for R/B in vertical direction
-        float *rblpfv  = (float (*)) (data + 4 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 6 * 64);
+        float *rblpfv  = (float (*)) (data + 3 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 6 * 64);
         //low pass filter for colour differences in horizontal direction
-        float *grblpfh = (float (*)) (data + 5 * sizeof(float) * ts * ts + 7 * 64);
+        float *grblpfh = (float (*)) (data + 4 * sizeof(float) * ts * ts + 7 * 64);
         //low pass filter for colour differences in vertical direction
-        float *grblpfv = (float (*)) (data + 5 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 8 * 64);
+        float *grblpfv = (float (*)) (data + 4 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 8 * 64);
         //colour differences
         float *grbdiff = rbhpfh; // there is no overlap in buffer usage => share
         //green interpolated to optical sample points for R/B
@@ -241,7 +244,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                             int col = cc + left;
                             int c = FC(rr, cc);
                             int indx1 = rr * ts + cc;
-                            rgb[c][indx1] = (rawData[row][col]) / 65535.0f;
+                            rgb[c][indx1 >> ((c & 1) ^ 1)] = (rawData[row][col]) / 65535.0f;
                         }

                     // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -250,7 +253,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                         for (int rr = 0; rr < border; rr++)
                             for (int cc = ccmin; cc < ccmax; cc++) {
                                 int c = FC(rr, cc);
-                                rgb[c][rr * ts + cc] = rgb[c][(border2 - rr) * ts + cc];
+                                rgb[c][(rr * ts + cc) >> ((c & 1) ^ 1)] = rgb[c][((border2 - rr) * ts + cc) >> ((c & 1) ^ 1)];
                             }
                     }

@@ -258,7 +261,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                         for (int rr = 0; rr < border; rr++)
                             for (int cc = ccmin; cc < ccmax; cc++) {
                                 int c = FC(rr, cc);
-                                rgb[c][(rrmax + rr)*ts + cc] = (rawData[(height - rr - 2)][left + cc]) / 65535.0f;
+                                rgb[c][((rrmax + rr)*ts + cc) >> ((c & 1) ^ 1)] = (rawData[(height - rr - 2)][left + cc]) / 65535.0f;
                             }
                     }

@@ -266,7 +269,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                         for (int rr = rrmin; rr < rrmax; rr++)
                             for (int cc = 0; cc < border; cc++) {
                                 int c = FC(rr, cc);
-                                rgb[c][rr * ts + cc] = rgb[c][rr * ts + border2 - cc];
+                                rgb[c][(rr * ts + cc) >> ((c & 1) ^ 1)] = rgb[c][(rr * ts + border2 - cc) >> ((c & 1) ^ 1)];
                             }
                     }

@@ -274,7 +277,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                         for (int rr = rrmin; rr < rrmax; rr++)
                             for (int cc = 0; cc < border; cc++) {
                                 int c = FC(rr, cc);
-                                rgb[c][rr * ts + ccmax + cc] = (rawData[(top + rr)][(width - cc - 2)]) / 65535.0f;
+                                rgb[c][(rr * ts + ccmax + cc) >> ((c & 1) ^ 1)] = (rawData[(top + rr)][(width - cc - 2)]) / 65535.0f;
                             }
                     }

@@ -283,7 +286,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                         for (int rr = 0; rr < border; rr++)
                             for (int cc = 0; cc < border; cc++) {
                                 int c = FC(rr, cc);
-                                rgb[c][(rr)*ts + cc] = (rawData[border2 - rr][border2 - cc]) / 65535.0f;
+                                rgb[c][((rr)*ts + cc) >> ((c & 1) ^ 1)] = (rawData[border2 - rr][border2 - cc]) / 65535.0f;
                             }
                     }

@@ -291,7 +294,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                         for (int rr = 0; rr < border; rr++)
                             for (int cc = 0; cc < border; cc++) {
                                 int c = FC(rr, cc);
-                                rgb[c][(rrmax + rr)*ts + ccmax + cc] = (rawData[(height - rr - 2)][(width - cc - 2)]) / 65535.0f;
+                                rgb[c][((rrmax + rr)*ts + ccmax + cc) >> ((c & 1) ^ 1)] = (rawData[(height - rr - 2)][(width - cc - 2)]) / 65535.0f;
                             }
                     }

@@ -299,7 +302,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                         for (int rr = 0; rr < border; rr++)
                             for (int cc = 0; cc < border; cc++) {
                                 int c = FC(rr, cc);
-                                rgb[c][(rr)*ts + ccmax + cc] = (rawData[(border2 - rr)][(width - cc - 2)]) / 65535.0f;
+                                rgb[c][((rr)*ts + ccmax + cc) >> ((c & 1) ^ 1)] = (rawData[(border2 - rr)][(width - cc - 2)]) / 65535.0f;
                             }
                     }

@@ -307,7 +310,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                         for (int rr = 0; rr < border; rr++)
                             for (int cc = 0; cc < border; cc++) {
                                 int c = FC(rr, cc);
-                                rgb[c][(rrmax + rr)*ts + cc] = (rawData[(height - rr - 2)][(border2 - cc)]) / 65535.0f;
+                                rgb[c][((rrmax + rr)*ts + cc) >> ((c & 1) ^ 1)] = (rawData[(height - rr - 2)][(border2 - cc)]) / 65535.0f;
                             }
                     }

@@ -328,22 +331,29 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
 #ifdef __SSE2__
                         for (; cc < cc1 - 9; cc+=8, indx+=8) {
                             //compute directional weights using image gradients
-                            vfloat wtuv = onev / SQRV(epsv + vabsf(LC2VFU(rgb[1][indx + v1]) - LC2VFU(rgb[1][indx - v1])) + vabsf(LC2VFU(rgb[c][indx]) - LC2VFU(rgb[c][indx - v2])) + vabsf(LC2VFU(rgb[1][indx - v1]) - LC2VFU(rgb[1][indx - v3])));
-                            vfloat wtdv = onev / SQRV(epsv + vabsf(LC2VFU(rgb[1][indx - v1]) - LC2VFU(rgb[1][indx + v1])) + vabsf(LC2VFU(rgb[c][indx]) - LC2VFU(rgb[c][indx + v2])) + vabsf(LC2VFU(rgb[1][indx + v1]) - LC2VFU(rgb[1][indx + v3])));
-                            vfloat wtlv = onev / SQRV(epsv + vabsf(LC2VFU(rgb[1][indx + 1]) - LC2VFU(rgb[1][indx - 1])) + vabsf(LC2VFU(rgb[c][indx]) - LC2VFU(rgb[c][indx - 2])) + vabsf(LC2VFU(rgb[1][indx - 1]) - LC2VFU(rgb[1][indx - 3])));
-                            vfloat wtrv = onev / SQRV(epsv + vabsf(LC2VFU(rgb[1][indx - 1]) - LC2VFU(rgb[1][indx + 1])) + vabsf(LC2VFU(rgb[c][indx]) - LC2VFU(rgb[c][indx + 2])) + vabsf(LC2VFU(rgb[1][indx + 1]) - LC2VFU(rgb[1][indx + 3])));
+                            vfloat cinv = LVFU(rgb[c][indx >> 1]);
+                            vfloat cinpv1v = LC2VFU(rgb[1][indx + v1]);
+                            vfloat cinmv1v = LC2VFU(rgb[1][indx - v1]);
+                            vfloat temp1 = epsv + vabsf(cinpv1v - cinmv1v);
+                            vfloat wtuv = onev / SQRV(temp1 + vabsf(cinv - LVFU(rgb[c][(indx - v2) >> 1])) + vabsf(cinmv1v - LC2VFU(rgb[1][indx - v3])));
+                            vfloat wtdv = onev / SQRV(temp1 + vabsf(cinv - LVFU(rgb[c][(indx + v2) >> 1])) + vabsf(cinpv1v - LC2VFU(rgb[1][indx + v3])));
+                            vfloat cinp1v = LC2VFU(rgb[1][indx + 1]);
+                            vfloat cinm1v = LC2VFU(rgb[1][indx - 1]);
+                            vfloat temp2 = epsv + vabsf(cinm1v - cinp1v);
+                            vfloat wtlv = onev / SQRV(temp2 + vabsf(cinv - LVFU(rgb[c][(indx - 2) >> 1])) + vabsf(cinm1v - LC2VFU(rgb[1][indx - 3])));
+                            vfloat wtrv = onev / SQRV(temp2 + vabsf(cinv - LVFU(rgb[c][(indx + 2) >> 1])) + vabsf(cinp1v - LC2VFU(rgb[1][indx + 3])));

                             //store in rgb array the interpolated G value at R/B grid points using directional weighted average
-                            STC2VFU(rgb[1][indx], (wtuv * LC2VFU(rgb[1][indx - v1]) + wtdv * LC2VFU(rgb[1][indx + v1]) + wtlv * LC2VFU(rgb[1][indx - 1]) + wtrv * LC2VFU(rgb[1][indx + 1])) / (wtuv + wtdv + wtlv + wtrv));
+                            STC2VFU(rgb[1][indx], (wtuv * cinmv1v + wtdv * cinpv1v + wtlv * cinm1v + wtrv * cinpv1v) / (wtuv + wtdv + wtlv + wtrv));
                         }

 #endif
                         for (; cc < cc1 - 3; cc+=2, indx+=2) {
                             //compute directional weights using image gradients
-                            float wtu = 1.f / SQR(eps + fabsf(rgb[1][indx + v1] - rgb[1][indx - v1]) + fabsf(rgb[c][indx] - rgb[c][indx - v2]) + fabsf(rgb[1][indx - v1] - rgb[1][indx - v3]));
-                            float wtd = 1.f / SQR(eps + fabsf(rgb[1][indx - v1] - rgb[1][indx + v1]) + fabsf(rgb[c][indx] - rgb[c][indx + v2]) + fabsf(rgb[1][indx + v1] - rgb[1][indx + v3]));
-                            float wtl = 1.f / SQR(eps + fabsf(rgb[1][indx + 1] - rgb[1][indx - 1]) + fabsf(rgb[c][indx] - rgb[c][indx - 2]) + fabsf(rgb[1][indx - 1] - rgb[1][indx - 3]));
-                            float wtr = 1.f / SQR(eps + fabsf(rgb[1][indx - 1] - rgb[1][indx + 1]) + fabsf(rgb[c][indx] - rgb[c][indx + 2]) + fabsf(rgb[1][indx + 1] - rgb[1][indx + 3]));
+                            float wtu = 1.f / SQR(eps + fabsf(rgb[1][indx + v1] - rgb[1][indx - v1]) + fabsf(rgb[c][indx >> 1] - rgb[c][(indx - v2) >> 1]) + fabsf(rgb[1][indx - v1] - rgb[1][indx - v3]));
+                            float wtd = 1.f / SQR(eps + fabsf(rgb[1][indx - v1] - rgb[1][indx + v1]) + fabsf(rgb[c][indx >> 1] - rgb[c][(indx + v2) >> 1]) + fabsf(rgb[1][indx + v1] - rgb[1][indx + v3]));
+                            float wtl = 1.f / SQR(eps + fabsf(rgb[1][indx + 1] - rgb[1][indx - 1]) + fabsf(rgb[c][indx >> 1] - rgb[c][(indx - 2) >> 1]) + fabsf(rgb[1][indx - 1] - rgb[1][indx - 3]));
+                            float wtr = 1.f / SQR(eps + fabsf(rgb[1][indx - 1] - rgb[1][indx + 1]) + fabsf(rgb[c][indx >> 1] - rgb[c][(indx + 2) >> 1]) + fabsf(rgb[1][indx + 1] - rgb[1][indx + 3]));

                             //store in rgb array the interpolated G value at R/B grid points using directional weighted average
                             rgb[1][indx] = (wtu * rgb[1][indx - v1] + wtd * rgb[1][indx + v1] + wtl * rgb[1][indx - 1] + wtr * rgb[1][indx + 1]) / (wtu + wtd + wtl + wtr);
@@ -365,43 +375,43 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
 #ifdef __SSE2__
                         for (; cc < cc1 - 10; cc += 8, indx += 8) {
                             vfloat rgb1v = LC2VFU(rgb[1][indx]);
-                            vfloat rgbcv = LC2VFU(rgb[c][indx]);
-                            vfloat temp1v = vabsf(vabsf((rgb1v - rgbcv) - (LC2VFU(rgb[1][indx + v4]) - LC2VFU(rgb[c][indx + v4]))) +
-                                                      vabsf(LC2VFU(rgb[1][indx - v4]) - LC2VFU(rgb[c][indx - v4]) - rgb1v + rgbcv) -
-                                                      vabsf(LC2VFU(rgb[1][indx - v4]) - LC2VFU(rgb[c][indx - v4]) - LC2VFU(rgb[1][indx + v4]) + LC2VFU(rgb[c][indx + v4])));
+                            vfloat rgbcv = LVFU(rgb[c][indx >> 1]);
+                            vfloat temp1v = vabsf(vabsf((rgb1v - rgbcv) - (LC2VFU(rgb[1][indx + v4]) - LVFU(rgb[c][(indx + v4) >> 1]))) +
+                                                      vabsf(LC2VFU(rgb[1][indx - v4]) - LVFU(rgb[c][(indx - v4) >> 1]) - rgb1v + rgbcv) -
+                                                      vabsf(LC2VFU(rgb[1][indx - v4]) - LVFU(rgb[c][(indx - v4) >> 1]) - LC2VFU(rgb[1][indx + v4]) + LVFU(rgb[c][(indx + v4) >> 1])));
                             STVFU(rbhpfv[indx >> 1], temp1v);
-                            vfloat temp2v = vabsf(vabsf((rgb1v - rgbcv) - (LC2VFU(rgb[1][indx + 4]) - LC2VFU(rgb[c][indx + 4]))) +
-                                                      vabsf(LC2VFU(rgb[1][indx - 4]) - LC2VFU(rgb[c][indx - 4]) - rgb1v + rgbcv) -
-                                                      vabsf(LC2VFU(rgb[1][indx - 4]) - LC2VFU(rgb[c][indx - 4]) - LC2VFU(rgb[1][indx + 4]) + LC2VFU(rgb[c][indx + 4])));
+                            vfloat temp2v = vabsf(vabsf((rgb1v - rgbcv) - (LC2VFU(rgb[1][indx + 4]) - LVFU(rgb[c][(indx + 4) >> 1]))) +
+                                                      vabsf(LC2VFU(rgb[1][indx - 4]) - LVFU(rgb[c][(indx - 4) >> 1]) - rgb1v + rgbcv) -
+                                                      vabsf(LC2VFU(rgb[1][indx - 4]) - LVFU(rgb[c][(indx - 4) >> 1]) - LC2VFU(rgb[1][indx + 4]) + LVFU(rgb[c][(indx + 4) >> 1])));
                             STVFU(rbhpfh[indx >> 1], temp2v);

                             //low and high pass 1D filters of G in vertical/horizontal directions
                             rgb1v = vmul2f(rgb1v);
-                            vfloat glpfvv = zd25v * (rgb1v + LC2VFU(rgb[1][indx + v2]) + LC2VFU(rgb[1][indx - v2]));
-                            vfloat glpfhv = zd25v * (rgb1v + LC2VFU(rgb[1][indx + 2]) + LC2VFU(rgb[1][indx - 2]));
+                            vfloat glpfvv = rgb1v + LC2VFU(rgb[1][indx + v2]) + LC2VFU(rgb[1][indx - v2]);
+                            vfloat glpfhv = rgb1v + LC2VFU(rgb[1][indx + 2]) + LC2VFU(rgb[1][indx - 2]);
                             rgbcv = vmul2f(rgbcv);
-                            STVFU(rblpfv[indx >> 1], epsv + vabsf(glpfvv - zd25v * (rgbcv + LC2VFU(rgb[c][indx + v2]) + LC2VFU(rgb[c][indx - v2]))));
-                            STVFU(rblpfh[indx >> 1], epsv + vabsf(glpfhv - zd25v * (rgbcv + LC2VFU(rgb[c][indx + 2]) + LC2VFU(rgb[c][indx - 2]))));
-                            STVFU(grblpfv[indx >> 1], glpfvv + zd25v * (rgbcv + LC2VFU(rgb[c][indx + v2]) + LC2VFU(rgb[c][indx - v2])));
-                            STVFU(grblpfh[indx >> 1], glpfhv + zd25v * (rgbcv + LC2VFU(rgb[c][indx + 2]) + LC2VFU(rgb[c][indx - 2])));
+                            STVFU(rblpfv[indx >> 1], epsv + zd25v * vabsf(glpfvv - (rgbcv + LVFU(rgb[c][(indx + v2) >> 1]) + LVFU(rgb[c][(indx - v2) >> 1]))));
+                            STVFU(rblpfh[indx >> 1], epsv + zd25v * vabsf(glpfhv - (rgbcv + LVFU(rgb[c][(indx + 2) >> 1]) + LVFU(rgb[c][(indx - 2) >> 1]))));
+                            STVFU(grblpfv[indx >> 1], zd25v * (glpfvv + (rgbcv + LVFU(rgb[c][(indx + v2) >> 1]) + LVFU(rgb[c][(indx - v2) >> 1]))));
+                            STVFU(grblpfh[indx >> 1], zd25v * (glpfhv + (rgbcv + LVFU(rgb[c][(indx + 2) >> 1]) + LVFU(rgb[c][(indx - 2) >> 1]))));
                         }

 #endif
                         for (; cc < cc1 - 4; cc += 2, indx += 2) {
-                            rbhpfv[indx >> 1] = fabsf(fabsf((rgb[1][indx] - rgb[c][indx]) - (rgb[1][indx + v4] - rgb[c][indx + v4])) +
-                                                      fabsf((rgb[1][indx - v4] - rgb[c][indx - v4]) - (rgb[1][indx] - rgb[c][indx])) -
-                                                      fabsf((rgb[1][indx - v4] - rgb[c][indx - v4]) - (rgb[1][indx + v4] - rgb[c][indx + v4])));
-                            rbhpfh[indx >> 1] = fabsf(fabsf((rgb[1][indx] - rgb[c][indx]) - (rgb[1][indx + 4] - rgb[c][indx + 4])) +
-                                                      fabsf((rgb[1][indx - 4] - rgb[c][indx - 4]) - (rgb[1][indx] - rgb[c][indx])) -
-                                                      fabsf((rgb[1][indx - 4] - rgb[c][indx - 4]) - (rgb[1][indx + 4] - rgb[c][indx + 4])));
+                            rbhpfv[indx >> 1] = fabsf(fabsf((rgb[1][indx] - rgb[c][indx >> 1]) - (rgb[1][indx + v4] - rgb[c][(indx + v4) >> 1])) +
+                                                      fabsf((rgb[1][indx - v4] - rgb[c][(indx - v4) >> 1]) - (rgb[1][indx] - rgb[c][indx >> 1])) -
+                                                      fabsf((rgb[1][indx - v4] - rgb[c][(indx - v4) >> 1]) - (rgb[1][indx + v4] - rgb[c][(indx + v4) >> 1])));
+                            rbhpfh[indx >> 1] = fabsf(fabsf((rgb[1][indx] - rgb[c][indx >> 1]) - (rgb[1][indx + 4] - rgb[c][(indx + 4) >> 1])) +
+                                                      fabsf((rgb[1][indx - 4] - rgb[c][(indx - 4) >> 1]) - (rgb[1][indx] - rgb[c][indx >> 1])) -
+                                                      fabsf((rgb[1][indx - 4] - rgb[c][(indx - 4) >> 1]) - (rgb[1][indx + 4] - rgb[c][(indx + 4) >> 1])));

                             //low and high pass 1D filters of G in vertical/horizontal directions
                             float glpfv = 0.25f * (2.f * rgb[1][indx] + rgb[1][indx + v2] + rgb[1][indx - v2]);
                             float glpfh = 0.25f * (2.f * rgb[1][indx] + rgb[1][indx + 2] + rgb[1][indx - 2]);
-                            rblpfv[indx >> 1] = eps + fabsf(glpfv - 0.25f * (2.f * rgb[c][indx] + rgb[c][indx + v2] + rgb[c][indx - v2]));
-                            rblpfh[indx >> 1] = eps + fabsf(glpfh - 0.25f * (2.f * rgb[c][indx] + rgb[c][indx + 2] + rgb[c][indx - 2]));
-                            grblpfv[indx >> 1] = glpfv + 0.25f * (2.f * rgb[c][indx] + rgb[c][indx + v2] + rgb[c][indx - v2]);
-                            grblpfh[indx >> 1] = glpfh + 0.25f * (2.f * rgb[c][indx] + rgb[c][indx + 2] + rgb[c][indx - 2]);
+                            rblpfv[indx >> 1] = eps + fabsf(glpfv - 0.25f * (2.f * rgb[c][indx >> 1] + rgb[c][(indx + v2) >> 1] + rgb[c][(indx - v2) >> 1]));
+                            rblpfh[indx >> 1] = eps + fabsf(glpfh - 0.25f * (2.f * rgb[c][indx >> 1] + rgb[c][(indx + 2) >> 1] + rgb[c][(indx - 2) >> 1]));
+                            grblpfv[indx >> 1] = glpfv + 0.25f * (2.f * rgb[c][indx >> 1] + rgb[c][(indx + v2) >> 1] + rgb[c][(indx - v2) >> 1]);
+                            grblpfh[indx >> 1] = glpfh + 0.25f * (2.f * rgb[c][indx >> 1] + rgb[c][(indx + 2) >> 1] + rgb[c][(indx - 2) >> 1]);
                         }
                     }

@@ -440,9 +450,9 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const

                             //vertical
                             vfloat gdiffv = zd3125v * (LC2VFU(rgb[1][indx + ts]) - LC2VFU(rgb[1][indx - ts])) + zd09375v * (LC2VFU(rgb[1][indx + ts + 1]) - LC2VFU(rgb[1][indx - ts + 1]) + LC2VFU(rgb[1][indx + ts - 1]) - LC2VFU(rgb[1][indx - ts - 1]));
-                            vfloat deltgrbv = LC2VFU(rgb[c][indx]) - LC2VFU(rgb[1][indx]);
+                            vfloat deltgrbv = LVFU(rgb[c][indx >> 1]) - LC2VFU(rgb[1][indx]);

-                            vfloat gradwtv = vabsf(zd25v * LVFU(rbhpfv[indx >> 1]) + zd125v * (LVFU(rbhpfv[(indx >> 1) + 1]) + LVFU(rbhpfv[(indx >> 1) - 1])) ) * (LVFU(grblpfv[(indx >> 1) - v1]) + LVFU(grblpfv[(indx >> 1) + v1])) / (epsv + zd1v * (LVFU(grblpfv[(indx >> 1) - v1]) + LVFU(grblpfv[(indx >> 1) + v1])) + LVFU(rblpfv[(indx >> 1) - v1]) + LVFU(rblpfv[(indx >> 1) + v1]));
+                            vfloat gradwtv = (zd25v * LVFU(rbhpfv[indx >> 1]) + zd125v * (LVFU(rbhpfv[(indx >> 1) + 1]) + LVFU(rbhpfv[(indx >> 1) - 1])) ) * (LVFU(grblpfv[(indx >> 1) - v1]) + LVFU(grblpfv[(indx >> 1) + v1])) / (epsv + zd1v * (LVFU(grblpfv[(indx >> 1) - v1]) + LVFU(grblpfv[(indx >> 1) + v1])) + LVFU(rblpfv[(indx >> 1) - v1]) + LVFU(rblpfv[(indx >> 1) + v1]));

                             coeff00v += gradwtv * deltgrbv * deltgrbv;
                             coeff01v += gradwtv * gdiffv * deltgrbv;
@@ -451,7 +461,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                             //horizontal
                             gdiffv = zd3125v * (LC2VFU(rgb[1][indx + 1]) - LC2VFU(rgb[1][indx - 1])) + zd09375v * (LC2VFU(rgb[1][indx + 1 + ts]) - LC2VFU(rgb[1][indx - 1 + ts]) + LC2VFU(rgb[1][indx + 1 - ts]) - LC2VFU(rgb[1][indx - 1 - ts]));

-                            gradwtv = vabsf(zd25v * LVFU(rbhpfh[indx >> 1]) + zd125v * (LVFU(rbhpfh[(indx >> 1) + v1]) + LVFU(rbhpfh[(indx >> 1) - v1])) ) * (LVFU(grblpfh[(indx >> 1) - 1]) + LVFU(grblpfh[(indx >> 1) + 1])) / (epsv + zd1v * (LVFU(grblpfh[(indx >> 1) - 1]) + LVFU(grblpfh[(indx >> 1) + 1])) + LVFU(rblpfh[(indx >> 1) - 1]) + LVFU(rblpfh[(indx >> 1) + 1]));
+                            gradwtv = (zd25v * LVFU(rbhpfh[indx >> 1]) + zd125v * (LVFU(rbhpfh[(indx >> 1) + v1]) + LVFU(rbhpfh[(indx >> 1) - v1])) ) * (LVFU(grblpfh[(indx >> 1) - 1]) + LVFU(grblpfh[(indx >> 1) + 1])) / (epsv + zd1v * (LVFU(grblpfh[(indx >> 1) - 1]) + LVFU(grblpfh[(indx >> 1) + 1])) + LVFU(rblpfh[(indx >> 1) - 1]) + LVFU(rblpfh[(indx >> 1) + 1]));

                             coeff10v += gradwtv * deltgrbv * deltgrbv;
                             coeff11v += gradwtv * gdiffv * deltgrbv;
@@ -477,7 +487,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const

                             //vertical
                             float gdiff = 0.3125f * (rgb[1][indx + ts] - rgb[1][indx - ts]) + 0.09375f * (rgb[1][indx + ts + 1] - rgb[1][indx - ts + 1] + rgb[1][indx + ts - 1] - rgb[1][indx - ts - 1]);
-                            float deltgrb = (rgb[c][indx] - rgb[1][indx]);
+                            float deltgrb = (rgb[c][indx >> 1] - rgb[1][indx]);

                             float gradwt = fabsf(0.25f * rbhpfv[indx >> 1] + 0.125f * (rbhpfv[(indx >> 1) + 1] + rbhpfv[(indx >> 1) - 1]) ) * (grblpfv[(indx >> 1) - v1] + grblpfv[(indx >> 1) + v1]) / (eps + 0.1f * (grblpfv[(indx >> 1) - v1] + grblpfv[(indx >> 1) + v1]) + rblpfv[(indx >> 1) - v1] + rblpfv[(indx >> 1) + v1]);

@@ -719,26 +729,29 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                     // rgb values should be floating point number between 0 and 1
                     // after white balance multipliers are applied

-                    for (int rr = rrmin; rr < rrmax; rr++)
-                        for (int row = rr + top, cc = ccmin; cc < ccmax; cc++) {
+                    for (int rr = rrmin; rr < rrmax; rr++) {
+                        int c = FC(rr, ccmin);
+                        const int c2x = c + FC(rr, ccmin + 1);
+                        const int row = rr + top;
+                        for (int cc = ccmin; cc < ccmax; cc++, c ^= c2x) {
                             int col = cc + left;
-                            int c = FC(rr, cc);
+//                            int c = FC(rr, cc);
                             int indx = row * width + col;
                             int indx1 = rr * ts + cc;
-                            rgb[c][indx1] = (rawData[row][col]) / 65535.0f;
+                            rgb[c][indx1 >> ((c & 1) ^ 1)] = rawData[row][col] / 65535.f;

                             if ((c & 1) == 0) {
                                 rgb[1][indx1] = Gtmp[indx];
                             }
                         }
-
+                    }
                     // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
                     //fill borders
                     if (rrmin > 0) {
                         for (int rr = 0; rr < border; rr++)
                             for (int cc = ccmin; cc < ccmax; cc++) {
                                 int c = FC(rr, cc);
-                                rgb[c][rr * ts + cc] = rgb[c][(border2 - rr) * ts + cc];
+                                rgb[c][(rr * ts + cc) >> ((c & 1) ^ 1)] = rgb[c][((border2 - rr) * ts + cc) >> ((c & 1) ^ 1)];
                                 rgb[1][rr * ts + cc] = rgb[1][(border2 - rr) * ts + cc];
                             }
                     }
@@ -747,7 +760,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                         for (int rr = 0; rr < border; rr++)
                             for (int cc = ccmin; cc < ccmax; cc++) {
                                 int c = FC(rr, cc);
-                                rgb[c][(rrmax + rr)*ts + cc] = (rawData[(height - rr - 2)][left + cc]) / 65535.0f;
+                                rgb[c][((rrmax + rr)*ts + cc) >> ((c & 1) ^ 1)] = (rawData[(height - rr - 2)][left + cc]) / 65535.0f;
                                 rgb[1][(rrmax + rr)*ts + cc] = Gtmp[(height - rr - 2) * width + left + cc];
                             }
                     }
@@ -756,7 +769,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                         for (int rr = rrmin; rr < rrmax; rr++)
                             for (int cc = 0; cc < border; cc++) {
                                 int c = FC(rr, cc);
-                                rgb[c][rr * ts + cc] = rgb[c][rr * ts + border2 - cc];
+                                rgb[c][(rr * ts + cc) >> ((c & 1) ^ 1)] = rgb[c][(rr * ts + border2 - cc) >> ((c & 1) ^ 1)];
                                 rgb[1][rr * ts + cc] = rgb[1][rr * ts + border2 - cc];
                             }
                     }
@@ -765,7 +778,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                         for (int rr = rrmin; rr < rrmax; rr++)
                             for (int cc = 0; cc < border; cc++) {
                                 int c = FC(rr, cc);
-                                rgb[c][rr * ts + ccmax + cc] = (rawData[(top + rr)][(width - cc - 2)]) / 65535.0f;
+                                rgb[c][(rr * ts + ccmax + cc) >> ((c & 1) ^ 1)] = (rawData[(top + rr)][(width - cc - 2)]) / 65535.0f;
                                 rgb[1][rr * ts + ccmax + cc] = Gtmp[(top + rr) * width + (width - cc - 2)];
                             }
                     }
@@ -775,7 +788,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                         for (int rr = 0; rr < border; rr++)
                             for (int cc = 0; cc < border; cc++) {
                                 int c = FC(rr, cc);
-                                rgb[c][(rr)*ts + cc] = (rawData[border2 - rr][border2 - cc]) / 65535.0f;
+                                rgb[c][((rr)*ts + cc) >> ((c & 1) ^ 1)] = (rawData[border2 - rr][border2 - cc]) / 65535.0f;
                                 rgb[1][(rr)*ts + cc] = Gtmp[(border2 - rr) * width + border2 - cc];
                             }
                     }
@@ -784,7 +797,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                         for (int rr = 0; rr < border; rr++)
                             for (int cc = 0; cc < border; cc++) {
                                 int c = FC(rr, cc);
-                                rgb[c][(rrmax + rr)*ts + ccmax + cc] = (rawData[(height - rr - 2)][(width - cc - 2)]) / 65535.0f;
+                                rgb[c][((rrmax + rr)*ts + ccmax + cc) >> ((c & 1) ^ 1)] = (rawData[(height - rr - 2)][(width - cc - 2)]) / 65535.0f;
                                 rgb[1][(rrmax + rr)*ts + ccmax + cc] = Gtmp[(height - rr - 2) * width + (width - cc - 2)];
                             }
                     }
@@ -793,7 +806,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                         for (int rr = 0; rr < border; rr++)
                             for (int cc = 0; cc < border; cc++) {
                                 int c = FC(rr, cc);
-                                rgb[c][(rr)*ts + ccmax + cc] = (rawData[(border2 - rr)][(width - cc - 2)]) / 65535.0f;
+                                rgb[c][((rr)*ts + ccmax + cc) >> ((c & 1) ^ 1)] = (rawData[(border2 - rr)][(width - cc - 2)]) / 65535.0f;
                                 rgb[1][(rr)*ts + ccmax + cc] = Gtmp[(border2 - rr) * width + (width - cc - 2)];
                             }
                     }
@@ -802,7 +815,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                         for (int rr = 0; rr < border; rr++)
                             for (int cc = 0; cc < border; cc++) {
                                 int c = FC(rr, cc);
-                                rgb[c][(rrmax + rr)*ts + cc] = (rawData[(height - rr - 2)][(border2 - cc)]) / 65535.0f;
+                                rgb[c][((rrmax + rr)*ts + cc) >> ((c & 1) ^ 1)] = (rawData[(height - rr - 2)][(border2 - cc)]) / 65535.0f;
                                 rgb[1][(rrmax + rr)*ts + cc] = Gtmp[(height - rr - 2) * width + (border2 - cc)];
                             }
                     }
@@ -819,10 +832,10 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const

                                 if (c != 1) {
                                     //compute directional weights using image gradients
-                                    float wtu = 1.0 / SQR(eps + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr - 1) * ts + cc]) + fabsf(rgb[c][(rr) * ts + cc] - rgb[c][(rr - 2) * ts + cc]) + fabsf(rgb[1][(rr - 1) * ts + cc] - rgb[1][(rr - 3) * ts + cc]));
-                                    float wtd = 1.0 / SQR(eps + fabsf(rgb[1][(rr - 1) * ts + cc] - rgb[1][(rr + 1) * ts + cc]) + fabsf(rgb[c][(rr) * ts + cc] - rgb[c][(rr + 2) * ts + cc]) + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr + 3) * ts + cc]));
-                                    float wtl = 1.0 / SQR(eps + fabsf(rgb[1][(rr) * ts + cc + 1] - rgb[1][(rr) * ts + cc - 1]) + fabsf(rgb[c][(rr) * ts + cc] - rgb[c][(rr) * ts + cc - 2]) + fabsf(rgb[1][(rr) * ts + cc - 1] - rgb[1][(rr) * ts + cc - 3]));
-                                    float wtr = 1.0 / SQR(eps + fabsf(rgb[1][(rr) * ts + cc - 1] - rgb[1][(rr) * ts + cc + 1]) + fabsf(rgb[c][(rr) * ts + cc] - rgb[c][(rr) * ts + cc + 2]) + fabsf(rgb[1][(rr) * ts + cc + 1] - rgb[1][(rr) * ts + cc + 3]));
+                                    float wtu = 1.0 / SQR(eps + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr - 1) * ts + cc]) + fabsf(rgb[c][((rr) * ts + cc) >> 1] - rgb[c][((rr - 2) * ts + cc) >> 1]) + fabsf(rgb[1][(rr - 1) * ts + cc] - rgb[1][(rr - 3) * ts + cc]));
+                                    float wtd = 1.0 / SQR(eps + fabsf(rgb[1][(rr - 1) * ts + cc] - rgb[1][(rr + 1) * ts + cc]) + fabsf(rgb[c][((rr) * ts + cc) >> 1] - rgb[c][((rr + 2) * ts + cc) >> 1]) + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr + 3) * ts + cc]));
+                                    float wtl = 1.0 / SQR(eps + fabsf(rgb[1][(rr) * ts + cc + 1] - rgb[1][(rr) * ts + cc - 1]) + fabsf(rgb[c][((rr) * ts + cc) >> 1] - rgb[c][((rr) * ts + cc - 2) >> 1]) + fabsf(rgb[1][(rr) * ts + cc - 1] - rgb[1][(rr) * ts + cc - 3]));
+                                    float wtr = 1.0 / SQR(eps + fabsf(rgb[1][(rr) * ts + cc - 1] - rgb[1][(rr) * ts + cc + 1]) + fabsf(rgb[c][((rr) * ts + cc) >> 1] - rgb[c][((rr) * ts + cc + 2) >> 1]) + fabsf(rgb[1][(rr) * ts + cc + 1] - rgb[1][(rr) * ts + cc + 3]));

                                     //store in rgb array the interpolated G value at R/B grid points using directional weighted average
                                     rgb[1][indx] = (wtu * rgb[1][indx - v1] + wtd * rgb[1][indx + v1] + wtl * rgb[1][indx - 1] + wtr * rgb[1][indx + 1]) / (wtu + wtd + wtl + wtr);
@@ -896,7 +909,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const

                             //determine R/B at grid points using colour differences at shift point plus interpolated G value at grid point
                             //but first we need to interpolate G-R/G-B to grid points...
-                            STVFU(grbdiff[((rr)*ts + cc) >> 1], Gintv - LC2VFU(rgb[c][(rr) * ts + cc]));
+                            STVFU(grbdiff[((rr)*ts + cc) >> 1], Gintv - LVFU(rgb[c][((rr) * ts + cc) >> 1]));
                             STVFU(gshift[((rr)*ts + cc) >> 1], Gintv);
                         }

@@ -910,7 +923,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const

                             //determine R/B at grid points using colour differences at shift point plus interpolated G value at grid point
                             //but first we need to interpolate G-R/G-B to grid points...
-                            grbdiff[((rr)*ts + cc) >> 1] = Gint - rgb[c][(rr) * ts + cc];
+                            grbdiff[((rr)*ts + cc) >> 1] = Gint - rgb[c][((rr) * ts + cc) >> 1];
                             gshift[((rr)*ts + cc) >> 1] = Gint;
                         }
                     }
@@ -920,11 +933,49 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                     shiftvfrac[0] /= 2.f;
                     shiftvfrac[2] /= 2.f;

-                    // this loop does not deserve vectorization in mainly because the most expensive part with the divisions does not happen often (less than 1/10 in my tests)
-                    for (int rr = 8; rr < rr1 - 8; rr++)
-                        for (int cc = 8 + (FC(rr, 2) & 1), c = FC(rr, cc), indx = rr * ts + cc; cc < cc1 - 8; cc += 2, indx += 2) {
+#ifdef __SSE2__
+                    vfloat zd25v = F2V(0.25f);
+                    vfloat onev = F2V(1.f);
+                    vfloat zd5v = F2V(0.5f);
+                    vfloat epsv = F2V(eps);
+#endif
+                    for (int rr = 8; rr < rr1 - 8; rr++) {
+                        int cc = 8 + (FC(rr, 2) & 1);
+#ifdef __SSE2__
+                        for (int c = FC(rr, cc), indx = rr * ts + cc; cc < cc1 - 14; cc += 8, indx += 8) {
+                            vfloat cinv = LVFU(rgb[c][indx >> 1]);
+                            vfloat rinv = LC2VFU(rgb[1][indx]);
+                            vfloat grbdiffold = rinv - cinv;

-                            float grbdiffold = rgb[1][indx] - rgb[c][indx];
+                            //interpolate colour difference from optical R/B locations to grid locations
+                            vfloat grbdiffinthfloor = vintpf(F2V(shifthfrac[c]), LVFU(grbdiff[(indx - GRBdir[1][c]) >> 1]), LVFU(grbdiff[indx >> 1]));
+                            vfloat grbdiffinthceil = vintpf(F2V(shifthfrac[c]), LVFU(grbdiff[((rr - GRBdir[0][c]) * ts + cc - GRBdir[1][c]) >> 1]), LVFU(grbdiff[((rr - GRBdir[0][c]) * ts + cc) >> 1]));
+                            //grbdiffint is bilinear interpolation of G-R/G-B at grid point
+                            vfloat grbdiffint = vintpf(F2V(shiftvfrac[c]), grbdiffinthceil, grbdiffinthfloor);
+
+                            //now determine R/B at grid points using interpolated colour differences and interpolated G value at grid point
+                            vfloat RBint = rinv - grbdiffint;
+                            vmask cmask = vmaskf_ge(vabsf(RBint - cinv), zd25v * (RBint + cinv));
+                            if(_mm_movemask_ps((vfloat)cmask)) {
+                                //gradient weights using difference from G at CA shift points and G at grid points
+                                vfloat p0 = onev / (epsv + vabsf(rinv - LVFU(gshift[indx >> 1])));
+                                vfloat p1 = onev / (epsv + vabsf(rinv - LVFU(gshift[(indx - GRBdir[1][c]) >> 1])));
+                                vfloat p2 = onev / (epsv + vabsf(rinv - LVFU(gshift[((rr - GRBdir[0][c]) * ts + cc) >> 1])));
+                                vfloat p3 = onev / (epsv + vabsf(rinv - LVFU(gshift[((rr - GRBdir[0][c]) * ts + cc - GRBdir[1][c]) >> 1])));
+
+                                grbdiffint = vself(cmask, (p0 * LVFU(grbdiff[indx >> 1]) + p1 * LVFU(grbdiff[(indx - GRBdir[1][c]) >> 1]) +
+                                              p2 * LVFU(grbdiff[((rr - GRBdir[0][c]) * ts + cc) >> 1]) + p3 * LVFU(grbdiff[((rr - GRBdir[0][c]) * ts + cc - GRBdir[1][c]) >> 1])) / (p0 + p1 + p2 + p3), grbdiffint);
+
+                            }
+                            RBint = rinv - grbdiffint;
+                            RBint = vself(vmaskf_gt(vabsf(grbdiffold), vabsf(grbdiffint)), RBint, cinv);
+                            RBint = vself(vmaskf_lt(grbdiffold * grbdiffint, ZEROV), rinv - zd5v * (grbdiffold + grbdiffint), RBint);
+                            STVFU(rgb[c][indx >> 1], RBint);
+                        }
+#endif
+                        for (int c = FC(rr, cc), indx = rr * ts + cc; cc < cc1 - 8; cc += 2, indx += 2) {
+
+                            float grbdiffold = rgb[1][indx] - rgb[c][indx >> 1];

                             //interpolate colour difference from optical R/B locations to grid locations
                             float grbdiffinthfloor = intp(shifthfrac[c], grbdiff[(indx - GRBdir[1][c]) >> 1], grbdiff[indx >> 1]);
@@ -935,9 +986,9 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                             //now determine R/B at grid points using interpolated colour differences and interpolated G value at grid point
                             float RBint = rgb[1][indx] - grbdiffint;

-                            if (fabsf(RBint - rgb[c][indx]) < 0.25f * (RBint + rgb[c][indx])) {
+                            if (fabsf(RBint - rgb[c][indx >> 1]) < 0.25f * (RBint + rgb[c][indx >> 1])) {
                                 if (fabsf(grbdiffold) > fabsf(grbdiffint) ) {
-                                    rgb[c][indx] = RBint;
+                                    rgb[c][indx >> 1] = RBint;
                                 }
                             } else {

@@ -952,22 +1003,23 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const

                                 //now determine R/B at grid points using interpolated colour differences and interpolated G value at grid point
                                 if (fabsf(grbdiffold) > fabsf(grbdiffint) ) {
-                                    rgb[c][indx] = rgb[1][indx] - grbdiffint;
+                                    rgb[c][indx >> 1] = rgb[1][indx] - grbdiffint;
                                 }
                             }

                             //if colour difference interpolation overshot the correction, just desaturate
                             if (grbdiffold * grbdiffint < 0) {
-                                rgb[c][indx] = rgb[1][indx] - 0.5f * (grbdiffold + grbdiffint);
+                                rgb[c][indx >> 1] = rgb[1][indx] - 0.5f * (grbdiffold + grbdiffint);
                             }
                         }
+                    }

                     // copy CA corrected results to temporary image matrix
                     for (int rr = border; rr < rr1 - border; rr++) {
                         int c = FC(rr + top, left + border + (FC(rr + top, 2) & 1));

                         for (int row = rr + top, cc = border + (FC(rr, 2) & 1), indx = (row * width + cc + left) >> 1; cc < cc1 - border; cc += 2, indx++) {
-                            RawDataTmp[indx] = 65535.0f * rgb[c][(rr) * ts + cc];
+                            RawDataTmp[indx] = 65535.0f * rgb[c][((rr) * ts + cc) >> 1];
                         }
                     }

</float>

该提问来源于开源项目:Beep6581/RawTherapee

  • 点赞
  • 写回答
  • 关注问题
  • 收藏
  • 复制链接分享
  • 邀请回答

5条回答

  • weixin_39611666 weixin_39611666 5月前

    Update:

    The patch from above reduced memory usage of raw ca correction by 128 x 128 x 4 x c bytes (where c is the number of cores), was 20% faster than dev in my test and had a bug, which led to differences in output.

    The following patch reduces memory usage of raw ca correction by 128 x 128 x 4 x c + width * height * 2 bytes, is also 20% faster than dev in my test and has no differences in output:

    patch
    diff --git a/rtengine/CA_correct_RT.cc b/rtengine/CA_correct_RT.cc
    index aea20d8c..a7b02ceb 100644
    --- a/rtengine/CA_correct_RT.cc
    +++ b/rtengine/CA_correct_RT.cc
    @@ -27,6 +27,8 @@
     #include "rawimagesource.h"
     #include "rt_math.h"
     #include "median.h"
    +#define BENCHMARK
    +#include "StopWatch.h"
    
     namespace {
    
    @@ -113,6 +115,7 @@ using namespace rtengine;
    
     void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const double cablue, const double caautostrength, array2D<float> &rawData)
     {
    +    BENCHFUN
     // multithreaded and partly vectorized by Ingo Weyrich
         constexpr int ts = 128;
         constexpr int tsh = ts / 2;
    @@ -136,7 +139,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
         // local variables
         const int width = W, height = H;
         //temporary array to store simple interpolation of G
    -    float *Gtmp = (float (*)) calloc ((height) * (width), sizeof * Gtmp);
    +    float *Gtmp = (float (*)) malloc ((height * width + ((height * width) & 1)) / 2 * sizeof * Gtmp);
    
         // temporary array to avoid race conflicts, only every second pixel needs to be saved here
         float *RawDataTmp = (float*) malloc( (height * width + ((height * width) & 1)) * sizeof(float) / 2);
    @@ -185,7 +188,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
             float   blockavethr[2][2] = {{0, 0}, {0, 0}}, blocksqavethr[2][2] = {{0, 0}, {0, 0}}, blockdenomthr[2][2] = {{0, 0}, {0, 0}};
    
             // assign working space
    -        constexpr int buffersize = 3 * sizeof(float) * ts * ts + 6 * sizeof(float) * ts * tsh + 8 * 64 + 63;
    +        constexpr int buffersize = sizeof(float) * ts * ts + 8 * sizeof(float) * ts * tsh + 8 * 64 + 63;
             char *buffer = (char *) malloc(buffersize);
             char *data = (char*)( ( uintptr_t(buffer) + uintptr_t(63)) / 64 * 64);
    
    @@ -194,22 +197,21 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
             //rgb data in a tile
             float* rgb[3];
             rgb[0]         = (float (*)) data;
    -        rgb[1]         = (float (*)) (data + 1 * sizeof(float) * ts * ts + 1 * 64);
    -        rgb[2]         = (float (*)) (data + 2 * sizeof(float) * ts * ts + 2 * 64);
    +        rgb[1]         = (float (*)) (data + sizeof(float) * ts * tsh + 1 * 64);
    +        rgb[2]         = (float (*)) (data + sizeof(float) * (ts * ts + ts * tsh) + 2 * 64);
    
             //high pass filter for R/B in vertical direction
    -        float *rbhpfh  = (float (*)) (data + 3 * sizeof(float) * ts * ts + 3 * 64);
    +        float *rbhpfh  = (float (*)) (data + 2 * sizeof(float) * ts * ts + 3 * 64);
             //high pass filter for R/B in horizontal direction
    -        float *rbhpfv  = (float (*)) (data + 3 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 4 * 64);
    +        float *rbhpfv  = (float (*)) (data + 2 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 4 * 64);
             //low pass filter for R/B in horizontal direction
    -        float *rblpfh  = (float (*)) (data + 4 * sizeof(float) * ts * ts + 5 * 64);
    +        float *rblpfh  = (float (*)) (data + 3 * sizeof(float) * ts * ts + 5 * 64);
             //low pass filter for R/B in vertical direction
    -        float *rblpfv  = (float (*)) (data + 4 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 6 * 64);
    +        float *rblpfv  = (float (*)) (data + 3 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 6 * 64);
             //low pass filter for colour differences in horizontal direction
    -        float *grblpfh = (float (*)) (data + 5 * sizeof(float) * ts * ts + 7 * 64);
    +        float *grblpfh = (float (*)) (data + 4 * sizeof(float) * ts * ts + 7 * 64);
             //low pass filter for colour differences in vertical direction
    -        float *grblpfv = (float (*)) (data + 5 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 8 * 64);
    -        //colour differences
    +        float *grblpfv = (float (*)) (data + 4 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 8 * 64);
             float *grbdiff = rbhpfh; // there is no overlap in buffer usage => share
             //green interpolated to optical sample points for R/B
             float *gshift  = rbhpfv; // there is no overlap in buffer usage => share
    @@ -241,7 +243,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                                 int col = cc + left;
                                 int c = FC(rr, cc);
                                 int indx1 = rr * ts + cc;
    -                            rgb[c][indx1] = (rawData[row][col]) / 65535.0f;
    +                            rgb[c][indx1 >> ((c & 1) ^ 1)] = (rawData[row][col]) / 65535.0f;
                             }
    
                         // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    @@ -250,7 +252,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                             for (int rr = 0; rr < border; rr++)
                                 for (int cc = ccmin; cc < ccmax; cc++) {
                                     int c = FC(rr, cc);
    -                                rgb[c][rr * ts + cc] = rgb[c][(border2 - rr) * ts + cc];
    +                                rgb[c][(rr * ts + cc) >> ((c & 1) ^ 1)] = rgb[c][((border2 - rr) * ts + cc) >> ((c & 1) ^ 1)];
                                 }
                         }
    
    @@ -258,7 +260,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                             for (int rr = 0; rr < border; rr++)
                                 for (int cc = ccmin; cc < ccmax; cc++) {
                                     int c = FC(rr, cc);
    -                                rgb[c][(rrmax + rr)*ts + cc] = (rawData[(height - rr - 2)][left + cc]) / 65535.0f;
    +                                rgb[c][((rrmax + rr)*ts + cc) >> ((c & 1) ^ 1)] = (rawData[(height - rr - 2)][left + cc]) / 65535.0f;
                                 }
                         }
    
    @@ -266,7 +268,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                             for (int rr = rrmin; rr < rrmax; rr++)
                                 for (int cc = 0; cc < border; cc++) {
                                     int c = FC(rr, cc);
    -                                rgb[c][rr * ts + cc] = rgb[c][rr * ts + border2 - cc];
    +                                rgb[c][(rr * ts + cc) >> ((c & 1) ^ 1)] = rgb[c][(rr * ts + border2 - cc) >> ((c & 1) ^ 1)];
                                 }
                         }
    
    @@ -274,7 +276,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                             for (int rr = rrmin; rr < rrmax; rr++)
                                 for (int cc = 0; cc < border; cc++) {
                                     int c = FC(rr, cc);
    -                                rgb[c][rr * ts + ccmax + cc] = (rawData[(top + rr)][(width - cc - 2)]) / 65535.0f;
    +                                rgb[c][(rr * ts + ccmax + cc) >> ((c & 1) ^ 1)] = (rawData[(top + rr)][(width - cc - 2)]) / 65535.0f;
                                 }
                         }
    
    @@ -283,7 +285,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                             for (int rr = 0; rr < border; rr++)
                                 for (int cc = 0; cc < border; cc++) {
                                     int c = FC(rr, cc);
    -                                rgb[c][(rr)*ts + cc] = (rawData[border2 - rr][border2 - cc]) / 65535.0f;
    +                                rgb[c][((rr)*ts + cc) >> ((c & 1) ^ 1)] = (rawData[border2 - rr][border2 - cc]) / 65535.0f;
                                 }
                         }
    
    @@ -291,7 +293,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                             for (int rr = 0; rr < border; rr++)
                                 for (int cc = 0; cc < border; cc++) {
                                     int c = FC(rr, cc);
    -                                rgb[c][(rrmax + rr)*ts + ccmax + cc] = (rawData[(height - rr - 2)][(width - cc - 2)]) / 65535.0f;
    +                                rgb[c][((rrmax + rr)*ts + ccmax + cc) >> ((c & 1) ^ 1)] = (rawData[(height - rr - 2)][(width - cc - 2)]) / 65535.0f;
                                 }
                         }
    
    @@ -299,7 +301,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                             for (int rr = 0; rr < border; rr++)
                                 for (int cc = 0; cc < border; cc++) {
                                     int c = FC(rr, cc);
    -                                rgb[c][(rr)*ts + ccmax + cc] = (rawData[(border2 - rr)][(width - cc - 2)]) / 65535.0f;
    +                                rgb[c][((rr)*ts + ccmax + cc) >> ((c & 1) ^ 1)] = (rawData[(border2 - rr)][(width - cc - 2)]) / 65535.0f;
                                 }
                         }
    
    @@ -307,7 +309,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                             for (int rr = 0; rr < border; rr++)
                                 for (int cc = 0; cc < border; cc++) {
                                     int c = FC(rr, cc);
    -                                rgb[c][(rrmax + rr)*ts + cc] = (rawData[(height - rr - 2)][(border2 - cc)]) / 65535.0f;
    +                                rgb[c][((rrmax + rr)*ts + cc) >> ((c & 1) ^ 1)] = (rawData[(height - rr - 2)][(border2 - cc)]) / 65535.0f;
                                 }
                         }
    
    @@ -328,10 +330,10 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
     #ifdef __SSE2__
                             for (; cc < cc1 - 9; cc+=8, indx+=8) {
                                 //compute directional weights using image gradients
    -                            vfloat wtuv = onev / SQRV(epsv + vabsf(LC2VFU(rgb[1][indx + v1]) - LC2VFU(rgb[1][indx - v1])) + vabsf(LC2VFU(rgb[c][indx]) - LC2VFU(rgb[c][indx - v2])) + vabsf(LC2VFU(rgb[1][indx - v1]) - LC2VFU(rgb[1][indx - v3])));
    -                            vfloat wtdv = onev / SQRV(epsv + vabsf(LC2VFU(rgb[1][indx - v1]) - LC2VFU(rgb[1][indx + v1])) + vabsf(LC2VFU(rgb[c][indx]) - LC2VFU(rgb[c][indx + v2])) + vabsf(LC2VFU(rgb[1][indx + v1]) - LC2VFU(rgb[1][indx + v3])));
    -                            vfloat wtlv = onev / SQRV(epsv + vabsf(LC2VFU(rgb[1][indx + 1]) - LC2VFU(rgb[1][indx - 1])) + vabsf(LC2VFU(rgb[c][indx]) - LC2VFU(rgb[c][indx - 2])) + vabsf(LC2VFU(rgb[1][indx - 1]) - LC2VFU(rgb[1][indx - 3])));
    -                            vfloat wtrv = onev / SQRV(epsv + vabsf(LC2VFU(rgb[1][indx - 1]) - LC2VFU(rgb[1][indx + 1])) + vabsf(LC2VFU(rgb[c][indx]) - LC2VFU(rgb[c][indx + 2])) + vabsf(LC2VFU(rgb[1][indx + 1]) - LC2VFU(rgb[1][indx + 3])));
    +                            vfloat wtuv = onev / SQRV(epsv + vabsf(LC2VFU(rgb[1][indx + v1]) - LC2VFU(rgb[1][indx - v1])) + vabsf(LVFU(rgb[c][indx >> 1]) - LVFU(rgb[c][(indx - v2) >> 1])) + vabsf(LC2VFU(rgb[1][indx - v1]) - LC2VFU(rgb[1][indx - v3])));
    +                            vfloat wtdv = onev / SQRV(epsv + vabsf(LC2VFU(rgb[1][indx - v1]) - LC2VFU(rgb[1][indx + v1])) + vabsf(LVFU(rgb[c][indx >> 1]) - LVFU(rgb[c][(indx + v2) >> 1])) + vabsf(LC2VFU(rgb[1][indx + v1]) - LC2VFU(rgb[1][indx + v3])));
    +                            vfloat wtlv = onev / SQRV(epsv + vabsf(LC2VFU(rgb[1][indx + 1]) - LC2VFU(rgb[1][indx - 1])) + vabsf(LVFU(rgb[c][indx >> 1]) - LVFU(rgb[c][(indx - 2) >> 1])) + vabsf(LC2VFU(rgb[1][indx - 1]) - LC2VFU(rgb[1][indx - 3])));
    +                            vfloat wtrv = onev / SQRV(epsv + vabsf(LC2VFU(rgb[1][indx - 1]) - LC2VFU(rgb[1][indx + 1])) + vabsf(LVFU(rgb[c][indx >> 1]) - LVFU(rgb[c][(indx + 2) >> 1])) + vabsf(LC2VFU(rgb[1][indx + 1]) - LC2VFU(rgb[1][indx + 3])));
    
                                 //store in rgb array the interpolated G value at R/B grid points using directional weighted average
                                 STC2VFU(rgb[1][indx], (wtuv * LC2VFU(rgb[1][indx - v1]) + wtdv * LC2VFU(rgb[1][indx + v1]) + wtlv * LC2VFU(rgb[1][indx - 1]) + wtrv * LC2VFU(rgb[1][indx + 1])) / (wtuv + wtdv + wtlv + wtrv));
    @@ -340,18 +342,26 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
     #endif
                             for (; cc < cc1 - 3; cc+=2, indx+=2) {
                                 //compute directional weights using image gradients
    -                            float wtu = 1.f / SQR(eps + fabsf(rgb[1][indx + v1] - rgb[1][indx - v1]) + fabsf(rgb[c][indx] - rgb[c][indx - v2]) + fabsf(rgb[1][indx - v1] - rgb[1][indx - v3]));
    -                            float wtd = 1.f / SQR(eps + fabsf(rgb[1][indx - v1] - rgb[1][indx + v1]) + fabsf(rgb[c][indx] - rgb[c][indx + v2]) + fabsf(rgb[1][indx + v1] - rgb[1][indx + v3]));
    -                            float wtl = 1.f / SQR(eps + fabsf(rgb[1][indx + 1] - rgb[1][indx - 1]) + fabsf(rgb[c][indx] - rgb[c][indx - 2]) + fabsf(rgb[1][indx - 1] - rgb[1][indx - 3]));
    -                            float wtr = 1.f / SQR(eps + fabsf(rgb[1][indx - 1] - rgb[1][indx + 1]) + fabsf(rgb[c][indx] - rgb[c][indx + 2]) + fabsf(rgb[1][indx + 1] - rgb[1][indx + 3]));
    +                            float wtu = 1.f / SQR(eps + fabsf(rgb[1][indx + v1] - rgb[1][indx - v1]) + fabsf(rgb[c][indx >> 1] - rgb[c][(indx - v2) >> 1]) + fabsf(rgb[1][indx - v1] - rgb[1][indx - v3]));
    +                            float wtd = 1.f / SQR(eps + fabsf(rgb[1][indx - v1] - rgb[1][indx + v1]) + fabsf(rgb[c][indx >> 1] - rgb[c][(indx + v2) >> 1]) + fabsf(rgb[1][indx + v1] - rgb[1][indx + v3]));
    +                            float wtl = 1.f / SQR(eps + fabsf(rgb[1][indx + 1] - rgb[1][indx - 1]) + fabsf(rgb[c][indx >> 1] - rgb[c][(indx - 2) >> 1]) + fabsf(rgb[1][indx - 1] - rgb[1][indx - 3]));
    +                            float wtr = 1.f / SQR(eps + fabsf(rgb[1][indx - 1] - rgb[1][indx + 1]) + fabsf(rgb[c][indx >> 1] - rgb[c][(indx + 2) >> 1]) + fabsf(rgb[1][indx + 1] - rgb[1][indx + 3]));
    
                                 //store in rgb array the interpolated G value at R/B grid points using directional weighted average
                                 rgb[1][indx] = (wtu * rgb[1][indx - v1] + wtd * rgb[1][indx + v1] + wtl * rgb[1][indx - 1] + wtr * rgb[1][indx + 1]) / (wtu + wtd + wtl + wtr);
                             }
    
                             if (row > -1 && row < height) {
    -                            for(int col = max(left + 3, 0), indx = rr * ts + 3 - (left < 0 ? (left+3) : 0); col < min(cc1 + left - 3, width); col++, indx++) {
    -                                Gtmp[row * width + col] = rgb[1][indx];
    +                            int offset = (FC(row,max(left + 3, 0)) & 1);
    +                            int col = max(left + 3, 0) + offset;
    +                            int indx = rr * ts + 3 - (left < 0 ? (left+3) : 0) + offset;
    +#ifdef __SSE2__
    +                            for(; col < min(cc1 + left - 3, width) - 7; col+=8, indx+=8) {
    +                                STVFU(Gtmp[(row * width + col) >> 1], LC2VFU(rgb[1][indx]));
    +                            }
    +#endif
    +                            for(; col < min(cc1 + left - 3, width); col+=2, indx+=2) {
    +                                Gtmp[(row * width + col) >> 1] = rgb[1][indx];
                                 }
                             }
    
    @@ -365,43 +375,43 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
     #ifdef __SSE2__
                             for (; cc < cc1 - 10; cc += 8, indx += 8) {
                                 vfloat rgb1v = LC2VFU(rgb[1][indx]);
    -                            vfloat rgbcv = LC2VFU(rgb[c][indx]);
    -                            vfloat temp1v = vabsf(vabsf((rgb1v - rgbcv) - (LC2VFU(rgb[1][indx + v4]) - LC2VFU(rgb[c][indx + v4]))) +
    -                                                      vabsf(LC2VFU(rgb[1][indx - v4]) - LC2VFU(rgb[c][indx - v4]) - rgb1v + rgbcv) -
    -                                                      vabsf(LC2VFU(rgb[1][indx - v4]) - LC2VFU(rgb[c][indx - v4]) - LC2VFU(rgb[1][indx + v4]) + LC2VFU(rgb[c][indx + v4])));
    +                            vfloat rgbcv = LVFU(rgb[c][indx >> 1]);
    +                            vfloat temp1v = vabsf(vabsf((rgb1v - rgbcv) - (LC2VFU(rgb[1][indx + v4]) - LVFU(rgb[c][(indx + v4) >> 1]))) +
    +                                                      vabsf(LC2VFU(rgb[1][indx - v4]) - LVFU(rgb[c][(indx - v4) >> 1]) - rgb1v + rgbcv) -
    +                                                      vabsf(LC2VFU(rgb[1][indx - v4]) - LVFU(rgb[c][(indx - v4) >> 1]) - LC2VFU(rgb[1][indx + v4]) + LVFU(rgb[c][(indx + v4) >> 1])));
                                 STVFU(rbhpfv[indx >> 1], temp1v);
    -                            vfloat temp2v = vabsf(vabsf((rgb1v - rgbcv) - (LC2VFU(rgb[1][indx + 4]) - LC2VFU(rgb[c][indx + 4]))) +
    -                                                      vabsf(LC2VFU(rgb[1][indx - 4]) - LC2VFU(rgb[c][indx - 4]) - rgb1v + rgbcv) -
    -                                                      vabsf(LC2VFU(rgb[1][indx - 4]) - LC2VFU(rgb[c][indx - 4]) - LC2VFU(rgb[1][indx + 4]) + LC2VFU(rgb[c][indx + 4])));
    +                            vfloat temp2v = vabsf(vabsf((rgb1v - rgbcv) - (LC2VFU(rgb[1][indx + 4]) - LVFU(rgb[c][(indx + 4) >> 1]))) +
    +                                                      vabsf(LC2VFU(rgb[1][indx - 4]) - LVFU(rgb[c][(indx - 4) >> 1]) - rgb1v + rgbcv) -
    +                                                      vabsf(LC2VFU(rgb[1][indx - 4]) - LVFU(rgb[c][(indx - 4) >> 1]) - LC2VFU(rgb[1][indx + 4]) + LVFU(rgb[c][(indx + 4) >> 1])));
                                 STVFU(rbhpfh[indx >> 1], temp2v);
    
                                 //low and high pass 1D filters of G in vertical/horizontal directions
                                 rgb1v = vmul2f(rgb1v);
    -                            vfloat glpfvv = zd25v * (rgb1v + LC2VFU(rgb[1][indx + v2]) + LC2VFU(rgb[1][indx - v2]));
    -                            vfloat glpfhv = zd25v * (rgb1v + LC2VFU(rgb[1][indx + 2]) + LC2VFU(rgb[1][indx - 2]));
    +                            vfloat glpfvv = (rgb1v + LC2VFU(rgb[1][indx + v2]) + LC2VFU(rgb[1][indx - v2]));
    +                            vfloat glpfhv = (rgb1v + LC2VFU(rgb[1][indx + 2]) + LC2VFU(rgb[1][indx - 2]));
                                 rgbcv = vmul2f(rgbcv);
    -                            STVFU(rblpfv[indx >> 1], epsv + vabsf(glpfvv - zd25v * (rgbcv + LC2VFU(rgb[c][indx + v2]) + LC2VFU(rgb[c][indx - v2]))));
    -                            STVFU(rblpfh[indx >> 1], epsv + vabsf(glpfhv - zd25v * (rgbcv + LC2VFU(rgb[c][indx + 2]) + LC2VFU(rgb[c][indx - 2]))));
    -                            STVFU(grblpfv[indx >> 1], glpfvv + zd25v * (rgbcv + LC2VFU(rgb[c][indx + v2]) + LC2VFU(rgb[c][indx - v2])));
    -                            STVFU(grblpfh[indx >> 1], glpfhv + zd25v * (rgbcv + LC2VFU(rgb[c][indx + 2]) + LC2VFU(rgb[c][indx - 2])));
    +                            STVFU(rblpfv[indx >> 1], epsv + zd25v * vabsf(glpfvv - (rgbcv + LVFU(rgb[c][(indx + v2) >> 1]) + LVFU(rgb[c][(indx - v2) >> 1]))));
    +                            STVFU(rblpfh[indx >> 1], epsv + zd25v * vabsf(glpfhv - (rgbcv + LVFU(rgb[c][(indx + 2) >> 1]) + LVFU(rgb[c][(indx - 2) >> 1]))));
    +                            STVFU(grblpfv[indx >> 1], zd25v * (glpfvv + (rgbcv + LVFU(rgb[c][(indx + v2) >> 1]) + LVFU(rgb[c][(indx - v2) >> 1]))));
    +                            STVFU(grblpfh[indx >> 1], zd25v * (glpfhv + (rgbcv + LVFU(rgb[c][(indx + 2) >> 1]) + LVFU(rgb[c][(indx - 2) >> 1]))));
                             }
    
     #endif
                             for (; cc < cc1 - 4; cc += 2, indx += 2) {
    -                            rbhpfv[indx >> 1] = fabsf(fabsf((rgb[1][indx] - rgb[c][indx]) - (rgb[1][indx + v4] - rgb[c][indx + v4])) +
    -                                                      fabsf((rgb[1][indx - v4] - rgb[c][indx - v4]) - (rgb[1][indx] - rgb[c][indx])) -
    -                                                      fabsf((rgb[1][indx - v4] - rgb[c][indx - v4]) - (rgb[1][indx + v4] - rgb[c][indx + v4])));
    -                            rbhpfh[indx >> 1] = fabsf(fabsf((rgb[1][indx] - rgb[c][indx]) - (rgb[1][indx + 4] - rgb[c][indx + 4])) +
    -                                                      fabsf((rgb[1][indx - 4] - rgb[c][indx - 4]) - (rgb[1][indx] - rgb[c][indx])) -
    -                                                      fabsf((rgb[1][indx - 4] - rgb[c][indx - 4]) - (rgb[1][indx + 4] - rgb[c][indx + 4])));
    +                            rbhpfv[indx >> 1] = fabsf(fabsf((rgb[1][indx] - rgb[c][indx >> 1]) - (rgb[1][indx + v4] - rgb[c][(indx + v4) >> 1])) +
    +                                                      fabsf((rgb[1][indx - v4] - rgb[c][(indx - v4) >> 1]) - (rgb[1][indx] - rgb[c][indx >> 1])) -
    +                                                      fabsf((rgb[1][indx - v4] - rgb[c][(indx - v4) >> 1]) - (rgb[1][indx + v4] - rgb[c][(indx + v4) >> 1])));
    +                            rbhpfh[indx >> 1] = fabsf(fabsf((rgb[1][indx] - rgb[c][indx >> 1]) - (rgb[1][indx + 4] - rgb[c][(indx + 4) >> 1])) +
    +                                                      fabsf((rgb[1][indx - 4] - rgb[c][(indx - 4) >> 1]) - (rgb[1][indx] - rgb[c][indx >> 1])) -
    +                                                      fabsf((rgb[1][indx - 4] - rgb[c][(indx - 4) >> 1]) - (rgb[1][indx + 4] - rgb[c][(indx + 4) >> 1])));
    
                                 //low and high pass 1D filters of G in vertical/horizontal directions
                                 float glpfv = 0.25f * (2.f * rgb[1][indx] + rgb[1][indx + v2] + rgb[1][indx - v2]);
                                 float glpfh = 0.25f * (2.f * rgb[1][indx] + rgb[1][indx + 2] + rgb[1][indx - 2]);
    -                            rblpfv[indx >> 1] = eps + fabsf(glpfv - 0.25f * (2.f * rgb[c][indx] + rgb[c][indx + v2] + rgb[c][indx - v2]));
    -                            rblpfh[indx >> 1] = eps + fabsf(glpfh - 0.25f * (2.f * rgb[c][indx] + rgb[c][indx + 2] + rgb[c][indx - 2]));
    -                            grblpfv[indx >> 1] = glpfv + 0.25f * (2.f * rgb[c][indx] + rgb[c][indx + v2] + rgb[c][indx - v2]);
    -                            grblpfh[indx >> 1] = glpfh + 0.25f * (2.f * rgb[c][indx] + rgb[c][indx + 2] + rgb[c][indx - 2]);
    +                            rblpfv[indx >> 1] = eps + fabsf(glpfv - 0.25f * (2.f * rgb[c][indx >> 1] + rgb[c][(indx + v2) >> 1] + rgb[c][(indx - v2) >> 1]));
    +                            rblpfh[indx >> 1] = eps + fabsf(glpfh - 0.25f * (2.f * rgb[c][indx >> 1] + rgb[c][(indx + 2) >> 1] + rgb[c][(indx - 2) >> 1]));
    +                            grblpfv[indx >> 1] = glpfv + 0.25f * (2.f * rgb[c][indx >> 1] + rgb[c][(indx + v2) >> 1] + rgb[c][(indx - v2) >> 1]);
    +                            grblpfh[indx >> 1] = glpfh + 0.25f * (2.f * rgb[c][indx >> 1] + rgb[c][(indx + 2) >> 1] + rgb[c][(indx - 2) >> 1]);
                             }
                         }
    
    @@ -439,23 +449,23 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                                 //solve for the interpolation position that minimizes colour difference variance over the tile
    
                                 //vertical
    -                            vfloat gdiffv = zd3125v * (LC2VFU(rgb[1][indx + ts]) - LC2VFU(rgb[1][indx - ts])) + zd09375v * (LC2VFU(rgb[1][indx + ts + 1]) - LC2VFU(rgb[1][indx - ts + 1]) + LC2VFU(rgb[1][indx + ts - 1]) - LC2VFU(rgb[1][indx - ts - 1]));
    -                            vfloat deltgrbv = LC2VFU(rgb[c][indx]) - LC2VFU(rgb[1][indx]);
    +                            vfloat gdiffvv = zd3125v * (LC2VFU(rgb[1][indx + ts]) - LC2VFU(rgb[1][indx - ts])) + zd09375v * (LC2VFU(rgb[1][indx + ts + 1]) - LC2VFU(rgb[1][indx - ts + 1]) + LC2VFU(rgb[1][indx + ts - 1]) - LC2VFU(rgb[1][indx - ts - 1]));
    +                            vfloat deltgrbv = LVFU(rgb[c][indx >> 1]) - LC2VFU(rgb[1][indx]);
    
    -                            vfloat gradwtv = vabsf(zd25v * LVFU(rbhpfv[indx >> 1]) + zd125v * (LVFU(rbhpfv[(indx >> 1) + 1]) + LVFU(rbhpfv[(indx >> 1) - 1])) ) * (LVFU(grblpfv[(indx >> 1) - v1]) + LVFU(grblpfv[(indx >> 1) + v1])) / (epsv + zd1v * (LVFU(grblpfv[(indx >> 1) - v1]) + LVFU(grblpfv[(indx >> 1) + v1])) + LVFU(rblpfv[(indx >> 1) - v1]) + LVFU(rblpfv[(indx >> 1) + v1]));
    +                            vfloat gradwtv = (zd25v * LVFU(rbhpfv[indx >> 1]) + zd125v * (LVFU(rbhpfv[(indx >> 1) + 1]) + LVFU(rbhpfv[(indx >> 1) - 1])) ) * (LVFU(grblpfv[(indx >> 1) - v1]) + LVFU(grblpfv[(indx >> 1) + v1])) / (epsv + zd1v * (LVFU(grblpfv[(indx >> 1) - v1]) + LVFU(grblpfv[(indx >> 1) + v1])) + LVFU(rblpfv[(indx >> 1) - v1]) + LVFU(rblpfv[(indx >> 1) + v1]));
    
                                 coeff00v += gradwtv * deltgrbv * deltgrbv;
    -                            coeff01v += gradwtv * gdiffv * deltgrbv;
    -                            coeff02v += gradwtv * gdiffv * gdiffv;
    +                            coeff01v += gradwtv * gdiffvv * deltgrbv;
    +                            coeff02v += gradwtv * gdiffvv * gdiffvv;
    
                                 //horizontal
    -                            gdiffv = zd3125v * (LC2VFU(rgb[1][indx + 1]) - LC2VFU(rgb[1][indx - 1])) + zd09375v * (LC2VFU(rgb[1][indx + 1 + ts]) - LC2VFU(rgb[1][indx - 1 + ts]) + LC2VFU(rgb[1][indx + 1 - ts]) - LC2VFU(rgb[1][indx - 1 - ts]));
    +                            vfloat gdiffhv = zd3125v * (LC2VFU(rgb[1][indx + 1]) - LC2VFU(rgb[1][indx - 1])) + zd09375v * (LC2VFU(rgb[1][indx + 1 + ts]) - LC2VFU(rgb[1][indx - 1 + ts]) + LC2VFU(rgb[1][indx + 1 - ts]) - LC2VFU(rgb[1][indx - 1 - ts]));
    
    -                            gradwtv = vabsf(zd25v * LVFU(rbhpfh[indx >> 1]) + zd125v * (LVFU(rbhpfh[(indx >> 1) + v1]) + LVFU(rbhpfh[(indx >> 1) - v1])) ) * (LVFU(grblpfh[(indx >> 1) - 1]) + LVFU(grblpfh[(indx >> 1) + 1])) / (epsv + zd1v * (LVFU(grblpfh[(indx >> 1) - 1]) + LVFU(grblpfh[(indx >> 1) + 1])) + LVFU(rblpfh[(indx >> 1) - 1]) + LVFU(rblpfh[(indx >> 1) + 1]));
    +                            gradwtv = (zd25v * LVFU(rbhpfh[indx >> 1]) + zd125v * (LVFU(rbhpfh[(indx >> 1) + v1]) + LVFU(rbhpfh[(indx >> 1) - v1])) ) * (LVFU(grblpfh[(indx >> 1) - 1]) + LVFU(grblpfh[(indx >> 1) + 1])) / (epsv + zd1v * (LVFU(grblpfh[(indx >> 1) - 1]) + LVFU(grblpfh[(indx >> 1) + 1])) + LVFU(rblpfh[(indx >> 1) - 1]) + LVFU(rblpfh[(indx >> 1) + 1]));
    
                                 coeff10v += gradwtv * deltgrbv * deltgrbv;
    -                            coeff11v += gradwtv * gdiffv * deltgrbv;
    -                            coeff12v += gradwtv * gdiffv * gdiffv;
    +                            coeff11v += gradwtv * gdiffhv * deltgrbv;
    +                            coeff12v += gradwtv * gdiffhv * gdiffhv;
    
                                 //  In Mathematica,
                                 //  f[x_]=Expand[Total[Flatten[
    @@ -477,7 +487,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
    
                                 //vertical
                                 float gdiff = 0.3125f * (rgb[1][indx + ts] - rgb[1][indx - ts]) + 0.09375f * (rgb[1][indx + ts + 1] - rgb[1][indx - ts + 1] + rgb[1][indx + ts - 1] - rgb[1][indx - ts - 1]);
    -                            float deltgrb = (rgb[c][indx] - rgb[1][indx]);
    +                            float deltgrb = (rgb[c][indx >> 1] - rgb[1][indx]);
    
                                 float gradwt = fabsf(0.25f * rbhpfv[indx >> 1] + 0.125f * (rbhpfv[(indx >> 1) + 1] + rbhpfv[(indx >> 1) - 1]) ) * (grblpfv[(indx >> 1) - v1] + grblpfv[(indx >> 1) + v1]) / (eps + 0.1f * (grblpfv[(indx >> 1) - v1] + grblpfv[(indx >> 1) + v1]) + rblpfv[(indx >> 1) - v1] + rblpfv[(indx >> 1) + v1]);
    
    @@ -725,10 +735,10 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                                 int c = FC(rr, cc);
                                 int indx = row * width + col;
                                 int indx1 = rr * ts + cc;
    -                            rgb[c][indx1] = (rawData[row][col]) / 65535.0f;
    +                            rgb[c][indx1 >> ((c & 1) ^ 1)] = (rawData[row][col]) / 65535.0f;
    
                                 if ((c & 1) == 0) {
    -                                rgb[1][indx1] = Gtmp[indx];
    +                                rgb[1][indx1] = Gtmp[indx >> 1];
                                 }
                             }
    
    @@ -738,7 +748,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                             for (int rr = 0; rr < border; rr++)
                                 for (int cc = ccmin; cc < ccmax; cc++) {
                                     int c = FC(rr, cc);
    -                                rgb[c][rr * ts + cc] = rgb[c][(border2 - rr) * ts + cc];
    +                                rgb[c][(rr * ts + cc) >> ((c & 1) ^ 1)] = rgb[c][((border2 - rr) * ts + cc) >> ((c & 1) ^ 1)];
                                     rgb[1][rr * ts + cc] = rgb[1][(border2 - rr) * ts + cc];
                                 }
                         }
    @@ -747,8 +757,10 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                             for (int rr = 0; rr < border; rr++)
                                 for (int cc = ccmin; cc < ccmax; cc++) {
                                     int c = FC(rr, cc);
    -                                rgb[c][(rrmax + rr)*ts + cc] = (rawData[(height - rr - 2)][left + cc]) / 65535.0f;
    -                                rgb[1][(rrmax + rr)*ts + cc] = Gtmp[(height - rr - 2) * width + left + cc];
    +                                rgb[c][((rrmax + rr)*ts + cc) >> ((c & 1) ^ 1)] = (rawData[(height - rr - 2)][left + cc]) / 65535.0f;
    +                                if ((c & 1) == 0) {
    +                                    rgb[1][(rrmax + rr)*ts + cc] = Gtmp[((height - rr - 2) * width + left + cc) >> 1];
    +                                }
                                 }
                         }
    
    @@ -756,7 +768,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                             for (int rr = rrmin; rr < rrmax; rr++)
                                 for (int cc = 0; cc < border; cc++) {
                                     int c = FC(rr, cc);
    -                                rgb[c][rr * ts + cc] = rgb[c][rr * ts + border2 - cc];
    +                                rgb[c][(rr * ts + cc) >> ((c & 1) ^ 1)] = rgb[c][(rr * ts + border2 - cc) >> ((c & 1) ^ 1)];
                                     rgb[1][rr * ts + cc] = rgb[1][rr * ts + border2 - cc];
                                 }
                         }
    @@ -765,8 +777,10 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                             for (int rr = rrmin; rr < rrmax; rr++)
                                 for (int cc = 0; cc < border; cc++) {
                                     int c = FC(rr, cc);
    -                                rgb[c][rr * ts + ccmax + cc] = (rawData[(top + rr)][(width - cc - 2)]) / 65535.0f;
    -                                rgb[1][rr * ts + ccmax + cc] = Gtmp[(top + rr) * width + (width - cc - 2)];
    +                                rgb[c][(rr * ts + ccmax + cc) >> ((c & 1) ^ 1)] = (rawData[(top + rr)][(width - cc - 2)]) / 65535.0f;
    +                                if ((c & 1) == 0) {
    +                                    rgb[1][rr * ts + ccmax + cc] = Gtmp[((top + rr) * width + (width - cc - 2)) >> 1];
    +                                }
                                 }
                         }
    
    @@ -775,8 +789,10 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                             for (int rr = 0; rr < border; rr++)
                                 for (int cc = 0; cc < border; cc++) {
                                     int c = FC(rr, cc);
    -                                rgb[c][(rr)*ts + cc] = (rawData[border2 - rr][border2 - cc]) / 65535.0f;
    -                                rgb[1][(rr)*ts + cc] = Gtmp[(border2 - rr) * width + border2 - cc];
    +                                rgb[c][((rr)*ts + cc) >> ((c & 1) ^ 1)] = (rawData[border2 - rr][border2 - cc]) / 65535.0f;
    +                                if ((c & 1) == 0) {
    +                                    rgb[1][(rr)*ts + cc] = Gtmp[((border2 - rr) * width + border2 - cc) >> 1];
    +                                }
                                 }
                         }
    
    @@ -784,8 +800,10 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                             for (int rr = 0; rr < border; rr++)
                                 for (int cc = 0; cc < border; cc++) {
                                     int c = FC(rr, cc);
    -                                rgb[c][(rrmax + rr)*ts + ccmax + cc] = (rawData[(height - rr - 2)][(width - cc - 2)]) / 65535.0f;
    -                                rgb[1][(rrmax + rr)*ts + ccmax + cc] = Gtmp[(height - rr - 2) * width + (width - cc - 2)];
    +                                rgb[c][((rrmax + rr)*ts + ccmax + cc) >> ((c & 1) ^ 1)] = (rawData[(height - rr - 2)][(width - cc - 2)]) / 65535.0f;
    +                                if ((c & 1) == 0) {
    +                                    rgb[1][(rrmax + rr)*ts + ccmax + cc] = Gtmp[((height - rr - 2) * width + (width - cc - 2)) >> 1];
    +                                }
                                 }
                         }
    
    @@ -793,8 +811,10 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                             for (int rr = 0; rr < border; rr++)
                                 for (int cc = 0; cc < border; cc++) {
                                     int c = FC(rr, cc);
    -                                rgb[c][(rr)*ts + ccmax + cc] = (rawData[(border2 - rr)][(width - cc - 2)]) / 65535.0f;
    -                                rgb[1][(rr)*ts + ccmax + cc] = Gtmp[(border2 - rr) * width + (width - cc - 2)];
    +                                rgb[c][((rr)*ts + ccmax + cc) >> ((c & 1) ^ 1)] = (rawData[(border2 - rr)][(width - cc - 2)]) / 65535.0f;
    +                                if ((c & 1) == 0) {
    +                                    rgb[1][(rr)*ts + ccmax + cc] = Gtmp[((border2 - rr) * width + (width - cc - 2)) >> 1];
    +                                }
                                 }
                         }
    
    @@ -802,8 +822,10 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                             for (int rr = 0; rr < border; rr++)
                                 for (int cc = 0; cc < border; cc++) {
                                     int c = FC(rr, cc);
    -                                rgb[c][(rrmax + rr)*ts + cc] = (rawData[(height - rr - 2)][(border2 - cc)]) / 65535.0f;
    -                                rgb[1][(rrmax + rr)*ts + cc] = Gtmp[(height - rr - 2) * width + (border2 - cc)];
    +                                rgb[c][((rrmax + rr)*ts + cc) >> ((c & 1) ^ 1)] = (rawData[(height - rr - 2)][(border2 - cc)]) / 65535.0f;
    +                                if ((c & 1) == 0) {
    +                                    rgb[1][(rrmax + rr)*ts + cc] = Gtmp[((height - rr - 2) * width + (border2 - cc)) >> 1];
    +                                }
                                 }
                         }
    
    @@ -819,18 +841,18 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
    
                                     if (c != 1) {
                                         //compute directional weights using image gradients
    -                                    float wtu = 1.0 / SQR(eps + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr - 1) * ts + cc]) + fabsf(rgb[c][(rr) * ts + cc] - rgb[c][(rr - 2) * ts + cc]) + fabsf(rgb[1][(rr - 1) * ts + cc] - rgb[1][(rr - 3) * ts + cc]));
    -                                    float wtd = 1.0 / SQR(eps + fabsf(rgb[1][(rr - 1) * ts + cc] - rgb[1][(rr + 1) * ts + cc]) + fabsf(rgb[c][(rr) * ts + cc] - rgb[c][(rr + 2) * ts + cc]) + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr + 3) * ts + cc]));
    -                                    float wtl = 1.0 / SQR(eps + fabsf(rgb[1][(rr) * ts + cc + 1] - rgb[1][(rr) * ts + cc - 1]) + fabsf(rgb[c][(rr) * ts + cc] - rgb[c][(rr) * ts + cc - 2]) + fabsf(rgb[1][(rr) * ts + cc - 1] - rgb[1][(rr) * ts + cc - 3]));
    -                                    float wtr = 1.0 / SQR(eps + fabsf(rgb[1][(rr) * ts + cc - 1] - rgb[1][(rr) * ts + cc + 1]) + fabsf(rgb[c][(rr) * ts + cc] - rgb[c][(rr) * ts + cc + 2]) + fabsf(rgb[1][(rr) * ts + cc + 1] - rgb[1][(rr) * ts + cc + 3]));
    +                                    float wtu = 1.0 / SQR(eps + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr - 1) * ts + cc]) + fabsf(rgb[c][((rr) * ts + cc) >> 1] - rgb[c][((rr - 2) * ts + cc) >> 1]) + fabsf(rgb[1][(rr - 1) * ts + cc] - rgb[1][(rr - 3) * ts + cc]));
    +                                    float wtd = 1.0 / SQR(eps + fabsf(rgb[1][(rr - 1) * ts + cc] - rgb[1][(rr + 1) * ts + cc]) + fabsf(rgb[c][((rr) * ts + cc) >> 1] - rgb[c][((rr + 2) * ts + cc) >> 1]) + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr + 3) * ts + cc]));
    +                                    float wtl = 1.0 / SQR(eps + fabsf(rgb[1][(rr) * ts + cc + 1] - rgb[1][(rr) * ts + cc - 1]) + fabsf(rgb[c][((rr) * ts + cc) >> 1] - rgb[c][((rr) * ts + cc - 2) >> 1]) + fabsf(rgb[1][(rr) * ts + cc - 1] - rgb[1][(rr) * ts + cc - 3]));
    +                                    float wtr = 1.0 / SQR(eps + fabsf(rgb[1][(rr) * ts + cc - 1] - rgb[1][(rr) * ts + cc + 1]) + fabsf(rgb[c][((rr) * ts + cc) >> 1] - rgb[c][((rr) * ts + cc + 2) >> 1]) + fabsf(rgb[1][(rr) * ts + cc + 1] - rgb[1][(rr) * ts + cc + 3]));
    
                                         //store in rgb array the interpolated G value at R/B grid points using directional weighted average
                                         rgb[1][indx] = (wtu * rgb[1][indx - v1] + wtd * rgb[1][indx + v1] + wtl * rgb[1][indx - 1] + wtr * rgb[1][indx + 1]) / (wtu + wtd + wtl + wtr);
    +                                    if (row > -1 && row < height && col > -1 && col < width) {
    +                                        Gtmp[(row * width + col) >> 1] = rgb[1][indx];
    +                                    }
                                     }
    
    -                                if (row > -1 && row < height && col > -1 && col < width) {
    -                                    Gtmp[row * width + col] = rgb[1][indx];
    -                                }
                                 }
    
                             float hfrac = -((float)(hblock - 0.5) / (hblsz - 2) - 0.5);
    @@ -896,7 +918,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
    
                                 //determine R/B at grid points using colour differences at shift point plus interpolated G value at grid point
                                 //but first we need to interpolate G-R/G-B to grid points...
    -                            STVFU(grbdiff[((rr)*ts + cc) >> 1], Gintv - LC2VFU(rgb[c][(rr) * ts + cc]));
    +                            STVFU(grbdiff[((rr)*ts + cc) >> 1], Gintv - LVFU(rgb[c][((rr) * ts + cc) >> 1]));
                                 STVFU(gshift[((rr)*ts + cc) >> 1], Gintv);
                             }
    
    @@ -910,7 +932,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
    
                                 //determine R/B at grid points using colour differences at shift point plus interpolated G value at grid point
                                 //but first we need to interpolate G-R/G-B to grid points...
    -                            grbdiff[((rr)*ts + cc) >> 1] = Gint - rgb[c][(rr) * ts + cc];
    +                            grbdiff[((rr)*ts + cc) >> 1] = Gint - rgb[c][((rr) * ts + cc) >> 1];
                                 gshift[((rr)*ts + cc) >> 1] = Gint;
                             }
                         }
    @@ -920,11 +942,48 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                         shiftvfrac[0] /= 2.f;
                         shiftvfrac[2] /= 2.f;
    
    -                    // this loop does not deserve vectorization in mainly because the most expensive part with the divisions does not happen often (less than 1/10 in my tests)
    -                    for (int rr = 8; rr < rr1 - 8; rr++)
    -                        for (int cc = 8 + (FC(rr, 2) & 1), c = FC(rr, cc), indx = rr * ts + cc; cc < cc1 - 8; cc += 2, indx += 2) {
    +#ifdef __SSE2__
    +                    vfloat zd25v = F2V(0.25f);
    +                    vfloat onev = F2V(1.f);
    +                    vfloat zd5v = F2V(0.5f);
    +                    vfloat epsv = F2V(eps);
    +#endif
    +                    for (int rr = 8; rr < rr1 - 8; rr++) {
    +                        int cc = 8 + (FC(rr, 2) & 1);
    +#ifdef __SSE2__
    +                        for (int c = FC(rr, cc), indx = rr * ts + cc; cc < cc1 - 14; cc += 8, indx += 8) {
    +                            vfloat cinv = LVFU(rgb[c][indx >> 1]);
    +                            vfloat rinv = LC2VFU(rgb[1][indx]);
    +                            vfloat grbdiffold = rinv - cinv;
    +
    +                            //interpolate colour difference from optical R/B locations to grid locations
    +                            vfloat grbdiffinthfloor = vintpf(F2V(shifthfrac[c]), LVFU(grbdiff[(indx - GRBdir[1][c]) >> 1]), LVFU(grbdiff[indx >> 1]));
    +                            vfloat grbdiffinthceil = vintpf(F2V(shifthfrac[c]), LVFU(grbdiff[((rr - GRBdir[0][c]) * ts + cc - GRBdir[1][c]) >> 1]), LVFU(grbdiff[((rr - GRBdir[0][c]) * ts + cc) >> 1]));
    +                            //grbdiffint is bilinear interpolation of G-R/G-B at grid point
    +                            vfloat grbdiffint = vintpf(F2V(shiftvfrac[c]), grbdiffinthceil, grbdiffinthfloor);
    +
    +                            //now determine R/B at grid points using interpolated colour differences and interpolated G value at grid point
    +                            vfloat RBint = rinv - grbdiffint;
    +                            vmask cmask = vmaskf_ge(vabsf(RBint - cinv), zd25v * (RBint + cinv));
    +                            if(_mm_movemask_ps((vfloat)cmask)) {
    +                                //gradient weights using difference from G at CA shift points and G at grid points
    +                                vfloat p0 = onev / (epsv + vabsf(rinv - LVFU(gshift[indx >> 1])));
    +                                vfloat p1 = onev / (epsv + vabsf(rinv - LVFU(gshift[(indx - GRBdir[1][c]) >> 1])));
    +                                vfloat p2 = onev / (epsv + vabsf(rinv - LVFU(gshift[((rr - GRBdir[0][c]) * ts + cc) >> 1])));
    +                                vfloat p3 = onev / (epsv + vabsf(rinv - LVFU(gshift[((rr - GRBdir[0][c]) * ts + cc - GRBdir[1][c]) >> 1])));
    
    -                            float grbdiffold = rgb[1][indx] - rgb[c][indx];
    +                                grbdiffint = vself(cmask, (p0 * LVFU(grbdiff[indx >> 1]) + p1 * LVFU(grbdiff[(indx - GRBdir[1][c]) >> 1]) +
    +                                              p2 * LVFU(grbdiff[((rr - GRBdir[0][c]) * ts + cc) >> 1]) + p3 * LVFU(grbdiff[((rr - GRBdir[0][c]) * ts + cc - GRBdir[1][c]) >> 1])) / (p0 + p1 + p2 + p3), grbdiffint);
    +
    +                            }
    +                            RBint = rinv - grbdiffint;
    +                            RBint = vself(vmaskf_gt(vabsf(grbdiffold), vabsf(grbdiffint)), RBint, cinv);
    +                            RBint = vself(vmaskf_lt(grbdiffold * grbdiffint, ZEROV), rinv - zd5v * (grbdiffold + grbdiffint), RBint);
    +                            STVFU(rgb[c][indx >> 1], RBint);
    +                        }
    +#endif
    +                        for (int c = FC(rr, cc), indx = rr * ts + cc; cc < cc1 - 8; cc += 2, indx += 2) {
    +                            float grbdiffold = rgb[1][indx] - rgb[c][indx >> 1];
    
                                 //interpolate colour difference from optical R/B locations to grid locations
                                 float grbdiffinthfloor = intp(shifthfrac[c], grbdiff[(indx - GRBdir[1][c]) >> 1], grbdiff[indx >> 1]);
    @@ -935,9 +994,9 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                                 //now determine R/B at grid points using interpolated colour differences and interpolated G value at grid point
                                 float RBint = rgb[1][indx] - grbdiffint;
    
    -                            if (fabsf(RBint - rgb[c][indx]) < 0.25f * (RBint + rgb[c][indx])) {
    +                            if (fabsf(RBint - rgb[c][indx >> 1]) < 0.25f * (RBint + rgb[c][indx >> 1])) {
                                     if (fabsf(grbdiffold) > fabsf(grbdiffint) ) {
    -                                    rgb[c][indx] = RBint;
    +                                    rgb[c][indx >> 1] = RBint;
                                     }
                                 } else {
    
    @@ -952,22 +1011,23 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
    
                                     //now determine R/B at grid points using interpolated colour differences and interpolated G value at grid point
                                     if (fabsf(grbdiffold) > fabsf(grbdiffint) ) {
    -                                    rgb[c][indx] = rgb[1][indx] - grbdiffint;
    +                                    rgb[c][indx >> 1] = rgb[1][indx] - grbdiffint;
                                     }
                                 }
    
                                 //if colour difference interpolation overshot the correction, just desaturate
                                 if (grbdiffold * grbdiffint < 0) {
    -                                rgb[c][indx] = rgb[1][indx] - 0.5f * (grbdiffold + grbdiffint);
    +                                rgb[c][indx >> 1] = rgb[1][indx] - 0.5f * (grbdiffold + grbdiffint);
                                 }
                             }
    +                    }
    
                         // copy CA corrected results to temporary image matrix
                         for (int rr = border; rr < rr1 - border; rr++) {
                             int c = FC(rr + top, left + border + (FC(rr + top, 2) & 1));
    
                             for (int row = rr + top, cc = border + (FC(rr, 2) & 1), indx = (row * width + cc + left) >> 1; cc < cc1 - border; cc += 2, indx++) {
    -                            RawDataTmp[indx] = 65535.0f * rgb[c][(rr) * ts + cc];
    +                            RawDataTmp[indx] = 65535.0f * rgb[c][((rr) * ts + cc) >> ((c & 1) ^ 1)];
                             }
                         }
    
    </float>
    点赞 评论 复制链接分享
  • weixin_39611666 weixin_39611666 5月前

    I created cacorrect_speedup branch with some more optimizations. I tested with blue_horse.nef. There are very very small differences in output before/after which are not visible by eyeball. I measured on my old (Processor: Pentium(R)\ Dual-Core\ CPU\ T4500\ @\ 2.30GHz) laptop using Sabayon 64bit and gcc 4.9.3 before: ~1180 ms after: ~850 ms

    点赞 评论 复制链接分享
  • weixin_39710951 weixin_39710951 5月前

    Hola! Without the patch ~500ms, with 779f4dd12e0109e2c1790ff5df9785a35add0deb ~300ms.

    点赞 评论 复制链接分享
  • weixin_39929138 weixin_39929138 5月前

    I just tested with and whithout patch. No differences, or very very small. Very good job :)

    Jacques

    点赞 评论 复制链接分享
  • weixin_39611666 weixin_39611666 5月前

    Jacques, thanks for testing merged

    点赞 评论 复制链接分享

相关推荐