diff --git a/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu b/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu
index a3d4e3e..4ec7f38 100644
--- a/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu
+++ b/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu
@@ -65,29 +65,31 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_)
     // 41 x 41, if halfsearchrange=20
     cuArraysMaxloc2D(r_corrBatchRaw, offsetInit, r_maxval, stream);
 
-    // Estimation of statistics
-    // Extraction of correlation surface around the peak
+    // estimate variance
+    cuEstimateVariance(r_referenceBatchRaw->size, r_corrBatchRaw, offsetInit, r_maxval, r_covValue, stream);
+
+    // estimate SNR
+    // step1: extraction of correlation surface around the peak
     cuArraysCopyExtractCorr(r_corrBatchRaw, r_corrBatchRawZoomIn, i_corrBatchZoomInValid, offsetInit, stream);
 
-    // Summation of correlation and data point values
+    // step2: summation of correlation and data point values
     cuArraysSumCorr(r_corrBatchRawZoomIn, i_corrBatchZoomInValid, r_corrBatchSum, i_corrBatchValidCount, stream);
 
 #ifdef CUAMPCOR_DEBUG
+    r_maxval->outputToFile("r_maxval", stream);
+    r_corrBatchRawZoomIn->outputToFile("r_corrBatchRawStatZoomIn", stream);
     i_corrBatchZoomInValid->outputToFile("i_corrBatchZoomInValid", stream);
     r_corrBatchSum->outputToFile("r_corrBatchSum", stream);
+    i_corrBatchValidCount->outputToFile("i_corrBatchValidCount", stream);
 #endif
 
-    // SNR
+    // step3: divide the peak value by the mean of surrounding values
     cuEstimateSnr(r_corrBatchSum, i_corrBatchValidCount, r_maxval, r_snrValue, stream);
 
-    // Variance
-    cuEstimateVariance(r_corrBatchRaw, offsetInit, r_maxval, r_covValue, stream);
-
 #ifdef CUAMPCOR_DEBUG
     offsetInit->outputToFile("i_offsetInit", stream);
-    r_maxval->outputToFile("r_maxval", stream);
-    r_corrBatchRawZoomIn->outputToFile("r_corrBatchRawStatZoomIn", stream);
-    i_corrBatchZoomInValid->outputToFile("i_corrBatchStatZoomInValid", stream);
+    r_snrValue->outputToFile("r_snrValue", stream);
+    r_covValue->outputToFile("r_covValue", stream);
 #endif
 
     // Using the approximate estimation to adjust secondary image (half search window size becomes only 4 pixels)
diff --git a/contrib/PyCuAmpcor/src/cuAmpcorUtil.h b/contrib/PyCuAmpcor/src/cuAmpcorUtil.h
index 752bc88..bdfbaa1 100644
--- a/contrib/PyCuAmpcor/src/cuAmpcorUtil.h
+++ b/contrib/PyCuAmpcor/src/cuAmpcorUtil.h
@@ -91,7 +91,7 @@ void cuArraysSumCorr(cuArrays<float> *images, cuArrays<int> *imagesValid, cuArra
 void cuEstimateSnr(cuArrays<float> *corrSum, cuArrays<int> *corrValidCount, cuArrays<float> *maxval, cuArrays<float> *snrValue, cudaStream_t stream);
 
 // implemented in cuEstimateStats.cu
-void cuEstimateVariance(cuArrays<float> *corrBatchRaw, cuArrays<int2> *maxloc, cuArrays<float> *maxval, cuArrays<float3> *covValue, cudaStream_t stream);
+void cuEstimateVariance(int winSize, cuArrays<float> *corrBatchRaw, cuArrays<int2> *maxloc, cuArrays<float> *maxval, cuArrays<float3> *covValue, cudaStream_t stream);
 
 #endif
 
diff --git a/contrib/PyCuAmpcor/src/cuEstimateStats.cu b/contrib/PyCuAmpcor/src/cuEstimateStats.cu
index 7973f7b..0a24420 100644
--- a/contrib/PyCuAmpcor/src/cuEstimateStats.cu
+++ b/contrib/PyCuAmpcor/src/cuEstimateStats.cu
@@ -46,7 +46,7 @@ void cuEstimateSnr(cuArrays<float> *corrSum, cuArrays<int> *corrValidCount, cuAr
 }
 
 // cuda kernel for cuEstimateVariance
-__global__ void cudaKernel_estimateVar(const float* corrBatchRaw, const int NX, const int NY,
+__global__ void cudaKernel_estimateVar(const int winSize, const float* corrBatchRaw, const int NX, const int NY,
     const int2* maxloc, const float* maxval, float3* covValue, const int size)
 {
 
@@ -78,14 +78,12 @@ __global__ void cudaKernel_estimateVar(const float* corrBatchRaw, const int NX,
         int idx21 = offset + (px + 1) * NY + py    ;
         int idx22 = offset + (px + 1) * NY + py + 1;
 
-        float dxx = - ( corrBatchRaw[idx21] + corrBatchRaw[idx01] - 2*corrBatchRaw[idx11] ) * 0.5;
-        float dyy = - ( corrBatchRaw[idx12] + corrBatchRaw[idx10] - 2*corrBatchRaw[idx11] ) * 0.5;
-        float dxy = - ( corrBatchRaw[idx22] + corrBatchRaw[idx00] - corrBatchRaw[idx20] - corrBatchRaw[idx02] ) *0.25;
+        float dxx = - ( corrBatchRaw[idx21] + corrBatchRaw[idx01] - 2*corrBatchRaw[idx11] ) * 1.0;
+        float dyy = - ( corrBatchRaw[idx12] + corrBatchRaw[idx10] - 2*corrBatchRaw[idx11] ) * 1.0;
+        float dxy = ( corrBatchRaw[idx22] + corrBatchRaw[idx00] - corrBatchRaw[idx20] - corrBatchRaw[idx02] ) *0.25;
 
         float n2 = fmaxf(1 - peak, 0.0);
 
-        int winSize = NX*NY;
-
         dxx = dxx * winSize;
         dyy = dyy * winSize;
         dxy = dxy * winSize;
@@ -113,18 +111,19 @@ __global__ void cudaKernel_estimateVar(const float* corrBatchRaw, const int NX,
 
 /**
  * Estimate the variance of the correlation surface
+ * @param[in] winSize size of reference chip
  * @param[in] corrBatchRaw correlation surface
  * @param[in] maxloc maximum location
  * @param[in] maxval maximum value
  * @param[out] covValue variance value
  * @param[in] stream cuda stream
  */
-void cuEstimateVariance(cuArrays<float> *corrBatchRaw, cuArrays<int2> *maxloc, cuArrays<float> *maxval, cuArrays<float3> *covValue, cudaStream_t stream)
+void cuEstimateVariance(int winSize, cuArrays<float> *corrBatchRaw, cuArrays<int2> *maxloc, cuArrays<float> *maxval, cuArrays<float3> *covValue, cudaStream_t stream)
 {
     int size = corrBatchRaw->count;
     // One dimensional launching parameters to loop over every correlation surface.
     cudaKernel_estimateVar<<< IDIVUP(size, NTHREADS), NTHREADS, 0, stream>>>
-        (corrBatchRaw->devData, corrBatchRaw->height, corrBatchRaw->width, maxloc->devData, maxval->devData, covValue->devData, size);
+        (winSize, corrBatchRaw->devData, corrBatchRaw->height, corrBatchRaw->width, maxloc->devData, maxval->devData, covValue->devData, size);
     getLastCudaError("cudaKernel_estimateVar error\n");
 }
-//end of file
\ No newline at end of file
+//end of file