From f154c6e2e54c0b4e552d0f302e79b65545b24d47 Mon Sep 17 00:00:00 2001
From: jaimefrio <jaime.frio@gmail.com>
Date: Sun, 6 Apr 2014 00:50:14 -0700
Subject: [PATCH] ENH: Use Welford's method in stats.moments.rolling_var

This PR implements a modified version of Welford's method to compute
the rolling variance. Instead of keeping track of the sum and sum of
the squares of the items in the window, it tracks the mean and the sum
of squared differences from the mean. This turns out to be (much) more
numerically stable.

The formulas to update these two variables when adding or removing an
item from the sequence are well known, see e.g.

http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance

The formulas used when both adding one and removing one item I have
not seen explicitly worked out anywhere, but are not too hard to come
up with if you put pen to (a lot of) paper.
---
 doc/source/release.rst             |  1 +
 pandas/algos.pyx                   | 74 ++++++++++++++++++++----------
 pandas/stats/moments.py            |  4 +-
 pandas/stats/tests/test_moments.py | 18 ++++++--
 4 files changed, 67 insertions(+), 30 deletions(-)

diff --git a/doc/source/release.rst b/doc/source/release.rst
index 7824a69c92561..c494db2ae91c4 100644
--- a/doc/source/release.rst
+++ b/doc/source/release.rst
@@ -276,6 +276,7 @@ Improvements to existing features
 - Add option to turn off escaping in ``DataFrame.to_latex`` (:issue:`6472`)
 - Added ``how`` option to rolling-moment functions to dictate how to handle resampling; :func:``rolling_max`` defaults to max,
   :func:``rolling_min`` defaults to min, and all others default to mean (:issue:`6297`)
+- ``pd.stats.moments.rolling_var`` now uses Welford's method for increased numerical stability (:issue:`6817`)
 
 .. _release.bug_fixes-0.14.0:
 
diff --git a/pandas/algos.pyx b/pandas/algos.pyx
index 27e25c3954dad..bba6b46c52e37 100644
--- a/pandas/algos.pyx
+++ b/pandas/algos.pyx
@@ -1122,7 +1122,10 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1):
 # Rolling variance
 
 def roll_var(ndarray[double_t] input, int win, int minp, int ddof=1):
-    cdef double val, prev, sum_x = 0, sum_xx = 0, nobs = 0
+    """
+    Numerically stable implementation using Welford's method.
+    """
+    cdef double val, prev, mean_x = 0, ssqdm_x = 0, nobs = 0, delta
     cdef Py_ssize_t i
     cdef Py_ssize_t N = len(input)
 
@@ -1130,48 +1133,71 @@ def roll_var(ndarray[double_t] input, int win, int minp, int ddof=1):
 
     minp = _check_minp(win, minp, N)
 
-    for i from 0 <= i < minp - 1:
+    for i from 0 <= i < win:
         val = input[i]
 
         # Not NaN
         if val == val:
             nobs += 1
-            sum_x += val
-            sum_xx += val * val
+            delta = (val - mean_x)
+            mean_x += delta / nobs
+            ssqdm_x += delta * (val - mean_x)
 
-        output[i] = NaN
+        if nobs >= minp:
+            #pathological case
+            if nobs == 1:
+                val = 0
+            else:
+                val = ssqdm_x / (nobs - ddof)
+                if val < 0:
+                    val = 0
+        else:
+            val = NaN
 
-    for i from minp - 1 <= i < N:
+        output[i] = val
+
+    for i from win <= i < N:
         val = input[i]
+        prev = input[i - win]
 
         if val == val:
-            nobs += 1
-            sum_x += val
-            sum_xx += val * val
-
-        if i > win - 1:
-            prev = input[i - win]
             if prev == prev:
-                sum_x -= prev
-                sum_xx -= prev * prev
-                nobs -= 1
+                delta = val - prev
+                prev -= mean_x
+                mean_x += delta / nobs
+                val -= mean_x
+                ssqdm_x += (val + prev) * delta
+            else:
+                nobs += 1
+                delta = (val - mean_x)
+                mean_x += delta / nobs
+                ssqdm_x += delta * (val - mean_x)
+        elif prev == prev:
+            nobs -= 1
+            if nobs:
+                delta = (prev - mean_x)
+                mean_x -= delta  / nobs
+                ssqdm_x -= delta * (prev - mean_x)
+            else:
+                mean_x = 0
+                ssqdm_x = 0
 
         if nobs >= minp:
-            # pathological case
+            #pathological case
             if nobs == 1:
-                output[i] = 0
-                continue
-
-            val = (nobs * sum_xx - sum_x * sum_x) / (nobs * (nobs - ddof))
-            if val < 0:
                 val = 0
-
-            output[i] = val
+            else:
+                val = ssqdm_x / (nobs - ddof)
+                if val < 0:
+                    val = 0
         else:
-            output[i] = NaN
+            val = NaN
+
+        output[i] = val
 
     return output
 
+
 #-------------------------------------------------------------------------------
 # Rolling skewness
 
diff --git a/pandas/stats/moments.py b/pandas/stats/moments.py
index 246037c7d7009..42da19f1a241d 100644
--- a/pandas/stats/moments.py
+++ b/pandas/stats/moments.py
@@ -751,7 +751,7 @@ def rolling_window(arg, window=None, win_type=None, min_periods=None,
     * ``gaussian`` (needs std)
     * ``general_gaussian`` (needs power, width)
     * ``slepian`` (needs width).
-    
+
     By default, the result is set to the right edge of the window. This can be
     changed to the center of the window by setting ``center=True``.
 
@@ -978,7 +978,7 @@ def expanding_apply(arg, func, min_periods=1, freq=None, center=False,
     Returns
     -------
     y : type of input argument
-    
+
     Notes
     -----
     The `freq` keyword is used to conform time series data to a specified
diff --git a/pandas/stats/tests/test_moments.py b/pandas/stats/tests/test_moments.py
index 8c9eb080cfc61..06e7484bbd536 100644
--- a/pandas/stats/tests/test_moments.py
+++ b/pandas/stats/tests/test_moments.py
@@ -295,7 +295,8 @@ def test_rolling_std_neg_sqrt(self):
 
     def test_rolling_var(self):
         self._check_moment_func(mom.rolling_var,
-                                lambda x: np.var(x, ddof=1))
+                                lambda x: np.var(x, ddof=1),
+                                test_stable=True)
         self._check_moment_func(functools.partial(mom.rolling_var, ddof=0),
                                 lambda x: np.var(x, ddof=0))
 
@@ -349,13 +350,15 @@ def _check_moment_func(self, func, static_comp, window=50,
                            has_center=True,
                            has_time_rule=True,
                            preserve_nan=True,
-                           fill_value=None):
+                           fill_value=None,
+                           test_stable=False):
 
         self._check_ndarray(func, static_comp, window=window,
                             has_min_periods=has_min_periods,
                             preserve_nan=preserve_nan,
                             has_center=has_center,
-                            fill_value=fill_value)
+                            fill_value=fill_value,
+                            test_stable=test_stable)
 
         self._check_structures(func, static_comp,
                                has_min_periods=has_min_periods,
@@ -367,7 +370,8 @@ def _check_ndarray(self, func, static_comp, window=50,
                        has_min_periods=True,
                        preserve_nan=True,
                        has_center=True,
-                       fill_value=None):
+                       fill_value=None,
+                       test_stable=False):
 
         result = func(self.arr, window)
         assert_almost_equal(result[-1],
@@ -425,6 +429,12 @@ def _check_ndarray(self, func, static_comp, window=50,
                 self.assert_(np.isnan(expected[-5]))
                 self.assert_(np.isnan(result[-14]))
 
+        if test_stable:
+            result = func(self.arr + 1e9, window)
+            assert_almost_equal(result[-1],
+                                static_comp(self.arr[-50:] + 1e9))
+
+
     def _check_structures(self, func, static_comp,
                           has_min_periods=True, has_time_rule=True,
                           has_center=True,