From f154c6e2e54c0b4e552d0f302e79b65545b24d47 Mon Sep 17 00:00:00 2001 From: jaimefrio Date: Sun, 6 Apr 2014 00:50:14 -0700 Subject: [PATCH] ENH: Use Welford's method in stats.moments.rolling_var This PR implements a modified version of Welford's method to compute the rolling variance. Instead of keeping track of the sum and sum of the squares of the items in the window, it tracks the mean and the sum of squared differences from the mean. This turns out to be (much) more numerically stable. The formulas to update these two variables when adding or removing an item from the sequence are well known, see e.g. http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance The formulas used when both adding one and removing one item I have not seen explicitly worked out anywhere, but are not too hard to come up with if you put pen to (a lot of) paper. --- doc/source/release.rst | 1 + pandas/algos.pyx | 74 ++++++++++++++++++++---------- pandas/stats/moments.py | 4 +- pandas/stats/tests/test_moments.py | 18 ++++++-- 4 files changed, 67 insertions(+), 30 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 7824a69c92561..c494db2ae91c4 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -276,6 +276,7 @@ Improvements to existing features - Add option to turn off escaping in ``DataFrame.to_latex`` (:issue:`6472`) - Added ``how`` option to rolling-moment functions to dictate how to handle resampling; :func:``rolling_max`` defaults to max, :func:``rolling_min`` defaults to min, and all others default to mean (:issue:`6297`) +- ``pd.stats.moments.rolling_var`` now uses Welford's method for increased numerical stability (:issue:`6817`) .. _release.bug_fixes-0.14.0: diff --git a/pandas/algos.pyx b/pandas/algos.pyx index 27e25c3954dad..bba6b46c52e37 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -1122,7 +1122,10 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1): # Rolling variance def roll_var(ndarray[double_t] input, int win, int minp, int ddof=1): - cdef double val, prev, sum_x = 0, sum_xx = 0, nobs = 0 + """ + Numerically stable implementation using Welford's method. + """ + cdef double val, prev, mean_x = 0, ssqdm_x = 0, nobs = 0, delta cdef Py_ssize_t i cdef Py_ssize_t N = len(input) @@ -1130,48 +1133,71 @@ def roll_var(ndarray[double_t] input, int win, int minp, int ddof=1): minp = _check_minp(win, minp, N) - for i from 0 <= i < minp - 1: + for i from 0 <= i < win: val = input[i] # Not NaN if val == val: nobs += 1 - sum_x += val - sum_xx += val * val + delta = (val - mean_x) + mean_x += delta / nobs + ssqdm_x += delta * (val - mean_x) - output[i] = NaN + if nobs >= minp: + #pathological case + if nobs == 1: + val = 0 + else: + val = ssqdm_x / (nobs - ddof) + if val < 0: + val = 0 + else: + val = NaN - for i from minp - 1 <= i < N: + output[i] = val + + for i from win <= i < N: val = input[i] + prev = input[i - win] if val == val: - nobs += 1 - sum_x += val - sum_xx += val * val - - if i > win - 1: - prev = input[i - win] if prev == prev: - sum_x -= prev - sum_xx -= prev * prev - nobs -= 1 + delta = val - prev + prev -= mean_x + mean_x += delta / nobs + val -= mean_x + ssqdm_x += (val + prev) * delta + else: + nobs += 1 + delta = (val - mean_x) + mean_x += delta / nobs + ssqdm_x += delta * (val - mean_x) + elif prev == prev: + nobs -= 1 + if nobs: + delta = (prev - mean_x) + mean_x -= delta / nobs + ssqdm_x -= delta * (prev - mean_x) + else: + mean_x = 0 + ssqdm_x = 0 if nobs >= minp: - # pathological case + #pathological case if nobs == 1: - output[i] = 0 - continue - - val = (nobs * sum_xx - sum_x * sum_x) / (nobs * (nobs - ddof)) - if val < 0: val = 0 - - output[i] = val + else: + val = ssqdm_x / (nobs - ddof) + if val < 0: + val = 0 else: - output[i] = NaN + val = NaN + + output[i] = val return output + #------------------------------------------------------------------------------- # Rolling skewness diff --git a/pandas/stats/moments.py b/pandas/stats/moments.py index 246037c7d7009..42da19f1a241d 100644 --- a/pandas/stats/moments.py +++ b/pandas/stats/moments.py @@ -751,7 +751,7 @@ def rolling_window(arg, window=None, win_type=None, min_periods=None, * ``gaussian`` (needs std) * ``general_gaussian`` (needs power, width) * ``slepian`` (needs width). - + By default, the result is set to the right edge of the window. This can be changed to the center of the window by setting ``center=True``. @@ -978,7 +978,7 @@ def expanding_apply(arg, func, min_periods=1, freq=None, center=False, Returns ------- y : type of input argument - + Notes ----- The `freq` keyword is used to conform time series data to a specified diff --git a/pandas/stats/tests/test_moments.py b/pandas/stats/tests/test_moments.py index 8c9eb080cfc61..06e7484bbd536 100644 --- a/pandas/stats/tests/test_moments.py +++ b/pandas/stats/tests/test_moments.py @@ -295,7 +295,8 @@ def test_rolling_std_neg_sqrt(self): def test_rolling_var(self): self._check_moment_func(mom.rolling_var, - lambda x: np.var(x, ddof=1)) + lambda x: np.var(x, ddof=1), + test_stable=True) self._check_moment_func(functools.partial(mom.rolling_var, ddof=0), lambda x: np.var(x, ddof=0)) @@ -349,13 +350,15 @@ def _check_moment_func(self, func, static_comp, window=50, has_center=True, has_time_rule=True, preserve_nan=True, - fill_value=None): + fill_value=None, + test_stable=False): self._check_ndarray(func, static_comp, window=window, has_min_periods=has_min_periods, preserve_nan=preserve_nan, has_center=has_center, - fill_value=fill_value) + fill_value=fill_value, + test_stable=test_stable) self._check_structures(func, static_comp, has_min_periods=has_min_periods, @@ -367,7 +370,8 @@ def _check_ndarray(self, func, static_comp, window=50, has_min_periods=True, preserve_nan=True, has_center=True, - fill_value=None): + fill_value=None, + test_stable=False): result = func(self.arr, window) assert_almost_equal(result[-1], @@ -425,6 +429,12 @@ def _check_ndarray(self, func, static_comp, window=50, self.assert_(np.isnan(expected[-5])) self.assert_(np.isnan(result[-14])) + if test_stable: + result = func(self.arr + 1e9, window) + assert_almost_equal(result[-1], + static_comp(self.arr[-50:] + 1e9)) + + def _check_structures(self, func, static_comp, has_min_periods=True, has_time_rule=True, has_center=True,