Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1263,7 +1263,8 @@ Groupby/resample/rolling
- Bug in :meth:`DataFrameGroupby.transform` and :meth:`SeriesGroupby.transform` with a reducer and ``observed=False`` that coerces dtype to float when there are unobserved categories. (:issue:`55326`)
- Bug in :meth:`Rolling.apply` for ``method="table"`` where column order was not being respected due to the columns getting sorted by default. (:issue:`59666`)
- Bug in :meth:`Rolling.apply` where the applied function could be called on fewer than ``min_period`` periods if ``method="table"``. (:issue:`58868`)
- Bug in :meth:`Rolling.skew` incorrectly computing skewness for windows following outliers due to numerical instability. The calculation now properly handles catastrophic cancellation by recomputing affected windows (:issue:`47461`)
- Bug in :meth:`Rolling.skew` and in :meth:`Rolling.kurt` incorrectly computing skewness and kurtosis, respectively, for windows following outliers due to numerical instability. The calculation now properly handles catastrophic cancellation by recomputing affected windows (:issue:`47461`, :issue:`61416`)
- Bug in :meth:`Rolling.skew` and in :meth:`Rolling.kurt` where results varied with input length despite identical data and window contents (:issue:`54380`)
- Bug in :meth:`Series.resample` could raise when the date range ended shortly before a non-existent time. (:issue:`58380`)
- Bug in :meth:`Series.resample` raising error when resampling non-nanosecond resolutions out of bounds for nanosecond precision (:issue:`57427`)
- Bug in :meth:`Series.rolling.var` and :meth:`Series.rolling.std` computing incorrect results due to numerical instability. (:issue:`47721`, :issue:`52407`, :issue:`54518`, :issue:`55343`)
Expand Down
220 changes: 102 additions & 118 deletions pandas/_libs/window/aggregations.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

from libc.math cimport (
fabs,
round,
signbit,
sqrt,
)
Expand Down Expand Up @@ -683,30 +682,22 @@ def roll_skew(const float64_t[:] values, ndarray[int64_t] start,


cdef float64_t calc_kurt(int64_t minp, int64_t nobs,
float64_t x, float64_t xx,
float64_t xxx, float64_t xxxx,
float64_t m2, float64_t m4,
int64_t num_consecutive_same_value,
) noexcept nogil:
cdef:
float64_t result, dnobs
float64_t A, B, C, D, R, K
float64_t result, dnobs, term1, term2, inner, correction
float64_t moments_ratio

if nobs >= minp:
if nobs < 4:
result = NaN
# GH 42064 46431
# uniform case, force result to be -3.
# degenerate case, force result to be -3.
elif num_consecutive_same_value >= nobs:
result = -3.
else:
dnobs = <float64_t>nobs
A = x / dnobs
R = A * A
B = xx / dnobs - R
R = R * A
C = xxx / dnobs - R - 3 * A * B
R = R * A
D = xxxx / dnobs - R - 6 * B * A * A - 4 * C * A

# #18044: with uniform distribution, floating issue will
# cause B != 0. and cause the result is a very
Expand All @@ -716,52 +707,61 @@ cdef float64_t calc_kurt(int64_t minp, int64_t nobs,
# _zero_out_fperr(m2) to fix floating error.
# if the variance is less than 1e-14, it could be
# treat as zero, here we follow the original
# skew/kurt behaviour to check B <= 1e-14
if B <= 1e-14:
# skew/kurt behaviour to check m2 <= n * 1e-14
if m2 <= dnobs * 1e-14:
result = NaN
else:
K = (dnobs * dnobs - 1.) * D / (B * B) - 3 * ((dnobs - 1.) ** 2)
result = K / ((dnobs - 2.) * (dnobs - 3.))
moments_ratio = m4 / (m2 * m2)
term1 = dnobs * (dnobs + 1.0) * moments_ratio
term2 = 3.0 * (dnobs - 1.0)
inner = term1 - term2

correction = (dnobs - 1.0) / ((dnobs - 2.0) * (dnobs - 3.0))
result = correction * inner
else:
result = NaN

return result


cdef void add_kurt(float64_t val, int64_t *nobs,
float64_t *x, float64_t *xx,
float64_t *xxx, float64_t *xxxx,
float64_t *compensation_x,
float64_t *compensation_xx,
float64_t *compensation_xxx,
float64_t *compensation_xxxx,
float64_t *mean, float64_t *m2,
float64_t *m3, float64_t *m4,
bint *numerically_unstable,
int64_t *num_consecutive_same_value,
float64_t *prev_value
) noexcept nogil:
""" add a value from the kurotic calc """
cdef:
float64_t y, t
float64_t n, delta, delta_n, term1, m4_update, new_m4

# Formulas adapted from
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Higher-order_statistics

# Not NaN
if val == val:
nobs[0] = nobs[0] + 1
nobs[0] += 1
n = <float64_t>(nobs[0])
delta = val - mean[0]
delta_n = delta / n
term1 = delta * delta_n * (n - 1.0)

m4_update = delta_n * (
-4.0 * m3[0]
+ delta_n * (
6 * m2[0] + term1 * (n * n - 3.0 * n + 3.0)
)
)
new_m4 = m4[0] + m4_update

if (fabs(m4_update) + fabs(m4[0])) * InvCondTol > fabs(new_m4):
# possible catastrophic cancellation
numerically_unstable[0] = True

y = val - compensation_x[0]
t = x[0] + y
compensation_x[0] = t - x[0] - y
x[0] = t
y = val * val - compensation_xx[0]
t = xx[0] + y
compensation_xx[0] = t - xx[0] - y
xx[0] = t
y = val * val * val - compensation_xxx[0]
t = xxx[0] + y
compensation_xxx[0] = t - xxx[0] - y
xxx[0] = t
y = val * val * val * val - compensation_xxxx[0]
t = xxxx[0] + y
compensation_xxxx[0] = t - xxxx[0] - y
xxxx[0] = t
m4[0] = new_m4
m3[0] += delta_n * (term1 * (n - 2.0) - 3.0 * m2[0])
m2[0] += term1
mean[0] += delta_n

# GH#42064, record num of same values to remove floating point artifacts
if val == prev_value[0]:
Expand All @@ -773,125 +773,109 @@ cdef void add_kurt(float64_t val, int64_t *nobs,


cdef void remove_kurt(float64_t val, int64_t *nobs,
float64_t *x, float64_t *xx,
float64_t *xxx, float64_t *xxxx,
float64_t *compensation_x,
float64_t *compensation_xx,
float64_t *compensation_xxx,
float64_t *compensation_xxxx) noexcept nogil:
float64_t *mean, float64_t *m2,
float64_t *m3, float64_t *m4,
bint *numerically_unstable,
) noexcept nogil:
""" remove a value from the kurotic calc """
cdef:
float64_t y, t
float64_t n, delta, delta_n, term1, m4_update, new_m4

# Not NaN
if val == val:
nobs[0] = nobs[0] - 1
nobs[0] -= 1
n = <float64_t>(nobs[0])
delta = val - mean[0]
delta_n = delta / n
term1 = delta_n * delta * (n + 1.0)

m4_update = delta_n * (
4.0 * m3[0]
+ delta_n * (
6.0 * m2[0]
- term1 * (n * n + 3.0 * n + 3.0)
)
)
new_m4 = m4[0] + m4_update

if (fabs(m4_update) + fabs(m4[0])) * InvCondTol > fabs(new_m4):
# possible catastrophic cancellation
numerically_unstable[0] = True

y = - val - compensation_x[0]
t = x[0] + y
compensation_x[0] = t - x[0] - y
x[0] = t
y = - val * val - compensation_xx[0]
t = xx[0] + y
compensation_xx[0] = t - xx[0] - y
xx[0] = t
y = - val * val * val - compensation_xxx[0]
t = xxx[0] + y
compensation_xxx[0] = t - xxx[0] - y
xxx[0] = t
y = - val * val * val * val - compensation_xxxx[0]
t = xxxx[0] + y
compensation_xxxx[0] = t - xxxx[0] - y
xxxx[0] = t


def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
m4[0] = new_m4
m3[0] -= delta_n * (term1 * (n + 2.0) - 3.0 * m2[0])
m2[0] -= term1
mean[0] -= delta_n


def roll_kurt(const float64_t[:] values, ndarray[int64_t] start,
ndarray[int64_t] end, int64_t minp) -> np.ndarray:
cdef:
Py_ssize_t i, j
float64_t val, mean_val, min_val, sum_val = 0
float64_t compensation_xxxx_add, compensation_xxxx_remove
float64_t compensation_xxx_remove, compensation_xxx_add
float64_t compensation_xx_remove, compensation_xx_add
float64_t compensation_x_remove, compensation_x_add
float64_t x, xx, xxx, xxxx
float64_t mean, m2, m3, m4
float64_t prev_value
int64_t nobs, s, e, num_consecutive_same_value
int64_t N = len(start), V = len(values), nobs_mean = 0
ndarray[float64_t] output, values_copy
int64_t N = len(start)
ndarray[float64_t] output
bint is_monotonic_increasing_bounds
bint requires_recompute, numerically_unstable = False

minp = max(minp, 4)
is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
start, end
)
output = np.empty(N, dtype=np.float64)
values_copy = np.copy(values)
min_val = np.nanmin(values)

with nogil:
for i in range(0, V):
val = values_copy[i]
if val == val:
nobs_mean += 1
sum_val += val
mean_val = sum_val / nobs_mean
# Other cases would lead to imprecision for smallest values
if min_val - mean_val > -1e4:
mean_val = round(mean_val)
for i in range(0, V):
values_copy[i] = values_copy[i] - mean_val

for i in range(0, N):

s = start[i]
e = end[i]

# Over the first window, observations can only be added
# never removed
if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:

prev_value = values[s]
num_consecutive_same_value = 0

compensation_xxxx_add = compensation_xxxx_remove = 0
compensation_xxx_remove = compensation_xxx_add = 0
compensation_xx_remove = compensation_xx_add = 0
compensation_x_remove = compensation_x_add = 0
x = xx = xxx = xxxx = 0
nobs = 0
for j in range(s, e):
add_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx,
&compensation_x_add, &compensation_xx_add,
&compensation_xxx_add, &compensation_xxxx_add,
&num_consecutive_same_value, &prev_value)
requires_recompute = (
i == 0
or not is_monotonic_increasing_bounds
or s >= end[i - 1]
)

else:
if not requires_recompute:

# After the first window, observations can both be added
# and removed
# calculate deletes
for j in range(start[i - 1], s):
remove_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx,
&compensation_x_remove, &compensation_xx_remove,
&compensation_xxx_remove, &compensation_xxxx_remove)
remove_kurt(values[j], &nobs, &mean, &m2, &m3, &m4,
&numerically_unstable)

# calculate adds
for j in range(end[i - 1], e):
add_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx,
&compensation_x_add, &compensation_xx_add,
&compensation_xxx_add, &compensation_xxxx_add,
add_kurt(values[j], &nobs, &mean, &m2, &m3, &m4,
&numerically_unstable,
&num_consecutive_same_value, &prev_value)

if requires_recompute or numerically_unstable:

prev_value = values[s]
num_consecutive_same_value = 0

mean = m2 = m3 = m4 = 0.0
nobs = 0
for j in range(s, e):
add_kurt(values[j], &nobs, &mean, &m2, &m3, &m4,
&numerically_unstable,
&num_consecutive_same_value, &prev_value)

output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx,
output[i] = calc_kurt(minp, nobs, m2, m4,
num_consecutive_same_value)

if not is_monotonic_increasing_bounds:
nobs = 0
x = 0.0
xx = 0.0
xxx = 0.0
xxxx = 0.0
mean = 0.0
m2 = 0.0
m3 = 0.0
m4 = 0.0

return output

Expand Down
27 changes: 26 additions & 1 deletion pandas/tests/window/test_rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -1521,16 +1521,41 @@ def test_rolling_skew_kurt_numerical_stability(method):
[3000000, 1, 1, 2, 3, 4, 999],
[np.nan] * 3 + [4.0, -1.289256, -1.2, 3.999946],
),
(
"kurt",
[1e6, -1e6, 1, 2, 3, 4, 5, 6],
[np.nan] * 3 + [1.5, 4.0, -1.2, -1.2, -1.2],
),
],
)
def test_rolling_skew_kurt_large_value_range(method, data, values):
# GH: 37557, 47461
# GH: 37557, 47461, 61416
s = Series(data)
result = getattr(s.rolling(4), method)()
expected = Series(values)
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("method", ["skew", "kurt"])
def test_same_result_with_different_lengths(method):
# GH-54380
len_smaller = 10
len_bigger = 12
window_size = 8

rng = np.random.default_rng(2)
data = rng.normal(loc=0.0, scale=1e3, size=len_bigger)
window_smaller = Series(data[:len_smaller]).rolling(window_size)
window_bigger = Series(data).rolling(window_size)

result_smaller = getattr(window_smaller, method)()
result_bigger = getattr(window_bigger, method)()

result_bigger_trimmed = result_bigger[:len_smaller]

tm.assert_series_equal(result_smaller, result_bigger_trimmed, check_exact=True)


def test_invalid_method():
with pytest.raises(ValueError, match="method must be 'table' or 'single"):
Series(range(1)).rolling(1, method="foo")
Expand Down
Loading