Skip to content

Commit f6d0a37

Browse files
committed
Replaced sort with argpartition, np.clip percentages
1 parent 01e867c commit f6d0a37

File tree

8 files changed

+50
-54
lines changed

8 files changed

+50
-54
lines changed

stumpy/aampdist.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def _aampdist_vect(
4141
4242
custom_func : object, default None
4343
A custom user defined function for selecting the desired value from the
44-
sorted `P_ABBA` array. This function may need to leverage `functools.partial`
44+
unsorted `P_ABBA` array. This function may need to leverage `functools.partial`
4545
and should take `P_ABBA` as its only input parameter and return a single
4646
`MPdist` value. The `percentage` and `k` parameters are ignored when
4747
`custom_func` is not None.
@@ -64,10 +64,10 @@ def aampdist(T_A, T_B, m, percentage=0.05, k=None):
6464
6565
The MPdist distance measure considers two time series to be similar if they share
6666
many subsequences, regardless of the order of matching subsequences. MPdist
67-
concatenates and sorts the output of an AB-join and a BA-join and returns the value
68-
of the `k`th smallest number as the reported distance. Note that MPdist is a
69-
measure and not a metric. Therefore, it does not obey the triangular inequality but
70-
the method is highly scalable.
67+
concatenates the output of an AB-join and a BA-join and returns the `k`th smallest
68+
value as the reported distance. Note that MPdist is a measure and not a metric.
69+
Therefore, it does not obey the triangular inequality but the method is highly
70+
scalable.
7171
7272
Parameters
7373
----------
@@ -111,10 +111,10 @@ def aampdisted(dask_client, T_A, T_B, m, percentage=0.05, k=None):
111111
112112
The MPdist distance measure considers two time series to be similar if they share
113113
many subsequences, regardless of the order of matching subsequences. MPdist
114-
concatenates and sorts the output of an AB-join and a BA-join and returns the value
115-
of the `k`th smallest number as the reported distance. Note that MPdist is a
116-
measure and not a metric. Therefore, it does not obey the triangular inequality but
117-
the method is highly scalable.
114+
concatenates the output of an AB-join and a BA-join and returns the `k`th smallest
115+
value as the reported distance. Note that MPdist is a measure and not a metric.
116+
Therefore, it does not obey the triangular inequality but the method is highly
117+
scalable.
118118
119119
Parameters
120120
----------

stumpy/aampdist_snippets.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,8 +91,7 @@ def _get_all_aampdist_profiles(
9191
if s is not None:
9292
s = min(int(s), m)
9393
else:
94-
percentage = min(percentage, 1.0)
95-
percentage = max(percentage, 0.0)
94+
percentage = np.clip(percentage, 0.0, 1.0)
9695
s = min(math.ceil(percentage * m), m)
9796

9897
# Iterate over non-overlapping subsequences, see Definition 3

stumpy/mpdist.py

Lines changed: 31 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,10 @@ def _compute_P_ABBA(
2020
2121
The MPdist distance measure considers two time series to be similar if they share
2222
many subsequences, regardless of the order of matching subsequences. MPdist
23-
concatenates and sorts the output of an AB-join and a BA-join and returns the value
24-
of the `k`th smallest number as the reported distance. Note that MPdist is a
25-
measure and not a metric. Therefore, it does not obey the triangular inequality but
26-
the method is highly scalable.
23+
concatenates the output of an AB-join and a BA-join and returns the `k`th smallest
24+
value as the reported distance. Note that MPdist is a measure and not a metric.
25+
Therefore, it does not obey the triangular inequality but the method is highly
26+
scalable.
2727
2828
Parameters
2929
----------
@@ -81,15 +81,15 @@ def _select_P_ABBA_value(P_ABBA, k, custom_func=None):
8181
8282
The MPdist distance measure considers two time series to be similar if they share
8383
many subsequences, regardless of the order of matching subsequences. MPdist
84-
concatenates and sorts the output of an AB-join and a BA-join and returns the value
85-
of the `k`th smallest number as the reported distance. Note that MPdist is a
86-
measure and not a metric. Therefore, it does not obey the triangular inequality but
87-
the method is highly scalable.
84+
concatenates the output of an AB-join and a BA-join and returns the `k`th smallest
85+
value as the reported distance. Note that MPdist is a measure and not a metric.
86+
Therefore, it does not obey the triangular inequality but the method is highly
87+
scalable.
8888
8989
Parameters
9090
----------
9191
P_ABBA : ndarray
92-
A pre-sorted array resulting from the concatenation of the outputs from an
92+
An unsorted array resulting from the concatenation of the outputs from an
9393
AB-joinand BA-join for two time series, `T_A` and `T_B`
9494
9595
k : int
@@ -98,7 +98,7 @@ def _select_P_ABBA_value(P_ABBA, k, custom_func=None):
9898
9999
custom_func : object, default None
100100
A custom user defined function for selecting the desired value from the
101-
sorted `P_ABBA` array. This function may need to leverage `functools.partial`
101+
unsorted `P_ABBA` array. This function may need to leverage `functools.partial`
102102
and should take `P_ABBA` as its only input parameter and return a single
103103
`MPdist` value. The `percentage` and `k` parameters are ignored when
104104
`custom_func` is not None.
@@ -112,10 +112,12 @@ def _select_P_ABBA_value(P_ABBA, k, custom_func=None):
112112
if custom_func is not None:
113113
MPdist = custom_func(P_ABBA)
114114
else:
115-
MPdist = P_ABBA[k]
115+
partition = np.partition(P_ABBA, k)
116+
MPdist = partition[k]
116117
if ~np.isfinite(MPdist):
117-
k = max(0, np.count_nonzero(np.isfinite(P_ABBA[:k])) - 1)
118-
MPdist = P_ABBA[k]
118+
partition[:k].sort()
119+
k = max(0, np.count_nonzero(np.isfinite(partition[:k])) - 1)
120+
MPdist = partition[k]
119121

120122
return MPdist
121123

@@ -137,10 +139,10 @@ def _mpdist(
137139
138140
The MPdist distance measure considers two time series to be similar if they share
139141
many subsequences, regardless of the order of matching subsequences. MPdist
140-
concatenates and sorts the output of an AB-join and a BA-join and returns the value
141-
of the `k`th smallest number as the reported distance. Note that MPdist is a
142-
measure and not a metric. Therefore, it does not obey the triangular inequality but
143-
the method is highly scalable.
142+
concatenates the output of an AB-join and a BA-join and returns the `k`th smallest
143+
value as the reported distance. Note that MPdist is a measure and not a metric.
144+
Therefore, it does not obey the triangular inequality but the method is highly
145+
scalable.
144146
145147
Parameters
146148
----------
@@ -180,7 +182,7 @@ def _mpdist(
180182
181183
custom_func : object, default None
182184
A custom user defined function for selecting the desired value from the
183-
sorted `P_ABBA` array. This function may need to leverage `functools.partial`
185+
unsorted `P_ABBA` array. This function may need to leverage `functools.partial`
184186
and should take `P_ABBA` as its only input parameter and return a single
185187
`MPdist` value. The `percentage` and `k` parameters are ignored when
186188
`custom_func` is not None.
@@ -202,13 +204,11 @@ def _mpdist(
202204
P_ABBA = np.empty(n_A - m + 1 + n_B - m + 1, dtype=np.float64)
203205

204206
_compute_P_ABBA(T_A, T_B, m, P_ABBA, dask_client, device_id, mp_func)
205-
P_ABBA.sort()
206207

207208
if k is not None:
208209
k = min(int(k), P_ABBA.shape[0] - 1)
209210
else:
210-
percentage = min(percentage, 1.0)
211-
percentage = max(percentage, 0.0)
211+
percentage = np.clip(percentage, 0.0, 1.0)
212212
k = min(math.ceil(percentage * (n_A + n_B)), n_A - m + 1 + n_B - m + 1 - 1)
213213

214214
MPdist = _select_P_ABBA_value(P_ABBA, k, custom_func)
@@ -252,7 +252,7 @@ def _mpdist_vect(
252252
253253
custom_func : object, default None
254254
A custom user defined function for selecting the desired value from the
255-
sorted `P_ABBA` array. This function may need to leverage `functools.partial`
255+
unsorted `P_ABBA` array. This function may need to leverage `functools.partial`
256256
and should take `P_ABBA` as its only input parameter and return a single
257257
`MPdist` value. The `percentage` and `k` parameters are ignored when
258258
`custom_func` is not None.
@@ -267,8 +267,7 @@ def _mpdist_vect(
267267
P_ABBA = np.empty(2 * j)
268268

269269
if k is None:
270-
percentage = min(percentage, 1.0)
271-
percentage = max(percentage, 0.0)
270+
percentage = np.clip(percentage, 0.0, 1.0)
272271
k = min(math.ceil(percentage * (2 * Q.shape[0])), 2 * j - 1)
273272

274273
k = min(int(k), P_ABBA.shape[0] - 1)
@@ -281,7 +280,6 @@ def _mpdist_vect(
281280
for i in range(MPdist_vect.shape[0]):
282281
P_ABBA[:j] = rolling_row_min[:, i]
283282
P_ABBA[j:] = col_min[i : i + j]
284-
P_ABBA.sort()
285283
MPdist_vect[i] = _select_P_ABBA_value(P_ABBA, k, custom_func)
286284

287285
return MPdist_vect
@@ -295,10 +293,10 @@ def mpdist(T_A, T_B, m, percentage=0.05, k=None, normalize=True):
295293
296294
The MPdist distance measure considers two time series to be similar if they share
297295
many subsequences, regardless of the order of matching subsequences. MPdist
298-
concatenates and sorts the output of an AB-join and a BA-join and returns the value
299-
of the `k`th smallest number as the reported distance. Note that MPdist is a
300-
measure and not a metric. Therefore, it does not obey the triangular inequality but
301-
the method is highly scalable.
296+
concatenates the output of an AB-join and a BA-join and returns the `k`th smallest
297+
value as the reported distance. Note that MPdist is a measure and not a metric.
298+
Therefore, it does not obey the triangular inequality but the method is highly
299+
scalable.
302300
303301
Parameters
304302
----------
@@ -349,10 +347,10 @@ def mpdisted(dask_client, T_A, T_B, m, percentage=0.05, k=None, normalize=True):
349347
350348
The MPdist distance measure considers two time series to be similar if they share
351349
many subsequences, regardless of the order of matching subsequences. MPdist
352-
concatenates and sorts the output of an AB-join and a BA-join and returns the value
353-
of the `k`th smallest number as the reported distance. Note that MPdist is a
354-
measure and not a metric. Therefore, it does not obey the triangular inequality but
355-
the method is highly scalable.
350+
concatenates the output of an AB-join and a BA-join and returns the `k`th smallest
351+
value as the reported distance. Note that MPdist is a measure and not a metric.
352+
Therefore, it does not obey the triangular inequality but the method is highly
353+
scalable.
356354
357355
Parameters
358356
----------

stumpy/scraamp.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -437,8 +437,7 @@ def __init__(
437437
)
438438

439439
self._n_threads = config.NUMBA_NUM_THREADS
440-
self._percentage = min(percentage, 1.0)
441-
self._percentage = max(percentage, 0.0)
440+
self._percentage = np.clip(percentage, 0.0, 1.0)
442441
self._n_chunks = int(np.ceil(1.0 / percentage))
443442
self._ndist_counts = core._count_diagonal_ndist(
444443
self._diags, self._m, self._n_A, self._n_B

stumpy/scrump.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -459,8 +459,7 @@ def __init__(
459459
)
460460

461461
self._n_threads = config.NUMBA_NUM_THREADS
462-
self._percentage = min(percentage, 1.0)
463-
self._percentage = max(percentage, 0.0)
462+
self._percentage = np.clip(percentage, 0.0, 1.0)
464463
self._n_chunks = int(np.ceil(1.0 / percentage))
465464
self._ndist_counts = core._count_diagonal_ndist(
466465
self._diags, self._m, self._n_A, self._n_B

stumpy/snippets.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,7 @@ def _get_all_profiles(
9393
if s is not None:
9494
s = min(int(s), m)
9595
else:
96-
percentage = min(percentage, 1.0)
97-
percentage = max(percentage, 0.0)
96+
percentage = np.clip(percentage, 0.0, 1.0)
9897
s = min(math.ceil(percentage * m), m)
9998

10099
# Iterate over non-overlapping subsequences, see Definition 3

stumpy/stimp.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -110,8 +110,7 @@ def __init__(
110110
self._bfs_indices = _bfs_indices(M.shape[0])
111111
self._M = M[self._bfs_indices]
112112
self._idx = 0
113-
percentage = min(1.0, percentage)
114-
percentage = max(0.0, percentage)
113+
percentage = np.clip(percentage, 0.0, 1.0)
115114
self._percentage = percentage
116115
self._pre_scrump = pre_scrump
117116
self._normalize = normalize

tests/test_mpdist.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ def some_func(P_ABBA, m, percentage, n_A, n_B):
1818
percentage = min(percentage, 1.0)
1919
percentage = max(percentage, 0.0)
2020
k = min(math.ceil(percentage * (n_A + n_B)), n_A - m + 1 + n_B - m + 1 - 1)
21+
P_ABBA.sort()
2122
MPdist = P_ABBA[k]
2223
if ~np.isfinite(MPdist):
2324
k = np.count_nonzero(np.isfinite(P_ABBA[:k])) - 1
@@ -124,10 +125,12 @@ def test_mpdist_k(T_A, T_B, k):
124125
def test_select_P_ABBA_val_inf():
125126
P_ABBA = np.random.rand(10)
126127
k = 2
127-
P_ABBA[k] = np.inf
128+
P_ABBA[k:] = np.inf
129+
p_abba = P_ABBA.copy()
128130

129-
ref = P_ABBA[k - 1]
130131
comp = _select_P_ABBA_value(P_ABBA, k=k)
132+
p_abba.sort()
133+
ref = p_abba[k - 1]
131134
npt.assert_almost_equal(ref, comp)
132135

133136

0 commit comments

Comments
 (0)