Replaced sort with argpartition, np.clip percentages

seanlaw · seanlaw · commit f6d0a37d228f · 2021-05-16T20:11:17.000-04:00
diff --git a/stumpy/aampdist.py b/stumpy/aampdist.py
@@ -41,7 +41,7 @@ def _aampdist_vect(
 
     custom_func : object, default None
         A custom user defined function for selecting the desired value from the
-        sorted `P_ABBA` array. This function may need to leverage `functools.partial`
+        unsorted `P_ABBA` array. This function may need to leverage `functools.partial`
         and should take `P_ABBA` as its only input parameter and return a single
         `MPdist` value. The `percentage` and `k` parameters are ignored when
         `custom_func` is not None.
@@ -64,10 +64,10 @@ def aampdist(T_A, T_B, m, percentage=0.05, k=None):
 
     The MPdist distance measure considers two time series to be similar if they share
     many subsequences, regardless of the order of matching subsequences. MPdist
-    concatenates and sorts the output of an AB-join and a BA-join and returns the value
-    of the `k`th smallest number as the reported distance. Note that MPdist is a
-    measure and not a metric. Therefore, it does not obey the triangular inequality but
-    the method is highly scalable.
+    concatenates the output of an AB-join and a BA-join and returns the `k`th smallest
+    value as the reported distance. Note that MPdist is a measure and not a metric.
+    Therefore, it does not obey the triangular inequality but the method is highly
+    scalable.
 
     Parameters
     ----------
@@ -111,10 +111,10 @@ def aampdisted(dask_client, T_A, T_B, m, percentage=0.05, k=None):
 
     The MPdist distance measure considers two time series to be similar if they share
     many subsequences, regardless of the order of matching subsequences. MPdist
-    concatenates and sorts the output of an AB-join and a BA-join and returns the value
-    of the `k`th smallest number as the reported distance. Note that MPdist is a
-    measure and not a metric. Therefore, it does not obey the triangular inequality but
-    the method is highly scalable.
+    concatenates the output of an AB-join and a BA-join and returns the `k`th smallest
+    value as the reported distance. Note that MPdist is a measure and not a metric.
+    Therefore, it does not obey the triangular inequality but the method is highly
+    scalable.
 
     Parameters
     ----------
diff --git a/stumpy/aampdist_snippets.py b/stumpy/aampdist_snippets.py
@@ -91,8 +91,7 @@ def _get_all_aampdist_profiles(
     if s is not None:
         s = min(int(s), m)
     else:
-        percentage = min(percentage, 1.0)
-        percentage = max(percentage, 0.0)
+        percentage = np.clip(percentage, 0.0, 1.0)
         s = min(math.ceil(percentage * m), m)
 
     # Iterate over non-overlapping subsequences, see Definition 3
diff --git a/stumpy/mpdist.py b/stumpy/mpdist.py
@@ -20,10 +20,10 @@ def _compute_P_ABBA(
 
     The MPdist distance measure considers two time series to be similar if they share
     many subsequences, regardless of the order of matching subsequences. MPdist
-    concatenates and sorts the output of an AB-join and a BA-join and returns the value
-    of the `k`th smallest number as the reported distance. Note that MPdist is a
-    measure and not a metric. Therefore, it does not obey the triangular inequality but
-    the method is highly scalable.
+    concatenates the output of an AB-join and a BA-join and returns the `k`th smallest
+    value as the reported distance. Note that MPdist is a measure and not a metric.
+    Therefore, it does not obey the triangular inequality but the method is highly
+    scalable.
 
     Parameters
     ----------
@@ -81,15 +81,15 @@ def _select_P_ABBA_value(P_ABBA, k, custom_func=None):
 
     The MPdist distance measure considers two time series to be similar if they share
     many subsequences, regardless of the order of matching subsequences. MPdist
-    concatenates and sorts the output of an AB-join and a BA-join and returns the value
-    of the `k`th smallest number as the reported distance. Note that MPdist is a
-    measure and not a metric. Therefore, it does not obey the triangular inequality but
-    the method is highly scalable.
+    concatenates the output of an AB-join and a BA-join and returns the `k`th smallest
+    value as the reported distance. Note that MPdist is a measure and not a metric.
+    Therefore, it does not obey the triangular inequality but the method is highly
+    scalable.
 
     Parameters
     ----------
     P_ABBA : ndarray
-        A pre-sorted array resulting from the concatenation of the outputs from an
+        An unsorted array resulting from the concatenation of the outputs from an
         AB-joinand BA-join for two time series, `T_A` and `T_B`
 
     k : int
@@ -98,7 +98,7 @@ def _select_P_ABBA_value(P_ABBA, k, custom_func=None):
 
     custom_func : object, default None
         A custom user defined function for selecting the desired value from the
-        sorted `P_ABBA` array. This function may need to leverage `functools.partial`
+        unsorted `P_ABBA` array. This function may need to leverage `functools.partial`
         and should take `P_ABBA` as its only input parameter and return a single
         `MPdist` value. The `percentage` and `k` parameters are ignored when
         `custom_func` is not None.
@@ -112,10 +112,12 @@ def _select_P_ABBA_value(P_ABBA, k, custom_func=None):
     if custom_func is not None:
         MPdist = custom_func(P_ABBA)
     else:
-        MPdist = P_ABBA[k]
+        partition = np.partition(P_ABBA, k)
+        MPdist = partition[k]
         if ~np.isfinite(MPdist):
-            k = max(0, np.count_nonzero(np.isfinite(P_ABBA[:k])) - 1)
-            MPdist = P_ABBA[k]
+            partition[:k].sort()
+            k = max(0, np.count_nonzero(np.isfinite(partition[:k])) - 1)
+            MPdist = partition[k]
 
     return MPdist
 
@@ -137,10 +139,10 @@ def _mpdist(
 
     The MPdist distance measure considers two time series to be similar if they share
     many subsequences, regardless of the order of matching subsequences. MPdist
-    concatenates and sorts the output of an AB-join and a BA-join and returns the value
-    of the `k`th smallest number as the reported distance. Note that MPdist is a
-    measure and not a metric. Therefore, it does not obey the triangular inequality but
-    the method is highly scalable.
+    concatenates the output of an AB-join and a BA-join and returns the `k`th smallest
+    value as the reported distance. Note that MPdist is a measure and not a metric.
+    Therefore, it does not obey the triangular inequality but the method is highly
+    scalable.
 
     Parameters
     ----------
@@ -180,7 +182,7 @@ def _mpdist(
 
     custom_func : object, default None
         A custom user defined function for selecting the desired value from the
-        sorted `P_ABBA` array. This function may need to leverage `functools.partial`
+        unsorted `P_ABBA` array. This function may need to leverage `functools.partial`
         and should take `P_ABBA` as its only input parameter and return a single
         `MPdist` value. The `percentage` and `k` parameters are ignored when
         `custom_func` is not None.
@@ -202,13 +204,11 @@ def _mpdist(
     P_ABBA = np.empty(n_A - m + 1 + n_B - m + 1, dtype=np.float64)
 
     _compute_P_ABBA(T_A, T_B, m, P_ABBA, dask_client, device_id, mp_func)
-    P_ABBA.sort()
 
     if k is not None:
         k = min(int(k), P_ABBA.shape[0] - 1)
     else:
-        percentage = min(percentage, 1.0)
-        percentage = max(percentage, 0.0)
+        percentage = np.clip(percentage, 0.0, 1.0)
         k = min(math.ceil(percentage * (n_A + n_B)), n_A - m + 1 + n_B - m + 1 - 1)
 
     MPdist = _select_P_ABBA_value(P_ABBA, k, custom_func)
@@ -252,7 +252,7 @@ def _mpdist_vect(
 
     custom_func : object, default None
         A custom user defined function for selecting the desired value from the
-        sorted `P_ABBA` array. This function may need to leverage `functools.partial`
+        unsorted `P_ABBA` array. This function may need to leverage `functools.partial`
         and should take `P_ABBA` as its only input parameter and return a single
         `MPdist` value. The `percentage` and `k` parameters are ignored when
         `custom_func` is not None.
@@ -267,8 +267,7 @@ def _mpdist_vect(
     P_ABBA = np.empty(2 * j)
 
     if k is None:
-        percentage = min(percentage, 1.0)
-        percentage = max(percentage, 0.0)
+        percentage = np.clip(percentage, 0.0, 1.0)
         k = min(math.ceil(percentage * (2 * Q.shape[0])), 2 * j - 1)
 
     k = min(int(k), P_ABBA.shape[0] - 1)
@@ -281,7 +280,6 @@ def _mpdist_vect(
     for i in range(MPdist_vect.shape[0]):
         P_ABBA[:j] = rolling_row_min[:, i]
         P_ABBA[j:] = col_min[i : i + j]
-        P_ABBA.sort()
         MPdist_vect[i] = _select_P_ABBA_value(P_ABBA, k, custom_func)
 
     return MPdist_vect
@@ -295,10 +293,10 @@ def mpdist(T_A, T_B, m, percentage=0.05, k=None, normalize=True):
 
     The MPdist distance measure considers two time series to be similar if they share
     many subsequences, regardless of the order of matching subsequences. MPdist
-    concatenates and sorts the output of an AB-join and a BA-join and returns the value
-    of the `k`th smallest number as the reported distance. Note that MPdist is a
-    measure and not a metric. Therefore, it does not obey the triangular inequality but
-    the method is highly scalable.
+    concatenates the output of an AB-join and a BA-join and returns the `k`th smallest
+    value as the reported distance. Note that MPdist is a measure and not a metric.
+    Therefore, it does not obey the triangular inequality but the method is highly
+    scalable.
 
     Parameters
     ----------
@@ -349,10 +347,10 @@ def mpdisted(dask_client, T_A, T_B, m, percentage=0.05, k=None, normalize=True):
 
     The MPdist distance measure considers two time series to be similar if they share
     many subsequences, regardless of the order of matching subsequences. MPdist
-    concatenates and sorts the output of an AB-join and a BA-join and returns the value
-    of the `k`th smallest number as the reported distance. Note that MPdist is a
-    measure and not a metric. Therefore, it does not obey the triangular inequality but
-    the method is highly scalable.
+    concatenates the output of an AB-join and a BA-join and returns the `k`th smallest
+    value as the reported distance. Note that MPdist is a measure and not a metric.
+    Therefore, it does not obey the triangular inequality but the method is highly
+    scalable.
 
     Parameters
     ----------
diff --git a/stumpy/scraamp.py b/stumpy/scraamp.py
@@ -437,8 +437,7 @@ def __init__(
             )
 
         self._n_threads = config.NUMBA_NUM_THREADS
-        self._percentage = min(percentage, 1.0)
-        self._percentage = max(percentage, 0.0)
+        self._percentage = np.clip(percentage, 0.0, 1.0)
         self._n_chunks = int(np.ceil(1.0 / percentage))
         self._ndist_counts = core._count_diagonal_ndist(
             self._diags, self._m, self._n_A, self._n_B
diff --git a/stumpy/scrump.py b/stumpy/scrump.py
@@ -459,8 +459,7 @@ def __init__(
             )
 
         self._n_threads = config.NUMBA_NUM_THREADS
-        self._percentage = min(percentage, 1.0)
-        self._percentage = max(percentage, 0.0)
+        self._percentage = np.clip(percentage, 0.0, 1.0)
         self._n_chunks = int(np.ceil(1.0 / percentage))
         self._ndist_counts = core._count_diagonal_ndist(
             self._diags, self._m, self._n_A, self._n_B
diff --git a/stumpy/snippets.py b/stumpy/snippets.py
@@ -93,8 +93,7 @@ def _get_all_profiles(
     if s is not None:
         s = min(int(s), m)
     else:
-        percentage = min(percentage, 1.0)
-        percentage = max(percentage, 0.0)
+        percentage = np.clip(percentage, 0.0, 1.0)
         s = min(math.ceil(percentage * m), m)
 
     # Iterate over non-overlapping subsequences, see Definition 3
diff --git a/stumpy/stimp.py b/stumpy/stimp.py
@@ -110,8 +110,7 @@ def __init__(
         self._bfs_indices = _bfs_indices(M.shape[0])
         self._M = M[self._bfs_indices]
         self._idx = 0
-        percentage = min(1.0, percentage)
-        percentage = max(0.0, percentage)
+        percentage = np.clip(percentage, 0.0, 1.0)
         self._percentage = percentage
         self._pre_scrump = pre_scrump
         self._normalize = normalize
diff --git a/tests/test_mpdist.py b/tests/test_mpdist.py
@@ -18,6 +18,7 @@ def some_func(P_ABBA, m, percentage, n_A, n_B):
     percentage = min(percentage, 1.0)
     percentage = max(percentage, 0.0)
     k = min(math.ceil(percentage * (n_A + n_B)), n_A - m + 1 + n_B - m + 1 - 1)
+    P_ABBA.sort()
     MPdist = P_ABBA[k]
     if ~np.isfinite(MPdist):
         k = np.count_nonzero(np.isfinite(P_ABBA[:k])) - 1
@@ -124,10 +125,12 @@ def test_mpdist_k(T_A, T_B, k):
 def test_select_P_ABBA_val_inf():
     P_ABBA = np.random.rand(10)
     k = 2
-    P_ABBA[k] = np.inf
+    P_ABBA[k:] = np.inf
+    p_abba = P_ABBA.copy()
 
-    ref = P_ABBA[k - 1]
     comp = _select_P_ABBA_value(P_ABBA, k=k)
+    p_abba.sort()
+    ref = p_abba[k - 1]
     npt.assert_almost_equal(ref, comp)
 
 

Original file line number	Diff line number	Diff line change
`@@ -437,8 +437,7 @@ def __init__(`
`437`	`437`	`)`
`438`	`438`
`439`	`439`	`self._n_threads = config.NUMBA_NUM_THREADS`
`440`		`- self._percentage = min(percentage, 1.0)`
`441`		`- self._percentage = max(percentage, 0.0)`
	`440`	`+ self._percentage = np.clip(percentage, 0.0, 1.0)`
`442`	`441`	`self._n_chunks = int(np.ceil(1.0 / percentage))`
`443`	`442`	`self._ndist_counts = core._count_diagonal_ndist(`
`444`	`443`	`self._diags, self._m, self._n_A, self._n_B`
Original file line number	Diff line number	Diff line change
`@@ -459,8 +459,7 @@ def __init__(`
`459`	`459`	`)`
`460`	`460`
`461`	`461`	`self._n_threads = config.NUMBA_NUM_THREADS`
`462`		`- self._percentage = min(percentage, 1.0)`
`463`		`- self._percentage = max(percentage, 0.0)`
	`462`	`+ self._percentage = np.clip(percentage, 0.0, 1.0)`
`464`	`463`	`self._n_chunks = int(np.ceil(1.0 / percentage))`
`465`	`464`	`self._ndist_counts = core._count_diagonal_ndist(`
`466`	`465`	`self._diags, self._m, self._n_A, self._n_B`