From 3b25ed4c193ec73551cf88f0a2084cf80933e16f Mon Sep 17 00:00:00 2001 From: Chisholm6192 Date: Tue, 27 Feb 2024 18:51:07 -0500 Subject: [PATCH] CLN: Enforce deprecation of using alias for builtin/NumPy funcs (#57444) * CLN: Enforce deprecation of using alias for builtin/NumPy funcs * GH# and whatsnew * Fixup docs * More tests * Restore docstring * Test fixes * Test fixups * Test fixes * Test fixup * Test fixes - [ ] closes #xxxx (Replace xxxx with the GitHub issue number) - [ ] [Tests added and passed](https://pandas.pydata.org/pandas-docs/dev/development/contributing_codebase.html#writing-tests) if fixing a bug or adding a new feature - [ ] All [code checks passed](https://pandas.pydata.org/pandas-docs/dev/development/contributing_codebase.html#pre-commit). - [ ] Added [type annotations](https://pandas.pydata.org/pandas-docs/dev/development/contributing_codebase.html#type-hints) to new arguments/methods/functions. - [ ] Added an entry in the latest `doc/source/whatsnew/vX.X.X.rst` file if fixing a bug or adding a new feature. --- scripts/handle_outliers.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 scripts/handle_outliers.py diff --git a/scripts/handle_outliers.py b/scripts/handle_outliers.py new file mode 100644 index 0000000000000..7b945a6ddf327 --- /dev/null +++ b/scripts/handle_outliers.py @@ -0,0 +1,30 @@ +import pandas as pd +import numpy as np + + +""" +Detect and handle outliers in a DataFrame. + +Parameters: + data: DataFrame + method: str, default 'z-score'. The method used for outlier detection. Options: 'z-score' or 'IQR' (Interquartile Range). + threshold: float, default 3. The threshold for identifying outliers. Data points beyond this threshold are considered outliers. + +Returns: + DataFrame: DataFrame with outliers handled (replaced or removed). +""" +def handle_outliers(data, method='z-score', threshold=3): + if method == 'z-score': + z_scores = np.abs((data - data.mean()) / data.std()) + data_no_outliers = data[(z_scores < threshold).all(axis=1)] + + elif method == 'IQR': + Q1 = data.quantile(0.25) + Q3 = data.quantile(0.75) + IQR = Q3 - Q1 + data_no_outliers = data[~((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))).any(axis=1)] + + else: + raise ValueError("Invalid method. Use z-score or IQR") + + return data_no_outliers \ No newline at end of file