1
+ """
2
+ Locally weighted linear regression, also called local regression, is a type of
3
+ non-parametric linear regression that prioritizes data closest to a given
4
+ prediction point. The algorithm estimates the vector of model coefficients β
5
+ using weighted least squares regression:
6
+
7
+ β = (XᵀWX)⁻¹(XᵀWy),
8
+
9
+ where X is the design matrix, y is the response vector, and W is the diagonal
10
+ weight matrix.
11
+
12
+ This implementation calculates wᵢ, the weight of the ith training sample, using
13
+ the Gaussian weight:
14
+
15
+ wᵢ = exp(-‖xᵢ - x‖²/(2τ²)),
16
+
17
+ where xᵢ is the ith training sample, x is the prediction point, τ is the
18
+ "bandwidth", and ‖x‖ is the Euclidean norm (also called the 2-norm or the L²
19
+ norm). The bandwidth τ controls how quickly the weight of a training sample
20
+ decreases as its distance from the prediction point increases. One can think of
21
+ the Gaussian weight as a bell curve centered around the prediction point: a
22
+ training sample is weighted lower if it's farther from the center, and τ
23
+ controls the spread of the bell curve.
24
+
25
+ Other types of locally weighted regression such as locally estimated scatterplot
26
+ smoothing (LOESS) typically use different weight functions.
27
+
28
+ References:
29
+ - https://en.wikipedia.org/wiki/Local_regression
30
+ - https://en.wikipedia.org/wiki/Weighted_least_squares
31
+ - https://cs229.stanford.edu/notes2022fall/main_notes.pdf
32
+ """
33
+
1
34
import matplotlib .pyplot as plt
2
35
import numpy as np
3
36
4
37
5
38
def weight_matrix (point : np .ndarray , x_train : np .ndarray , tau : float ) -> np .ndarray :
6
39
"""
7
- Calculate the weight for every point in the data set.
8
- point --> the x value at which we want to make predictions
40
+ Calculate the weight of every point in the training data around a given
41
+ prediction point
42
+
43
+ Args:
44
+ point: x-value at which the prediction is being made
45
+ x_train: ndarray of x-values for training
46
+ tau: bandwidth value, controls how quickly the weight of training values
47
+ decreases as the distance from the prediction point increases
48
+
49
+ Returns:
50
+ n x n weight matrix around the prediction point, where n is the size of
51
+ the training set
9
52
>>> weight_matrix(
10
53
... np.array([1., 1.]),
11
54
... np.array([[16.99, 10.34], [21.01,23.68], [24.59,25.69]]),
@@ -15,22 +58,30 @@ def weight_matrix(point: np.ndarray, x_train: np.ndarray, tau: float) -> np.ndar
15
58
[0.00000000e+000, 0.00000000e+000, 0.00000000e+000],
16
59
[0.00000000e+000, 0.00000000e+000, 0.00000000e+000]])
17
60
"""
18
- m , _ = np .shape (x_train ) # m is the number of training samples
19
- weights = np .eye (m ) # Initializing weights as identity matrix
20
-
21
- # calculating weights for all training examples [x(i)'s]
22
- for j in range (m ):
61
+ n = len (x_train ) # Number of training samples
62
+ weights = np .eye (n ) # Initialize weights as identity matrix
63
+ for j in range (n ):
23
64
diff = point - x_train [j ]
24
65
weights [j , j ] = np .exp (diff @ diff .T / (- 2.0 * tau ** 2 ))
66
+
25
67
return weights
26
68
27
69
28
70
def local_weight (
29
71
point : np .ndarray , x_train : np .ndarray , y_train : np .ndarray , tau : float
30
72
) -> np .ndarray :
31
73
"""
32
- Calculate the local weights using the weight_matrix function on training data.
33
- Return the weighted matrix.
74
+ Calculate the local weights at a given prediction point using the weight
75
+ matrix for that point
76
+
77
+ Args:
78
+ point: x-value at which the prediction is being made
79
+ x_train: ndarray of x-values for training
80
+ y_train: ndarray of y-values for training
81
+ tau: bandwidth value, controls how quickly the weight of training values
82
+ decreases as the distance from the prediction point increases
83
+ Returns:
84
+ ndarray of local weights
34
85
>>> local_weight(
35
86
... np.array([1., 1.]),
36
87
... np.array([[16.99, 10.34], [21.01,23.68], [24.59,25.69]]),
@@ -52,17 +103,24 @@ def local_weight_regression(
52
103
x_train : np .ndarray , y_train : np .ndarray , tau : float
53
104
) -> np .ndarray :
54
105
"""
55
- Calculate predictions for each data point on axis
106
+ Calculate predictions for each point in the training data
107
+
108
+ Args:
109
+ x_train: ndarray of x-values for training
110
+ y_train: ndarray of y-values for training
111
+ tau: bandwidth value, controls how quickly the weight of training values
112
+ decreases as the distance from the prediction point increases
113
+
114
+ Returns:
115
+ ndarray of predictions
56
116
>>> local_weight_regression(
57
117
... np.array([[16.99, 10.34], [21.01, 23.68], [24.59, 25.69]]),
58
118
... np.array([[1.01, 1.66, 3.5]]),
59
119
... 0.6
60
120
... )
61
121
array([1.07173261, 1.65970737, 3.50160179])
62
122
"""
63
- m , _ = np .shape (x_train )
64
- y_pred = np .zeros (m )
65
-
123
+ y_pred = np .zeros (len (x_train )) # Initialize array of predictions
66
124
for i , item in enumerate (x_train ):
67
125
y_pred [i ] = item @ local_weight (item , x_train , y_train , tau )
68
126
@@ -74,14 +132,15 @@ def load_data(
74
132
) -> tuple [np .ndarray , np .ndarray , np .ndarray ]:
75
133
"""
76
134
Load data from seaborn and split it into x and y points
135
+ >>> pass # No doctests, function is for demo purposes only
77
136
"""
78
137
import seaborn as sns
79
138
80
139
data = sns .load_dataset (dataset_name )
81
- x_data = np .array (data [x_name ]) # total_bill
82
- y_data = np .array (data [y_name ]) # tip
140
+ x_data = np .array (data [x_name ])
141
+ y_data = np .array (data [y_name ])
83
142
84
- one = np .ones (np . shape (y_data )[ 0 ], dtype = int )
143
+ one = np .ones (len (y_data ))
85
144
86
145
# pairing elements of one and x_data
87
146
x_train = np .column_stack ((one , x_data ))
@@ -99,6 +158,7 @@ def plot_preds(
99
158
) -> plt .plot :
100
159
"""
101
160
Plot predictions and display the graph
161
+ >>> pass # No doctests, function is for demo purposes only
102
162
"""
103
163
x_train_sorted = np .sort (x_train , axis = 0 )
104
164
plt .scatter (x_data , y_data , color = "blue" )
@@ -119,6 +179,7 @@ def plot_preds(
119
179
120
180
doctest .testmod ()
121
181
182
+ # Demo with a dataset from the seaborn module
122
183
training_data_x , total_bill , tip = load_data ("tips" , "total_bill" , "tip" )
123
184
predictions = local_weight_regression (training_data_x , tip , 5 )
124
185
plot_preds (training_data_x , predictions , total_bill , tip , "total_bill" , "tip" )
0 commit comments