|
6 | 6 | The Local Outlier Factor (LOF) algorithm is an unsupervised anomaly detection
|
7 | 7 | method which computes the local density deviation of a given data point with
|
8 | 8 | respect to its neighbors. It considers as outliers the samples that have a
|
9 |
| -substantially lower density than their neighbors. This example shows how to |
10 |
| -use LOF for outlier detection which is the default use case of this estimator |
11 |
| -in scikit-learn. Note that when LOF is used for outlier detection it has no |
12 |
| -predict, decision_function and score_samples methods. See |
13 |
| -:ref:`User Guide <outlier_detection>`: for details on the difference between |
14 |
| -outlier detection and novelty detection and how to use LOF for novelty |
15 |
| -detection. |
16 |
| -
|
17 |
| -The number of neighbors considered (parameter n_neighbors) is typically |
18 |
| -set 1) greater than the minimum number of samples a cluster has to contain, |
19 |
| -so that other samples can be local outliers relative to this cluster, and 2) |
20 |
| -smaller than the maximum number of close by samples that can potentially be |
21 |
| -local outliers. |
22 |
| -In practice, such information is generally not available, and taking |
23 |
| -n_neighbors=20 appears to work well in general. |
| 9 | +substantially lower density than their neighbors. This example shows how to use |
| 10 | +LOF for outlier detection which is the default use case of this estimator in |
| 11 | +scikit-learn. Note that when LOF is used for outlier detection it has no |
| 12 | +`predict`, `decision_function` and `score_samples` methods. See the :ref:`User |
| 13 | +Guide <outlier_detection>` for details on the difference between outlier |
| 14 | +detection and novelty detection and how to use LOF for novelty detection. |
| 15 | +
|
| 16 | +The number of neighbors considered (parameter `n_neighbors`) is typically set 1) |
| 17 | +greater than the minimum number of samples a cluster has to contain, so that |
| 18 | +other samples can be local outliers relative to this cluster, and 2) smaller |
| 19 | +than the maximum number of close by samples that can potentially be local |
| 20 | +outliers. In practice, such information is generally not available, and taking |
| 21 | +`n_neighbors=20` appears to work well in general. |
24 | 22 |
|
25 | 23 | """
|
26 | 24 |
|
| 25 | +# %% |
| 26 | +# Generate data with outliers |
| 27 | +# --------------------------- |
| 28 | + |
| 29 | +# %% |
27 | 30 | import numpy as np
|
28 |
| -import matplotlib.pyplot as plt |
29 |
| -from sklearn.neighbors import LocalOutlierFactor |
30 | 31 |
|
31 | 32 | np.random.seed(42)
|
32 | 33 |
|
33 |
| -# Generate train data |
34 | 34 | X_inliers = 0.3 * np.random.randn(100, 2)
|
35 | 35 | X_inliers = np.r_[X_inliers + 2, X_inliers - 2]
|
36 |
| - |
37 |
| -# Generate some outliers |
38 | 36 | X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))
|
39 | 37 | X = np.r_[X_inliers, X_outliers]
|
40 | 38 |
|
41 | 39 | n_outliers = len(X_outliers)
|
42 | 40 | ground_truth = np.ones(len(X), dtype=int)
|
43 | 41 | ground_truth[-n_outliers:] = -1
|
44 | 42 |
|
45 |
| -# fit the model for outlier detection (default) |
| 43 | +# %% |
| 44 | +# Fit the model for outlier detection (default) |
| 45 | +# --------------------------------------------- |
| 46 | +# |
| 47 | +# Use `fit_predict` to compute the predicted labels of the training samples |
| 48 | +# (when LOF is used for outlier detection, the estimator has no `predict`, |
| 49 | +# `decision_function` and `score_samples` methods). |
| 50 | + |
| 51 | +from sklearn.neighbors import LocalOutlierFactor |
| 52 | + |
46 | 53 | clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
|
47 |
| -# use fit_predict to compute the predicted labels of the training samples |
48 |
| -# (when LOF is used for outlier detection, the estimator has no predict, |
49 |
| -# decision_function and score_samples methods). |
50 | 54 | y_pred = clf.fit_predict(X)
|
51 | 55 | n_errors = (y_pred != ground_truth).sum()
|
52 | 56 | X_scores = clf.negative_outlier_factor_
|
53 | 57 |
|
54 |
| -plt.title("Local Outlier Factor (LOF)") |
| 58 | +# %% |
| 59 | +# Plot results |
| 60 | +# ------------ |
| 61 | + |
| 62 | +# %% |
| 63 | +import matplotlib.pyplot as plt |
| 64 | +from matplotlib.legend_handler import HandlerPathCollection |
| 65 | + |
| 66 | + |
| 67 | +def update_legend_marker_size(handle, orig): |
| 68 | + "Customize size of the legend marker" |
| 69 | + handle.update_from(orig) |
| 70 | + handle.set_sizes([20]) |
| 71 | + |
| 72 | + |
55 | 73 | plt.scatter(X[:, 0], X[:, 1], color="k", s=3.0, label="Data points")
|
56 | 74 | # plot circles with radius proportional to the outlier scores
|
57 | 75 | radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min())
|
58 |
| -plt.scatter( |
| 76 | +scatter = plt.scatter( |
59 | 77 | X[:, 0],
|
60 | 78 | X[:, 1],
|
61 | 79 | s=1000 * radius,
|
|
67 | 85 | plt.xlim((-5, 5))
|
68 | 86 | plt.ylim((-5, 5))
|
69 | 87 | plt.xlabel("prediction errors: %d" % (n_errors))
|
70 |
| -legend = plt.legend(loc="upper left") |
71 |
| -legend.legendHandles[0]._sizes = [10] |
72 |
| -legend.legendHandles[1]._sizes = [20] |
| 88 | +plt.legend( |
| 89 | + handler_map={scatter: HandlerPathCollection(update_func=update_legend_marker_size)} |
| 90 | +) |
| 91 | +plt.title("Local Outlier Factor (LOF)") |
73 | 92 | plt.show()
|
0 commit comments