The scikit-learn provides several outlier detection algorithms including Robust Estimator of Covariance, One-Class SVM, Isolation Forest, and Local Outlier Factor. Here we are going to show you the outlier detection using Local Outlier Factor algorithm, users should consult with the scikit-learn document for more information.
Here is the sample code:
- Code: Select all
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib.font_manager
from sklearn import svm
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
# Initilize random number generator
rng = np.random.RandomState(42)
# Example settings
n_samples = 200
outliers_fraction = 0.25
clusters_separation = [0, 1, 2]
# define Local Outlier Factor tool
clf = LocalOutlierFactor(n_neighbors=35,contamination=outliers_fraction)
# Compare given classifiers under given settings
xx, yy = np.meshgrid(np.linspace(-7, 7, 100), np.linspace(-7, 7, 100))
n_inliers = int((1. - outliers_fraction) * n_samples)
n_outliers = int(outliers_fraction * n_samples)
ground_truth = np.ones(n_samples, dtype=int)
ground_truth[-n_outliers:] = -1
# Fit the problem with varying cluster separation
np.random.seed(42)
# Data generation
X1 = 0.3 * np.random.randn(n_inliers // 2, 2) - 2.0
X2 = 0.3 * np.random.randn(n_inliers // 2, 2) + 2.0
X = np.r_[X1, X2]
# Add outliers
X = np.r_[X, np.random.uniform(low=-6, high=6, size=(n_outliers, 2))]
# Fit the model
plt.figure(figsize=(9, 7))
# fit the data and tag outliers
y_pred = clf.fit_predict(X)
scores_pred = clf.negative_outlier_factor_
threshold = stats.scoreatpercentile(scores_pred, 100 * outliers_fraction)
n_errors = (y_pred != ground_truth).sum()
# plot the levels lines and the points
Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
a = plt.contour(xx, yy, Z, levels=[threshold], linewidths=2, colors='red')
plt.contourf(xx, yy, Z, levels=[threshold, Z.max()], colors='orange')
b = plt.scatter(X[:-n_outliers, 0], X[:-n_outliers, 1], c='white', s=20, edgecolor='k')
c = plt.scatter(X[-n_outliers:, 0], X[-n_outliers:, 1], c='black', s=20, edgecolor='k')
plt.axis('tight')
plt.xlim([-7, 7])
plt.ylim([-7, 7])
plt.legend( [a.collections[0], b, c],
['learned decision function', 'inliers', 'outliers'],
prop=matplotlib.font_manager.FontProperties(size=10),
loc='lower right')
plt.show()
The Figure 1 shows all the points including inliers and outliers, and the points inside the orange-filled area are reported as inliers by the algorithm.
User could also download the Docker image for this recipe from dockerhub, and run the recipe using jupyter nootbook:
https://hub.docker.com/r/podaacdatarecipes/outlier_test/