Source code for faultdiagnosistoolbox.testselection_rf

"""Test selection using random forest machine learning models."""

import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from .diag_util import IsolabilityMatrix, DiagnosesAndConfusionMatrix



[docs]
def RandomForestTestSelection(data, n_estimators=100):
    """Select residuals for fault isolation using Random Forests.

    Implementation of the residual selection approach described in the
    paper

      Frisk, Erik, and Mattias Krysander. "Residual Selection for
      Consistency Based Diagnosis Using Machine Learning Models."
      IFAC Safeprocess, Warszaw, Poland (2018)
      (https://doi.org/10.1016/j.ifacol.2018.09.547)

    The method computes a sorted list of residuals, where the most important
    residuals are first in the list. The method also computes performance
    indices that can be used to determine how many in the sorted list of
    residuals that should be selected.

    Parameters
    ----------
    data : dict
        Dictionary with data and supporting information needed by the algorithm. The dictionary must have the following keys
        ``modes``: an array of size m with names of no-fault and fault modes where the no-fault mode is assumed the first element in the list.
        ``res``: an (N x n) numpy array with N samples of the n residuals.
        ``mode``: an (N x 1) numpy array with the fault mode at each sample where a value of 0 corresponds to fault free operation.
        ``fsm``: a fault signature matrix of size (n x m)
    n_estimators : int
        Number of decision trees in the ensemble model. Defaults to 100

    Returns
    -------
    result : dict
        A dictionary containing the resulting test selection.
        The dictionary contains the keys
        ``sortidx``: A sorted list of indices to residuals with the most important residuals first.
        ``pfa``  : Performance curve for the false alarm probability
        ``pmd``  : Performace curve for missed detection
        ``pfi``  : Performance curve for fault isolation
        ``pmfi`` : Maximal isolation performance per fault mode
        ``residualimportance`` : List of relative residual importance. The list is not sorted.
    C : arraylike
        A confusion matrix when computing consistency based diagnoses using all residuals.
    rf : RandomForestClassifier
        The random forest object
    Crf : arraylike
        The confusion matrix for the random forest classifier.

    Example
    -------
    Let data be thresholded residual data

    >>> ts, C, rf, Crf = RandomForestTestSelection(data, n_estimators=100)

    Then plot performance as a function of selected number of residuals using
    >>> plt.plot(range(1, n), ts['pfa'], 'r', label='False alarm probability')
    >>> plt.plot(range(1, n), ts['pmd'], 'b',
    >>>          label='Missed detection probability')
    >>> plt.plot(range(1, n), ts['pfi'], 'y',
    >>>          label='False isolation probability')

    based on the plots, determine ntest, the number of tests to select, and
    then plot the corresponding fault isolation performance

    >>> _, C_select = DiagnosesAndConfusionMatrix(data, residx=ts['sortidx'][0:ntests])
    >>> PlotConfusionMatrix(C_select)

    that could be compared to

    >>> PlotConfusionMatrix(C)

    """

    nf = len(data["modes"])
    nr = data["res"].shape[1]
    im = IsolabilityMatrix(data["fsm"])

    rf = RandomForestClassifier(n_estimators=n_estimators)
    rf.fit(data["res"], data["mode"])
    sortIdx = np.argsort(rf.feature_importances_)[::-1]
    residualImportance = rf.feature_importances_

    # Compute classifiers confusion matrix (needed?)
    s = np.diag([1 / sum(data["mode"] == mi) for mi in range(nf)])
    Crf = s @ confusion_matrix(data["mode"], rf.predict(data["res"]))

    pfa = np.zeros(nr - 1)
    pmd = np.zeros(nr - 1)
    pfi = np.zeros(nr - 1)
    pmfi = np.zeros((nr - 1, nf))

    # Make sure isolability matrix for NF corresponds to diagnosis
    # statement computed by DiagnosesAndConfusionMatrix
    imk = IsolabilityMatrix(data["fsm"])
    imk[0] = np.zeros(nf)
    imk[0, 0] = 1
    C = None
    for k in range(1, nr):
        dx, C = DiagnosesAndConfusionMatrix(data, residx=sortIdx[0:k])
        pfa[k - 1] = 1 - C[0, 0]
        pmd[k - 1] = np.mean(C[1:, 0])
        pfi[k - 1] = np.mean(np.diag(1 - C[1:, 0]) @ np.abs(C[1:, 1:] - im[1:, 1:]))

        for fi in range(nf):
            pmfi[k - 1, fi] = np.mean(np.all(dx[data["mode"] == fi] == imk[fi, :], axis=1))
    return (
        {
            "sortidx": sortIdx,
            "pfa": pfa,
            "pmd": pmd,
            "pfi": pfi,
            "pmfi": pmfi,
            "residualimportance": residualImportance,
        },
        C,
        rf,
        Crf,
    )