Source code for glmdisc

# -*- coding: utf-8 -*-
"""This module is dedicated to preprocessing tasks for logistic regression and
post-learning graphical tools.

.. autosummary::
    :toctree:

    Glmdisc
    Glmdisc._check_is_fitted
    Glmdisc.best_formula
    Glmdisc.discrete_data
    Glmdisc.discretize
    Glmdisc.discretize_dummy
    Glmdisc.fit
    Glmdisc.plot
    Glmdisc.predict
    Glmdisc.generate_data
    NotFittedError
"""
import numpy as np
import sklearn as sk
from loguru import logger

__version__ = "0.1.2"


[docs]class NotFittedError(sk.exceptions.NotFittedError): """Exception class to raise if estimator is used before fitting. This class inherits from both NotFittedError from sklearn which itself inherits from ValueError and AttributeError to help with exception handling and backward compatibility. """
def _vectorized_multinouilli(prob_matrix, items): """ A vectorized version of multinouilli sampling. .. todo:: check that the number of columns of prob_matrix is the same as the number of elements in items :param prob_matrix: A probability matrix of size n (number of training examples) * m[j] (the factor levels to sample from). :type prob_matrix: numpy.array :param list items: The factor levels to sample from. :returns: The drawn factor levels for each observation. :rtype: numpy.array """ s = prob_matrix.cumsum(axis=1) r = np.random.rand(prob_matrix.shape[0]).reshape((-1, 1)) k = (s < r).sum(axis=1) return items[k]
[docs]class Glmdisc: """ This class implements a supervised multivariate discretization method, factor levels grouping and interaction discovery for logistic regression. .. attribute:: test Boolean (T/F) specifying if a test set is required. If True, the provided data is split to provide 20% of observations in a test set and the reported performance is the Gini index on test set. :type: bool .. attribute:: validation Boolean (T/F) specifying if a validation set is required. If True, the provided data is split to provide 20% of observations in a validation set and the reported performance is the Gini index on the validation set (if no test=False). The quality of the discretization at each step is evaluated using the Gini index on the validation set, so criterion must be set to "gini". :type: bool .. attribute:: criterion The criterion to be used to assess the goodness-of-fit of the discretization: "bic" or "aic" if no validation set, else "gini". :type: str .. attribute:: iter Number of MCMC steps to perform. The more the better, but it may be more intelligent to use several MCMCs. Computation time can increase dramatically. :type: int .. attribute:: m_start Number of initial discretization intervals for all variables. If :code:`m_start` is bigger than the number of factor levels for a given variable in predictors_qual, m_start is set (for this variable only) to this variable's number of factor levels. :type: int .. attribute:: criterion_iter The value of the criterion wished to be optimized over the iterations. :type: list .. attribute:: best_link The best link function between the original features and their quantized counterparts that allows to quantize the data after learning. :type: list .. attribute:: best_reglog: The best logistic regression on quantized data found with best_link. :type: sklearn.linear_model.LogisticRegression .. attribute:: affectations The label encoder of each original feature. best_encoder_emap (list): The label encoder of each of the best_link. :type: list .. attribute:: performance: The best 'criterion' obtained. :type: list .. attribute:: splitting The line rows corresponding to the splits. :type: list """
[docs] def __init__(self, algorithm="SEM", test=True, validation=True, criterion="bic", m_start=20): """ Initializes self by checking if its arguments are appropriately specified. :param str algorithm: Algorithm to use (SEM or NN). :param bool test: Boolean specifying if a test set is required. If True, the provided data is split to provide 20% of observations in a test set and the reported performance is the Gini index on test set. :param bool validation: Boolean (T/F) specifying if a validation set is required. If True, the provided data is split to provide 20% of observations in a validation set and the reported performance is the Gini index on the validation set (if no test=False). The quality of the discretization at each step is evaluated using the Gini index on the validation set, so criterion must be set to "gini". :param str criterion: The criterion to be used to assess the goodness-of-fit of the discretization: "bic" or "aic" if no validation set, else "gini". :param int iter: Number of MCMC steps to perform. The more the better, but it may be more intelligent to use several MCMCs. Computation time can increase dramatically. Defaults to 100. :param int m_start: Number of initial discretization intervals for all variables. If :code:`m_start` is bigger than the number of factor levels for a given variable in :code:`predictors_qual`, :code:`m_start` is set (for this variable only) to this variable's number of factor levels. Defaults to 20. .. todo:: Gérer un try catch pour warm start ? """ # Tests des variables d'entrée # L'algorithme doit être SEM ou NN if algorithm not in ['SEM', 'NN']: msg = 'Algorithm must be one of SEM, NN' logger.error(msg) raise ValueError(msg) # Le critère doit être un des trois de la liste if criterion not in ['gini', 'aic', 'bic']: msg = 'Criterion must be one of Gini, Aic, Bic' logger.error(msg) raise ValueError(msg) # test est bool if not type(test) is bool: msg = 'test must be boolean' logger.error(msg) raise ValueError(msg) # validation est bool if not type(validation) is bool: msg = 'validation must be boolean' logger.error(msg) raise ValueError(msg) # m_start doit être pas déconnant if not 2 <= m_start <= 50: msg = 'Please set 2 <= m_start <= 50' logger.error(msg) raise ValueError(msg) if not validation and criterion == 'gini': logger.warning('Using Gini index on training set might yield an overfitted model') if validation and criterion in ['aic', 'bic']: logger.warning('No need to penalize the log-likelihood when a validation set is used. ' 'Using log-likelihood instead.') # Attributes from parameters from __init__ self.algorithm = algorithm self.test = test self.validation = validation self.criterion = criterion self.m_start = m_start # Attributes from fit self.n = 0 self.d_cont = 0 self.d_qual = 0 self.predictors_cont = None self.predictors_qual = None self.labels = None self.plot_fit = False self.criterion_iter = [] self.best_link = [] self.best_reglog = None self.model_nn = {} self.affectations = [] self.best_encoder_emap = None self.performance = -np.inf self.train_rows = np.array([]) self.validation_rows = np.array([]) self.test_rows = np.array([])
[docs] def _check_is_fitted(self): """Perform is_fitted validation for estimator. Checks if the estimator is fitted by verifying the presence of fitted attributes (ending with a trailing underscore) and otherwise raises a NotFittedError with the given message. This utility is meant to be used internally by estimators themselves, typically in their own predict / transform methods. """ if self.algorithm == "SEM": try: sk.utils.validation.check_is_fitted(self.best_reglog) for link in self.best_link: if isinstance(link, sk.linear_model.LogisticRegression): sk.utils.validation.check_is_fitted(link) except sk.exceptions.NotFittedError as e: raise NotFittedError(str(e) + " If you did call fit, try increasing iter: " "it means it did not find a better solution than " "the random initialization.") else: if self.model_nn["callbacks"][1].best_weights is None: raise NotFittedError(" If you did call fit, try increasing iter: " "it means it did not find a better solution than " "the random initialization.")
# Imported methods from ._bestFormula import best_formula from ._discreteData import discrete_data from ._discretize import discretize from ._discretizeDummy import discretize_dummy from ._fit import fit, _calculate_shape, _init_disc, _split from ._plot import plot from ._predict import predict from ._generateData import generate_data