Source code for galeritas.plot_calibration_and_distribution

from sklearn.calibration import calibration_curve
from matplotlib import pyplot as plt
import warnings
import numpy as np

__all__ = ["plot_calibration_and_distribution"]


[docs]def plot_calibration_and_distribution(
        df,
        target,
        predictions,
        n_bins=20,
        strategy='quantile',
        x_lim=None,
        y_lim=None,
        show_distribution=True,
        color="#3377bb",
        return_fig=False,
        ax=None):

    """
    Returns a calibration curve for predicted values. If wanted, it will also return a distribuition plot.

    :param df: a pd.Dataframe that contains target and prediction data
    :type df: pd.Dataframe

    :param target: name of target column
    :type target: string

    :param predictions: name of prediction column
    :type predictions: string

    :param n_bins: number of bins to discretize the [0, x_lim] interval in calibration curve. |default| :code:`20`
    :type n_bins: int, optional

    :param strategy: strategy used in calibration curve: |default| :code:`quantile`
        uniform: the bins have identical widths.
        quantile: The bins have the same number of samples and depend on y_prob.
    :type strategy: string, optional

    :param x_lim: width of x axes in calibration and distribution curve. |default| :code:`None`
    :type x_lim: float, optional

    :param y_lim: width of y ax in calibration curve. |default| :code:`None`
    :type y_lim: float, optional

    :param show_distribution: if distribution graph is wanted |default| :code:`True`
    :type show_distribution: boolean, optional

    :param color: personalized color |default| :code:`#3377bb`
    :type color: str, optional

    :param return_fig: If True return figure object. |default| :code:`True`
    :type return_fig: bool, optional

    :return: Returns the figure object with the plot (*return_fig parameter needs to be set)
    :rtype: Figure

    """
    
    if show_distribution and ax:
        warnings.warn("`ax` is not None and `show_distribution` is True. Ignoring distribution for plotting in personalized axes. To see distribution don't use `ax` parameter.")
        show_distribution = False
    
    if ax:
        # used personalized axes (ax)
        ax1 = ax
        
    elif show_distribution:
        # create subplots for calibration curve and distribution
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(20, 10))
    else:
        # create only one plot with calibration curve
        fig, ax1 = plt.subplots(figsize=(20, 10))

    if strategy == 'uniform':
        bins = np.linspace(0.0, 1.0 + 1e-8, n_bins + 1)

    else:
        quantiles = np.linspace(0, 1, n_bins + 1)
        bins = np.percentile(df[predictions], quantiles * 100)
        bins[-1] = bins[-1] + 1e-8

    binids = np.digitize(df[predictions], bins) - 1

    bin_sums = np.bincount(binids, weights=df[predictions], minlength=len(bins))
    bin_true = np.bincount(binids, weights=df[target], minlength=len(bins))
    bin_total = np.bincount(binids, minlength=len(bins))

    nonzero = bin_total != 0
    prob_true = bin_true[nonzero] / bin_total[nonzero]
    prob_pred = bin_sums[nonzero] / bin_total[nonzero]

    if x_lim is None:
        x_lim = prob_pred.max() * 1.1

    if y_lim is None:
        y_lim = prob_true.max() * 1.1

    # plot perfectly calibrated
    ax1.plot([0, y_lim], [0, y_lim], label='Perfectly calibrated', linestyle='--', color='black')

    fop_calibrated, mpv_calibrated = calibration_curve(df[target], df[predictions], n_bins=n_bins,
                                                       strategy=strategy)
    ax1.plot(mpv_calibrated, fop_calibrated, marker='.', label=predictions, color=color)

    ax1.set_xlim([0, x_lim])
    ax1.set_ylim([0, y_lim])
    ax1.legend(loc="upper left")
    ax1.grid(True)
    ax1.set_xlabel('Mean prediction value')
    ax1.set_ylabel('Mean target value')

    if show_distribution:
        ax2.hist(df[predictions], histtype="bar", bins=bins, label=predictions, color=color)

        ax2.set_xlim([0, x_lim])
        ax2.legend(loc="upper left")
        ax2.set_xlabel('Mean prediction value')
        ax2.set_ylabel('Quantity')

    if return_fig:
        plt.show()
        plt.close()

        return fig