Source code for galeritas.plot_calibration_and_distribution

from sklearn.calibration import calibration_curve
from matplotlib import pyplot as plt
import warnings
import numpy as np

__all__ = ["plot_calibration_and_distribution"]


[docs]def plot_calibration_and_distribution( df, target, predictions, n_bins=20, strategy='quantile', x_lim=None, y_lim=None, show_distribution=True, color="#3377bb", return_fig=False, ax=None): """ Returns a calibration curve for predicted values. If wanted, it will also return a distribuition plot. :param df: a pd.Dataframe that contains target and prediction data :type df: pd.Dataframe :param target: name of target column :type target: string :param predictions: name of prediction column :type predictions: string :param n_bins: number of bins to discretize the [0, x_lim] interval in calibration curve. |default| :code:`20` :type n_bins: int, optional :param strategy: strategy used in calibration curve: |default| :code:`quantile` uniform: the bins have identical widths. quantile: The bins have the same number of samples and depend on y_prob. :type strategy: string, optional :param x_lim: width of x axes in calibration and distribution curve. |default| :code:`None` :type x_lim: float, optional :param y_lim: width of y ax in calibration curve. |default| :code:`None` :type y_lim: float, optional :param show_distribution: if distribution graph is wanted |default| :code:`True` :type show_distribution: boolean, optional :param color: personalized color |default| :code:`#3377bb` :type color: str, optional :param return_fig: If True return figure object. |default| :code:`True` :type return_fig: bool, optional :return: Returns the figure object with the plot (*return_fig parameter needs to be set) :rtype: Figure """ if show_distribution and ax: warnings.warn("`ax` is not None and `show_distribution` is True. Ignoring distribution for plotting in personalized axes. To see distribution don't use `ax` parameter.") show_distribution = False if ax: # used personalized axes (ax) ax1 = ax elif show_distribution: # create subplots for calibration curve and distribution fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(20, 10)) else: # create only one plot with calibration curve fig, ax1 = plt.subplots(figsize=(20, 10)) if strategy == 'uniform': bins = np.linspace(0.0, 1.0 + 1e-8, n_bins + 1) else: quantiles = np.linspace(0, 1, n_bins + 1) bins = np.percentile(df[predictions], quantiles * 100) bins[-1] = bins[-1] + 1e-8 binids = np.digitize(df[predictions], bins) - 1 bin_sums = np.bincount(binids, weights=df[predictions], minlength=len(bins)) bin_true = np.bincount(binids, weights=df[target], minlength=len(bins)) bin_total = np.bincount(binids, minlength=len(bins)) nonzero = bin_total != 0 prob_true = bin_true[nonzero] / bin_total[nonzero] prob_pred = bin_sums[nonzero] / bin_total[nonzero] if x_lim is None: x_lim = prob_pred.max() * 1.1 if y_lim is None: y_lim = prob_true.max() * 1.1 # plot perfectly calibrated ax1.plot([0, y_lim], [0, y_lim], label='Perfectly calibrated', linestyle='--', color='black') fop_calibrated, mpv_calibrated = calibration_curve(df[target], df[predictions], n_bins=n_bins, strategy=strategy) ax1.plot(mpv_calibrated, fop_calibrated, marker='.', label=predictions, color=color) ax1.set_xlim([0, x_lim]) ax1.set_ylim([0, y_lim]) ax1.legend(loc="upper left") ax1.grid(True) ax1.set_xlabel('Mean prediction value') ax1.set_ylabel('Mean target value') if show_distribution: ax2.hist(df[predictions], histtype="bar", bins=bins, label=predictions, color=color) ax2.set_xlim([0, x_lim]) ax2.legend(loc="upper left") ax2.set_xlabel('Mean prediction value') ax2.set_ylabel('Quantity') if return_fig: plt.show() plt.close() return fig