Source code for galeritas.plot_ecdf_curve

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from galeritas.utils.creditas_palette import get_palette
import seaborn as sns
import warnings

__all__ = ["plot_ecdf_curve"]


[docs]def plot_ecdf_curve(
        df,
        column_to_plot,
        drop_na=True,
        hue=None,
        hue_labels=None,
        colors=None,
        color_palette=None,
        plot_title=None,
        percentiles=(25, 50, 75),
        percentiles_title='Percentiles',
        mark_percentiles=True,
        show_percentile_table=False,
        figsize=(16, 7),
        ax=None,
        return_fig=False,
        **legend_kwargs):
    """
    Generates an empirical cumulative distribution function.
    Theorical Reference can be found `here <https://en.wikipedia.org/wiki/Empirical_distribution_function>`__.

    :param df: A dataframe containing the dataset.
    :type df: DataFrame

    :param column_to_plot: Column name of the observed data.
    :type column_to_plot: str

    :param drop_na: If True, removes the missing values of the column to be plotted. Otherwise, plots the distribution without removing the missing values, but doesn't calculates the percentiles. |default| :code:`True`
    :type drop_na: bool, optional

    :param hue: A string indicating the dataframe's column name containing the categories if is wanted to plot the distribution using the column passed by column_to_plot parameter for each category that appears at the column passed by hue parameter. |default| :code:`None`
    :type hue: str, optional

    :param hue_labels: Parameter to be used if is wanted to show a label of hue categories different from the actual values existing in the column passed by hue parameter. It's necessary to pass a dictionary containing the values to be replaced and the values that will replace them (e.g. {1:'True', 0: 'False'}). |default| :code:`None`
    :type hue_labels: Dict, optional

    :param colors: A list containing the hexadecimal colors of each hue. The number of elements on the list must be the same of hue groups. |default| :code:`None`
    :type colors: list of str, optional

    :param color_palette:  If colors parameter is None, uses the color_palette to set different colors of the palette for each hue value.  If both colors and color_palette parameters are None, then uses the default palette of the library. |default| :code:`None`
    :type color_palette: str, optional

    :param plot_title: Text to describe the plot's title. |default| :code:`None`
    :type plot_title: str, optional

    :param percentiles: A tuple that indicates the percentiles of the distributions. |default| :code:`(25, 50, 75)`
    :type percentiles: tuple, optional

    :param percentiles_title: A string to be used to indicate the percentiles. |default| :code:`Percentiles`
    :type percentiles_title: str, optional

    :param mark_percentiles: If True, shows the percentiles defined in parameter percentiles. |default| :code:`True`
    :type mark_percentiles: bool, optional

    :param show_percentile_table: If True, shows a table with the values for each percentile and category. |default| :code:`False`
    :type show_percentile_table: bool, optional

    :param figsize: A tuple that indicates the figure size (respectively, width and height in inches). |default| :code:`(16, 7)`
    :type figsize: tuple, optional

    :param ax: Custom figure axes to plot. |default| :code: `None`
    :type ax: matplotlib.axes, optional

    :param return_fig: If True return figure object. |default| :code:`Fase`
    :type return_fig: bool, optional

    :param legend_kwargs: Matplotlib.pyplot's legend arguments such as *bbox_to_anchor* and *ncol*. Further informations `here <http://matplotlib.org/3.1.1/api/_as_gen/matplotlib.pyplot.legend>`__.
    :type legend_kwargs: key, value mappings

    :return: Returns the figure object with the plot
    :rtype: Figure
    """
    data = df.copy()

    if data[column_to_plot].isnull().values.any():
        warnings.warn(f'Column "{column_to_plot}" has missing values! If the parameter drop_na is True (which is the '
                      f'default value), the missing values will be removed.')

    if drop_na:
        data = data.dropna(subset=[column_to_plot])
    
    if ax:
        axes=ax
    else:
        fig, axes = plt.subplots(1, 1, figsize=figsize)
        fig.subplots_adjust(hspace=0.5)

    data_list = [data]

    x_values_list = []
    y_values_list = []
    x_values, y_values = calculate_ecdf_plot_axis_values(data[column_to_plot])
    x_values_list.append(x_values)
    y_values_list.append(y_values)

    if hue_labels is not None:
        data[hue] = data[hue].apply(
            lambda category: hue_labels[category] if category in hue_labels.keys() else category
        )

    if hue:
        data[hue] = data[hue].astype('str')
        data = data.sort_values(by=hue)
        hue_categories_labels = data[hue].unique()

        for _, category in enumerate(hue_categories_labels):
            df_hue = data.loc[data[hue] == category]
            data_list.append(df_hue)
            x_values, y_values = calculate_ecdf_plot_axis_values(df_hue[column_to_plot])
            x_values_list.append(x_values)
            y_values_list.append(y_values)

        data_list.pop(0)
        x_values_list.pop(0)
        y_values_list.pop(0)
    else:
        hue_categories_labels = [column_to_plot]

    if colors is not None and len(hue_categories_labels) > len(colors):
        raise KeyError(f'The number of colors passed by colors parameter is smaller than the number of categories in "{hue}" column! Expected {len(hue_categories_labels)} colors but only {len(colors)} was/were passed.')

    if colors is None:
        colors = get_palette()

    if color_palette:
        colors = sns.color_palette(color_palette, len(hue_categories_labels))

    colormap = dict(zip(hue_categories_labels, colors))

    for coordinates in enumerate(list(zip(x_values_list, y_values_list))):
        index = coordinates[0]
        x_values, y_values = coordinates[1]
        axes.plot(x_values, y_values, marker='.', markersize=4.5, alpha=0.7, linestyle='none',
                  label=hue_categories_labels[index], color=colormap[hue_categories_labels[index]])

    axes.set_title(plot_title)
    axes.set_ylabel("ECDF")
    axes.set_xlabel(column_to_plot)

    if mark_percentiles:
        percentiles_values = []
        for index, data in enumerate(data_list):
            percentiles_calc = np.percentile(data[column_to_plot], percentiles)
            axes.plot(
                percentiles_calc,
                np.divide(percentiles, 100),
                marker='D',
                markersize=8,
                color=colormap[hue_categories_labels[index]],
                linestyle='none',
                label=f'{percentiles_title} - {hue_categories_labels[index]}'
            )
            percentiles_values.append(percentiles_calc)

        axes.text(
            0.05,
            -0.15,
            f"{percentiles_title} {percentiles}",
            horizontalalignment='center',
            verticalalignment='center',
            bbox=dict(boxstyle='round', alpha=0.25, facecolor='gray')
        )

    plt.grid(True, alpha=0.6, linestyle='--')

    axes.legend(loc="lower right")

    if bool(legend_kwargs) is True:
        axes.legend(**legend_kwargs)

    if show_percentile_table:
        columns = list(zip([percentiles_title] * len(percentiles), list(percentiles)))
        columns = pd.MultiIndex.from_tuples(columns)
        tabela = pd.DataFrame(percentiles_values, index=hue_categories_labels, columns=columns)

        display(tabela)

    if return_fig:
        plt.show()
        plt.close()

        return fig


def calculate_ecdf_plot_axis_values(data):
    data_length = len(data)
    x = np.sort(data)
    y = np.arange(1, data_length + 1) / data_length

    return x, y