Source code for b2plot.analysis

# -*- coding: utf-8 -*-
""" Analysis tools

Author:
    - Simon Wehle (swehle@desy.de)
    - Henrikas Svidras (henrikas.svidras@desy.de)

"""
import b2plot
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from .histogram import _hist_init, to_stack, hist, get_xaxis
from .functions import xlim


[docs]def optimal_bin_size(n):
    """ Is this empricially the best?
    """
    return int(2*n**(1/3))


[docs]def plot_flatness(sig, tag, bins=None, ax=None, xrange=None, percent_step=5):
    """ Plotting differences of sig distribution in percentiles of tag distribution

    Args:
        sig:
        tag:
        bins:
        ax:
        xrange:
        percent_step:

    """

    if ax is None:
        fix, ax = plt.subplots()

    xaxis = _hist_init(sig, bins=bins, xrange=xrange)

    colormap = plt.get_cmap('magma')
    orig, x = np.histogram(sig, bins=xaxis, range=xrange, normed=True, )
    bin_center = ((x + np.roll(x, 1)) / 2)[1:]
    tmp = orig/orig
    ax.plot(bin_center, tmp, color='black', lw=1)
    for quantil in np.arange(5, 100, percent_step):
        cut = np.percentile(tag, quantil)
        sel = tag >= cut
        y, x = np.histogram(sig[sel], bins=x, range=xrange, normed=True, )
        y /= orig
        ax.fill_between(bin_center, tmp, y, color=colormap(quantil/100.0))
        tmp = y


[docs]def ratio(y1, y2, y1_err=None, y2_err= None):
    """ calculate the ratio between two histograms y1/y2

    Args:
        y1: y values of first histogram
        y2: y values of second histogram
        y1_err: (optional) error of first
        y2_err: (optional) error of second

    Returns:
        ratio, ratio_error

    """
    assert len(y1) == len(y2), "y1 and y2 length does not match"
    y1e = np.sqrt(y1) if y1_err is None else y1_err
    y2e = np.sqrt(y2) if y2_err is None else y2_err
    r = y1/y2
    re = np.sqrt((y1/(1.0*y2*y2))*(y1/(1.0*y2*y2))*y2e*y2e+(1/(1.0*y2))*(1/(1.0*y2))*y1e*y1e)
    return r, re


[docs]def divideEfficiency(n_nom, n_denom, confidence=0.683):
    """ divides two histograms for an efficiency calculation

    Args:
        n_nom: y values of nominator histogram (1d or 2d)
        n_denom: y values of denominator histogram  (1d or 2d)
        confidence: (optional) confidence level, by default 0.683

    Returns:
        ratio, [lower ratio error, upper ratio error]
    """
    shape = np.shape(n_nom)

    flattened_n_nom = n_nom.flatten()
    flattened_n_denom = n_denom.flatten()

    rat = []
    err_down = []
    err_up = []

    for passes, counts in zip(flattened_n_nom, flattened_n_denom):
        bin_ratio = exact_CI(passes, counts, conf=confidence)
        rat.append(bin_ratio[0])
        err_down.append(bin_ratio[1])
        err_up.append(bin_ratio[2])

    err_down = np.reshape(err_down, shape)
    err_up = np.reshape(err_up, shape)
    rat = np.reshape(rat, shape)

    return rat, (err_down, err_up)


[docs]def exact_CI(k, n, conf=0.683):
    """ calculated clopper pearson confidence intervals

    Args:
        k: number of passes ("numerator of proportion")
        n: total sample size ("denominator of proportion")
        conf: (optional) confidence level, by default 0.683

    Returns:
        ratio, [upper ratio error, lower ratio error]
    """

    from scipy.stats import beta
    k = float(k)
    n = float(n)
    p = (k/n) if n > 0 else 0

    alpha = (1 - conf)
    up = 1 if k == n else 1 - beta.ppf(alpha/2, n-k, k+1)
    down = 0 if k == 0 else 1 - beta.ppf(1-alpha/2, n-k+1, k)

    result = (p, p-down, up-p)
    return result


[docs]def divhist(h1, h2):
    assert np.mean(h1[1]-h2[1]) < 0.1
    r,re = ratio(h1[0],h2[0], np.mean(h1[2], axis=0),np.mean(h2[2], axis=0) )
    return r, h1[1], [re,re]


[docs]def subhist(h1, h2, rethist=False):
    assert np.mean(h1[1]-h2[1]) < 0.1
    r = h1[0] - h2[0]
    re = np.sqrt(np.mean(h1[2], axis=0)**2 + np.mean(h2[2], axis=0)**2)
    if rethist:
        return r,h1[1],  [re,re]
    return h1[1],r, [re,re]


[docs]def data_mc_ratio(data, mc, label_data='Data', label_mc="MC",
                  y_label=None, figsize=None, ratio_range=(0, 2),
                  *args, **kwarg):
    """ Perform data mc distributions
    
    returns:
        axes
    """
    f, axes = plt.subplots(2, 1, gridspec_kw={"height_ratios": [3, 1]}, sharex=True, figsize=figsize)
    ax0 = axes[0]

    hm = b2plot.hist(mc, lw=2, ax=ax0, label=label_mc, *args, **kwarg)
    hd = b2plot.errorhist(data, ax=ax0, label=label_data, color='black')
    ax0.legend()

    ax1 = axes[1]
    ry, rye = ratio(hd[0], hm[0])
    b2plot.errorbar(hd[1], ry, rye, ax=ax1, color='grey')
    ax1.axhline(1, color='grey', lw=0.5, ls='--')
    f.subplots_adjust(hspace=0.1)

    ax1.set_ylim(*ratio_range)
    b2plot.xlim()
    if y_label is not None:
        ax0.set_ylabel(y_label)
        ax1.set_ylabel("Ratio")
        ax1.yaxis.set_label_coords(-0.08, 0.5)
        ax0.yaxis.set_label_coords(-0.08, 0.5)
    return axes


[docs]def mask_append(xs,xb):
    """ Merge xs and xb into one vector and return it with a boolean mask for each category
    """
    return np.append(xs,xb), np.append(np.ones(len(xs)), np.zeros(len(xb)))==1


[docs]def pur_eff_cont(x, mask):
    """ Continuus evaluation of the purity vs efficiency 

    Returns:
        efficiency, purity : arrays of len(x)
    """
    if len(pd.unique(mask)) > 2:
        # if signal and background distribution are given as x and mask
        x, mask = mask_append(x, mask)

    ag = np.argsort(np.array(x))[::-1]
    e = mask[ag].cumsum() 
    eff = e/e[-1]
    pur = e/np.arange(1,len(mask)+1)
    return eff, pur 


[docs]def pur_eff(x, mask, nbins=None, reverse_too=False):
    """ Plots the distribution x in an equal frequency binning with the purity regarding mask

    Args:
        x: Distribution or signal distribution
        mask: Boolean mask or background distribution
        nbins:
        do_plot:
        figsize:
        xticks_fontsize:

    Returns:

    """
    if len(pd.unique(mask)) > 2:
        # if signal and background distribution are given as x and mask
        x, mask = mask_append(x, mask)

    nbins = optimal_bin_size(len(x)) if nbins is None else nbins

    x = x[np.isfinite(x)]
    bins = np.percentile(x, np.linspace(0, 100, nbins))

    y_, _ = np.histogram(x, bins)
    y_1, _ = np.histogram(x[mask], bins)
    y_0, _ = np.histogram(x[~mask], bins)

    pur = y_1 / (y_1 + y_0)
    ps = np.argsort(pur)
    # pur_err = (pur * (1 - pur)) / (y_)  

    # Sort by purity
    y1 = y_1[ps]
    y0 = y_0[ps]
    y = y_[ps]

    eff = sum(y1)-y1.cumsum()
    eff_err = np.sqrt(eff)/eff[0] #faster than np.sum(y_1)
    eff = eff/eff[0]

    pur= (y1[::-1].cumsum()/(y1[::-1].cumsum()+y0[::-1].cumsum()))[::-1]
    pur_err = (pur * (1 - pur)) / (y)  

    if reverse_too:
        eff_r = sum(y1)-y1[::-1].cumsum()
        eff_r_err = np.sqrt(eff_r)/eff_r[0]
        eff_r = eff_r/eff_r[0]    

        pur_r= (y1.cumsum()/(y1.cumsum()+y0.cumsum()))[::-1]
        pur_r_err = (pur_r * (1 - pur_r)) / (y)  

        eff = np.append(eff, eff_r[::-1])
        pur = np.append(pur, pur_r[::-1])
        eff_err = np.append(eff_err, eff_r_err[::-1])
        pur_err = np.append(pur_err, pur_r_err[::-1])
    return eff, pur,pur_err, eff_err


[docs]def purity_hist(x, mask, nbins=10, do_plot=True, figsize=None, xticks_fontsize=None, ax=None):
    """ Plots the distribution x in an equal frequency binning with the purity regarding mask

    Args:
        x: Distribution or signal distribution
        mask: Boolean mask or background distribution
        nbins:
        do_plot:
        figsize:
        xticks_fontsize:
        ax:

    Returns:

    """
    if len(pd.unique(mask)) > 2:
        # if signal and background distribution are given as x and mask
        x, mask = mask_append(x, mask)

    x = x[np.isfinite(x)]
    bins = np.percentile(x, np.linspace(0, 100, nbins))

    y_, _ = np.histogram(x, bins)
    y_1, _ = np.histogram(x[mask], bins)
    y_0, _ = np.histogram(x[~mask], bins)

    pur = y_1 / (y_1 + y_0)
    pur_err = (pur * (1 - pur)) / (y_)    

    if do_plot:
        x_ = np.arange(len(y_) + 1)
        x_centers = (x_[1:] + x_[:-1]) / 2.

        if ax is None:
            _, ax = plt.subplots(figsize=figsize)
        ax.hist(x_[:-1], x_, weights=y_1, alpha=0.8, label='Signal')
        ax.hist(x_[:-1], x_, weights=y_0, histtype='step', lw=2, label='Background')
        ax.hist(x_[:-1], x_, weights=y_, histtype='step', color='grey', label='Total')

        ax2 = ax.twinx()
        ax2.errorbar(x_centers, pur, np.sqrt(pur_err), color='black', fmt="o--")
        ax2.set_xticklabels([], [])
        ax2.set_xticks([], [])

        ax2.set_ylabel("Purity")
        ax2.set_ylim(0)

        np.append(bins, x.max())
        _ = ax.set_xticks(x_)
        _ = ax.set_xticklabels(['%1.2e' % f for f in bins], rotation=-90, fontfamily='monospace',
                               fontsize=xticks_fontsize)
        plt.xlim(np.min(x_), np.max(x_))
        plt.sca(ax)
    return pur, np.sqrt(pur_err), bins


[docs]def flat_bins(x, set=False, nbins=None,  fontsize=None, rotation=-90):
    if nbins is None:
        nbins = len(get_xaxis())-1
    bins = np.percentile(x, np.linspace(0, 100, nbins+1))
    np.append(bins, x.max())
    if set:
        ax = plt.gca()
        x_ = np.linspace(0, 110, len(bins)+1)
        _ = ax.set_xticks(x_)
        _ = ax.set_xticklabels(['%1.2e' % f for f in bins], rotation=rotation, fontfamily='monospace',
                    fontsize=fontsize)
    else:
        return bins, np.linspace(0, 100, len(bins)+1)


[docs]def purity_flatness_proba(x, mask, nbins=10, do_plot=False):
    """ Returns the probability that the purity of x[mask] and x[~mask] is flat.

    This can be used as a measure of the information content in this observable regarding the mask.

    Args:
        x: Observable or signal distribution
        mask: Boolean mask, like 'is signal', 'is background' or background distribution
        nbins: Number of bins to calculate the purity
        do_plot (bool): plot the purity distribution

    Returns:
        chi2: probability of CDF(flat) against CDF(purity), where a value of 1 corresponds to the purity being compatible
        with a flat distribution.

    """

    if len(pd.unique(mask)) > 2:
        # if signal and background distribution are given as x and mask
        x, mask = mask_append(x, mask)
    
    if np.std(x)==0:
        return 1

    pur, pure, b = purity_hist(x, mask, nbins=nbins, do_plot=do_plot)

    mask = (np.isfinite(pur)) & (pur != 0)
    p = pur[mask]
    pe = pure[mask] * 100

    cs = np.nancumsum(p) * 100
    cl = np.cumsum(np.full(len(cs), np.mean(p), )) * 100
    chi2 = ((cl - cs) ** 2) / pe ** 2
    try:
        import scipy
        return scipy.stats.distributions.chi2.sf(np.sqrt(np.mean(chi2)), len(chi2))
    except ImportError:
        return chi2



[docs]def df_var_by2xy():
    pass


[docs]def sig_bkg_plot(df, col, by=None, ax=None, bins=None, range=None, labels=None, normed=False):
    """

    Args:
        df:
        col:
        by:
        ax:
        bins:
        range:
        labels:
        normed:

    Returns:

    """

    # foreseen usage
    if isinstance(df, pd.DataFrame):
        # by is not a boolean index
        if isinstance(by, str):
            x, cats = to_stack(df, col, by, get_cats=True)
            if len(x) > 2 :
                print("Waring, more than two categories in %s!" % by)
                assert len(x) > 1, "Did not found any categories in %s!" % by

            x_sig = x[0]
            x_bkg = x[1]

            if len(cats) == 2:
                if labels is None:
                    labels = [by+f' == {cats[1]}', by+f' == {cats[0]}']

        # by is a boolean index
        else:
           x_sig = df[col][by].values
           x_bkg = df[col][~by].values
    # Alternative usage, passing two arrays
    else:
        if len(pd.unique(col)) == 2:
            # if signal and background distribution are given as x and mask
            x_sig = df[col]
            x_bkg = df[~col]
        else:
            x_sig = df
            x_bkg = col

    xaxis = _hist_init(np.append(x_sig, x_bkg), bins, xrange=range)

    if labels is None:
        labels = ["Background", "Signal"]

    hist(x_bkg, xaxis, style=0, label=labels[0], ax=ax, density=normed)
    hist(x_sig, xaxis, lw=2, color=0, label=labels[1], ax=ax, density=normed)

    plt.legend()
    # xlim()


[docs]def get_upper_lim(x, perc=0.1, width=1, maxtries=100):
    xmax = np.max(x)
    xstd = width*np.std(x)
    if xstd == 0:
        return xmax
    lx = len(x)
    cut = xmax
    i = 0
    while i<maxtries:
        cut -= xstd
        px = 100*len(x[x>cut])/lx
        if px>=perc:
            cut+=xstd
            break
        i+=1
    return cut


[docs]def get_lower_lim(x, perc=0.1, width=1, maxtries=100):
    xmin = np.min(x)
    xstd = width*np.std(x)
    if xstd == 0:
        return xmin
    lx = len(x)
    cut = xmin
    i = 0
    while i<maxtries:
        cut += xstd
        px = 100*len(x[x<cut])/lx
        if px>=perc:
            cut-=xstd
            break
        
    return cut


[docs]def minmax(x,perc=0.1, width=1, maxtries=100):
    return (get_lower_lim(x, perc, width, maxtries), get_upper_lim(x, perc, width, maxtries))


[docs]def plot_feature_importance(fi, cols, figsize=None, palette='Blues_d',ax=None, *args, **kwargs):
    if ax is None:
        plt.figure(figsize=figsize)
        ax = plt.gca()
    import seaborn as sns
    imp = np.argsort(fi)[::-1]
    dfplot= pd.DataFrame({'col':np.array(cols)[imp], 'imp':fi[imp]})
    sns.barplot('imp', 'col', data=dfplot, hue_order=fi[imp][::-1], palette=palette, ax=ax, *args, **kwargs )
    ax.set_xlabel("Importance",)
    ax.set_ylabel("Feature",)
    return np.array(cols)[imp]