Source code for b2plot.analysis

# -*- coding: utf-8 -*-
""" Analysis tools

Author:
    - Simon Wehle (swehle@desy.de)
    - Henrikas Svidras (henrikas.svidras@desy.de)

"""
import b2plot
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from .histogram import _hist_init, to_stack, hist, get_xaxis
from .functions import xlim


[docs]def optimal_bin_size(n): """ Is this empricially the best? """ return int(2*n**(1/3))
[docs]def plot_flatness(sig, tag, bins=None, ax=None, xrange=None, percent_step=5): """ Plotting differences of sig distribution in percentiles of tag distribution Args: sig: tag: bins: ax: xrange: percent_step: """ if ax is None: fix, ax = plt.subplots() xaxis = _hist_init(sig, bins=bins, xrange=xrange) colormap = plt.get_cmap('magma') orig, x = np.histogram(sig, bins=xaxis, range=xrange, normed=True, ) bin_center = ((x + np.roll(x, 1)) / 2)[1:] tmp = orig/orig ax.plot(bin_center, tmp, color='black', lw=1) for quantil in np.arange(5, 100, percent_step): cut = np.percentile(tag, quantil) sel = tag >= cut y, x = np.histogram(sig[sel], bins=x, range=xrange, normed=True, ) y /= orig ax.fill_between(bin_center, tmp, y, color=colormap(quantil/100.0)) tmp = y
[docs]def ratio(y1, y2, y1_err=None, y2_err= None): """ calculate the ratio between two histograms y1/y2 Args: y1: y values of first histogram y2: y values of second histogram y1_err: (optional) error of first y2_err: (optional) error of second Returns: ratio, ratio_error """ assert len(y1) == len(y2), "y1 and y2 length does not match" y1e = np.sqrt(y1) if y1_err is None else y1_err y2e = np.sqrt(y2) if y2_err is None else y2_err r = y1/y2 re = np.sqrt((y1/(1.0*y2*y2))*(y1/(1.0*y2*y2))*y2e*y2e+(1/(1.0*y2))*(1/(1.0*y2))*y1e*y1e) return r, re
[docs]def divideEfficiency(n_nom, n_denom, confidence=0.683): """ divides two histograms for an efficiency calculation Args: n_nom: y values of nominator histogram (1d or 2d) n_denom: y values of denominator histogram (1d or 2d) confidence: (optional) confidence level, by default 0.683 Returns: ratio, [lower ratio error, upper ratio error] """ shape = np.shape(n_nom) flattened_n_nom = n_nom.flatten() flattened_n_denom = n_denom.flatten() rat = [] err_down = [] err_up = [] for passes, counts in zip(flattened_n_nom, flattened_n_denom): bin_ratio = exact_CI(passes, counts, conf=confidence) rat.append(bin_ratio[0]) err_down.append(bin_ratio[1]) err_up.append(bin_ratio[2]) err_down = np.reshape(err_down, shape) err_up = np.reshape(err_up, shape) rat = np.reshape(rat, shape) return rat, (err_down, err_up)
[docs]def exact_CI(k, n, conf=0.683): """ calculated clopper pearson confidence intervals Args: k: number of passes ("numerator of proportion") n: total sample size ("denominator of proportion") conf: (optional) confidence level, by default 0.683 Returns: ratio, [upper ratio error, lower ratio error] """ from scipy.stats import beta k = float(k) n = float(n) p = (k/n) if n > 0 else 0 alpha = (1 - conf) up = 1 if k == n else 1 - beta.ppf(alpha/2, n-k, k+1) down = 0 if k == 0 else 1 - beta.ppf(1-alpha/2, n-k+1, k) result = (p, p-down, up-p) return result
[docs]def divhist(h1, h2): assert np.mean(h1[1]-h2[1]) < 0.1 r,re = ratio(h1[0],h2[0], np.mean(h1[2], axis=0),np.mean(h2[2], axis=0) ) return r, h1[1], [re,re]
[docs]def subhist(h1, h2, rethist=False): assert np.mean(h1[1]-h2[1]) < 0.1 r = h1[0] - h2[0] re = np.sqrt(np.mean(h1[2], axis=0)**2 + np.mean(h2[2], axis=0)**2) if rethist: return r,h1[1], [re,re] return h1[1],r, [re,re]
[docs]def data_mc_ratio(data, mc, label_data='Data', label_mc="MC", y_label=None, figsize=None, ratio_range=(0, 2), *args, **kwarg): """ Perform data mc distributions returns: axes """ f, axes = plt.subplots(2, 1, gridspec_kw={"height_ratios": [3, 1]}, sharex=True, figsize=figsize) ax0 = axes[0] hm = b2plot.hist(mc, lw=2, ax=ax0, label=label_mc, *args, **kwarg) hd = b2plot.errorhist(data, ax=ax0, label=label_data, color='black') ax0.legend() ax1 = axes[1] ry, rye = ratio(hd[0], hm[0]) b2plot.errorbar(hd[1], ry, rye, ax=ax1, color='grey') ax1.axhline(1, color='grey', lw=0.5, ls='--') f.subplots_adjust(hspace=0.1) ax1.set_ylim(*ratio_range) b2plot.xlim() if y_label is not None: ax0.set_ylabel(y_label) ax1.set_ylabel("Ratio") ax1.yaxis.set_label_coords(-0.08, 0.5) ax0.yaxis.set_label_coords(-0.08, 0.5) return axes
[docs]def mask_append(xs,xb): """ Merge xs and xb into one vector and return it with a boolean mask for each category """ return np.append(xs,xb), np.append(np.ones(len(xs)), np.zeros(len(xb)))==1
[docs]def pur_eff_cont(x, mask): """ Continuus evaluation of the purity vs efficiency Returns: efficiency, purity : arrays of len(x) """ if len(pd.unique(mask)) > 2: # if signal and background distribution are given as x and mask x, mask = mask_append(x, mask) ag = np.argsort(np.array(x))[::-1] e = mask[ag].cumsum() eff = e/e[-1] pur = e/np.arange(1,len(mask)+1) return eff, pur
[docs]def pur_eff(x, mask, nbins=None, reverse_too=False): """ Plots the distribution x in an equal frequency binning with the purity regarding mask Args: x: Distribution or signal distribution mask: Boolean mask or background distribution nbins: do_plot: figsize: xticks_fontsize: Returns: """ if len(pd.unique(mask)) > 2: # if signal and background distribution are given as x and mask x, mask = mask_append(x, mask) nbins = optimal_bin_size(len(x)) if nbins is None else nbins x = x[np.isfinite(x)] bins = np.percentile(x, np.linspace(0, 100, nbins)) y_, _ = np.histogram(x, bins) y_1, _ = np.histogram(x[mask], bins) y_0, _ = np.histogram(x[~mask], bins) pur = y_1 / (y_1 + y_0) ps = np.argsort(pur) # pur_err = (pur * (1 - pur)) / (y_) # Sort by purity y1 = y_1[ps] y0 = y_0[ps] y = y_[ps] eff = sum(y1)-y1.cumsum() eff_err = np.sqrt(eff)/eff[0] #faster than np.sum(y_1) eff = eff/eff[0] pur= (y1[::-1].cumsum()/(y1[::-1].cumsum()+y0[::-1].cumsum()))[::-1] pur_err = (pur * (1 - pur)) / (y) if reverse_too: eff_r = sum(y1)-y1[::-1].cumsum() eff_r_err = np.sqrt(eff_r)/eff_r[0] eff_r = eff_r/eff_r[0] pur_r= (y1.cumsum()/(y1.cumsum()+y0.cumsum()))[::-1] pur_r_err = (pur_r * (1 - pur_r)) / (y) eff = np.append(eff, eff_r[::-1]) pur = np.append(pur, pur_r[::-1]) eff_err = np.append(eff_err, eff_r_err[::-1]) pur_err = np.append(pur_err, pur_r_err[::-1]) return eff, pur,pur_err, eff_err
[docs]def purity_hist(x, mask, nbins=10, do_plot=True, figsize=None, xticks_fontsize=None, ax=None): """ Plots the distribution x in an equal frequency binning with the purity regarding mask Args: x: Distribution or signal distribution mask: Boolean mask or background distribution nbins: do_plot: figsize: xticks_fontsize: ax: Returns: """ if len(pd.unique(mask)) > 2: # if signal and background distribution are given as x and mask x, mask = mask_append(x, mask) x = x[np.isfinite(x)] bins = np.percentile(x, np.linspace(0, 100, nbins)) y_, _ = np.histogram(x, bins) y_1, _ = np.histogram(x[mask], bins) y_0, _ = np.histogram(x[~mask], bins) pur = y_1 / (y_1 + y_0) pur_err = (pur * (1 - pur)) / (y_) if do_plot: x_ = np.arange(len(y_) + 1) x_centers = (x_[1:] + x_[:-1]) / 2. if ax is None: _, ax = plt.subplots(figsize=figsize) ax.hist(x_[:-1], x_, weights=y_1, alpha=0.8, label='Signal') ax.hist(x_[:-1], x_, weights=y_0, histtype='step', lw=2, label='Background') ax.hist(x_[:-1], x_, weights=y_, histtype='step', color='grey', label='Total') ax2 = ax.twinx() ax2.errorbar(x_centers, pur, np.sqrt(pur_err), color='black', fmt="o--") ax2.set_xticklabels([], []) ax2.set_xticks([], []) ax2.set_ylabel("Purity") ax2.set_ylim(0) np.append(bins, x.max()) _ = ax.set_xticks(x_) _ = ax.set_xticklabels(['%1.2e' % f for f in bins], rotation=-90, fontfamily='monospace', fontsize=xticks_fontsize) plt.xlim(np.min(x_), np.max(x_)) plt.sca(ax) return pur, np.sqrt(pur_err), bins
[docs]def flat_bins(x, set=False, nbins=None, fontsize=None, rotation=-90): if nbins is None: nbins = len(get_xaxis())-1 bins = np.percentile(x, np.linspace(0, 100, nbins+1)) np.append(bins, x.max()) if set: ax = plt.gca() x_ = np.linspace(0, 110, len(bins)+1) _ = ax.set_xticks(x_) _ = ax.set_xticklabels(['%1.2e' % f for f in bins], rotation=rotation, fontfamily='monospace', fontsize=fontsize) else: return bins, np.linspace(0, 100, len(bins)+1)
[docs]def purity_flatness_proba(x, mask, nbins=10, do_plot=False): """ Returns the probability that the purity of x[mask] and x[~mask] is flat. This can be used as a measure of the information content in this observable regarding the mask. Args: x: Observable or signal distribution mask: Boolean mask, like 'is signal', 'is background' or background distribution nbins: Number of bins to calculate the purity do_plot (bool): plot the purity distribution Returns: chi2: probability of CDF(flat) against CDF(purity), where a value of 1 corresponds to the purity being compatible with a flat distribution. """ if len(pd.unique(mask)) > 2: # if signal and background distribution are given as x and mask x, mask = mask_append(x, mask) if np.std(x)==0: return 1 pur, pure, b = purity_hist(x, mask, nbins=nbins, do_plot=do_plot) mask = (np.isfinite(pur)) & (pur != 0) p = pur[mask] pe = pure[mask] * 100 cs = np.nancumsum(p) * 100 cl = np.cumsum(np.full(len(cs), np.mean(p), )) * 100 chi2 = ((cl - cs) ** 2) / pe ** 2 try: import scipy return scipy.stats.distributions.chi2.sf(np.sqrt(np.mean(chi2)), len(chi2)) except ImportError: return chi2
[docs]def df_var_by2xy(): pass
[docs]def sig_bkg_plot(df, col, by=None, ax=None, bins=None, range=None, labels=None, normed=False): """ Args: df: col: by: ax: bins: range: labels: normed: Returns: """ # foreseen usage if isinstance(df, pd.DataFrame): # by is not a boolean index if isinstance(by, str): x, cats = to_stack(df, col, by, get_cats=True) if len(x) > 2 : print("Waring, more than two categories in %s!" % by) assert len(x) > 1, "Did not found any categories in %s!" % by x_sig = x[0] x_bkg = x[1] if len(cats) == 2: if labels is None: labels = [by+f' == {cats[1]}', by+f' == {cats[0]}'] # by is a boolean index else: x_sig = df[col][by].values x_bkg = df[col][~by].values # Alternative usage, passing two arrays else: if len(pd.unique(col)) == 2: # if signal and background distribution are given as x and mask x_sig = df[col] x_bkg = df[~col] else: x_sig = df x_bkg = col xaxis = _hist_init(np.append(x_sig, x_bkg), bins, xrange=range) if labels is None: labels = ["Background", "Signal"] hist(x_bkg, xaxis, style=0, label=labels[0], ax=ax, density=normed) hist(x_sig, xaxis, lw=2, color=0, label=labels[1], ax=ax, density=normed) plt.legend()
# xlim()
[docs]def get_upper_lim(x, perc=0.1, width=1, maxtries=100): xmax = np.max(x) xstd = width*np.std(x) if xstd == 0: return xmax lx = len(x) cut = xmax i = 0 while i<maxtries: cut -= xstd px = 100*len(x[x>cut])/lx if px>=perc: cut+=xstd break i+=1 return cut
[docs]def get_lower_lim(x, perc=0.1, width=1, maxtries=100): xmin = np.min(x) xstd = width*np.std(x) if xstd == 0: return xmin lx = len(x) cut = xmin i = 0 while i<maxtries: cut += xstd px = 100*len(x[x<cut])/lx if px>=perc: cut-=xstd break return cut
[docs]def minmax(x,perc=0.1, width=1, maxtries=100): return (get_lower_lim(x, perc, width, maxtries), get_upper_lim(x, perc, width, maxtries))
[docs]def plot_feature_importance(fi, cols, figsize=None, palette='Blues_d',ax=None, *args, **kwargs): if ax is None: plt.figure(figsize=figsize) ax = plt.gca() import seaborn as sns imp = np.argsort(fi)[::-1] dfplot= pd.DataFrame({'col':np.array(cols)[imp], 'imp':fi[imp]}) sns.barplot('imp', 'col', data=dfplot, hue_order=fi[imp][::-1], palette=palette, ax=ax, *args, **kwargs ) ax.set_xlabel("Importance",) ax.set_ylabel("Feature",) return np.array(cols)[imp]