Source code for b2plot.histogram

 # -*- coding: utf-8 -*-
"""
In this file all the histogram related functions.

"""


from .helpers import get_optimal_bin_size, TheManager
from .colors import b2cm
import pandas as pd
import numpy as np
from matplotlib.colors import hex2color

import matplotlib.pyplot as plt


def _hist_init(data, bins=None, xrange=None):
    """ Performs and stores or returns the binning

    Args:
        data:
        bins:
        xrange:

    Returns:

    """
    xaxis = TheManager.Instance().get_x_axis()

    if xaxis is None or bins is not None or xrange is not None:
        if bins is None:
            bins = get_optimal_bin_size(len(data))
        if xrange == 'auto':
            from .analysis import minmax
            xrange = minmax(data)
        _, xaxis = np.histogram(data, bins, xrange)

    return xaxis


[docs]def set_xaxis(bins, flat=False):
    TheManager.Instance().set_x_axis(bins)


[docs]def get_xaxis():
    return TheManager.Instance().get_x_axis()


[docs]def flat_x(x, nbins=25):
    set_xaxis(np.percentile(x, np.linspace(0, 100, nbins)))


# This needs to be changed
STYLES_facecolor = [None, 'none', 'none', 'none', 'none', 'none']
STYLES_hatches = [None, '///', r"\\\ ",  'xxx', '--', '++', 'o', ".+", 'xx', '//', '*',  'O', '.']


[docs]def hist(data, bins=None, fill=False, range=None, lw=1., ax=None, style=None, color=None, scale=None, weights=None,
         label=None, edgecolor=None, fillalpha=0.5, *args, **kwargs):
    """

    Args:
        data:
        bins:
        fill:
        range:
        lw:
        ax:
        style:
        color:
        scale:
        weights:
        *args:
        **kwargs:

    Returns:

    """

    if ax is None:
        ax = plt.gca()

    xaxis = _hist_init(data, bins, xrange=range)

    if type(data) is pd.Series:
        data = data.values

    if isinstance(color, int):
        color = b2cm[color % len(b2cm)]

    if color is None:
        color = next(ax._get_lines.prop_cycler)["color"]

    # convert color
    if not isinstance(color, list) or isinstance(color, tuple):
        color = hex2color(color)

    if style is not None:
        fill = True
        if style == 0 and edgecolor is None:
            edgecolor = 'black'
    else:
        style = 0

    if weights is None:
        weights = np.ones(len(data))

    if scale is not None:
        if isinstance(scale, int) or isinstance(scale, float):
            if not isinstance(scale, bool):
                weights *= scale
        else:
            print("Please provide int or float with scale")

    edgecolor = color if edgecolor is None else edgecolor

    if fill:
        # edgecolor = 'black' if style == 0 else color
        fc = (*color, fillalpha) if style == 0 else 'none'
        # y, xaxis, _ = ax.hist(data, xaxis, range=range, histtype='step',
        #                       lw=lw, color=color, weights=weights, *args, **kwargs)
        y, xaxis, patches = ax.hist(data, xaxis, range=range, lw=lw, histtype='stepfilled', hatch=STYLES_hatches[style],
                                    edgecolor=edgecolor, facecolor=fc, linewidth=lw, weights=weights, label=label,
                                    color=color, *args, **kwargs)
    else:
        y, xaxis, patches = ax.hist(data, xaxis, range=range, histtype='step', lw=lw, color=color, weights=weights,
                                    label=label, *args, **kwargs)

    TheManager.Instance().set_x_axis(xaxis)
    return y, xaxis, patches


def _notransform(x):
    return x


[docs]def to_stack(df, col, by, transform=None, get_cats=False):
    """ Convert columns of a dataframe to a list of lists by 'by'

    Args:
        df:
        col:
        by:
        transform:

    Returns:

    """
    g = df.groupby(by)
    transform = _notransform if transform is None else transform
    x_data = []
    for gr in g.groups:
        x_data.append(transform(g.get_group(gr)[col].values))

    cats = np.array([gg for gg in g.groups])
    x_len = np.array([len(x) for x in x_data])
    inds = x_len.argsort()
    # print(cats)
    # print(inds)
    if get_cats:
        return [x_data[i] for i in inds], cats[inds]
    return [x_data[i] for i in inds]


[docs]def stacked(df, col=None, by=None, bins=None, color=None, range=None, lw=.5, ax=None, edgecolor='black', weights=None,
            scale=None, label=None, transform=None, *args, **kwargs):
    """ Create stacked histogram

    Args:
        df (DataFrame or list of arrays):
        col:
        by:
        bins:
        color:
        lw:
        *args:
        **kwargs:

    Returns:

    """

    if isinstance(df, pd.DataFrame):
        assert col is not None, "Please provide column"
        assert by is not None, "Please provide by"

        data, cats = to_stack(df, col, by, transform, get_cats=True)
        if label is None:
            label = cats

    else:
        assert isinstance(df, list), "Please provide DataFrame or List"
        (data, labels) = (df,[None])

    if ax is None:
        ax = plt.gca()

    if color is None:
        from b2plot.colors import b2helix
        n_stacks = len(data)
        if n_stacks < 20:
            color = b2helix(n_stacks)

    if weights is None:
        weights = []
        for i,d in enumerate(data):
            wei = np.ones(len(d))
            if scale is not None:
                if isinstance(scale, int) or isinstance(scale, float):
                    if not isinstance(scale, bool):
                        wei *= scale
                elif isinstance(scale, dict):
                    assert cats[i] in scale.keys(), "Scale list must have same lenght as data"
                    wei *= scale[cats[i]]
                else:
                    print("Please provide int or float with scale")
            weights.append(wei)

    xaxis = _hist_init(data[0], bins, xrange=range)

    y, xaxis, stuff = ax.hist(data, xaxis, histtype='stepfilled',
                          lw=lw, color=color, edgecolor=edgecolor, stacked=True, weights=weights, label=label, *args, **kwargs)

    TheManager.Instance().set_x_axis(xaxis)
    return y[-1], xaxis, stuff  # dangerous list index


[docs]def errorhist(data, bins=None, color=None, normed=False, density=False, fmt='.', range=None, scale=None,
              x_err=False, box=False, ax=None, weights=None, plot_zero=True, label=None, *args, **kwargs):
    """ Histogram as error bar

    Args:
        data:
        bins:
        color:
        normed:
        density:
        fmt:
        range:
        scale:
        x_err:
        box:
        ax:
        weights:
        plot_zero:
        label:
        *args:
        **kwargs:

    Returns:

    """

    xaxis = _hist_init(data, bins, xrange=range)

    if ax is None:
        ax = plt.gca()

    if type(data) is pd.Series:
        data = data.values

    if weights is None:
        weights = np.ones(len(data))

    if scale is not None:
        if isinstance(scale, int) or isinstance(scale, float):
            if not isinstance(scale, bool):
                weights *= scale
        else:
            print("Please provide int or float with scale")
    else:
        scale = 1

    if (normed and density) or normed:
      print('normed is deprecated and changed by density. Your call has been changed to density=True automatically.')
      density=True

    y, x = np.histogram(data, xaxis, density=density, weights=weights)

    # https://www-cdf.fnal.gov/physics/statistics    
    err = (-0.5 + np.sqrt(np.array(y*scale + 0.25)), +0.5 + np.sqrt(np.array(y*scale + 0.25)))  # np.sqrt(np.array(y))
    bin_centers = (x[1:] + x[:-1]) / 2.0

    if isinstance(color, int):
        color = b2cm[color % len(b2cm)]

    if color is None:
        color = next(ax._get_lines.prop_cycler)["color"]

    if density:
        yom, x = np.histogram(data, xaxis, weights=weights)
        err = (np.sqrt(np.array(yom)) *(y/yom), np.sqrt(np.array(yom)) * (y/yom))
    if x_err is not False or box:
        x_err = (x[1]-x[0])/2.0
    else:
        x_err = None

    errorbar(bin_centers, y, err, x_err, box, plot_zero, fmt, color, ax, label=label, *args, **kwargs)

    TheManager.Instance().set_x_axis(xaxis)

    return y, bin_centers, err


[docs]def errorbar(bin_centers, y, y_err, x_err=None, box=False, plot_zero=True, fmt='.',
             color=None, ax=None, label=None, alpha=0.4, hatch=None, *args, **kwargs):
    """ Error graph plotting x-y points with errorbars

    Args:
        bin_centers:
        y:
        y_err:
        x_err:
        box:
        plot_zero:
        fmt:
        color:
        ax:
        label:
        alpha:
        hatch:
        *args:
        **kwargs:w
    """

    if ax is None:
        ax = plt.gca()

    if len(y_err) != 2:
        y_err = y_err, y_err

    if color is None:
        color = next(ax._get_lines.prop_cycler)["color"]

    toplot = np.ones(len(y)).astype(bool)

    if plot_zero is False:
        toplot[y == 0] = False
        y_err = (y_err[0][[toplot]], y_err[1][toplot])
        if x_err is not None:
            x_err = x_err[toplot]
        bin_centers = bin_centers[toplot]
        y = y[toplot]

    if box:
        assert x_err is not None, "Please provide x-err"
        hi = y_err[0] + y_err[1]
        lo = y - y_err[0]
        ax.errorbar(bin_centers, y, color=color, xerr=x_err, fmt=' ')
        ax.bar(bin_centers[toplot], hi, bottom=lo, align='center', color=color, alpha=alpha,
                width=2 * x_err, label=label,
                edgecolor=color, hatch=hatch,*args, **kwargs)
    else:
        ax.errorbar(bin_centers, y, yerr=y_err, xerr=x_err, fmt=fmt, color=color,label=label, *args, **kwargs)


[docs]def bar(y, binedges, ax=None, *args, **kwargs):
    """ Bar plot

    Args:
        y:
        binedges:
        ax:
        *args:
        **kwargs:
    """

    if ax is None:
        ax = plt.gca()

    x = (binedges[1:] + binedges[:-1]) / 2.0

    return ax.hist(x, bins=binedges, weights=y, *args, **kwargs)


[docs]def profile(x, y, bins=None, range=None, fmt='.', *args, **kwargs):
    """ Profile plot of x vs y; the mean and std of y in bins of x as errorbar

    Args:
        x:
        y:
        bins:
        range:
        fmt:
        *args:
        **kwargs:

    Returns:

    """
    import scipy

    xaxis = _hist_init(x, bins, xrange=range)

    means = scipy.stats.binned_statistic(x, y, bins=xaxis, statistic='mean').statistic
    std = scipy.stats.binned_statistic(x, y, bins=xaxis, statistic=scipy.stats.sem).statistic

    bin_centers = (xaxis[:-1] + xaxis[1:]) / 2.
    return plt.errorbar(x=bin_centers, y=means, yerr=std, linestyle='none', fmt=fmt, *args, **kwargs)