Source code for pycbc.inference.entropy

""" The module contains functions for calculating the
Kullback-Leibler divergence.
"""

import numpy
from scipy import stats



[docs]
def check_hist_params(samples, hist_min, hist_max, hist_bins):
    """ Checks that the bound values given for the histogram are consistent,
    returning the range if they are or raising an error if they are not.
    Also checks that if hist_bins is a str, it corresponds to a method
    available in numpy.histogram

    Parameters
    ----------
    samples : numpy.array
        Set of samples to get the min/max if only one of the bounds is given.
    hist_min : numpy.float64
        Minimum value for the histogram.
    hist_max : numpy.float64
        Maximum value for the histogram.
    hist_bins: int or str
        If int, number of equal-width bins to use in numpy.histogram. If str,
        it should be one of the methods to calculate the optimal bin width
        available in numpy.histogram: ['auto', 'fd', 'doane', 'scott', 'stone',
        'rice', 'sturges', 'sqrt']. Default is 'fd' (Freedman Diaconis
        Estimator). This option will be ignored if `kde=True`.

    Returns
    -------
    hist_range : tuple or None
        The bounds (hist_min, hist_max) or None.
    hist_bins : int or str
        Number of bins or method for optimal width bin calculation.
    """

    hist_methods = ['auto', 'fd', 'doane', 'scott', 'stone', 'rice',
                    'sturges', 'sqrt']
    if not hist_bins:
        hist_bins = 'fd'
    elif isinstance(hist_bins, str) and hist_bins not in hist_methods:
        raise ValueError('Method for calculating bins width must be one of'
                         ' {}'.format(hist_methods))

    # No bounds given, return None
    if not hist_min and not hist_max:
        return None, hist_bins

    # One of the bounds is missing
    if hist_min and not hist_max:
        hist_max = samples.max()
    elif hist_max and not hist_min:
        hist_min = samples.min()
    # Both bounds given
    elif hist_min and hist_max and hist_min >= hist_max:
        raise ValueError('hist_min must be lower than hist_max.')

    hist_range = (hist_min, hist_max)

    return hist_range, hist_bins




[docs]
def compute_pdf(samples, method, bins, hist_min, hist_max):
    """ Computes the probability density function for a set of samples.

    Parameters
    ----------
    samples : numpy.array
        Set of samples to calculate the pdf.
    method : str
        Method to calculate the pdf. Options are 'kde' for the Kernel Density
        Estimator, and 'hist' to use numpy.histogram
    bins : str or int, optional
        This option will be ignored if method is `kde`.
        If int, number of equal-width bins to use when calculating probability
        density function from a set of samples of the distribution. If str, it
        should be one of the methods to calculate the optimal bin width
        available in numpy.histogram: ['auto', 'fd', 'doane', 'scott', 'stone',
        'rice', 'sturges', 'sqrt']. Default is 'fd' (Freedman Diaconis
        Estimator).
    hist_min : numpy.float64, optional
        Minimum of the distributions' values to use. This will be ignored if
        `kde=True`.
    hist_max : numpy.float64, optional
        Maximum of the distributions' values to use. This will be ignored if
        `kde=True`.

    Returns
    -------
    pdf : numpy.array
        Discrete probability distribution calculated from samples.
    """

    if method == 'kde':
        samples_kde = stats.gaussian_kde(samples)
        npts = 10000 if len(samples) <= 10000 else len(samples)
        draw = samples_kde.resample(npts)
        pdf = samples_kde.evaluate(draw)
    elif method == 'hist':
        hist_range, hist_bins = check_hist_params(samples, hist_min,
                                                  hist_max, bins)
        pdf, _ = numpy.histogram(samples, bins=hist_bins,
                                 range=hist_range, density=True)
    else:
        raise ValueError('Method not recognized.')

    return pdf




[docs]
def entropy(pdf1, base=numpy.e):
    """ Computes the information entropy for a single parameter
    from one probability density function.

    Parameters
    ----------
    pdf1 : numpy.array
        Probability density function.
    base : {numpy.e, numpy.float64}, optional
        The logarithmic base to use (choose base 2 for information measured
        in bits, default is nats).

    Returns
    -------
    numpy.float64
        The information entropy value.
    """

    return stats.entropy(pdf1, base=base)




[docs]
def kl(samples1, samples2, pdf1=False, pdf2=False, kde=False,
       bins=None, hist_min=None, hist_max=None, base=numpy.e):
    """ Computes the Kullback-Leibler divergence for a single parameter
    from two distributions.

    Parameters
    ----------
    samples1 : numpy.array
        Samples or probability density function (for the latter must also set
        `pdf1=True`).
    samples2 : numpy.array
        Samples or probability density function (for the latter must also set
        `pdf2=True`).
    pdf1 : bool
        Set to `True` if `samples1` is a probability density funtion already.
    pdf2 : bool
        Set to `True` if `samples2` is a probability density funtion already.
    kde : bool
        Set to `True` if at least one of `pdf1` or `pdf2` is `False` to
        estimate the probability density function using kernel density
        estimation (KDE).
    bins : int or str, optional
        If int, number of equal-width bins to use when calculating probability
        density function from a set of samples of the distribution. If str, it
        should be one of the methods to calculate the optimal bin width
        available in numpy.histogram: ['auto', 'fd', 'doane', 'scott', 'stone',
        'rice', 'sturges', 'sqrt']. Default is 'fd' (Freedman Diaconis
        Estimator). This option will be ignored if `kde=True`.
    hist_min : numpy.float64
        Minimum of the distributions' values to use. This will be ignored if
        `kde=True`.
    hist_max : numpy.float64
        Maximum of the distributions' values to use. This will be ignored if
        `kde=True`.
    base : numpy.float64
        The logarithmic base to use (choose base 2 for information measured
        in bits, default is nats).

    Returns
    -------
    numpy.float64
        The Kullback-Leibler divergence value.
    """
    if pdf1 and pdf2 and kde:
        raise ValueError('KDE can only be used when at least one of pdf1 or '
                         'pdf2 is False.')

    sample_groups = {'P': (samples1, pdf1), 'Q': (samples2, pdf2)}
    pdfs = {}
    for n in sample_groups:
        samples, pdf = sample_groups[n]
        if pdf:
            pdfs[n] = samples
        else:
            method = 'kde' if kde else 'hist'
            pdfs[n] = compute_pdf(samples, method, bins, hist_min, hist_max)

    return stats.entropy(pdfs['P'], qk=pdfs['Q'], base=base)




[docs]
def js(samples1, samples2, kde=False, bins=None, hist_min=None, hist_max=None,
       base=numpy.e):
    """ Computes the Jensen-Shannon divergence for a single parameter
    from two distributions.

    Parameters
    ----------
    samples1 : numpy.array
        Samples.
    samples2 : numpy.array
        Samples.
    kde : bool
        Set to `True` to estimate the probability density function using
        kernel density estimation (KDE).
    bins : int or str, optional
        If int, number of equal-width bins to use when calculating probability
        density function from a set of samples of the distribution. If str, it
        should be one of the methods to calculate the optimal bin width
        available in numpy.histogram: ['auto', 'fd', 'doane', 'scott', 'stone',
        'rice', 'sturges', 'sqrt']. Default is 'fd' (Freedman Diaconis
        Estimator). This option will be ignored if `kde=True`.
    hist_min : numpy.float64
        Minimum of the distributions' values to use. This will be ignored if
        `kde=True`.
    hist_max : numpy.float64
        Maximum of the distributions' values to use. This will be ignored if
        `kde=True`.
    base : numpy.float64
        The logarithmic base to use (choose base 2 for information measured
        in bits, default is nats).

    Returns
    -------
    numpy.float64
        The Jensen-Shannon divergence value.
    """

    sample_groups = {'P': samples1, 'Q': samples2}
    pdfs = {}
    for n in sample_groups:
        samples = sample_groups[n]
        method = 'kde' if kde else 'hist'
        pdfs[n] = compute_pdf(samples, method, bins, hist_min, hist_max)

    pdfs['M'] = (1./2) * (pdfs['P'] + pdfs['Q'])

    js_div = 0
    for pdf in (pdfs['P'], pdfs['Q']):
        js_div += (1./2) * kl(pdf, pdfs['M'], pdf1=True, pdf2=True, base=base)

    return js_div