Source code for threeML.utils.statistics.stats_tools

from math import sqrt

import numpy as np
import scipy.interpolate
import scipy.stats
from scipy.special import erfinv

from threeML.io.logging import setup_logger

# Provides some universal statistical utilities and stats comparison tools


log = setup_logger(__name__)



[docs]
def aic(log_like, n_parameters, n_data_points):
    """The Aikake information criterion.

    A model comparison tool based of infomormation theory. It assumes
    that N is large i.e., that the model is approaching the CLT.
    """
    try:
        val = -2.0 * log_like + 2 * n_parameters
        val += (
            2
            * n_parameters
            * (n_parameters + 1)
            / float(n_data_points - n_parameters - 1)
        )

    except Exception:
        val = 0

    if not np.isfinite(val):
        val = 0

        log.warning("AIC was NAN. Recording zero, but you should examine your fit.")

    return val




[docs]
def bic(log_like, n_parameters, n_data_points):
    """The Bayesian information criterion."""
    val = -2.0 * log_like + n_parameters * np.log(n_data_points)

    if not np.isfinite(val):
        val = 0

        log.warning("BIC was NAN. Recording zero, but you should examine your fit.")

    return val




[docs]
def waic(bayesian_trace):
    raise NotImplementedError("Coming soon to a theater near you.")




[docs]
def dic(bayes_analysis):
    """
    elpd_DIC = log p(y|mean(parameters)) - p_DIC

    the first term is the deviance at the mean of the posterior
    and p_DIC is the effective number of free parameters:

    p_DIC = 2(log p(y|mean(parameters)) - 1/N sum(log p(y|parameters_s), 1,N) )

    DIC = -2*elpd_DIC

    the effective number of free parameters can be negative if the mean is
    the mean is far from the mode

    :param bayes_analysis: a bayesian analysis object
    :return dic, effective number of free parameters:

    """

    mean_of_free_parameters = np.mean(bayes_analysis.raw_samples, axis=0)

    deviance_at_mean = bayes_analysis.get_posterior(mean_of_free_parameters)

    mean_deviance = np.mean(bayes_analysis.log_probability_values)

    pdic = 2 * (deviance_at_mean - mean_deviance)

    elpd_dic = deviance_at_mean - pdic

    if not np.isfinite(pdic) or not np.isfinite(elpd_dic):
        elpd_dic = 0
        pdic = 0

        log.warning("DIC was NAN. Recording zero, but you should examine your fit.")

    return -2 * elpd_dic, pdic




[docs]
def sqrt_sum_of_squares(arg):
    """
    :param arg: and array of number to be squared and summed
    :return: the sqrt of the sum of the squares
    """

    return np.sqrt(np.square(arg).sum())




[docs]
class PoissonResiduals:
    """This class implements a way to compute residuals for a Poisson
    distribution mapping them to residuals of a standard normal distribution.
    The probability of obtaining the observed counts given the expected one is
    computed, and then transformed "in unit of sigma", i.e., the sigma value
    corresponding to that probability is computed.

    The algorithm implemented here uses different branches so that it is fairly accurate
    between -36 and +36 sigma.

    NOTE: if the expected number of counts is not very high, then the Poisson
    distribution is skewed and so the probability of obtaining a downward fluctuation at
    a given sigma level is not the same as obtaining the same fluctuation in the upward
    direction. Therefore, the distribution of residuals is *not* expected to be
    symmetric in that case. The sigma level at which this effect is visible depends
    strongly on the expected number of counts. Under normal circumstances residuals are
    expected to be a few sigma at most, in which case the effect becomes important for
    expected number of counts <~ 15-20.
    """

    # Putting these here make them part of the *class*, not the instance, i.e., they are
    # created only once when the module is imported, and then are referred to by any
    # instance of the class

    # These are lookup tables for the significance from a Poisson distribution when the
    # probability is very low so that the normal computation is not possible due to
    # the finite numerical precision of the computer

    _x = np.logspace(np.log10(5), np.log10(36), 1000)
    _logy = np.log10(scipy.stats.norm.sf(_x))

    # Make the interpolator here so we do it only once. Also use ext=3 so that the
    # interpolation will return the maximum value instead of extrapolating

    _interpolator = scipy.interpolate.InterpolatedUnivariateSpline(
        _logy[::-1], _x[::-1], k=1, ext=3
    )

    def __init__(self, Non, Noff, alpha=1.0):
        assert alpha > 0 and alpha <= 1, "alpha was %f" % alpha

        self.Non = np.array(Non, dtype=float, ndmin=1)

        self.Noff = np.array(Noff, dtype=float, ndmin=1)

        self.alpha = float(alpha)

        self.expected = self.alpha * self.Noff

        self.net = self.Non - self.expected

        # This is the minimum difference between 1 and the next representable floating
        # point number
        self._epsilon = np.finfo(float).eps


[docs]
    def significance_one_side(self):
        # For the points where Non > expected, we need to use the survival function
        # sf(x) = 1 - cdf, which can go do very low numbers
        # Instead, for points where Non < expected, we need to use the cdf which allows
        # to go to very low numbers in that directions

        idx = self.Non >= self.expected

        out = np.zeros_like(self.Non)

        if np.sum(idx) > 0:
            out[idx] = self._using_sf(self.Non[idx], self.expected[idx])

        if np.sum(~idx) > 0:
            out[~idx] = self._using_cdf(self.Non[~idx], self.expected[~idx])

        return out


    def _using_sf(self, x, exp):
        sf = scipy.stats.poisson.sf(x, exp)

        # print(sf)

        # return erfinv(2 * sf) * sqrt(2)

        return scipy.stats.norm.isf(sf)

    def _using_cdf(self, x, exp):
        # Get the value of the cumulative probability function, instead of the survival
        # function (1 - cdf), because for extreme values sf(x) = 1 - cdf(x) = 1 due to
        # numerical precision problems

        cdf = scipy.stats.poisson.cdf(x, exp)

        # print(cdf)

        out = np.zeros_like(x)

        idx = cdf >= 2 * self._epsilon

        # We can do a direct computation, because the numerical precision is sufficient
        # for this computation, as -sf = cdf - 1 is a representable number

        out[idx] = erfinv(2 * cdf[idx] - 1) * sqrt(2)

        # We use a lookup table with interpolation because the numerical precision
        # would not be sufficient to make the computation

        out[~idx] = -1 * self._interpolator(np.log10(cdf[~idx]))

        return out




[docs]
class Significance:
    """Implements equations in Li&Ma 1983."""

    def __init__(self, Non, Noff, alpha=1):
        # assert alpha > 0 and alpha <= 1, "alpha was %f" % alpha

        self._Non = np.array(Non, dtype=float, ndmin=1)

        self._Noff = np.array(Noff, dtype=float, ndmin=1)

        self._alpha = float(alpha)

        self._expected = self._alpha * self._Noff

        self._net = self._Non - self._expected

    @property
    def Non(self) -> int:
        return self._Non

    @property
    def Noff(self) -> int:
        return self._Noff

    @property
    def alpha(self) -> float:
        return self._alpha

    @property
    def expected(self) -> int:
        return self._expected

    @property
    def net(self) -> int:
        return self._net


[docs]
    def known_background(self):
        """Compute the significance under the hypothesis that there is no
        uncertainty in the background. In other words, compute the probability
        of obtaining the observed counts given the expected counts from the
        background, then transform it in sigma.

        NOTE: this is reliable for expected counts >~10-15 if the significance is not
        very high. The higher the expected counts, the more reliable the significance
        estimation. As rule of thumb, you need at least 25 counts to have reliable
        estimates up to 5 sigma.

        NOTE 2: if you use to compute residuals in units of sigma, you should not
        expected them to be symmetrically distributed around 0 unless the expected
        number of counts is high enough for all bins (>~15). This is due to the fact
        that the Poisson distribution is very skewed at low counts.

        :return: significance vector
        """

        # Poisson probability of obtaining Non given Noff * alpha, in sigma units

        poisson_probability = PoissonResiduals(
            self._Non, self._Noff, self._alpha
        ).significance_one_side()

        return poisson_probability



[docs]
    def li_and_ma(self, assign_sign=True):
        """Compute the significance using the formula from Li & Ma 1983, which
        is appropriate when both background and observed signal are counts
        coming from a Poisson distribution.

        :param assign_sign: whether to assign a sign to the significance, according to
        the sign of the net counts Non - alpha * Noff, so that excesses will have
        positive significances and defects negative significances
        :return:
        """

        one = np.zeros_like(self._Non, dtype=float)

        idx = self._Non > 0

        one[idx] = self._Non[idx] * np.log(
            ((1 + self._alpha) / self._alpha)
            * (self._Non[idx] / (self._Non[idx] + self._Noff[idx]))
        )

        two = np.zeros_like(self._Noff, dtype=float)

        two[idx] = self._Noff[idx] * np.log(
            (1 + self._alpha) * (self._Noff[idx] / (self._Non[idx] + self._Noff[idx]))
        )

        if assign_sign:
            sign = np.where(self._net > 0, 1, -1)

        else:
            sign = 1

        return sign * np.sqrt(2 * (one + two))



[docs]
    def li_and_ma_equivalent_for_gaussian_background(self, sigma_b):
        """
        Compute the significance using the formula from Vianello 2018
        (https://iopscience.iop.org/article/10.3847/1538-4365/aab780/meta),
        which is appropriate when the observation is Poisson distributed but
        the background has been modeled and thus has Gaussian distributed errors.

        :param sigma_b: The gaussian 1 sigma errors on the background
        :return:

        """

        # This is a computation I need to publish (G. Vianello)

        # Actually, you did (and beat J. Michael!) For details on this computation

        b = self._expected
        o = self._Non

        b0 = 0.5 * (
            np.sqrt(b**2 - 2 * sigma_b**2 * (b - 2 * o) + sigma_b**4) + b - sigma_b**2
        )

        S = sqrt(2) * np.sqrt(
            o * np.log(o / b0) + (((b0 - b) ** 2) / (2 * sigma_b**2)) + b0 - o
        )

        sign = np.where(o > b, 1, -1)

        return sign * S



[docs]
    def gaussian_background(self, sigma_c, sigma_b):
        """
        :param sigma_b: The gaussian 1 sigma errors on the background
        :return:

        """
        return self.net / np.sqrt(sigma_c**2 + sigma_b**2)