# Source code for threeML.utils.statistics.stats_tools

```from __future__ import division

# Provides some universal statistical utilities and stats comparison tools

from past.utils import old_div
from builtins import object
from math import sqrt

import numpy as np
import pandas as pd
import scipy.interpolate
import scipy.stats
import warnings
from scipy.special import erfinv

from threeML.io.rich_display import display

[docs]def aic(log_like, n_parameters, n_data_points):
"""
The Aikake information criterion.
A model comparison tool based of infomormation theory. It assumes that N is large i.e.,
that the model is approaching the CLT.

"""
try:
val = -2.0 * log_like + 2 * n_parameters
val += (
2 * n_parameters * (n_parameters + 1) / float(n_data_points - n_parameters - 1)
)

except:

val = 0

if not np.isfinite(val):
val = 0

warnings.warn("AIC was NAN. Recording zero, but you should examine your fit.")

return val

[docs]def bic(log_like, n_parameters, n_data_points):
"""
The Bayesian information criterion.
"""
val = -2.0 * log_like + n_parameters * np.log(n_data_points)

if not np.isfinite(val):
val = 0

warnings.warn("BIC was NAN. Recording zero, but you should examine your fit.")

return val

[docs]def waic(bayesian_trace):
raise NotImplementedError("Coming soon to a theater near you.")

[docs]def dic(bayes_analysis):
"""
elpd_DIC = log p(y|mean(parameters)) - p_DIC

the first term is the deviance at the mean of the posterior
and p_DIC is the effective number of free parameters:

p_DIC = 2(log p(y|mean(parameters)) - 1/N sum(log p(y|parameters_s), 1,N) )

DIC = -2*elpd_DIC

the effective number of free parameters can be negative if the mean is
the mean is far from the mode

:param bayes_analysis: a bayesian analysis object
:return dic, effective number of free parameters:

"""

mean_of_free_parameters = np.mean(bayes_analysis.raw_samples, axis=0)

deviance_at_mean = bayes_analysis.get_posterior(mean_of_free_parameters)

mean_deviance = np.mean(bayes_analysis.log_probability_values)

pdic = 2 * (deviance_at_mean - mean_deviance)

elpd_dic = deviance_at_mean - pdic

if not np.isfinite(pdic) or not np.isfinite(elpd_dic):

elpd_dic = 0
pdic = 0

warnings.warn("DIC was NAN. Recording zero, but you should examine your fit.")

return -2 * elpd_dic, pdic

[docs]def sqrt_sum_of_squares(arg):
"""
:param arg: and array of number to be squared and summed
:return: the sqrt of the sum of the squares
"""

return np.sqrt(np.square(arg).sum())

[docs]class PoissonResiduals(object):
"""
This class implements a way to compute residuals for a Poisson distribution mapping them to residuals of a standard
normal distribution. The probability of obtaining the observed counts given the expected one is computed, and then
transformed "in unit of sigma", i.e., the sigma value corresponding to that probability is computed.

The algorithm implemented here uses different branches so that it is fairly accurate between -36 and +36 sigma.

NOTE: if the expected number of counts is not very high, then the Poisson distribution is skewed and so the
probability of obtaining a downward fluctuation at a given sigma level is not the same as obtaining the same
fluctuation in the upward direction. Therefore, the distribution of residuals is *not* expected to be symmetric
in that case. The sigma level at which this effect is visible depends strongly on the expected number of counts.
Under normal circumstances residuals are expected to be a few sigma at most, in which case the effect becomes
important for expected number of counts <~ 15-20.

"""

# Putting these here make them part of the *class*, not the instance, i.e., they are created
# only once when the module is imported, and then are referred to by any instance of the class

# These are lookup tables for the significance from a Poisson distribution when the
# probability is very low so that the normal computation is not possible due to
# the finite numerical precision of the computer

_x = np.logspace(np.log10(5), np.log10(36), 1000)
_logy = np.log10(scipy.stats.norm.sf(_x))

# Make the interpolator here so we do it only once. Also use ext=3 so that the interpolation
# will return the maximum value instead of extrapolating

_interpolator = scipy.interpolate.InterpolatedUnivariateSpline(
_logy[::-1], _x[::-1], k=1, ext=3
)

def __init__(self, Non, Noff, alpha=1.0):

assert alpha > 0 and alpha <= 1, "alpha was %f" % alpha

self.Non = np.array(Non, dtype=float, ndmin=1)

self.Noff = np.array(Noff, dtype=float, ndmin=1)

self.alpha = float(alpha)

self.expected = self.alpha * self.Noff

self.net = self.Non - self.expected

# This is the minimum difference between 1 and the next representable floating point number
self._epsilon = np.finfo(float).eps

[docs]    def significance_one_side(self):

# For the points where Non > expected, we need to use the survival function
# sf(x) = 1 - cdf, which can go do very low numbers
# Instead, for points where Non < expected, we need to use the cdf which allows
# to go to very low numbers in that directions

idx = self.Non >= self.expected

out = np.zeros_like(self.Non)

if np.sum(idx) > 0:
out[idx] = self._using_sf(self.Non[idx], self.expected[idx])

if np.sum(~idx) > 0:
out[~idx] = self._using_cdf(self.Non[~idx], self.expected[~idx])

return out

def _using_sf(self, x, exp):

sf = scipy.stats.poisson.sf(x, exp)

# print(sf)

# return erfinv(2 * sf) * sqrt(2)

return scipy.stats.norm.isf(sf)

def _using_cdf(self, x, exp):

# Get the value of the cumulative probability function, instead of the survival function (1 - cdf),
# because for extreme values sf(x) = 1 - cdf(x) = 1 due to numerical precision problems

cdf = scipy.stats.poisson.cdf(x, exp)

# print(cdf)

out = np.zeros_like(x)

idx = cdf >= 2 * self._epsilon

# We can do a direct computation, because the numerical precision is sufficient
# for this computation, as -sf = cdf - 1 is a representable number

out[idx] = erfinv(2 * cdf[idx] - 1) * sqrt(2)

# We use a lookup table with interpolation because the numerical precision would not
# be sufficient to make the computation

out[~idx] = -1 * self._interpolator(np.log10(cdf[~idx]))

return out

[docs]class Significance(object):
"""
Implements equations in Li&Ma 1983

"""

def __init__(self, Non, Noff, alpha=1):

assert alpha > 0 and alpha <= 1, "alpha was %f" % alpha

self.Non = np.array(Non, dtype=float, ndmin=1)

self.Noff = np.array(Noff, dtype=float, ndmin=1)

self.alpha = float(alpha)

self.expected = self.alpha * self.Noff

self.net = self.Non - self.expected

[docs]    def known_background(self):
"""
Compute the significance under the hypothesis that there is no uncertainty in the background. In other words,
compute the probability of obtaining the observed counts given the expected counts from the background, then
transform it in sigma.

NOTE: this is reliable for expected counts >~10-15 if the significance is not very high. The higher the
expected counts, the more reliable the significance estimation. As rule of thumb, you need at least 25 counts
to have reliable estimates up to 5 sigma.

NOTE 2: if you use to compute residuals in units of sigma, you should not expected them to be symmetrically
distributed around 0 unless the expected number of counts is high enough for all bins (>~15). This is due to
the fact that the Poisson distribution is very skewed at low counts.

:return: significance vector
"""

# Poisson probability of obtaining Non given Noff * alpha, in sigma units

poisson_probability = PoissonResiduals(
self.Non, self.Noff, self.alpha
).significance_one_side()

return poisson_probability

[docs]    def li_and_ma(self, assign_sign=True):
"""
Compute the significance using the formula from Li & Ma 1983, which is appropriate when both background and
observed signal are counts coming from a Poisson distribution.

:param assign_sign: whether to assign a sign to the significance, according to the sign of the net counts
Non - alpha * Noff, so that excesses will have positive significances and defects negative significances
:return:
"""

one = np.zeros_like(self.Non, dtype=float)

idx = self.Non > 0

one[idx] = self.Non[idx] * np.log(
old_div((1 + self.alpha), self.alpha)
* (old_div(self.Non[idx], (self.Non[idx] + self.Noff[idx])))
)

two = np.zeros_like(self.Noff, dtype=float)

two[idx] = self.Noff[idx] * np.log(
(1 + self.alpha)
* (old_div(self.Noff[idx], (self.Non[idx] + self.Noff[idx])))
)

if assign_sign:

sign = np.where(self.net > 0, 1, -1)

else:

sign = 1

return sign * np.sqrt(2 * (one + two))

[docs]    def li_and_ma_equivalent_for_gaussian_background(self, sigma_b):
"""
Compute the significance using the formula from Vianello 2018
(https://iopscience.iop.org/article/10.3847/1538-4365/aab780/meta),
which is appropriate when the observation is Poisson distributed but
the background has been modeled and thus has Gaussian distributed errors.

:param sigma_b: The gaussian 1 sigma errors on the background
:return:

"""

# This is a computation I need to publish (G. Vianello)

# Actually, you did (and beat J. Michael!) For details on this computation

b = self.expected
o = self.Non

b0 = 0.5 * (
np.sqrt(b ** 2 - 2 * sigma_b ** 2 * (b - 2 * o) + sigma_b ** 4)
+ b
- sigma_b ** 2
)

S = sqrt(2) * np.sqrt(
o * np.log(old_div(o, b0))
+ old_div((b0 - b) ** 2, (2 * sigma_b ** 2))
+ b0
- o
)

sign = np.where(o > b, 1, -1)

return sign * S
```