Source code for threeML.utils.statistics.stats_tools

from math import sqrt

import numpy as np
import scipy.interpolate
import scipy.stats
from scipy.special import erfinv
from threeML.io.logging import setup_logger

# Provides some universal statistical utilities and stats comparison tools


log = setup_logger(__name__)


[docs] def aic(log_like, n_parameters, n_data_points): """ The Aikake information criterion. A model comparison tool based of infomormation theory. It assumes that N is large i.e., that the model is approaching the CLT. """ try: val = -2.0 * log_like + 2 * n_parameters val += ( 2 * n_parameters * (n_parameters + 1) / float(n_data_points - n_parameters - 1) ) except: val = 0 if not np.isfinite(val): val = 0 log.warning( "AIC was NAN. Recording zero, but you should examine your fit." ) return val
[docs] def bic(log_like, n_parameters, n_data_points): """ The Bayesian information criterion. """ val = -2.0 * log_like + n_parameters * np.log(n_data_points) if not np.isfinite(val): val = 0 log.warning( "BIC was NAN. Recording zero, but you should examine your fit." ) return val
[docs] def waic(bayesian_trace): raise NotImplementedError("Coming soon to a theater near you.")
[docs] def dic(bayes_analysis): """ elpd_DIC = log p(y|mean(parameters)) - p_DIC the first term is the deviance at the mean of the posterior and p_DIC is the effective number of free parameters: p_DIC = 2(log p(y|mean(parameters)) - 1/N sum(log p(y|parameters_s), 1,N) ) DIC = -2*elpd_DIC the effective number of free parameters can be negative if the mean is the mean is far from the mode :param bayes_analysis: a bayesian analysis object :return dic, effective number of free parameters: """ mean_of_free_parameters = np.mean(bayes_analysis.raw_samples, axis=0) deviance_at_mean = bayes_analysis.get_posterior(mean_of_free_parameters) mean_deviance = np.mean(bayes_analysis.log_probability_values) pdic = 2 * (deviance_at_mean - mean_deviance) elpd_dic = deviance_at_mean - pdic if not np.isfinite(pdic) or not np.isfinite(elpd_dic): elpd_dic = 0 pdic = 0 log.warning( "DIC was NAN. Recording zero, but you should examine your fit." ) return -2 * elpd_dic, pdic
[docs] def sqrt_sum_of_squares(arg): """ :param arg: and array of number to be squared and summed :return: the sqrt of the sum of the squares """ return np.sqrt(np.square(arg).sum())
[docs] class PoissonResiduals: """ This class implements a way to compute residuals for a Poisson distribution mapping them to residuals of a standard normal distribution. The probability of obtaining the observed counts given the expected one is computed, and then transformed "in unit of sigma", i.e., the sigma value corresponding to that probability is computed. The algorithm implemented here uses different branches so that it is fairly accurate between -36 and +36 sigma. NOTE: if the expected number of counts is not very high, then the Poisson distribution is skewed and so the probability of obtaining a downward fluctuation at a given sigma level is not the same as obtaining the same fluctuation in the upward direction. Therefore, the distribution of residuals is *not* expected to be symmetric in that case. The sigma level at which this effect is visible depends strongly on the expected number of counts. Under normal circumstances residuals are expected to be a few sigma at most, in which case the effect becomes important for expected number of counts <~ 15-20. """ # Putting these here make them part of the *class*, not the instance, i.e., they are created # only once when the module is imported, and then are referred to by any instance of the class # These are lookup tables for the significance from a Poisson distribution when the # probability is very low so that the normal computation is not possible due to # the finite numerical precision of the computer _x = np.logspace(np.log10(5), np.log10(36), 1000) _logy = np.log10(scipy.stats.norm.sf(_x)) # Make the interpolator here so we do it only once. Also use ext=3 so that the interpolation # will return the maximum value instead of extrapolating _interpolator = scipy.interpolate.InterpolatedUnivariateSpline( _logy[::-1], _x[::-1], k=1, ext=3 ) def __init__(self, Non, Noff, alpha=1.0): assert alpha > 0 and alpha <= 1, "alpha was %f" % alpha self.Non = np.array(Non, dtype=float, ndmin=1) self.Noff = np.array(Noff, dtype=float, ndmin=1) self.alpha = float(alpha) self.expected = self.alpha * self.Noff self.net = self.Non - self.expected # This is the minimum difference between 1 and the next representable floating point number self._epsilon = np.finfo(float).eps
[docs] def significance_one_side(self): # For the points where Non > expected, we need to use the survival function # sf(x) = 1 - cdf, which can go do very low numbers # Instead, for points where Non < expected, we need to use the cdf which allows # to go to very low numbers in that directions idx = self.Non >= self.expected out = np.zeros_like(self.Non) if np.sum(idx) > 0: out[idx] = self._using_sf(self.Non[idx], self.expected[idx]) if np.sum(~idx) > 0: out[~idx] = self._using_cdf(self.Non[~idx], self.expected[~idx]) return out
def _using_sf(self, x, exp): sf = scipy.stats.poisson.sf(x, exp) # print(sf) # return erfinv(2 * sf) * sqrt(2) return scipy.stats.norm.isf(sf) def _using_cdf(self, x, exp): # Get the value of the cumulative probability function, instead of the survival function (1 - cdf), # because for extreme values sf(x) = 1 - cdf(x) = 1 due to numerical precision problems cdf = scipy.stats.poisson.cdf(x, exp) # print(cdf) out = np.zeros_like(x) idx = cdf >= 2 * self._epsilon # We can do a direct computation, because the numerical precision is sufficient # for this computation, as -sf = cdf - 1 is a representable number out[idx] = erfinv(2 * cdf[idx] - 1) * sqrt(2) # We use a lookup table with interpolation because the numerical precision would not # be sufficient to make the computation out[~idx] = -1 * self._interpolator(np.log10(cdf[~idx])) return out
[docs] class Significance: """ Implements equations in Li&Ma 1983 """ def __init__(self, Non, Noff, alpha=1): # assert alpha > 0 and alpha <= 1, "alpha was %f" % alpha self._Non = np.array(Non, dtype=float, ndmin=1) self._Noff = np.array(Noff, dtype=float, ndmin=1) self._alpha = float(alpha) self._expected = self._alpha * self._Noff self._net = self._Non - self._expected @property def Non(self) -> int: return self._Non @property def Noff(self) -> int: return self._Noff @property def alpha(self) -> float: return self._alpha @property def expected(self) -> int: return self._expected @property def net(self) -> int: return self._net
[docs] def known_background(self): """ Compute the significance under the hypothesis that there is no uncertainty in the background. In other words, compute the probability of obtaining the observed counts given the expected counts from the background, then transform it in sigma. NOTE: this is reliable for expected counts >~10-15 if the significance is not very high. The higher the expected counts, the more reliable the significance estimation. As rule of thumb, you need at least 25 counts to have reliable estimates up to 5 sigma. NOTE 2: if you use to compute residuals in units of sigma, you should not expected them to be symmetrically distributed around 0 unless the expected number of counts is high enough for all bins (>~15). This is due to the fact that the Poisson distribution is very skewed at low counts. :return: significance vector """ # Poisson probability of obtaining Non given Noff * alpha, in sigma units poisson_probability = PoissonResiduals( self._Non, self._Noff, self._alpha ).significance_one_side() return poisson_probability
[docs] def li_and_ma(self, assign_sign=True): """ Compute the significance using the formula from Li & Ma 1983, which is appropriate when both background and observed signal are counts coming from a Poisson distribution. :param assign_sign: whether to assign a sign to the significance, according to the sign of the net counts Non - alpha * Noff, so that excesses will have positive significances and defects negative significances :return: """ one = np.zeros_like(self._Non, dtype=float) idx = self._Non > 0 one[idx] = self._Non[idx] * np.log( ((1 + self._alpha) / self._alpha) * ((self._Non[idx] / (self._Non[idx] + self._Noff[idx]))) ) two = np.zeros_like(self._Noff, dtype=float) two[idx] = self._Noff[idx] * np.log( (1 + self._alpha) * ((self._Noff[idx] / (self._Non[idx] + self._Noff[idx]))) ) if assign_sign: sign = np.where(self._net > 0, 1, -1) else: sign = 1 return sign * np.sqrt(2 * (one + two))
[docs] def li_and_ma_equivalent_for_gaussian_background(self, sigma_b): """ Compute the significance using the formula from Vianello 2018 (https://iopscience.iop.org/article/10.3847/1538-4365/aab780/meta), which is appropriate when the observation is Poisson distributed but the background has been modeled and thus has Gaussian distributed errors. :param sigma_b: The gaussian 1 sigma errors on the background :return: """ # This is a computation I need to publish (G. Vianello) # Actually, you did (and beat J. Michael!) For details on this computation b = self._expected o = self._Non b0 = 0.5 * ( np.sqrt(b**2 - 2 * sigma_b**2 * (b - 2 * o) + sigma_b**4) + b - sigma_b**2 ) S = sqrt(2) * np.sqrt( o * np.log(o / b0) + (((b0 - b) ** 2) / (2 * sigma_b**2)) + b0 - o ) sign = np.where(o > b, 1, -1) return sign * S
[docs] def gaussian_background(self, sigma_c,sigma_b): """ :param sigma_b: The gaussian 1 sigma errors on the background :return: """ return self.net/np.sqrt(sigma_c**2+sigma_b**2)