Source code for ssd.distributions

# -*- coding: utf-8 -*-
"""
Probability Density Functions

A collection of probability density functions
"""
from typing import Callable, Iterable, Optional, Tuple, Union

import numpy as np
from scipy.interpolate import splev, splrep
from scipy.stats import gaussian_kde

from .base import BaseDistribution


[docs]class HistogramDistribution(BaseDistribution): """Use a histogram to estimate a probability density function.""" def __init__(self, bins: Union[int, Iterable[float]] = 100): """ Parameters ---------- bins : Union[int, Iterable[float]], optional The number of bins to use in the histogram (default is 100). If an iterable is provided, it will be used as the bin edges. """ super().__init__() # Set the number of bins self.bins = bins
[docs] def fit(self, data: Iterable[float]) -> 'HistogramDistribution': """ Parameters ---------- data : array_like The data to fit Returns ------- HistogramDistribution The fitted histogram distribution """ # Fit the histogram distribution self._hist, self._bins = np.histogram(data, bins=self.bins, density=True) # Set the fitted status self.fitted = True return self
def __call__(self, x: float) -> float: """ Parameters ---------- x : float The point at which to evaluate the probability density function Returns ------- float The value of the probability density function at the given point Raises ------ RuntimeError If the histogram distribution has not been fitted """ if not self.fitted: raise RuntimeError( 'The histogram distribution must be fitted before it can be evaluated. Call self.fit(data) to fit the histogram distribution.' ) if (x < self._bins[0]) or (x > self._bins[-2]): return 0 # Find the bin that contains the point idx = np.digitize(x, self._bins) return self._hist[idx]
[docs] def integrate(self, a: float = -np.inf, b: float = np.inf, moment: float = 1) -> Tuple[float, float]: """ Integrate the probability density function from a to b. Parameters ---------- a : float, optional The lower bound of the integration (default is -inf) b : float, optional The upper bound of the integration (default is inf) moment : float, optional The moment to compute (default is 0) Returns ------- Tuple[float, float] The value of the integral and the error """ if not self.fitted: raise RuntimeError( 'The histogram distribution must be fitted before it can be integrated. Call self.fit(data) to fit the histogram distribution.' ) # Find the bins that contain the integration bounds idx_a = np.digitize(a, self._bins) idx_b = np.digitize(b, self._bins) # Get the bin_width bin_width = np.diff(self._bins)[0] # Compute the integral integral = np.sum(self._hist[idx_a:idx_b]**moment * bin_width) # Compute the error error = np.sqrt(np.sum(self._hist[idx_a:idx_b]**moment * bin_width)**2) return integral, error
[docs] def grad(self, x: float) -> float: """ Compute the derivative of the probability density function at a given point. Parameters ---------- x : float The point at which to compute the derivative Returns ------- float The value of the derivative at the given point """ if not self.fitted: raise RuntimeError( 'The histogram distribution must be fitted before it can be differentiated. Call self.fit(data) to fit the histogram distribution.' ) if (x < self._bins[0]) or (x >= self._bins[-1]): return 0 # Find the bin that contains the point idx = np.digitize(x, self._bins) - 1 # Get the bin_width bin_width = np.diff(self._bins)[0] # Compute the derivative def yy(idx: int) -> float: if idx > len(self._hist) - 1: return 0 else: return self._hist[idx] dy = (yy(idx + 1) - yy(idx - 1)) / (2*bin_width) return dy
[docs]class InterpolateDistribution(BaseDistribution): """Use a histogram distribution with interpolated values to estimate a probability density function.""" def __init__(self, bins: Union[int, Iterable[float]] = 100): """ Parameters ---------- bins : Union[int, Iterabl[float]], optional The number of bins to use in the histogram (default is 100). If an iterable is provided, it will be used as the bin edges. """ super().__init__() # Set the number of bins self.bins = bins
[docs] def fit(self, data: Iterable[float], n: int = 1, s: float = 0.01, force_origin: bool = False, epsilon: float = 1.e-9) -> 'InterpolateDistribution': """ Parameters ---------- data : array_like The data to fit n : int, optional The degree of the interpolating polynomial (default is 1) s : float, optional The smoothing factor (default is 0.01) force_origin : bool, optional Force the interpolating polynomial to pass through the origin (default is False) epsilon : float, optional Small number to avoid division by zero (default is 1.e-9) Returns ------- HistogramDistribution The fitted histogram distribution """ self.epsilon = epsilon self.force_origin = force_origin # Fit the histogram distribution self._hist, self._bins = np.histogram(data, bins=self.bins, density=True) # Interpolate dx = np.diff(self._bins) X = self._bins[:-1] + dx/2 Y = self._hist if self.force_origin: X = np.insert(X, 0, 0) Y = np.insert(Y, 0, 0) self._spl = splrep(X, Y, k=n, s=s, per=False) # Set the fitted status self.fitted = True return self
def __call__(self, x: float) -> float: """ Use the values of nearby bins to interpolate the probability density function at a given point. Parameters ---------- x : float The point at which to evaluate the probability density function Returns ------- float The value of the probability density function at the given point Raises ------ RuntimeError If the histogram distribution has not been fitted """ if not self.fitted: raise RuntimeError( 'The histogram distribution must be fitted before it can be evaluated. Call self.fit(data) to fit the histogram distribution.' ) if (x < self._bins[0]) or (x >= self._bins[-1]): return 0 if self.force_origin: return float(x * splev(x, self._spl) / (x + self.epsilon)) else: return float(splev(x, self._spl))
[docs]class EmpiricalDistribution(BaseDistribution): """Estimate an empirical distribution using a Gaussian KDE.""" def __init__(self, bw_method: Optional[Union[str, float, Callable]] = None, weights: Optional[Iterable[float]] = None): """ Parameters ---------- bw_method : str, float, or callable, optional The method used to calculate the estimator bandwidth. This can be 'scott', 'silverman', a scalar constant or a callable. If a scalar, this will be used directly as `kde.factor`. If a callable, it should take a `gaussian_kde` instance as only parameter and return a scalar. If None (default), 'scott' is used. See ``scipy.stats.gaussian_kde`` for more details. weights : array_like, optional The weights for each value in the empirical distribution. This must be the same shape as the input data. See ``scipy.stats.gaussian_kde`` for more details. """ super().__init__() # Set the bandwidth method self.bw_method = bw_method # Set the weights self.weights = weights # Set the fitted status self.fitted = False
[docs] def fit(self, data: Iterable[float]) -> 'EmpiricalDistribution': """ Parameters ---------- data : array_like The data to fit Returns ------- EmpiricalDistribution The fitted empirical distribution """ # Fit the empirical distribution self._kernel = gaussian_kde(data, bw_method=self.bw_method, weights=self.weights) # Set the fitted status self.fitted = True return self
def __call__(self, x: float) -> float: """ Parameters ---------- x : float The point at which to evaluate the probability density function Returns ------- float The value of the probability density function at the given point Raises ------ RuntimeError If the empirical distribution has not been fitted """ if not self.fitted: raise RuntimeError( 'The empirical distribution must be fitted before it can be evaluated. Call self.fit(data) to fit the empirical distribution.' ) return self._kernel(x) @property def max(self): return self._kernel.dataset.max() @property def min(self): return self._kernel.dataset.min()
[docs]class WignerSemicircle(BaseDistribution): """A Wigner semicircle probability density function.""" def __init__(self, sigma: float = 1.0): """ The Wigner semicircle (WS) distribution is a probability density function that describes the asymptotic distribution of the eigenvalues of a large symmetric random matrix with Gaussian entries. Given a matrix :math:`X` of size :math:`n \\times n`, where :math:`n \\to \\infty`, the WS distribution is defined as: .. math:: \\mu(x) = \\frac{1}{2 \\pi \\sigma^2} \\sqrt{4 \\sigma^2 - x^2} \\mathbb{1}_{[-2 \\sigma, 2 \\sigma]}(x) where :math:`\\mathbb{1}_{[-2 \\sigma, 2 \\sigma]}(x)` is the indicator function of the interval :math:`[-2 \\sigma, 2 \\sigma]`. Parameters ---------- sigma : float, optional The standard deviation of the Gaussian distribution (default is 1.0) """ super().__init__() # Set the standard deviation self.sigma = sigma def __call__(self, x: float) -> float: """ Evaluate the probability density function at a given point. Parameters ---------- x : float The point at which to evaluate the probability density function Returns ------- float The value of the probability density function at the given point """ # Check the input if x <= -2.0 * self.sigma or x >= 2.0 * self.sigma: return 0.0 # Evaluate the probability density function func = np.sqrt(4.0 * self.sigma**2 - x**2) return func / (2.0 * np.pi * self.sigma**2) # normalization @property def max(self) -> float: return 2 * self.sigma @property def min(self) -> float: return -2 * self.sigma
[docs]class InverseWignerSemicircle(WignerSemicircle): """An inverse Wigner semicircle probability density function.""" def __call__(self, q: float) -> float: """ Evaluate the probability density function at a given point. Parameters ---------- q : float The point at which to evaluate the probability density function Returns ------- float The value of the probability density function at the given point """ return super().__call__(1.0 / q) / q**2 @property def max(self) -> float: return np.inf @property def min(self) -> float: return -np.inf
[docs]class MarchenkoPastur(BaseDistribution): """A Marchenko-Pastur probability density function.""" def __init__(self, L: float, sigma: float = 1.0): """ The Marchenko-Pastur (MP) distribution is a probability density function that describes the asymptotic distribution of the singular values of a large rectangular random matrix with Gaussian entries (i.e. the eigenvalues of the associated covariance matrix). Given a matrix :math:`X` of size :math:`n \\times p`, where :math:`n \\to \\infty` and :math:`p \\to \\infty`, such that :math:`\\lambda = \\frac{p}{n} \\in (0, 1)`, the MP distribution is defined as: .. math:: \\mu(x) = \\frac{1}{2 \\pi \\sigma^2} \\frac{\\sqrt{(\\lambda_+ - x)(x - \\lambda_-)}}{\\lambda x} \\mathbb{1}_{[x_-, x_+]}(x) where .. math:: \\lambda_+ = \\sigma^2 (1 + \\sqrt{\\lambda})^2 and .. math:: \\lambda_- = \\sigma^2 (1 - \\sqrt{\\lambda})^2 are the upper and lower bounds of the function, respectively, and :math:`\\mathbb{1}_{[x_-, x_+]}(x)` is the indicator function of the interval :math:`[x_-, x_+]`. Parameters ---------- L : float The ratio :math:`\\lambda = \\frac{p}{n}`, where :math:`p` is the number of variables and :math:`n` is the number of observations of the random matrix. sigma : float, optional The standard deviation of the random matrix. The default is 1.0. """ super().__init__() # Set the ratio self.L = L # Set the standard deviation self.sigma = sigma # Define the boundaries of the function self.lp = self.sigma**2 * (1 + np.sqrt(self.L))**2 self.lm = self.sigma**2 * (1 - np.sqrt(self.L))**2 def __call__(self, x: float) -> float: """ Evaluate the probability density function at a given point. Parameters ---------- x : float The point at which to evaluate the probability density function Returns ------- float The value of the probability density function at the given point """ if x <= self.lm or x >= self.lp: return 0 numer = np.sqrt((x - self.lm) * (self.lp - x)) denom = self.L * x return numer / denom / (2 * np.pi * self.sigma**2) # normalization @property def max(self) -> float: return self.lp # maximum value of the function @property def min(self) -> float: return self.lm # minimum value of the function
[docs]class TranslatedMarchenkoPastur(MarchenkoPastur): """A Marchenko-Pastur probability density function with lowest eigenvalue at zero.""" def __call__(self, x: float) -> float: """ Evaluate the probability density function at a given point. Parameters ---------- x : float The point at which to evaluate the probability density function Returns ------- float The value of the probability density function at the given point """ return super().__call__(x + self.l0) @property def max(self) -> float: return self.lp - self.lm # maximum value of the function @property def min(self) -> float: return 0.0 # minimum value of the function @property def l0(self) -> float: return self.lm # lowest eigenvalue
[docs]class InverseMarchenkoPastur(MarchenkoPastur): """An inverse Marchenko-Pastur probability density function.""" def __call__(self, q: float) -> float: """ Evaluate the probability density function at a given point. Parameters ---------- q : float The point at which to evaluate the probability density function Returns ------- float The value of the probability density function at the given point """ if q <= self.min or q >= self.max: return 0.0 self.lm = 0.0 return super().__call__(1.0 / q) / q**2 @property def max(self) -> float: return np.inf # maximum value of the function @property def min(self) -> float: return 1 / self.lp # minimum value of the function
[docs]class TranslatedInverseMarchenkoPastur(InverseMarchenkoPastur): """An inverse Marchenko-Pastur probability density function with lowest eigenvalue at zero.""" def __call__(self, q: float) -> float: """ Evaluate the probability density function at a given point. Parameters ---------- q : float The point at which to evaluate the probability density function Returns ------- float The value of the probability density function at the given point """ return super().__call__(q + self.m2) @property def max(self) -> float: return np.inf # maximum value of the function @property def min(self) -> float: return 0.0 # minimum value of the function @property def m2(self) -> float: return 1 / self.lp # inverse of the largest eigenvalue