Source code for diff_binom_confint._diff_binom_confint

"""
"""

import warnings
from typing import Union

import numpy as np
from deprecate_kwargs import deprecate_kwargs
from deprecated import deprecated
from scipy.optimize import brentq
from scipy.stats import norm

from ._confint import _SIDE_NAME_MAP, ConfidenceInterval, ConfidenceIntervalSides
from ._utils import accelerator, add_docstring, remove_parameters_returns_from_docstring

__all__ = [
    "compute_difference_confidence_interval",
    "list_difference_confidence_interval_methods",
]


# alias of statistical functions
qnorm = norm.ppf
pnorm = norm.cdf
uniroot = brentq


[docs]@deprecate_kwargs([["method", "confint_type"]]) def compute_difference_confidence_interval( n_positive: int, n_total: int, ref_positive: int, ref_total: int, conf_level: float = 0.95, confint_type: str = "wilson", clip: bool = True, sides: Union[str, int] = "two-sided", ) -> ConfidenceInterval: """Compute the confidence interval of the difference between two binomial proportions. Parameters ---------- n_positive : int Number of positive samples. n_total : int Total number of samples. ref_positive : int Number of positive samples of the reference. ref_total : int Total number of samples of the reference. conf_level : float, default 0.95 Confidence level, should be inside the interval ``(0, 1)``. confint_type : str, default "wilson" Type (computation method) of the confidence interval. clip : bool, default True Whether to clip the confidence interval to the interval ``(-1, 1)``. sides : str or int, default "two-sided" the sides of the confidence interval, should be one of - "two-sided" (aliases "2-sided", "two_sided", "2_sided", "2-sides", "two_sides", "two-sides", "2_sides", "ts", "t", "two", "2", 2), - "left-sided" (aliases "left_sided", "left", "ls", "l"), - "right-sided" (aliases "right_sided", "right", "rs", "r"), case insensitive. Returns ------- confint : ConfidenceInterval The confidence interval. """ if confint_type not in _supported_types: raise ValueError(f"`method` should be one of `{_supported_types}`, " f"but got `{confint_type}`") if conf_level <= 0 or conf_level >= 1: raise ValueError(f"`conf_level` should be inside the interval (0, 1), but got `{conf_level}`") if n_positive > n_total: raise ValueError( f"`n_positive` should be less than or equal to `n_total`, " f"but got `n_positive={n_positive}` and `n_total={n_total}`" ) if ref_positive > ref_total: raise ValueError( f"`ref_positive` should be less than or equal to `ref_total`, " f"but got `ref_positive={ref_positive}` and `ref_total={ref_total}`" ) if n_positive < 0: raise ValueError(f"`n_positive` should be non-negative, but got `n_positive={n_positive}`") if n_total <= 0: raise ValueError(f"`n_total` should be positive, but got `n_total={n_total}`") if ref_positive < 0: raise ValueError(f"`ref_positive` should be non-negative, but got `ref_positive={ref_positive}`") if ref_total <= 0: raise ValueError(f"`ref_total` should be positive, but got `ref_total={ref_total}`") sides = str(sides).lower() if sides not in _SIDE_NAME_MAP: raise ValueError(f"`sides` should be one of `{list(_SIDE_NAME_MAP)}`, but got `{sides}`") else: sides = _SIDE_NAME_MAP[sides] confint = _compute_difference_confidence_interval( n_positive, n_total, ref_positive, ref_total, conf_level, confint_type, sides ) if clip: confint.lower_bound = max(confint.lower_bound, -1) confint.upper_bound = min(confint.upper_bound, 1) if confint.sides == ConfidenceIntervalSides.LeftSided.value: confint.upper_bound = 1 elif confint.sides == ConfidenceIntervalSides.RightSided.value: confint.lower_bound = -1 return confint
@add_docstring( """ NOTE ---- the lower bound and upper bound are not adjusted w.r.t. `sides`. """, "append", ) @add_docstring(remove_parameters_returns_from_docstring(compute_difference_confidence_interval.__doc__, parameters="clip")) def _compute_difference_confidence_interval( n_positive: int, n_total: int, ref_positive: int, ref_total: int, conf_level: float = 0.95, confint_type: str = "wilson", sides: Union[str, int] = "two-sided", ) -> ConfidenceInterval: warnings.simplefilter(action="ignore", category=RuntimeWarning) if sides != "two-sided": z = qnorm(conf_level) _conf_level = 2 * conf_level - 1 else: z = qnorm((1 + conf_level) / 2) _conf_level = conf_level n_negative = n_total - n_positive ref_negative = ref_total - ref_positive ratio = n_positive / n_total neg_ratio = 1 - ratio ref_ratio = ref_positive / ref_total ref_neg_ratio = 1 - ref_ratio delta_ratio = ratio - ref_ratio if confint_type.lower() in ["wilson", "newcombe", "score"]: item1 = z * np.sqrt(4 * n_total * ratio * neg_ratio + z**2) lower1 = (2 * n_total * ratio + z**2 - item1) / 2 / (n_total + z**2) upper1 = (2 * n_total * ratio + z**2 + item1) / 2 / (n_total + z**2) item2 = z * np.sqrt(4 * ref_total * ref_ratio * ref_neg_ratio + z**2) lower2 = (2 * ref_total * ref_ratio + z**2 - item2) / 2 / (ref_total + z**2) upper2 = (2 * ref_total * ref_ratio + z**2 + item2) / 2 / (ref_total + z**2) return ConfidenceInterval( delta_ratio - np.sqrt((ratio - lower1) ** 2 + (upper2 - ref_ratio) ** 2), delta_ratio + np.sqrt((ref_ratio - lower2) ** 2 + (upper1 - ratio) ** 2), delta_ratio, conf_level, confint_type.lower(), str(sides), ) elif confint_type.lower() in ["wilson-cc", "newcombe-cc", "score-cc"]: # https://corplingstats.wordpress.com/2019/04/27/correcting-for-continuity/ # equation (6) and (6') e = 2 * n_total * ratio + z**2 f = z**2 - 1 / n_total + 4 * n_total * ratio * neg_ratio g = 4 * ratio - 2 h = 2 * (n_total + z**2) # lower1 = (e - (z * np.sqrt(f + g) + 1)) / h # upper1 = (e + (z * np.sqrt(f - g) + 1)) / h # should always be clipped ? lower1 = (e - (z * np.sqrt(f + g) + 1)) / h if n_positive != 0 else 0 upper1 = (e + (z * np.sqrt(f - g) + 1)) / h if n_negative != 0 else 1 e = 2 * ref_total * ref_ratio + z**2 f = z**2 - 1 / ref_total + 4 * ref_total * ref_ratio * ref_neg_ratio g = 4 * ref_ratio - 2 h = 2 * (ref_total + z**2) # lower2 = (e - (z * np.sqrt(f + g) + 1)) / h # upper2 = (e + (z * np.sqrt(f - g) + 1)) / h # should always be clipped ? lower2 = (e - (z * np.sqrt(f + g) + 1)) / h if ref_positive != 0 else 0 upper2 = (e + (z * np.sqrt(f - g) + 1)) / h if ref_negative != 0 else 1 return ConfidenceInterval( delta_ratio - np.sqrt((ratio - lower1) ** 2 + (upper2 - ref_ratio) ** 2), delta_ratio + np.sqrt((ref_ratio - lower2) ** 2 + (upper1 - ratio) ** 2), delta_ratio, conf_level, confint_type.lower(), str(sides), ) elif confint_type.lower() in ["wald", "wald-cc"]: item = z * np.sqrt(ratio * neg_ratio / n_total + ref_ratio * ref_neg_ratio / ref_total) if confint_type.lower() == "wald-cc": return ConfidenceInterval( delta_ratio - item - 0.5 / n_total - 0.5 / ref_total, delta_ratio + item + 0.5 / n_total + 0.5 / ref_total, delta_ratio, conf_level, confint_type.lower(), str(sides), ) return ConfidenceInterval( delta_ratio - item, delta_ratio + item, delta_ratio, conf_level, confint_type.lower(), str(sides), ) elif confint_type.lower() in ["haldane", "jeffreys-perks"]: v = 0.25 / n_total - 0.25 / ref_total u = v + 0.5 / ref_total if confint_type.lower() == "haldane": psi = 0.5 * (ratio + ref_ratio) else: # "jeffreys-perks" psi = 0.5 * ((n_positive + 0.5) / (n_total + 1) + (ref_positive + 0.5) / (ref_total + 1)) w = ( np.sqrt( u * (4 * psi * (1 - psi) - delta_ratio**2) + 2 * v * (1 - 2 * psi) * delta_ratio + 4 * ((z * u) ** 2) * psi * (1 - psi) + (z * v * (1 - 2 * psi)) ** 2 ) * z / (1 + u * z**2) ) theta_star = (delta_ratio + v * (1 - 2 * psi) * z**2) / (1 + u * z**2) return ConfidenceInterval( theta_star - w, theta_star + w, delta_ratio, conf_level, confint_type.lower(), str(sides), ) elif confint_type.lower() in ["mee", "miettinen-nurminen"]: if confint_type.lower() == "mee": lamb = 1 else: # "miettinen-nurminen" lamb = (n_total + ref_total) / (n_total + ref_total - 1) if n_positive == ref_positive == 0: # R implementation from https://github.com/AndriSignorell/DescTools/blob/master/R/StatsAndCIs.r # `uniroot` unstable for some cases (e.g. 10/10 vs 0/20) tol = 1e-6 lower = uniroot( lambda j: _mee_mn_score_func(j, ratio, ref_ratio, n_total, ref_total, lamb) - (1 - _conf_level), -1 + tol, delta_ratio - tol, full_output=False, ) upper = uniroot( lambda j: _mee_mn_score_func(j, ratio, ref_ratio, n_total, ref_total, lamb) - (1 - _conf_level), delta_ratio + tol, 1 - tol, full_output=False, ) else: # failed in the case of n_positive == ref_positive == 0 itv = _mee_mn_lower_upper_bounds(ratio, ref_ratio, n_total, ref_total, lamb, z) lower, upper = np.min(itv), np.max(itv) return ConfidenceInterval(lower, upper, delta_ratio, conf_level, confint_type.lower(), str(sides)) elif confint_type.lower() == "true-profile": itv = _true_profile_lower_upper_bounds(n_positive, n_total, ref_positive, ref_total, z) return ConfidenceInterval( np.min(itv), np.max(itv), delta_ratio, conf_level, confint_type.lower(), str(sides), ) elif confint_type.lower() == "hauck-anderson": item = 1 / 2 / min(n_total, ref_total) + z * np.sqrt( ratio * neg_ratio / (n_total - 1) + ref_ratio * ref_neg_ratio / (ref_total - 1) ) return ConfidenceInterval( delta_ratio - item, delta_ratio + item, delta_ratio, conf_level, confint_type.lower(), str(sides), ) elif confint_type.lower() in ["agresti-caffo", "carlin-louis"]: ratio_1 = (n_positive + 1) / (n_total + 2) ratio_2 = (ref_positive + 1) / (ref_total + 2) denom_add = { "agresti-caffo": 2, "carlin-louis": 3, } item = z * np.sqrt( (ratio_1 * (1 - ratio_1) / (n_total + denom_add[confint_type.lower()])) + (ratio_2 * (1 - ratio_2) / (ref_total + denom_add[confint_type.lower()])) ) return ConfidenceInterval( ratio_1 - ratio_2 - item, ratio_1 - ratio_2 + item, delta_ratio, conf_level, confint_type.lower(), str(sides), ) elif confint_type.lower() in ["brown-li", "brown-li-jeffrey"]: ratio_1 = (n_positive + 0.5) / (n_total + 1) ratio_2 = (ref_positive + 0.5) / (ref_total + 1) item = z * np.sqrt(ratio_1 * (1 - ratio_1) / n_total + ratio_2 * (1 - ratio_2) / ref_total) return ConfidenceInterval( ratio_1 - ratio_2 - item, ratio_1 - ratio_2 + item, delta_ratio, conf_level, confint_type.lower(), str(sides), ) elif confint_type.lower() == "miettinen-nurminen-brown-li": weight = 2 / 3 lower_mn, upper_mn = _compute_difference_confidence_interval( n_positive, n_total, ref_positive, ref_total, conf_level, "miettinen-nurminen", sides, ).astuple() lower_bl, upper_bl = _compute_difference_confidence_interval( n_positive, n_total, ref_positive, ref_total, conf_level, "brown-li", sides ).astuple() lower = weight * lower_mn + (1 - weight) * lower_bl upper = weight * upper_mn + (1 - weight) * upper_bl return ConfidenceInterval(lower, upper, delta_ratio, conf_level, confint_type.lower(), str(sides)) elif confint_type.lower() == "exact": raise NotImplementedError(f"`method` `{confint_type}` is not implemented yet") elif confint_type.lower() == "mid-p": raise NotImplementedError(f"`method` `{confint_type}` is not implemented yet") elif confint_type.lower() == "santner-snell": raise NotImplementedError(f"`method` `{confint_type}` is not implemented yet") elif confint_type.lower() == "chan-zhang": raise NotImplementedError(f"`method` `{confint_type}` is not implemented yet") elif confint_type.lower() == "agresti-min": raise NotImplementedError(f"`method` `{confint_type}` is not implemented yet") elif confint_type.lower() == "wang": raise NotImplementedError(f"`method` `{confint_type}` is not implemented yet") elif confint_type.lower() == "pradhan-banerjee": raise NotImplementedError(f"`method` `{confint_type}` is not implemented yet") else: newline = "\n" raise ValueError( f"""`method` `{confint_type}` is not supported, """ f"""choose one from {newline}{newline.join(_supported_types)}""" ) _supported_types = [ "wilson", "newcombe", "score", "wilson-cc", "newcombe-cc", "score-cc", "wald", "wald-cc", "haldane", "jeffreys-perks", "mee", "miettinen-nurminen", "true-profile", "hauck-anderson", "agresti-caffo", "carlin-louis", "brown-li", "brown-li-jeffrey", "miettinen-nurminen-brown-li", # "exact", # "mid-p", # "santner-snell", # "chan-zhang", # "agresti-min", # "wang", # "pradhan-banerjee", ] _supported_methods = _supported_types _stochastic_types = [] _stochastic_methods = _stochastic_types _type_aliases = { "wilson": "newcombe", "wilson-cc": "newcombe-cc", "score": "newcombe", "score-cc": "newcombe-cc", "brown-li-jeffrey": "brown-li", } _method_aliases = _type_aliases @deprecated(version="0.0.4", reason="Use `list_difference_confidence_interval_methods` instead.") def list_difference_confidence_interval_types() -> None: print("\n".join(_supported_types))
[docs]def list_difference_confidence_interval_methods() -> None: """List all supported methods for computing difference confidence intervals.""" print("\n".join(_supported_types))
def _mee_mn_score_func( j: float, ratio: float, ref_ratio: float, n_total: int, ref_total: int, lamb: float, ) -> float: var = _mee_mn_var_func(j, ratio, ref_ratio, n_total, ref_total, lamb) return 2 * min(pnorm(var), 1 - pnorm(var)) def _mee_mn_var_func(j: float, ratio: float, ref_ratio: float, n_total: int, ref_total: int, lamb: float) -> float: theta = ref_total / n_total delta_ratio = ratio - ref_ratio a = a = 1 + theta b = -(1 + theta + ratio + theta * ref_ratio + j * (theta + 2)) c = j * (j + 2 * ratio + theta + 1) + ratio + theta * ref_ratio d = -ratio * j * (1 + j) tmp_b = b / 3 / a tmp_c = c / 3 / a v = tmp_b**3 - tmp_b * tmp_c * 3 / 2 + d / 2 / a # https://github.com/AndriSignorell/DescTools/blob/de9731c7d5640deff425e08e63e3aed2c5dc65aa/R/StatsAndCIs.r#L2509 # u = np.sign(v) * np.sqrt(tmp_b**2 - tmp_c) if np.abs(v) < np.finfo(np.float64).eps: v = 0 u = np.sqrt(tmp_b**2 - tmp_c) if v < 0: u = -u w = (np.pi + np.arccos(v / u**3)) / 3 ratio_mle = 2 * u * np.cos(w) - tmp_b ref_ratio_mle = ratio_mle - j var = np.sqrt(lamb * (ratio_mle * (1 - ratio_mle) / n_total + ref_ratio_mle * (1 - ref_ratio_mle) / ref_total)) var = (delta_ratio - j) / var return var @accelerator.accelerator def _mee_mn_lower_upper_bounds( ratio: float, ref_ratio: float, n_total: int, ref_total: int, lamb: float, z: float, ) -> np.ndarray: theta = ref_total / n_total delta_ratio = ratio - ref_ratio a = 1 + theta increment = 1e-5 itv = [] # flag = None for j in np.arange(-1, 1 + increment, increment): b = -(1 + theta + ratio + theta * ref_ratio + j * (theta + 2)) c = j * (j + 2 * ratio + theta + 1) + ratio + theta * ref_ratio d = -ratio * j * (1 + j) tmp_b = b / 3 / a tmp_c = c / 3 / a v = tmp_b**3 - tmp_b * tmp_c * 3 / 2 + d / 2 / a # https://github.com/AndriSignorell/DescTools/blob/de9731c7d5640deff425e08e63e3aed2c5dc65aa/R/StatsAndCIs.r#L2509 # u = np.sign(v) * np.sqrt(tmp_b**2 - tmp_c) if np.abs(v) < np.finfo(np.float64).eps: v = 0 u = np.sqrt(tmp_b**2 - tmp_c) if v < 0: u = -u u3 = u**3 if u3 == 0: u3 = np.finfo(np.float64).eps continue # python >= 3.7 would cause ZeroDivisionError w = (np.pi + np.arccos(v / u3)) / 3 ratio_mle = 2 * u * np.cos(w) - tmp_b ref_ratio_mle = ratio_mle - j var = np.sqrt(lamb * (ratio_mle * (1 - ratio_mle) / n_total + ref_ratio_mle * (1 - ref_ratio_mle) / ref_total)) if var == 0: # https://github.com/DeepPSP/DBCI/issues/1 var = np.finfo(np.float64).eps var = (delta_ratio - j) / var if -z < var < z: # flag = True itv.append(j) # elif flag: # break return np.array(itv) @accelerator.accelerator def _true_profile_lower_upper_bounds( n_positive: int, n_total: int, ref_positive: int, ref_total: int, z: float, ) -> np.ndarray: theta = ref_total / n_total ratio = n_positive / n_total ref_ratio = ref_positive / ref_total ref_neg_ratio = 1 - ref_ratio neg_ratio = 1 - ratio n_negative = n_total - n_positive ref_negative = ref_total - ref_positive a = 1 + theta increment = 1e-5 itv = [] # flag = None for j in np.arange(-1, 1 + increment, increment): b = -(1 + theta + ratio + theta * ref_ratio + j * (theta + 2)) c = j * (j + 2 * ratio + theta + 1) + ratio + theta * ref_ratio d = -ratio * j * (1 + j) tmp_b = b / 3 / a tmp_c = c / 3 / a v = tmp_b**3 - tmp_b * tmp_c * 3 / 2 + d / 2 / a # https://github.com/AndriSignorell/DescTools/blob/de9731c7d5640deff425e08e63e3aed2c5dc65aa/R/StatsAndCIs.r#L2509 # u = np.sign(v) * np.sqrt(tmp_b**2 - tmp_c) if np.abs(v) < np.finfo(np.float64).eps: v = 0 u = np.sqrt(tmp_b**2 - tmp_c) if v < 0: u = -u u3 = u**3 if u3 == 0: u3 = np.finfo(np.float64).eps continue # python >= 3.7 would cause ZeroDivisionError w = (np.pi + np.arccos(v / u3)) / 3 ratio_mle = 2 * u * np.cos(w) - tmp_b ref_ratio_mle = ratio_mle - j var = 0 for (num, r_m_, r_) in [ (n_positive, ratio_mle, ratio), (ref_positive, ref_ratio_mle, ref_ratio), (n_negative, 1 - ratio_mle, neg_ratio), (ref_negative, 1 - ref_ratio_mle, ref_neg_ratio), ]: if num > 0: # omitting any terms corresponding to empty cells var += num * np.log(r_m_ / r_) if var >= -(z**2) / 2: # flag = True itv.append(j) # elif flag: # break return np.array(itv)