%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import scipy.stats as st
plt.style.use("seaborn")
sns.set_context("talk", font_scale=1.4)
plt.rcParams["figure.figsize"] = (12, 6)
x, T, G, N = 39, 1641, 74, 257
Note: the Scipy and R calling conventions are different. If we use the above values as the ones we'd use to call R's phyper
function directly (see below) then we need to convert them to the Scipy convention as follows:
M, n = T+G, G
rv_null = st.hypergeom(M, n, N)
x_null = np.arange(0, 40)
pmf_null = rv_null.pmf(x_null)
fig, ax = plt.subplots()
ax.plot(x_null, pmf_null, 'bo')
ax.vlines(x_null, 0, pmf_null, lw=2)
ax.set_xlabel('# of shifts with at least one death out of 257 randomly chosen shifts')
ax.set_ylabel('hypergeometric probabilities');
cdf_null = rv_null.sf(x_null)
fig, ax = plt.subplots()
ax.plot(x_null, cdf_null)
#ax.vlines(x_null, 0, cdf_null, lw=2)
ax.set_xlabel('# of shifts with at least one death out of 257 randomly chosen shifts')
ax.set_ylabel('hypergeometric CDF');
st.hypergeom.cdf(x, M, n, N)
st.hypergeom.sf(x, M, n, N)
%load_ext rpy2.ipython
%%R -i x,M,n,N
cat("x, M, n, N:", x, M, n, N, "\n")
cat("cdf:", phyper(x, n, M-n, N, lower.tail=TRUE), "\n")
cat("sf :", phyper(x, n, M-n, N, lower.tail=FALSE))
%%R -i x,M,n,N
cat("x, M, n, N:", x, M, n, N, "\n")
cdf <- phyper(x, n, M-n, N, lower.tail=TRUE)
sf <- phyper(x, n, M-n, N, lower.tail=FALSE)
cat("cdf:", sprintf("%.18e", cdf), "\n")
cat("sf :", sf, "\n" )
cat("diff:", cdf-sf )
Note that for these values:
x, M, n, N = 40, 1600, 50, 300
Scipy's logcdf
function returns non-sensical results:
x, M, n, N = 40, 1600, 50, 300
lcdf = st.hypergeom.logcdf(x, M, n, N)
print("logcdf:", lcdf)
print("probab:", repr(np.exp(lcdf)))
As a sanity check, let's look at R, which does the right thing:
%%R -i x,M,n,N -o lcdf
lcdf <- phyper(x, n, M-n, N, lower.tail=TRUE, log=TRUE)
cat("logcdf:", sprintf("%.12e", lcdf), "\n")
cat("probab:", exp(lcdf))
This is probably obvious for some reason, but not immediately to me... At these values, the survival function is minus the logcdf
to numerical precision:
%%R -i x,M,n,N
lcdf <- phyper(x, n, M-n, N, lower.tail=TRUE, log=TRUE)
sf <- phyper(x, n, M-n, N, lower.tail=FALSE, log=FALSE)
cat("logcdf:", sprintf("%.12e", lcdf), "\n")
cat("sf : ", sprintf("%.12e", sf), "\n")
cat("diff :", lcdf - sf)
This shows that SciPy does agree with R away from the tails.
import numpy as np
import scipy.stats as st
x, M, n, N = 3, 20, 7, 12
lcdf = st.hypergeom.logcdf(x, M, n, N)
print("logcdf:", lcdf)
print("probab:", repr(np.exp(lcdf)))
%%R -i x,M,n,N
lcdf <- phyper(x, n, M-n, N, lower.tail=TRUE, log=TRUE)
cat("logcdf:", sprintf("%.12e", lcdf), "\n")
cat("probab:", exp(lcdf))