Bootstrap

1
2
3
4
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm

The canonical case

We simulate a linear model with normal errors, we compare the finite sample distribution to the asymptotic distribution.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
beta = 1.5
N = 50000
R = 2000
X = np.random.normal(size=N)

# construct the null distribution
tstats_asy = np.zeros((R))
beta_vec_mc = np.zeros((R))
for i in range(R):
    E = np.random.normal(size=N)
    Y = beta*X + E
    cov = np.cov(Y,X)
    beta_hat = cov[0,1]/cov[1,1]
    beta_vec_mc[i] = beta_hat
    beta_se_asy  = (Y-beta_hat*X).std()/(X.std())
    tstats_asy[i] = np.sqrt(N)*(beta_hat-beta)/beta_se_asy 
1
plt.hist(tstats_asy)
1
2
3
4
5
(array([  5.,  33., 144., 306., 510., 486., 317., 141.,  47.,  11.]),
 array([-3.2595005 , -2.61418765, -1.9688748 , -1.32356195, -0.6782491 ,
        -0.03293625,  0.61237661,  1.25768946,  1.90300231,  2.54831516,
         3.19362801]),
 <a list of 10 Patch objects>)

png

1
2
3
4
5
6
7
# plot the quantiles of the asymptotic distribution versus true
qlev = np.linspace(0.05,0.95,num=19)
qs = np.quantile(tstats_asy,qlev)

plt.plot(qlev,qs,color="red",label="mc")
plt.plot(qlev,norm.ppf(qlev),color="blue",label="asy")
plt.legend()
1
<matplotlib.legend.Legend at 0x1a1b9b5310>

png

1
2
3
4
beta_vec_asy = beta + 1/(np.sqrt(N) * X.std()) * np.random.normal(size=R)
h1 = plt.hist(beta_vec_mc,alpha=0.5,label="mc",color="blue",density=True)
h2 = plt.hist(beta_vec_asy,alpha=0.5,label="asy",color="red",density=True)
plt.legend()
1
<matplotlib.legend.Legend at 0x1a1bbbed50>

png

Smaller sample size

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
beta = 1.5
N = 20
R = 10000
X = np.random.normal(size=N)

# construct the null distribution
tstats_asy = np.zeros((R))
beta_mc = np.zeros((R))
for i in range(R):
    E = np.random.normal(size=N)
    Y = beta*X + E
    cov = np.cov(Y,X)
    beta_hat = cov[0,1]/cov[1,1]
    beta_mc[i] = beta_hat
    beta_se  = (Y-beta_hat*X).std()/(np.sqrt(N)*X.std())
    tstats_asy[i] = (beta_hat-beta)/beta_se 
1
2
3
4
5
6
# plot the quantiles of the asymptotic distribution versus true
qlev = np.linspace(0.05,0.95,num=19)
qs = np.quantile(tstats_asy,qlev)

plt.plot(qlev,qs)
plt.plot(qlev,norm.ppf(qlev))
1
[<matplotlib.lines.Line2D at 0x1a1bcdc890>]

png

1
## Change the error distribution
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
beta = 1.2
N = 20
R = 10000
X = np.random.normal(size=N)

# construct the null distribution
tstats_asy = np.zeros((R))
for i in range(R):
    E = np.random.pareto(1.0, N)
    Y = beta*X + E
    cov = np.cov(Y,X)
    beta_hat = cov[0,1]/cov[1,1]
    beta_se  = (Y-beta_hat*X).std()/(np.sqrt(N)*X.std())
    tstats_asy[i] = (beta_hat-beta)/beta_se 
1
2
3
4
5
6
# plot the quantiles of the asymptotic distribution versus true
qlev = np.linspace(0.05,0.95,num=19)
qs = np.quantile(tstats_asy,qlev)

plt.plot(qlev,qs)
plt.plot(qlev,norm.ppf(qlev))
1
[<matplotlib.lines.Line2D at 0x1a1ba64750>]

png

Delta method and C.I.

Let's now look at the square of \beta. From the delta method we have that

\sqrt{n} ( \beta_n^2 - \beta^2 ) \rightarrow \mathcal{N} \Big(0, \frac{\sigma_e^2}{\sigma_x^2} 4\beta^2 \Big)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
beta = 0.5
N = 100000
R = 2000
X = np.random.normal(size=N)

# we want to compare MC CI to realized CI

ci_asy = np.zeros((R,2))
beta2_mc = np.zeros((R)) # we store the MC realizations 

for i in range(R):
    E = np.random.normal(size=N)
    Y = beta*X + E

    cov = np.cov(Y,X)
    beta_hat = cov[0,1]/cov[1,1]    
    beta_se  = (Y-beta_hat*X).std()/(np.sqrt(N)*X.std())

    beta2_mc[i] = beta_hat ** 2
    beta2_se = beta_se * 2*np.abs(beta_hat)
    ci_asy[i,:] = beta_hat ** 2 + beta2_se * np.array([-1.96,1.96])
1
2
3
4
5
6
7
plt.subplot(1,2,1)
plt.hist(ci_asy[:,0],alpha=0.3,label="mc",color="blue",density=True)
plt.axvline( np.quantile(beta2_mc,0.025) )

plt.subplot(1,2,2)
plt.hist(ci_asy[:,1],alpha=0.3,label="mc",color="blue",density=True)
plt.axvline( np.quantile(beta2_mc,0.975) )
1
<matplotlib.lines.Line2D at 0x1a1fcb65d0>

png

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
np.random.seed(10)
beta = 0.5
N = 50
R = 2000
X = np.random.normal(size=N)

# we want to compare MC CI to realized CI

ci_asy = np.zeros((R,2))
beta2_mc = np.zeros((R)) # we store the MC realizations 

for i in range(R):
    E = np.random.normal(size=N)
    Y = beta*X + E

    cov = np.cov(Y,X)
    beta_hat = cov[0,1]/cov[1,1]    
    beta_se  = (Y-beta_hat*X).std()/(np.sqrt(N)*X.std())

    beta2_mc[i] = beta_hat ** 2
    beta2_se = beta_se * 2*np.abs(beta_hat)
    ci_asy[i,:] = beta_hat ** 2 + beta2_se * np.array([-1.96,1.96])
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
plt.subplot(1,2,1)
plt.hist(ci_asy[:,0],alpha=0.3,label="mc",color="blue",density=True)
plt.axvline( ci_asy[:,0].mean(),color="blue" ,label="asy")
plt.axvline( np.quantile(beta2_mc,0.025) ,color="red" ,label="mc")
plt.legend()

plt.subplot(1,2,2)
plt.hist(ci_asy[:,1],alpha=0.3,label="mc",color="blue",density=True)
plt.axvline( ci_asy[:,1].mean(),color="blue",label="asy" )
plt.axvline( np.quantile(beta2_mc,0.975) ,color="red",label="mc" )
plt.legend()

plt.show()

png

Bootstrap distribution

We create the resampling distribution and compare it to the finite sample distribution

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
def bootstrap_CI(Y,X,B=500):
    """ Computes the CI at 0.05 percent for the OLS estimate """
    N = len(Y)
    beta_inv_vec_bs = np.zeros((B))
    tstat_bs = np.zeros((B))

    cov = np.cov(Y,X)
    beta_hat =  cov[0,1]/cov[1,1]
    beta_se_r0  = (Y-beta_hat*X).std()/(np.sqrt(N)*X.std())    
    beta2_se_r0 = beta_se_r0 * 2 * np.abs(beta_hat)

    for i in range(B):
        I = np.random.randint(N,size=N)
        Ys = Y[I]
        Xs = X[I]
        cov = np.cov(Ys,Xs)
        beta_hat_r = cov[0,1]/cov[1,1]
        beta_inv_vec_bs[i] = beta_hat_r**2

        beta_se_r  = (Ys-beta_hat_r*Xs).std()/(np.sqrt(N)*Xs.std())    
        beta2_se_r = beta_se_r * 2 * np.abs(beta_hat_r)
        tstat_bs[i] = beta_hat_r**2 / beta2_se_r

    M = np.mean(beta_inv_vec_bs)
    bias = M - beta_hat**2

    U = np.quantile(beta_inv_vec_bs,0.975)-bias
    L = np.quantile(beta_inv_vec_bs,0.025)-bias        
#     U = np.quantile(beta_inv_vec_bs,0.975)
#     L = np.quantile(beta_inv_vec_bs,0.025)

    return(M,L,U)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
np.random.seed(10)

all = []
for N in [100]:

    R = 500
    B = 500
    X = np.random.normal(size=N)

    # construct the null distribution
    ci_bs = np.zeros((R,2))
    ci_asy = np.zeros((R,2))
    beta2_mc = np.zeros((R))
    beta2_bs = np.zeros((R))

    for i in range(R):
        E = np.random.normal(size=N)
        Y = beta*X + E
        cov = np.cov(Y,X)
        beta_hat = cov[0,1]/cov[1,1]
        beta2_mc[i] = beta_hat**2

        beta_se  = (Y-beta_hat*X).std()/(np.sqrt(N)*X.std())    
        beta2_se = beta_se * 2 * np.abs(beta_hat)
        ci_asy[i,:] = beta_hat**2 + beta2_se * np.array([-1.96,1.96])
        beta2_bs[i], ci_bs[i,0], ci_bs[i,1] = bootstrap_CI(Y,X,B)        
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
plt.subplot(1,2,1)
plt.hist(ci_asy[:,0],alpha=0.3,color="blue",density=True)
plt.axvline( ci_asy[:,0].mean(),color="blue" , label="asy")
plt.axvline( np.quantile(beta2_mc,0.025) ,color="red" , label="mc")
plt.title("asymptotic CI 0.025")
plt.legend()

plt.subplot(1,2,2)
plt.hist(ci_asy[:,1],alpha=0.3,color="blue",density=True)
plt.axvline( ci_asy[:,1].mean(),color="blue" , label="asy")
plt.axvline( np.quantile(beta2_mc,0.975) ,color="red", label="mc" )
plt.title("asymptotic CI 0.975")
plt.legend()

plt.show()

png

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
plt.subplot(1,2,1)
plt.hist(ci_bs[:,0],alpha=0.3,label="mc",color="blue",density=True)
plt.axvline( ci_bs[:,0].mean(),color="green" , label="bs")
plt.axvline( np.quantile(beta2_mc,0.025) ,color="red" , label="mc")
plt.title("bootstrap CI 0.025")
plt.legend()

plt.subplot(1,2,2)
plt.hist(ci_bs[:,1],alpha=0.3,label="mc",color="blue",density=True)
plt.axvline( ci_bs[:,1].mean(),color="green", label="bs" )
plt.axvline( np.quantile(beta2_mc,0.975) ,color="red" , label="mc")
plt.title("bootstrap CI 0.025")
plt.legend()

plt.show()

png

1
ci_bs.mean(0)
1
array([0.03777446, 0.67763476])
1
ci_asy.mean(0)
1
array([-0.03398326,  0.59821145])
1
np.quantile(beta2_mc,[0.025,0.975])
1
array([0.0306751 , 0.66861031])

Biased in the estimate itself

We compare \beta to \beta_n to \beta^*_n. We notice for instance that \beta < \beta_n <\beta^*_n

1
beta**2
1
0.25
1
beta2_mc.mean()
1
0.28211409584476665
1
beta2_bs.mean()
1
0.308561455697164
1
2*beta2_mc.mean() - beta2_bs.mean()
1
0.25566673599236933
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
plt.subplot(1,2,1)
plt.hist( beta2_mc ,alpha=0.3,label="mc",color="blue",density=True)
plt.axvline( beta2_mc.mean(),color="blue" )
plt.axvline( beta**2 ,color="red" )
plt.title("OLS MC")

plt.subplot(1,2,2)
plt.hist( 2*beta2_mc - beta2_bs ,alpha=0.3,label="mc",color="blue",density=True)
plt.axvline( (2*beta2_mc - beta2_bs).mean(),color="blue" )
plt.axvline( beta**2 ,color="red" )
plt.title("Bootstrap corrected MC")

plt.show()

png

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
def bootstrap_CI2(Y,X,B=500):
    """ Computes the CI at 0.05 percent for the OLS estimate """
    N = len(Y)
    beta_inv_vec_bs = np.zeros((B))
    tstat_bs = np.zeros((B))

    cov = np.cov(Y,X)
    beta_hat =  cov[0,1]/cov[1,1]
    beta_se_r0  = (Y-beta_hat*X).std()/(np.sqrt(N)*X.std())    
    beta2_se_r0 = beta_se_r0 * 2 * np.abs(beta_hat)

    for i in range(B):
        I = np.random.randint(N,size=N)
        Ys = Y[I]
        Xs = X[I]
        cov = np.cov(Ys,Xs)
        beta_hat_r = cov[0,1]/cov[1,1]
        beta_inv_vec_bs[i] = beta_hat_r**2

        beta_se_r  = (Ys-beta_hat_r*Xs).std()/(np.sqrt(N)*Xs.std())    
        beta2_se_r = beta_se_r * 2 * np.abs(beta_hat_r)
        tstat_bs[i] = (beta_hat_r**2 - beta_hat**2) / beta2_se_r

    M = np.mean(beta_inv_vec_bs)
    bias = M - beta_hat**2

#     U = np.quantile(beta_inv_vec_bs,0.975)-bias
#     L = np.quantile(beta_inv_vec_bs,0.025)-bias        
#     U = np.quantile(beta_inv_vec_bs,0.975)
#     L = np.quantile(beta_inv_vec_bs,0.025)

    L = beta_hat**2 - np.quantile( tstat_bs, 0.975)*beta2_se_r0
    U = beta_hat**2 - np.quantile( tstat_bs, 0.025)*beta2_se_r0

    return(M,L,U)
1