Skip to content

Asymptotic distributions

1
2
3
4
5
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
import seaborn as sns

The canonical case

We simulate a linear model with normal errors, we compare the finite sample distribution to the asymptotic distribution.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
def ols_normal(beta,X,N,e_sd):
    """ Simulates the errors, estimates OLS coefficient, se, tstat and CI"""
    E = e_sd*np.random.normal(size=N)
    Y = beta*X[0:N] + E
    cov = np.cov(Y,X[0:N])
    beta_hat = cov[0,1]/cov[1,1]
    beta_se  = (Y-beta_hat*X[0:N]).std()/(np.sqrt(N)*X[0:N].std())    
    tstats_asy = (beta_hat-beta)/beta_se    
    ci_asy = beta_hat + beta_se * np.array([-1.96,1.96])
    return({
        'beta':beta_hat,
        'se':beta_se,
        'tstats_asy':tstats_asy,
        'ci_lb':ci_asy[0],
        'ci_ub':ci_asy[1], 
        'N':N})
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
beta = 1.2
R = 10000
e_sd = 1.0

X = np.random.normal(size=200)
res = [ols_normal(beta,X,N,e_sd) for _ in range(R) for N in [10,25,50,75,100,200]]
sim = pd.DataFrame(res)

# append theoretical asymptotic CI 
sim['ci_lb_asy'] = beta - e_sd*1.96/np.sqrt(sim['N'])
sim['ci_ub_asy'] = beta + e_sd*1.96/np.sqrt(sim['N'])
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
sim_tmp = sim.groupby('N').mean()
sim_tmp['ub'] = sim.groupby('N')['beta'].quantile(0.975)
sim_tmp['lb'] = sim.groupby('N')['beta'].quantile(0.025)
sim_tmp = sim_tmp.reset_index()

plt.plot(sim_tmp['N'],sim_tmp['ci_lb'], color="green", label="asy_hat")
plt.plot(sim_tmp['N'],sim_tmp['ci_lb_asy'], color="red", label="asy")
plt.plot(sim_tmp['N'],sim_tmp['lb'], color="blue", label="mc")

plt.plot(sim_tmp['N'],sim_tmp['ci_ub'], color="green")
plt.plot(sim_tmp['N'],sim_tmp['ci_ub_asy'], color="red")
plt.plot(sim_tmp['N'],sim_tmp['ub'], color="blue")

plt.axhline(beta,linestyle=":",color="black")
plt.legend()
plt.title("Conf intervals for different sample sizes")

plt.xscale('log')
plt.show()

png

Change the error distribution to Pareto

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
from scipy.stats import pareto

def ols_pareto(beta,X,N,alpha):
    """ Simulates the errors, estimates OLS coefficient, se, tstat and CI"""
    E = np.random.pareto(alpha, N) # !!! new distribution here
    Y = beta*X[0:N] + E
    cov = np.cov(Y,X[0:N])
    beta_hat = cov[0,1]/cov[1,1]
    beta_se  = (Y-beta_hat*X[0:N]).std()/(np.sqrt(N)*X[0:N].std())    
    tstats_asy = (beta_hat-beta)/beta_se    
    ci_asy = beta_hat + beta_se * np.array([-1.96,1.96])
    return({'beta':beta_hat,'se':beta_se,'tstats_asy':tstats_asy,'ci_lb':ci_asy[0],'ci_ub':ci_asy[1],'N':N})
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
beta = 0.5
R = 10000
alpha = 3.0 # Pareto parameter

e_sd = np.sqrt(pareto.stats(alpha, moments='v'))

X = 0.2*np.random.normal(size=200)
res = [ols_pareto(beta,X,N,alpha) for _ in range(R) for N in [10,25,50,75,100,200]]
sim = pd.DataFrame(res)

sim['ci_lb_asy'] = beta - e_sd*1.96/np.sqrt(sim['N'])
sim['ci_ub_asy'] = beta + e_sd*1.96/np.sqrt(sim['N'])
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
sim_tmp = sim.groupby('N').mean()
sim_tmp['ub'] = sim.groupby('N')['beta'].quantile(0.975)
sim_tmp['lb'] = sim.groupby('N')['beta'].quantile(0.025)
sim_tmp = sim_tmp.reset_index()

plt.plot(sim_tmp['N'],sim_tmp['ci_lb'], color="green", label="asy_hat")
plt.plot(sim_tmp['N'],sim_tmp['ci_lb_asy'], color="red", label="asy")
plt.plot(sim_tmp['N'],sim_tmp['lb'], color="blue", label="mc")

plt.plot(sim_tmp['N'],sim_tmp['ci_ub'], color="green")
plt.plot(sim_tmp['N'],sim_tmp['ci_ub_asy'], color="red")
plt.plot(sim_tmp['N'],sim_tmp['ub'], color="blue")

plt.axhline(beta,linestyle=":",color="black")
plt.legend()
plt.title("Conf intervals for different sample sizes")

plt.xscale('log')
plt.show()

# fixme maybe an error in the red
# finite sample distribution based on normal assumption

png

We plotted 3 objects. The blue line is directly the (p,1-p) quantiles of the distribution of generated estimates. We can think of this as the true finite sample distribution for this particular draw of X. The red line is the asymptotic confidence interval that uses the true variance of X and the error term. Finally the green line is the average over confidence intervals across replciations generated using the asymptotic normality assumption and the sample variance of the residual.

Delta method and C.I.

Let's now look at the square of \beta. From the delta method we have that

\sqrt{n} ( \beta_n^2 - \beta^2 ) \rightarrow \mathcal{N} \Big(0, \frac{\sigma_e^2}{\sigma_x^2} 4\beta^2 \Big)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
np.random.seed(10)
beta = 0.5
N = 50
R = 2000
X = np.random.normal(size=N)

# we want to compare MC CI to realized CI

ci_asy = np.zeros((R,2))
beta2_mc = np.zeros((R)) # we store the MC realizations 

for i in range(R):
    E = np.random.normal(size=N)
    Y = beta*X + E

    cov = np.cov(Y,X)
    beta_hat = cov[0,1]/cov[1,1]    
    beta_se  = (Y-beta_hat*X).std()/(np.sqrt(N)*X.std())

    ## we construct the estimate and the CI using the delta method
    beta2_mc[i] = beta_hat ** 2
    beta2_se = beta_se * 2 * np.abs(beta_hat) # see delta method formula
    ci_asy[i,:] = beta_hat ** 2 + beta2_se * np.array([-1.96,1.96])
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
plt.subplot(1,2,1)
plt.hist(ci_asy[:,0],alpha=0.3,label="mc asy CI",color="blue",density=True)
plt.axvline( ci_asy[:,0].mean(),color="blue" ,label="mc asy CI (mean)")
plt.axvline( np.quantile(beta2_mc,0.025) ,color="red" ,label="mc CI")
plt.title("lower bound CI 0.025")
plt.legend()

plt.subplot(1,2,2)
plt.hist(ci_asy[:,1],alpha=0.3,label="mc asy CI",color="blue",density=True)
plt.axvline( ci_asy[:,1].mean(),color="blue",label="mc asy CI (mean)" )
plt.axvline( np.quantile(beta2_mc,0.975) ,color="red",label="mc CI" )
plt.legend()
plt.title("upper bound CI 0.975")
plt.show()

png

In particular, we note that while the Monte-Carlo CI does not include 0, and hence 0 should be rejected, the asymptotic confidence interval includes 0. In this case the asymptotic CI appears to be on average to the right of the true monte-carlo distribution across replications.

Bootstrap distribution

We create the resampling distribution and compare it to the finite sample distribution

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
def bootstrap_CI(Y,X,B=500):
    """ Computes the CI at 0.05 percent for the OLS estimate """
    N = len(Y)
    beta_dm_vec_bs = np.zeros((B))
    tstat_bs = np.zeros((B))

    # compute the OLS estimate in the sample
    cov = np.cov(Y,X)
    beta_hat =  cov[0,1]/cov[1,1]
    beta_se_r0  = (Y-beta_hat*X).std()/(np.sqrt(N)*X.std())    
    beta2_se_r0 = beta_se_r0 * 2 * np.abs(beta_hat)

    # bootstrap
    for i in range(B):
        I = np.random.randint(N,size=N)
        Ys = Y[I]
        Xs = X[I]
        cov = np.cov(Ys,Xs)
        beta_hat_r = cov[0,1]/cov[1,1]     # OLS estimate for each replication
        beta_dm_vec_bs[i] = beta_hat_r**2  # transformed estimate for each replciation

        beta_se_r  = (Ys-beta_hat_r*Xs).std()/(np.sqrt(N)*Xs.std())    
        beta2_se_r = beta_se_r * 2 * np.abs(beta_hat_r)
        tstat_bs[i] = beta_hat_r**2 / beta2_se_r

    # compute bias is any
    M = np.mean(beta_dm_vec_bs)
    bias = M - beta_hat**2

    U = np.quantile(beta_dm_vec_bs,0.975)-bias
    L = np.quantile(beta_dm_vec_bs,0.025)-bias        
#     U = np.quantile(beta_inv_vec_bs,0.975)
#     L = np.quantile(beta_inv_vec_bs,0.025)

    return(M,L,U)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
np.random.seed(10)
import tqdm

all = []
for N in [50]:

    R = 500 # number of replications
    B = 500 # number of boostraps
    X = np.random.normal(size=N)

    # construct the null distribution
    ci_bs = np.zeros((R,2))
    ci_asy = np.zeros((R,2))
    beta2_mc = np.zeros((R))
    beta2_bs = np.zeros((R))

    for i in tqdm.tqdm(range(R)):
        E = np.random.normal(size=N)
        Y = beta*X + E
        cov = np.cov(Y,X)
        beta_hat = cov[0,1]/cov[1,1]
        beta2_mc[i] = beta_hat**2    # transformed estimate

        beta_se  = (Y-beta_hat*X).std()/(np.sqrt(N)*X.std())    
        beta2_se = beta_se * 2 * np.abs(beta_hat)

        # asymptotic confidence interval using delta method
        ci_asy[i,:] = beta_hat**2 + beta2_se * np.array([-1.96,1.96])

        # bootstrap estimates
        beta2_bs[i], ci_bs[i,0], ci_bs[i,1] = bootstrap_CI(Y,X,B)        
1
100%|██████████| 500/500 [00:23<00:00, 20.96it/s]
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
plt.subplot(1,2,1)
plt.hist(ci_asy[:,0],alpha=0.3,color="blue",density=True)
plt.axvline( ci_asy[:,0].mean(),color="blue" , label="asy")
plt.axvline( np.quantile(beta2_mc,0.025) ,color="red" , label="mc CI")
plt.title("asymptotic CI 0.025")
plt.legend()

plt.subplot(1,2,2)
plt.hist(ci_asy[:,1],alpha=0.3,color="blue",density=True)
plt.axvline( ci_asy[:,1].mean(),color="blue" , label="asy")
plt.axvline( np.quantile(beta2_mc,0.975) ,color="red", label="mc CI" )
plt.title("asymptotic CI 0.975")
plt.legend()

plt.show()

png

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
plt.subplot(1,2,1)
plt.hist(ci_bs[:,0],alpha=0.3,label="mc",color="blue",density=True)
plt.axvline( ci_bs[:,0].mean(),color="green" , label="bs")
plt.axvline( np.quantile(beta2_mc,0.025) ,color="red" , label="mc")
plt.title("bootstrap CI 0.025")
plt.legend()

plt.subplot(1,2,2)
plt.hist(ci_bs[:,1],alpha=0.3,label="mc",color="blue",density=True)
plt.axvline( ci_bs[:,1].mean(),color="green", label="bs" )
plt.axvline( np.quantile(beta2_mc,0.975) ,color="red" , label="mc")
plt.title("bootstrap CI 0.025")
plt.legend()

plt.show()

png

1
2
# Boostrap CI on average
ci_bs.mean(0)
1
array([0.03916983, 0.63744695])
1
2
# Asy CI on average
ci_asy.mean(0)
1
array([-0.0254555 ,  0.56548982])
1
2
# 
np.quantile(beta2_mc,[0.025,0.975])
1
array([0.03712143, 0.60464131])

Biased in the estimate itself

We compare \beta to \beta_n to \beta^*_n. We notice for instance that \beta < \beta_n <\beta^*_n

1
beta**2
1
0.25
1
beta2_mc.mean()
1
0.2700171565383394
1
beta2_bs.mean()
1
0.29472310724131867
1
2*beta2_mc.mean() - beta2_bs.mean()
1
0.24531120583536015
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
plt.subplot(1,2,1)
plt.hist( beta2_mc ,alpha=0.3,label="mc",color="blue",density=True)
plt.axvline( beta2_mc.mean(),color="blue" )
plt.axvline( beta**2 ,color="red" )
plt.title("OLS MC")

plt.subplot(1,2,2)
plt.hist( 2*beta2_mc - beta2_bs ,alpha=0.3,label="mc",color="blue",density=True)
plt.axvline( (2*beta2_mc - beta2_bs).mean(),color="blue" )
plt.axvline( beta**2 ,color="red" )
plt.title("Bootstrap corrected MC")

plt.show()

png

1