# !wget https://developer.nvidia.com/compute/cuda/9.0/Prod/local_installers/cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb
# !dpkg -i cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb
# !apt-key add /var/cuda-repo-9-0-local/7fa2af80.pub
# !apt update -q
# !apt install cuda gcc-6 g++-6 -y -q
# !ln -s /usr/bin/gcc-6 /usr/local/cuda/bin/gcc
# !ln -s /usr/bin/g++-6 /usr/local/cuda/bin/g++
# !curl -sSL "https://julialang-s3.julialang.org/bin/linux/x64/1.7/julia-1.7.3-linux-x86_64.tar.gz" -o julia.tar.gz
# !tar -xzf julia.tar.gz -C /usr --strip-components 1
# !rm -rf julia.tar.gz*
# !julia -e 'using Pkg; pkg"add IJulia; precompile"'
24. Debiased ML for Partially Linear IV Model in Julia#
References:
https://arxiv.org/abs/1608.00060
https://www.amazon.com/Business-Data-Science-Combining-Accelerate/dp/1260452778
The code is based on the book.
24.1. Partially Linear IV Model#
We consider the partially linear structural equation model: \begin{eqnarray} & Y - D\theta_0 = g_0(X) + \zeta, & E[\zeta \mid Z,X]= 0,\ & Z = m_0(X) + V, & E[V \mid X] = 0. \end{eqnarray}
Note that this model is not a regression model unless
The idea of this model is that there is a structural or causal relation between
The causal DAG this model corresponds to is given by:
$
24.2. Example#
A simple contextual example is from biostatistics, where
24.3. PLIVM in Residualized Form#
The PLIV model above can be rewritten in the following residualized form:
$
24.4. DML for PLIV Model#
Given identification, DML proceeds as follows
Compute the estimates
Estimate
The reason we work with this residualized form is that it eliminates the bias arising when solving the prediction problem in stage 1. The role of cross-validation is to avoid another source of bias due to potential overfitting.
The estimator is adaptive, in the sense that the first stage estimation errors do not affect the second stage errors.
# Import relevant packages
# using Pkg
# Pkg.add("CSV")
# Pkg.add("DataFrames")
# Pkg.add("GLM")
# Pkg.add("FixedEffectModels")
# Pkg.add("DecisionTree")
# Pkg.add("PrettyTables")
# Pkg.add("CovarianceMatrices")
# Pkg.add("RegressionTables")
# Pkg.add("StatsFuns")
# Pkg.add("Plots")
# Pkg.add("RData")
# Pkg.add("MLBase")
using CSV, DataFrames, FixedEffectModels, DecisionTree, PrettyTables, CovarianceMatrices, RegressionTables, StatsFuns, Plots, RData, MLBase, GLM
# load data
url = "https://github.com/d2cml-ai/14.388_jl/raw/github_data/data/ajr.RData"
download(url, "data.RData")
rdata_read = RData.load("data.RData")
rm("data.RData")
AJR = rdata_read["AJR"]
names(data)
println("Number of Rows : ", size(AJR)[1],"\n","Number of Columns : ", size(AJR)[2],) #rows and columnsRows : ", size(AJR)[1],"\n","Number of Columns : ", size(AJR)[2],) #rows and columns
Number of Rows : 64
Number of Columns : 11
first(AJR, 5)
5 rows × 11 columns (omitted printing of 1 columns)
GDP | Exprop | Mort | Latitude | Neo | Africa | Asia | Namer | Samer | logMort | |
---|---|---|---|---|---|---|---|---|---|---|
Float64 | Float64 | Float64 | Float64 | Int32 | Int32 | Int32 | Int32 | Int32 | Float64 | |
1 | 8.39 | 6.5 | 78.2 | 0.3111 | 0 | 1 | 0 | 0 | 0 | 4.35927 |
2 | 7.77 | 5.36 | 280.0 | 0.1367 | 0 | 1 | 0 | 0 | 0 | 5.63479 |
3 | 9.13 | 6.39 | 68.9 | 0.3778 | 0 | 0 | 0 | 0 | 1 | 4.23266 |
4 | 9.9 | 9.32 | 8.55 | 0.3 | 1 | 0 | 0 | 0 | 0 | 2.14593 |
5 | 9.29 | 7.5 | 85.0 | 0.2683 | 0 | 0 | 0 | 1 | 0 | 4.44265 |
function DML2_for_PLIVM(x , d , z, y, dreg , yreg , zreg, nfold)
# Num ob observations
nobser = size(x,1)
# Define folds indices
foldid = collect(Kfold(size(x)[1], nfold))
# Create array to save errors
ytil = ones(nobser)
dtil = ones(nobser)
ztil = ones(nobser);
println("Folds: " )
for i in 1:nfold
# Lasso regression, excluding folds selected
dfit = dreg(x[foldid[i],:], d[foldid[i]])
zfit = zreg(x[foldid[i],:], z[foldid[i]])
yfit = yreg(x[foldid[i],:], y[foldid[i]])
# Predict estimates using the
dhat = apply_forest(dfit,x[Not(foldid[i]),:])
zhat = apply_forest(zfit,x[Not(foldid[i]),:])
yhat = apply_forest(yfit,x[Not(foldid[i]),:])
# Save errors
dtil[Not(foldid[i])] = (d[Not(foldid[i])] - dhat)
ztil[Not(foldid[i])] = (z[Not(foldid[i])] - zhat)
ytil[Not(foldid[i])] = (y[Not(foldid[i])] - yhat)
println(i)
end
data = DataFrame(ytil = ytil, dtil = dtil, ztil = ztil)
ivfit = reg(data, @formula(ytil ~ + (dtil ~ ztil)))
# OLS clustering at the County level
coef_est = ivfit.coef[2]
se = FixedEffectModels.coeftable(ivfit).cols[2][1]
println( "\n Coefficient (se) =", coef_est, "(",se,")" )
Final_result = ("coef_est" => coef_est , "se" => se , "dtil" => dtil , "ytil" => ytil , "ztil" => ztil);
end
DML2_for_PLIVM (generic function with 1 method)
24.5. Emprical Example: Acemoglu, Jonsohn, Robinson (AER).#
Y is log GDP;
D is a measure of Protection from Expropriation, a proxy for quality of insitutions;
Z is the log of Settler’s mortality;
W are geographical variables (latitude, latitude squared, continent dummies as well as interactions)
y = AJR[!,"GDP"]
d = AJR[!,"Exprop"]
z = AJR[!,"logMort"];
Y = DataFrame(GDP = y)
D = DataFrame(Exprop = d)
Z = DataFrame(logMort = z);
xraw_formula = @formula(GDP ~ Latitude+ Africa+Asia + Namer + Samer)
xraw_dframe = ModelFrame(xraw_formula, AJR)
xraw1 = ModelMatrix(xraw_dframe)
xraw = xraw1.m
size(xraw)
(64, 6)
x_formula = @formula(GDP ~ -1 + Latitude + Latitude2 + Africa + Asia + Namer + Samer
+ Latitude*Latitude2 + Latitude*Africa + Latitude*Asia + Latitude*Namer + Latitude*Samer
+ Latitude2*Africa + Latitude2*Asia + Latitude2*Namer + Latitude2*Samer
+ Africa*Asia + Africa*Namer + Africa*Samer
+ Asia*Namer + Asia*Samer
+ Namer*Samer)
x_dframe = ModelFrame( x_formula, AJR)
x1 = ModelMatrix(x_dframe)
x = x1.m
size(x1)
(64, 21)
X = DataFrame(Latitude = x_dframe.data[2], Latitude2 = x_dframe.data[3], Africa = x_dframe.data[4],
Asia = Africa = x_dframe.data[5], Namer = Africa = x_dframe.data[6], Samer = Africa = x_dframe.data[7])
size(X)
(64, 6)
Information from random Forest link1, link2
y_model = DataFrame(y_model = y);
function dreg( x_1 , d_1 )
if d_1 != nothing && ( typeof(d_1) != String )
mtry1 = convert(Int64,findmax(round( ( size(x_1)[ 2 ]/3 ), digits = 0))[1])
else
mtry1 = convert(Int64,round(sqrt( size(x)[ 2 ] ), digits = 0))
end
if d_1 != nothing && ( typeof(d_1) != String )
nodesize1 = 5
else
nodesize1 = 1
end
n_subfeatures=-1; n_trees=10; partial_sampling=0.7; max_depth=-1
min_samples_leaf= nodesize1; min_samples_split=2; min_purity_increase=0.0; seed=0
RFmodel = build_forest(d_1, x_1, n_subfeatures, n_trees, partial_sampling, max_depth, min_samples_leaf, min_samples_split, min_purity_increase; rng = seed)
end
function yreg( x_1, y_1 )
if y_1 != nothing && ( typeof(y_1) != String )
mtry1 = convert(Int64,findmax(round( ( size(x_1)[ 2 ]/3 ), digits = 0))[1])
else
mtry1 = convert(Int64,round(sqrt( size(x_1)[ 2 ] ), digits = 0))
end
if y_1 != nothing && ( typeof(y_1) != String )
nodesize1 = 5
else
nodesize1 = 1
end
n_subfeatures=-1; n_trees=10; partial_sampling=0.7; max_depth=-1
min_samples_leaf= nodesize1; min_samples_split=2; min_purity_increase=0.0; seed=0
RFmodel = build_forest(y_1, x_1, n_subfeatures, n_trees, partial_sampling, max_depth, min_samples_leaf, min_samples_split, min_purity_increase; rng = seed)
end
function zreg( x_1, z_1 )
if z_1 != nothing && ( typeof(z_1) != String )
mtry1 = convert(Int64,findmax(round( ( size(x_1)[ 2 ]/3 ), digits = 0))[1])
else
mtry1 = convert(Int64,round(sqrt( size(x_1)[ 2 ] ), digits = 0))
end
if z_1 != nothing && ( typeof(z_1) != String )
nodesize1 = 5
else
nodesize1 = 1
end
n_subfeatures=-1; n_trees=10; partial_sampling=0.7; max_depth=-1
min_samples_leaf= nodesize1; min_samples_split=2; min_purity_increase=0.0; seed=0
RFmodel = build_forest(z_1, x_1, n_subfeatures, n_trees, partial_sampling, max_depth, min_samples_leaf, min_samples_split, min_purity_increase; rng = seed)
end
DML2_RF = DML2_for_PLIVM(xraw, d, z, y, dreg, yreg, zreg, 20)
Folds:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
Coefficient (se) =0.7524130698167707(0.23630541190311558)
("coef_est" => 0.7524130698167707, "se" => 0.23630541190311558, "dtil" => [0.15565634920634874, -0.2791861111111107, -0.9777500000000012, 2.29963852813853, -0.31843055555555644, -2.6909138888888897, -1.139750000000002, 1.1712214285714273, -1.5625142857142862, -0.36003214285714336 … 0.797359523809523, 0.9724035714285719, 1.1681678571428575, -0.18468412698412617, -1.2077369047619042, -0.4859154761904758, 2.753840151515152, 0.2909972222222228, -0.30180833333333457, -2.4970499999999998], "ytil" => [0.2812396825396828, 0.8693749999999998, 0.12426785714285771, 1.230942424242425, 0.20469837662337653, -2.2161714646464654, -0.4307873015873014, 0.2022126984126995, -0.3882476190476183, -1.2033507936507934 … -1.0349730158730157, 0.18190039682539627, 0.5981027777777772, 0.20425714285714314, -0.0898416666666666, -0.04134920634920647, 1.2455204545454546, 0.6519388888888891, -0.9406249999999998, -0.2312666666666674], "ztil" => [-0.04162269159816123, -0.47678027991012506, 0.5216786273234608, -1.6768161192286897, 0.84380642810238, 0.8852341028025945, -0.030204424243000183, 0.10552145021519266, -0.6319344143374961, 1.7776519182284276 … -0.9613266827204523, 0.7370634824889502, -0.04842881237565244, -0.24884581462637723, 0.3457469675255931, 0.5720750978085158, -1.0069785997445133, 0.01417775285878431, 0.8127966969644644, -0.3488015282100294])
print( "\n DML with Post-Lasso \n" )
DML with Post-Lasso
include("E:/causal_ml/hdmjl/hdmjl.jl")
#include("hdmjl/hdmjl.jl")
function DML2_lasso_cv(x , d , z, y, dreg , yreg , zreg, nfold)
# Num ob observations
nobser = size(x,1)
# Define folds indices
foldid = collect(Kfold(size(x)[1], nfold))
# Create array to save errors
ytil = ones(nobser)
dtil = ones(nobser)
ztil = ones(nobser);
println("Folds: " )
for i in 1:nfold
# Save errors
# Lasso regression, excluding folds selected
dtil= dreg(X[foldid[i],:], D[foldid[i],:])
ztil= dreg(X[foldid[i],:], Z[foldid[i],:])
ytil= dreg(X[foldid[i],:], Y[foldid[i],:])
println(i)
end
data = DataFrame(ytil = ytil, dtil = dtil, ztil = ztil)
ivfit = reg(data, @formula(ytil ~ + (dtil ~ ztil)))
# OLS clustering at the County level
coef_est = ivfit.coef[2]
se = FixedEffectModels.coeftable(ivfit).cols[2][1]
println( "\n Coefficient (se) =", coef_est, "(",se,")" )
Final_result = ("coef_est" => coef_est , "se" => se , "dtil" => dtil , "ytil" => ytil , "ztil" => ztil);
end
DML2_lasso_cv (generic function with 1 method)
function dreg(x_1, d_1)
res_Y_0 = rlasso_arg( x_1, d_1, nothing, true, true, true, false, false,
nothing, 1.1, nothing, 5000, 15, 10^(-5), -Inf, true, Inf, true )
res_Y = rlasso(res_Y_0)["residuals"]
end
function yreg(x_1, y_1)
res_D_0 = rlasso_arg( x_1, y_1, nothing, true, true, true, false, false,
nothing, 1.1, nothing, 5000, 15, 10^(-5), -Inf, true, Inf, true )
res_D = rlasso(res_D_0)["residuals"]
end
function yreg(x_1, z_1)
res_D_0 = rlasso_arg( x_1, z_1, nothing, true, true, true, false, false,
nothing, 1.1, nothing, 5000, 15, 10^(-5), -Inf, true, Inf, true )
res_D = rlasso(res_D_0)["residuals"]
end
DML2_lasso = DML2_lasso_cv(x, d, z, y, dreg, yreg, zreg, 20)
Folds:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
Coefficient (se) =0.8025198007404852(0.25504605869117214)
("coef_est" => 0.8025198007404852, "se" => 0.25504605869117214, "dtil" => [0.32193926336371426, -0.3770827830132363, -1.070075755625701, 2.1576850079023595, 0.43940896035100363, -1.9157575580162984, -1.2156103112423475, 1.1861811155183757, -1.2993046591692252, -1.6914123930204885 … 0.9833133752652194, 1.233808244104633, 0.7115585348636041, 0.012480510286280333, -1.1822612803084596, -0.4134126124732038, 2.339357342980089, 0.4412519781926516, -0.4226265978857042, -2.1315655638692483], "ytil" => [0.8507245602499757, 0.5201871241327402, 0.23278092961681263, 1.198234224563612, 0.6550068961725938, -1.7518203556789185, -0.5704415991463694, 0.3160676786406481, -0.4078354438403389, -1.3280401038135867 … -0.9470399925282902, 0.010156634359231087, 0.3464692630007332, 0.7704131075558607, -0.21100693971176332, 0.16341110725897146, 1.19112670738053, 0.6725244564201832, -1.2053548481169798, -0.3105502641283808], "ztil" => [-0.7785546325051316, 0.0745990650429309, 0.6425336548870126, -1.632609215400903, 0.5873389096268885, 0.4092506488987838, 0.215074635162109, 0.026656659945137162, 0.09324711657468787, 1.3581662483976231 … -0.7529845369035337, 0.8283344064209794, 0.23351029549835928, -0.8331538345521134, -0.22958213916338854, 0.6456750978044945, -0.7745432976135063, 0.06820232754102012, 0.8671549246258765, -0.4106150750948676])
# Compare Forest vs Lasso
mods = [DML2_RF, DML2_lasso];
mods_name = ["DML2_RF", "DML2_lasso"];
RMSE_Y = []
RMSE_D = []
RMSE_Z = []
for i in mods
push!(RMSE_Y, sqrt(sum(i[4][2].^2)/length(i[4][2])))
push!(RMSE_D, sqrt(sum(i[3][2].^2)/length(i[3][2])))
push!(RMSE_Z, sqrt(sum(i[5][2].^2)/length(i[5][2])))
end
result = DataFrame([mods_name RMSE_Y RMSE_D RMSE_Z], [:Models, :RMSE_Y, :RMSE_D, :RMSE_Z])
pretty_table(result; formatters = ft_printf("%5.10f"))
┌────────────┬──────────────┬──────────────┬──────────────┐
│ Models │ RMSE_Y │ RMSE_D │ RMSE_Z │
│ Any │ Any │ Any │ Any │
├────────────┼──────────────┼──────────────┼──────────────┤
│ DML2_RF │ 0.8072244996 │ 1.2993424184 │ 0.9519269381 │
│ DML2_lasso │ 0.7736230839 │ 1.2684038714 │ 0.9581213967 │
└────────────┴──────────────┴──────────────┴──────────────┘
24.6. Examine if we have weak instruments#
data1 = DataFrame(ytil = DML2_lasso[4][2], dtil = DML2_lasso[3][2], ztil = DML2_lasso[5][2]);
ols1 = fit(LinearModel, @formula(dtil ~ 0 + ztil), data1)
StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, CholeskyPivoted{Float64, Matrix{Float64}}}}, Matrix{Float64}}
dtil ~ 0 + ztil
Coefficients:
──────────────────────────────────────────────────────────────────
Coef. Std. Error t Pr(>|t|) Lower 95% Upper 95%
──────────────────────────────────────────────────────────────────
ztil -0.429901 0.161645 -2.66 0.0100 -0.753239 -0.106562
──────────────────────────────────────────────────────────────────
stderror(HC1(), ols1)
1-element Vector{Float64}:
0.17430705743059816
data2 = DataFrame(ytil = DML2_RF[4][2], dtil = DML2_RF[3][2], ztil = DML2_RF[5][2]);
ols2 = fit(LinearModel, @formula(dtil ~ 0 + ztil), data2)
StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, CholeskyPivoted{Float64, Matrix{Float64}}}}, Matrix{Float64}}
dtil ~ 0 + ztil
Coefficients:
──────────────────────────────────────────────────────────────────
Coef. Std. Error t Pr(>|t|) Lower 95% Upper 95%
──────────────────────────────────────────────────────────────────
ztil -0.41886 0.163672 -2.56 0.0129 -0.745932 -0.0917875
──────────────────────────────────────────────────────────────────
stderror(HC1(), ols2)
1-element Vector{Float64}:
0.18547948275293047
24.7. We do have weak instruments, because t-stats in regression are less than 4 in absolute value#
function DML_AR_PLIV(rY, rD, rZ, grid, alpha)
n = size(rY)[1]
Cstat = zeros(size(grid))
for i in 1:length(grid)
Cstat[i] = n * ((mean((rY - grid[i] * rD).* rZ))^2) / var((rY - grid[i] * rD).* rZ)
end
data_ar = DataFrame(grid = grid, Cstat = Cstat)
LB = minimum(data_ar[data_ar[:,2] .< chisqinvcdf(1, 1 - alpha), :][!,1])
UB = maximum(data_ar[data_ar[:,2] .< chisqinvcdf(1, 1 - alpha), :][!,1])
println( "UB =" , UB, "LB =" ,LB)
Plots.plot(grid, Cstat, color=:black)
xlabel!("Effect of institutions")
ylabel!("Statistic")
vline!([LB], color=:red, label="1", linestyle=:dash)
vline!([UB], color=:red, label="2", linestyle=:dash)
hline!([chisqinvcdf(1, 1 - alpha)], color=:skyblue, linestyle=:dash)
end
DML_AR_PLIV (generic function with 1 method)
DML_AR_PLIV(data1[!,1], data1[!,2], data1[!,3], collect(range( -2, 2.001, step = 0.01 )), 0.05 )
UB =2.0LB =0.39
DML_AR_PLIV(data2[!,1], data2[!,2], data2[!,3], collect(range( -2, 2.001, step = 0.01 )), 0.05 )
UB =2.0LB =0.34