# !wget https://developer.nvidia.com/compute/cuda/9.0/Prod/local_installers/cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb
# !dpkg -i cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb
# !apt-key add /var/cuda-repo-9-0-local/7fa2af80.pub
# !apt update -q
# !apt install cuda gcc-6 g++-6 -y -q
# !ln -s /usr/bin/gcc-6 /usr/local/cuda/bin/gcc
# !ln -s /usr/bin/g++-6 /usr/local/cuda/bin/g++
# !curl -sSL "https://julialang-s3.julialang.org/bin/linux/x64/1.7/julia-1.7.3-linux-x86_64.tar.gz" -o julia.tar.gz
# !tar -xzf julia.tar.gz -C /usr --strip-components 1
# !rm -rf julia.tar.gz*
# !julia -e 'using Pkg; pkg"add IJulia; precompile"'
18. DML inference for gun ownership#
We consider the problem of estimating the effect of gun ownership on the homicide rate. For this purpose, we estimate the following partially linear model
18.1. Data#
\(Y_{j,t}\) is log homicide rate in county \(j\) at time \(t\), \(D_{j, t-1}\) is log fraction of suicides committed with a firearm in county \(j\) at time \(t-1\), which we use as a proxy for gun ownership, and \(Z_{j,t}\) is a set of demographic and economic characteristics of county \(j\) at time \(t\). The parameter \(\beta\) is the effect of gun ownership on the homicide rates, controlling for county-level demographic and economic characteristics.
The sample covers 195 large United States counties between the years 1980 through 1999, giving us 3900 observations.
using Pkg
# Pkg.add("CSV"), using CSV
# Pkg.add("DataFrames"), using DataFrames
# Pkg.add("StatsModels"), using StatsModels
# Pkg.add("GLM"), using GLM
# Pkg.add("Random"), using Random
Updating registry at `C:\Users\sandr\.julia\registries\General`
Updating git-repo `https://github.com/JuliaRegistries/General.git`
Resolving package versions...
No Changes to `C:\Users\sandr\.julia\environments\v1.6\Project.toml`
No Changes to `C:\Users\sandr\.julia\environments\v1.6\Manifest.toml`
Resolving package versions...
No Changes to `C:\Users\sandr\.julia\environments\v1.6\Project.toml`
No Changes to `C:\Users\sandr\.julia\environments\v1.6\Manifest.toml`
Resolving package versions...
No Changes to `C:\Users\sandr\.julia\environments\v1.6\Project.toml`
No Changes to `C:\Users\sandr\.julia\environments\v1.6\Manifest.toml`
Resolving package versions...
No Changes to `C:\Users\sandr\.julia\environments\v1.6\Project.toml`
No Changes to `C:\Users\sandr\.julia\environments\v1.6\Manifest.toml`
Resolving package versions...
No Changes to `C:\Users\sandr\.julia\environments\v1.6\Project.toml`
No Changes to `C:\Users\sandr\.julia\environments\v1.6\Manifest.toml`
(nothing, nothing)
using Pkg, RData, DataFrames, StatsModels, GLM, Random
url = "https://github.com/d2cml-ai/14.388_jl/raw/github_data/data/gun_clean.RData"
download(url, "data.RData")
rdata_read = load("data.RData")
rm("data.RData")
data = rdata_read["data"]
names(data)
println("Number of rows: ",size(data,1))
println("Number of columns: ",size(data,2))
Number of rows: 3900
Number of columns: 198
18.1.1. Preprocessing#
To account for heterogeneity across counties and time trends in all variables, we remove from them county-specific and time-specific effects in the following preprocessing.
################################# Find Variable Names from Dataset ########################
function varlist(df = nothing , type_dataframe = ["numeric","categorical","string"], pattern=String , exclude = nothing)
varrs = []
if "numeric" in type_dataframe
append!(varrs, [i for i in names(data) if eltype(eachcol(data)[i]) <: Number])
end
if "categorical" in type_dataframe
append!(varrs,[i for i in names(data) if eltype(eachcol(data)[i]) <: CategoricalVector])
end
if "string" in type_dataframe
append!(varrs,[i for i in names(data) if eltype(eachcol(data)[i]) <: String])
end
varrs[(!varrs in exclude) & varrs[findall(x->contains(x,pattern),names(data))]]
end
varlist (generic function with 5 methods)
################################# Create Variables ###############################
# Dummy Variables for Year and County Fixed Effects
fixed = filter(x->contains(x, "X_Jfips"), names(data));
year = filter(x->contains(x, "X_Tyear"), names(data));
census = []
census_var = ["AGE", "BN", "BP", "BZ", "ED", "EL", "HI", "HS", "INC", "LF", "LN", "PI", "PO", "PP", "PV", "SPR", "VS"]
for i in 1:size(census_var,1)
append!(census, filter(x->contains(x, census_var[i]), names(data)))
end
################################ Variables ##################################
# Treatment Variable
d = ["logfssl"];
# Outcome Variable
y = ["logghomr"];
# Other Control Variables
X1 = ["logrobr", "logburg", "burg_missing", "robrate_missing"];
X2 = ["newblack", "newfhh", "newmove", "newdens", "newmal"];
################################# Partial out Fixed Effects ########################
# Variables to be Partialled-out
variable = [y, d,X1, X2, census]
varlis = []
# Partial out Variables in varlist from year and county fixed effect
for i in variable
append!(varlis,i)
end
# Running the following lines takes aprox. 10 minutes (depends on your CPU)
example = DataFrame(CountyCode = data[:,"CountyCode"]);
rdata = DataFrame(CountyCode = data[:,"CountyCode"]);
for i in 1:size(varlis,1)
rdata[!,varlis[i]]= residuals(lm(term(Symbol(varlis[i])) ~ sum(term.(Symbol.(year))) + sum(term.(Symbol.(fixed))), data))
end
first(rdata, 6)
6 rows × 198 columns (omitted printing of 191 columns)
CountyCode | logghomr | logfssl | logrobr | logburg | burg_missing | robrate_missing | |
---|---|---|---|---|---|---|---|
Int64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | |
1 | 1073 | -0.134778 | 0.0961271 | 0.150893 | -0.124395 | 0.0104613 | -0.021229 |
2 | 1073 | -0.239622 | 0.0808094 | 0.0401683 | -0.134781 | 0.0104613 | -0.0194181 |
3 | 1073 | -0.0786772 | 0.0573399 | -0.017679 | -0.167909 | 0.0104613 | -0.0220374 |
4 | 1073 | -0.331465 | 0.0816945 | -0.00963344 | -0.22925 | 0.0104613 | -0.0194181 |
5 | 1073 | -0.31664 | 0.0253655 | -0.0267151 | -0.176635 | 0.00324793 | -0.0208037 |
6 | 1073 | 0.105132 | -0.00677726 | -0.151487 | -0.189069 | 0.0104613 | 0.016953 |
18.2. We check that our results are equal to R results at 6 decimals#
# load dataset
rdata_read = CSV.File("../data/gun_clean2.csv") |> DataFrame
data_1 = rdata_read[!, names(rdata)]
n = size(data_1,1)
3900
column_names = names(data_1)
result = []
for i in 1:size(data_1,1)
for j in 1:size(data_1,2)
data_1[i,j] = round(data_1[i,j], digits=6)
rdata[i,j] = round(rdata[i,j], digits=6)
end
end
for col in column_names
result = sum(data_1[!,col] .== rdata[!,col])
end
Column CountyCode are equal at 6 decimals
Column logghomr are equal at 6 decimals
Column logfssl are equal at 6 decimals
Column logrobr are equal at 6 decimals
Column logburg are equal at 6 decimals
Column burg_missing are equal at 6 decimals
Column robrate_missing are equal at 6 decimals
Column newblack are equal at 6 decimals
Column newfhh are equal at 6 decimals
Column newmove are equal at 6 decimals
Column newdens are equal at 6 decimals
Column newmal are equal at 6 decimals
Column AGE010D are equal at 6 decimals
Column AGE050D are equal at 6 decimals
Column AGE110D are equal at 6 decimals
Column AGE170D are equal at 6 decimals
Column AGE180D are equal at 6 decimals
Column AGE270D are equal at 6 decimals
Column AGE310D are equal at 6 decimals
Column AGE320D are equal at 6 decimals
Column AGE350D are equal at 6 decimals
Column AGE380D are equal at 6 decimals
Column AGE410D are equal at 6 decimals
Column AGE470D are equal at 6 decimals
Column AGE570D are equal at 6 decimals
Column AGE640D are equal at 6 decimals
Column AGE670D are equal at 6 decimals
Column AGE760D are equal at 6 decimals
Column BNK010D are equal at 6 decimals
Column BNK050D are equal at 6 decimals
Column BPS030D are equal at 6 decimals
Column BPS130D are equal at 6 decimals
Column BPS230D are equal at 6 decimals
Column BPS020D are equal at 6 decimals
Column BPS120D are equal at 6 decimals
Column BPS220D are equal at 6 decimals
Column BPS820D are equal at 6 decimals
Column BZA010D are equal at 6 decimals
Column BZA110D are equal at 6 decimals
Column BZA210D are equal at 6 decimals
Column EDU100D are equal at 6 decimals
Column EDU200D are equal at 6 decimals
Column EDU600D are equal at 6 decimals
Column EDU610D are equal at 6 decimals
Column EDU620D are equal at 6 decimals
Column EDU630D are equal at 6 decimals
Column EDU635D are equal at 6 decimals
Column EDU640D are equal at 6 decimals
Column EDU650D are equal at 6 decimals
Column EDU680D are equal at 6 decimals
Column EDU685D are equal at 6 decimals
Column ELE010D are equal at 6 decimals
Column ELE020D are equal at 6 decimals
Column ELE025D are equal at 6 decimals
Column ELE030D are equal at 6 decimals
Column ELE035D are equal at 6 decimals
Column ELE060D are equal at 6 decimals
Column ELE065D are equal at 6 decimals
Column ELE210D are equal at 6 decimals
Column ELE220D are equal at 6 decimals
Column HIS010D are equal at 6 decimals
Column HIS020D are equal at 6 decimals
Column HIS030D are equal at 6 decimals
Column HIS040D are equal at 6 decimals
Column HIS110D are equal at 6 decimals
Column HIS120D are equal at 6 decimals
Column HIS130D are equal at 6 decimals
Column HIS140D are equal at 6 decimals
Column HIS200D are equal at 6 decimals
Column HIS300D are equal at 6 decimals
Column HIS500D are equal at 6 decimals
Column HIS700D are equal at 6 decimals
Column HSD010D are equal at 6 decimals
Column HSD020D are equal at 6 decimals
Column HSD030D are equal at 6 decimals
Column HSD110D are equal at 6 decimals
Column HSD120D are equal at 6 decimals
Column HSD130D are equal at 6 decimals
Column HSD140D are equal at 6 decimals
Column HSD150D are equal at 6 decimals
Column HSD170D are equal at 6 decimals
Column HSD200D are equal at 6 decimals
Column HSD210D are equal at 6 decimals
Column HSD230D are equal at 6 decimals
Column HSD300D are equal at 6 decimals
Column HSD310D are equal at 6 decimals
Column HSG030D are equal at 6 decimals
Column HSG195D are equal at 6 decimals
Column HSG200D are equal at 6 decimals
Column HSG220D are equal at 6 decimals
Column HSG440D are equal at 6 decimals
Column HSG445D are equal at 6 decimals
Column HSG460D are equal at 6 decimals
Column HSG680D are equal at 6 decimals
Column HSG700D are equal at 6 decimals
Column HSD410D are equal at 6 decimals
Column HSD500D are equal at 6 decimals
Column HSD510D are equal at 6 decimals
Column HSD520D are equal at 6 decimals
Column HSD530D are equal at 6 decimals
Column HSD540D are equal at 6 decimals
Column HSD550D are equal at 6 decimals
Column HSD560D are equal at 6 decimals
Column HSD570D are equal at 6 decimals
Column HSD580D are equal at 6 decimals
Column HSD590D are equal at 6 decimals
Column HSD610D are equal at 6 decimals
Column HSD620D are equal at 6 decimals
Column HSD710D are equal at 6 decimals
Column HSD720D are equal at 6 decimals
Column HSD730D are equal at 6 decimals
Column HSD740D are equal at 6 decimals
Column HSD750D are equal at 6 decimals
Column HSD760D are equal at 6 decimals
Column HSD770D are equal at 6 decimals
Column HSD780D are equal at 6 decimals
Column HSG040D are equal at 6 decimals
Column HSG045D are equal at 6 decimals
Column HSG050D are equal at 6 decimals
Column HSG182D are equal at 6 decimals
Column HSG210D are equal at 6 decimals
Column HSG230D are equal at 6 decimals
Column HSG240D are equal at 6 decimals
Column HSG250D are equal at 6 decimals
Column HSG310D are equal at 6 decimals
Column HSG315D are equal at 6 decimals
Column HSG320D are equal at 6 decimals
Column HSG325D are equal at 6 decimals
Column HSG335D are equal at 6 decimals
Column HSG350D are equal at 6 decimals
Column HSG370D are equal at 6 decimals
Column HSG375D are equal at 6 decimals
Column HSG380D are equal at 6 decimals
Column HSG450D are equal at 6 decimals
Column HSG490D are equal at 6 decimals
Column HSG500D are equal at 6 decimals
Column HSG510D are equal at 6 decimals
Column HSG520D are equal at 6 decimals
Column HSG530D are equal at 6 decimals
Column HSG540D are equal at 6 decimals
Column HSG550D are equal at 6 decimals
Column HSG560D are equal at 6 decimals
Column HSG570D are equal at 6 decimals
Column HSG650D are equal at 6 decimals
Column HSG690D are equal at 6 decimals
Column HSG710D are equal at 6 decimals
Column HSG730D are equal at 6 decimals
Column INC110D are equal at 6 decimals
Column INC650D are equal at 6 decimals
Column INC670D are equal at 6 decimals
Column INC680D are equal at 6 decimals
Column INC690D are equal at 6 decimals
Column INC700D are equal at 6 decimals
Column INC710D are equal at 6 decimals
Column INC720D are equal at 6 decimals
Column INC730D are equal at 6 decimals
Column INC760D are equal at 6 decimals
Column INC790D are equal at 6 decimals
Column LFE020D are equal at 6 decimals
Column LFE023D are equal at 6 decimals
Column LFE030D are equal at 6 decimals
Column LFE080D are equal at 6 decimals
Column LFE090D are equal at 6 decimals
Column LFE210D are equal at 6 decimals
Column LFE220D are equal at 6 decimals
Column LND110D are equal at 6 decimals
Column PIN020D are equal at 6 decimals
Column POP110D are equal at 6 decimals
Column POP210D are equal at 6 decimals
Column POP240D are equal at 6 decimals
Column POP440D are equal at 6 decimals
Column POP450D are equal at 6 decimals
Column POP470D are equal at 6 decimals
Column POP480D are equal at 6 decimals
Column POP540D are equal at 6 decimals
Column POP550D are equal at 6 decimals
Column POP570D are equal at 6 decimals
Column POP580D are equal at 6 decimals
Column POP700D are equal at 6 decimals
Column POP710D are equal at 6 decimals
Column POP720D are equal at 6 decimals
Column POP740D are equal at 6 decimals
Column PPQ010D are equal at 6 decimals
Column PPQ100D are equal at 6 decimals
Column PPQ110D are equal at 6 decimals
Column PPQ120D are equal at 6 decimals
Column PVY020D are equal at 6 decimals
Column PVY120D are equal at 6 decimals
Column PVY210D are equal at 6 decimals
Column PVY310D are equal at 6 decimals
Column PVY420D are equal at 6 decimals
Column PVY520D are equal at 6 decimals
Column SPR030D are equal at 6 decimals
Column SPR130D are equal at 6 decimals
Column SPR230D are equal at 6 decimals
Column SPR330D are equal at 6 decimals
Column SPR440D are equal at 6 decimals
Column VST020D are equal at 6 decimals
Now, we can construct the treatment variable, the outcome variable and the matrix \(Z\) that includes the control variables.
# Treatment variable
D = rdata[!,d]
# Outcome variable
Y = rdata[!,y];
# Construct matrix Z
Z = rdata[!, varlis[3:end]];
We have in total 195 control variables. The control variables \(Z_{j,t}\) are from the U.S. Census Bureau and contain demographic and economic characteristics of the counties such as the age distribution, the income distribution, crime rates, federal spending, home ownership rates, house prices, educational attainment, voting paterns, employment statistics, and migration rates.
clu = select(rdata,:CountyCode)
data = hcat(Y,D,Z,clu);
first(data, 6)
6 rows × 198 columns (omitted printing of 192 columns)
logghomr | logfssl | logrobr | logburg | burg_missing | robrate_missing | |
---|---|---|---|---|---|---|
Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | |
1 | -0.134778 | 0.0961271 | 0.150893 | -0.124395 | 0.0104613 | -0.021229 |
2 | -0.239622 | 0.0808094 | 0.0401683 | -0.134781 | 0.0104613 | -0.0194181 |
3 | -0.0786772 | 0.0573399 | -0.017679 | -0.167909 | 0.0104613 | -0.0220374 |
4 | -0.331465 | 0.0816945 | -0.00963344 | -0.22925 | 0.0104613 | -0.0194181 |
5 | -0.31664 | 0.0253655 | -0.0267151 | -0.176635 | 0.00324793 | -0.0208037 |
6 | 0.105132 | -0.00677726 | -0.151487 | -0.189069 | 0.0104613 | 0.016953 |
size(data), size(rdata)
((3900, 198), (3900, 198))
18.3. The effect of gun ownership#
18.3.1. OLS#
After preprocessing the data, we first look at simple regression of \(Y_{j,t}\) on \(D_{j,t-1}\) without controls as a baseline model.
using FixedEffectModels
# OLS clustering at the County level
fm_1 = @formula(logghomr ~ 0 + logfssl + fe(CountyCode))
baseline_ols = reg(data, fm_1, Vcov.cluster(:CountyCode))
Fixed Effect Model
==================================================================
Number of obs: 3900 Degrees of freedom: 2
R2: 0.006 R2 Adjusted: 0.006
F-Stat: 18.9732 p-value: 0.000
R2 within: 0.006 Iterations: 1
==================================================================
logghomr | Estimate Std.Error t value Pr(>|t|) Lower 95% Upper 95%
------------------------------------------------------------------
logfssl | 0.282304 0.0648108 4.35582 0.000 0.155238 0.409371
==================================================================
println("2.5% : ", GLM.coeftable(baseline_ols).cols[5])
println("97.5% : " , GLM.coeftable(baseline_ols).cols[6])
println("Estimate: ", GLM.coeftable(baseline_ols).cols[1])
println("Cluster s.e. : " , GLM.r2(baseline_ols))
println("T-value : ", GLM.coeftable(baseline_ols).cols[3])
println("Pr(>|t|) : " , GLM.coeftable(baseline_ols).cols[4])
2.5% : [0.15523818202545506]
97.5% : [0.40937075124096944]
Estimate: [0.28230446663321224]
Cluster s.e. : 0.006193263836883567
T-value : [4.355824585575914]
Pr(>|t|) : [1.3597670123818405e-5]
The point estimate is \(0.282\) with the confidence interval ranging from 0.167 to 0.397. This suggests that increases in gun ownership rates are related to gun homicide rates - if gun ownership increases by 1% relative to a trend then the predicted gun homicide rate goes up by 0.28%, without controlling for counties’ characteristics.
Since our goal is to estimate the effect of gun ownership after controlling for a rich set county characteristics we next include the controls. First, we estimate the model by ols and then by an array of the modern regression methods using the double machine learning approach.
control_formula = term(:logghomr) ~ term(:logfssl) + sum(term.(Symbol.(names(Z)))) + fe(:CountyCode)
control_ols = reg(data, control_formula)
Fixed Effect Model
===================================================================================
Number of obs: 3900 Degrees of freedom: 375
R2: 0.203 R2 Adjusted: 0.118
F-Stat: 4.987 p-value: 0.000
R2 within: 0.203 Iterations: 1
===================================================================================
logghomr | Estimate Std.Error t value Pr(>|t|) Lower 95% Upper 95%
-----------------------------------------------------------------------------------
logfssl | 0.190645 0.0578919 3.29312 0.001 0.0771399 0.30415
logrobr | 0.189046 0.0439522 4.30118 0.000 0.102872 0.275221
logburg | 0.219263 0.062191 3.52564 0.000 0.0973288 0.341197
burg_missing | 1.52961 0.452901 3.37737 0.001 0.641639 2.41759
robrate_missing | 1.13308 0.258481 4.3836 0.000 0.62629 1.63987
newblack | -4.34622 1.01668 -4.27492 0.000 -6.33956 -2.35288
newfhh | 7.71732 6.39769 1.20627 0.228 -4.82622 20.2609
newmove | 42.1106 28.0383 1.5019 0.133 -12.8624 97.0836
newdens | -2.40488 1.53413 -1.56758 0.117 -5.41276 0.602998
newmal | 0.61169 0.2731 2.2398 0.025 0.076239 1.14714
AGE010D | -31.0998 46.3537 -0.670924 0.502 -121.983 59.783
AGE050D | 10.427 5.60406 1.86062 0.063 -0.560503 21.4146
AGE110D | -6.45862 4.34915 -1.48503 0.138 -14.9857 2.06849
AGE170D | -6.44631 4.56503 -1.41211 0.158 -15.3967 2.50406
AGE180D | -11.5315 14.482 -0.79626 0.426 -39.9255 16.8625
AGE270D | 35.655 20.0702 1.77651 0.076 -3.69538 75.0054
AGE310D | 19.3837 25.0502 0.773796 0.439 -29.7306 68.4981
AGE320D | 1.58818 1.50379 1.05612 0.291 -1.36021 4.53658
AGE350D | -9.33085 12.9579 -0.720089 0.472 -34.7366 16.0749
AGE380D | 27.1384 26.3214 1.03104 0.303 -24.4684 78.7452
AGE410D | -9.26452 12.4292 -0.745381 0.456 -33.6337 15.1047
AGE470D | 5.1716 2.36612 2.18568 0.029 0.53249 9.8107
AGE570D | 3.81806 1.91895 1.98966 0.047 0.0556927 7.58042
AGE640D | 1.96507 1.38588 1.41792 0.156 -0.752135 4.68228
AGE670D | -1.72923 1.22096 -1.4163 0.157 -4.12309 0.664618
AGE760D | 0.811396 3.00505 0.270011 0.787 -5.08042 6.70321
BNK010D | -0.0756996 0.0801789 -0.944134 0.345 -0.232901 0.0815022
BNK050D | 0.0185195 0.0748159 0.247534 0.805 -0.128167 0.165206
BPS030D | -0.0670493 0.13632 -0.491852 0.623 -0.334323 0.200225
BPS130D | 0.0281751 0.141942 0.198497 0.843 -0.250122 0.306472
BPS230D | -0.0730225 0.0672526 -1.08579 0.278 -0.204881 0.0588355
BPS020D | -0.0832099 0.0854561 -0.973716 0.330 -0.250758 0.0843385
BPS120D | 0.0354868 0.0734394 0.483212 0.629 -0.108501 0.179475
BPS220D | 0.0676879 0.0405164 1.67063 0.095 -0.01175 0.147126
BPS820D | 0.0314011 0.0195249 1.60826 0.108 -0.0068801 0.0696823
BZA010D | 0.341185 0.306873 1.11181 0.266 -0.260481 0.942851
BZA110D | -0.135647 0.128622 -1.05462 0.292 -0.387828 0.116534
BZA210D | -0.0971895 0.175951 -0.552366 0.581 -0.442166 0.247787
EDU100D | 0.0 NaN NaN NaN NaN NaN
EDU200D | 0.0 NaN NaN NaN NaN NaN
EDU600D | -50.6628 58.3172 -0.868745 0.385 -165.002 63.6761
EDU610D | -0.428207 0.41455 -1.03294 0.302 -1.24099 0.384575
EDU620D | 1.10019 0.633071 1.73787 0.082 -0.141027 2.34142
EDU630D | 28.5177 54.5575 0.522708 0.601 -78.4499 135.485
EDU635D | -22.7577 54.2392 -0.419581 0.675 -129.101 83.5857
EDU640D | -1.00389 1.09485 -0.916915 0.359 -3.1505 1.14272
EDU650D | -2.01891 0.978975 -2.06227 0.039 -3.93833 -0.0994954
EDU680D | -8.05494 15.3721 -0.523999 0.600 -38.194 22.0841
EDU685D | 7.22624 15.337 0.471164 0.638 -22.844 37.2965
ELE010D | 16.0534 18.9389 0.847645 0.397 -21.0788 53.1857
ELE020D | 2.23734 13.631 0.164137 0.870 -24.4881 28.9628
ELE025D | -2.56733 13.6226 -0.188462 0.851 -29.2762 24.1416
ELE030D | -18.6684 13.0853 -1.42667 0.154 -44.3239 6.98717
ELE035D | 18.6815 13.0753 1.42877 0.153 -6.95434 44.3174
ELE060D | 0.00555417 0.0178076 0.311898 0.755 -0.0293602 0.0404685
ELE065D | -0.014298 0.0210801 -0.678268 0.498 -0.0556284 0.0270325
ELE210D | -0.329838 0.131981 -2.49913 0.012 -0.588606 -0.0710706
ELE220D | -0.00283513 0.0551682 -0.0513907 0.959 -0.111 0.10533
HIS010D | 0.0 NaN NaN NaN NaN NaN
HIS020D | 18.9167 10.7819 1.75448 0.079 -2.22272 40.0561
HIS030D | -9.10581 5.23129 -1.74064 0.082 -19.3625 1.15084
HIS040D | -9.42998 5.59639 -1.68501 0.092 -20.4025 1.54252
HIS110D | -0.835275 0.42122 -1.98299 0.047 -1.66114 -0.00941463
HIS120D | 0.0609434 1.18955 0.0512323 0.959 -2.27133 2.39322
HIS130D | -0.0268448 0.0991369 -0.270785 0.787 -0.221216 0.167527
HIS140D | -0.143565 0.13609 -1.05492 0.292 -0.410388 0.123259
HIS200D | 0.0 NaN NaN NaN NaN NaN
HIS300D | 0.0 NaN NaN NaN NaN NaN
HIS500D | 0.0 NaN NaN NaN NaN NaN
HIS700D | 0.0 NaN NaN NaN NaN NaN
HSD010D | 76.4943 110.246 0.69385 0.488 -139.658 292.647
HSD020D | -0.153102 0.0594808 -2.57397 0.010 -0.269722 -0.0364813
HSD030D | 5.6202 3.39774 1.6541 0.098 -1.04152 12.2819
HSD110D | 76.8705 69.7297 1.10241 0.270 -59.8442 213.585
HSD120D | -17.7899 10.6873 -1.66459 0.096 -38.7437 3.16397
HSD130D | -78.271 32.262 -2.4261 0.015 -141.525 -15.0169
HSD140D | 12.5449 8.17146 1.53521 0.125 -3.47632 28.5662
HSD150D | -2.97057 2.16737 -1.37059 0.171 -7.22 1.27886
HSD170D | -6.81635 5.60026 -1.21715 0.224 -17.7964 4.16372
HSD200D | -58.9814 20.7961 -2.83617 0.005 -99.7551 -18.2078
HSD210D | 33.4968 11.8303 2.83143 0.005 10.3018 56.6918
HSD230D | -3.18941 3.03845 -1.04968 0.294 -9.14671 2.76789
HSD300D | 11.7346 106.498 0.110186 0.912 -197.07 220.539
HSD310D | -20.4372 20.0619 -1.01871 0.308 -59.7714 18.897
HSG030D | -67.0834 81.6017 -0.822082 0.411 -227.075 92.9081
HSG195D | -0.0667794 0.364696 -0.18311 0.855 -0.781817 0.648258
HSG200D | 0.0 NaN NaN NaN NaN NaN
HSG220D | -1.36752 1.80298 -0.758478 0.448 -4.90252 2.16747
HSG440D | -398.106 237.504 -1.67621 0.094 -863.765 67.5536
HSG445D | 64.7986 59.6396 1.0865 0.277 -52.133 181.73
HSG460D | 0.39361 0.819571 0.480263 0.631 -1.21327 2.00049
HSG680D | -80.9783 114.06 -0.709961 0.478 -304.609 142.652
HSG700D | 1.19998 1.00046 1.19942 0.230 -0.761566 3.16152
HSD410D | -33.7818 74.2015 -0.455271 0.649 -179.264 111.7
HSD500D | -86.7182 70.9099 -1.22293 0.221 -225.747 52.3105
HSD510D | 96.7617 30.9344 3.12796 0.002 36.1106 157.413
HSD520D | -8.16829 6.54038 -1.2489 0.212 -20.9916 4.65502
HSD530D | -0.803146 6.52354 -0.123115 0.902 -13.5934 11.9872
HSD540D | 4.7532 2.64694 1.79574 0.073 -0.436486 9.94288
HSD550D | -0.836034 1.01653 -0.822438 0.411 -2.82908 1.15701
HSD560D | -0.343998 1.37207 -0.250714 0.802 -3.03414 2.34614
HSD570D | 0.0 NaN NaN NaN NaN NaN
HSD580D | 2.34596 2.04351 1.148 0.251 -1.66062 6.35253
HSD590D | 2.71956 1.44161 1.88647 0.059 -0.106914 5.54604
HSD610D | 28.2381 23.6197 1.19553 0.232 -18.0716 74.5478
HSD620D | -31.3837 12.891 -2.43454 0.015 -56.6583 -6.10909
HSD710D | 41.6004 95.7404 0.434513 0.664 -146.112 229.313
HSD720D | 0.0 NaN NaN NaN NaN NaN
HSD730D | -7.50721 8.94448 -0.839312 0.401 -25.0441 10.0297
HSD740D | -14.6973 6.09608 -2.41094 0.016 -26.6495 -2.74511
HSD750D | 0.865632 0.399115 2.16888 0.030 0.0831119 1.64815
HSD760D | 12.9136 4.98484 2.59058 0.010 3.14016 22.6871
HSD770D | 14.3236 7.30444 1.96095 0.050 0.00226319 28.645
HSD780D | -2.01614 0.862821 -2.33668 0.020 -3.70782 -0.324459
HSG040D | 61.085 82.7939 0.737796 0.461 -101.244 223.414
HSG045D | 0.225821 0.108449 2.08228 0.037 0.0131921 0.438451
HSG050D | 0.0 NaN NaN NaN NaN NaN
HSG182D | 331.042 220.275 1.50286 0.133 -100.838 762.922
HSG210D | 5.42601 1.90796 2.84388 0.004 1.68519 9.16683
HSG230D | 0.0 NaN NaN NaN NaN NaN
HSG240D | 13.3649 6.53496 2.04513 0.041 0.552168 26.1775
HSG250D | 0.117331 0.332084 0.353315 0.724 -0.533766 0.768428
HSG310D | 0.182678 0.33501 0.545291 0.586 -0.474155 0.839511
HSG315D | 0.0783361 0.156528 0.500461 0.617 -0.228558 0.38523
HSG320D | 0.1786 0.219242 0.814625 0.415 -0.251254 0.608453
HSG325D | -0.0160287 0.0874834 -0.18322 0.855 -0.187552 0.155495
HSG335D | 0.077291 0.105222 0.734549 0.463 -0.129012 0.283594
HSG350D | 0.0525904 0.0497627 1.05682 0.291 -0.0449763 0.150157
HSG370D | -1.06479 0.618502 -1.72156 0.085 -2.27745 0.147868
HSG375D | -0.843125 1.53396 -0.54964 0.583 -3.85066 2.16441
HSG380D | -2.10192 2.36916 -0.887201 0.375 -6.74699 2.54314
HSG450D | 0.202061 0.848067 0.238261 0.812 -1.46069 1.86481
HSG490D | 0.711425 0.433281 1.64195 0.101 -0.138081 1.56093
HSG500D | 0.0765713 0.161625 0.473759 0.636 -0.240316 0.393459
HSG510D | -0.0546356 0.254196 -0.214934 0.830 -0.553023 0.443751
HSG520D | 0.339298 0.326444 1.03938 0.299 -0.30074 0.979337
HSG530D | -0.489419 0.249349 -1.96278 0.050 -0.978302 -0.000535244
HSG540D | 0.391536 0.186473 2.09969 0.036 0.0259303 0.757142
HSG550D | -0.178454 0.176789 -1.00942 0.313 -0.525073 0.168165
HSG560D | 0.0440896 0.212525 0.207456 0.836 -0.372596 0.460775
HSG570D | -0.029629 0.157865 -0.187685 0.851 -0.339146 0.279888
HSG650D | 0.680951 0.830388 0.820039 0.412 -0.947139 2.30904
HSG690D | -0.45285 1.06089 -0.426858 0.670 -2.53287 1.62717
HSG710D | 79.6322 113.831 0.699564 0.484 -143.549 302.814
HSG730D | -0.923085 0.607052 -1.5206 0.128 -2.11329 0.267124
INC110D | -1.23532 1.05831 -1.16725 0.243 -3.31028 0.839646
INC650D | -0.644222 0.999919 -0.644275 0.519 -2.6047 1.31626
INC670D | -0.170282 0.557046 -0.305688 0.760 -1.26245 0.921883
INC680D | -0.591852 0.710938 -0.832494 0.405 -1.98574 0.80204
INC690D | 0.584595 0.780246 0.749244 0.454 -0.945185 2.11437
INC700D | 0.117036 0.740572 0.158035 0.874 -1.33496 1.56903
INC710D | 0.347172 0.888027 0.390947 0.696 -1.39393 2.08827
INC720D | -1.70807 0.844535 -2.0225 0.043 -3.3639 -0.0522468
INC730D | 0.148319 0.951026 0.155957 0.876 -1.7163 2.01294
INC760D | 1.01734 0.74768 1.36067 0.174 -0.448585 2.48327
INC790D | -0.665296 0.472569 -1.40783 0.159 -1.59183 0.261241
LFE020D | -11.1221 6.71988 -1.6551 0.098 -24.2973 2.05319
LFE023D | 5.50736 5.57333 0.988163 0.323 -5.41992 16.4346
LFE030D | 0.192258 0.0815242 2.3583 0.018 0.0324188 0.352097
LFE080D | -0.193538 0.682271 -0.283667 0.777 -1.53122 1.14415
LFE090D | 0.992654 3.18155 0.312003 0.755 -5.24521 7.23052
LFE210D | 2.53462 1.92188 1.31882 0.187 -1.2335 6.30274
LFE220D | 0.192318 0.556457 0.345611 0.730 -0.898692 1.28333
LND110D | -0.953834 1.25919 -0.757499 0.449 -3.42265 1.51498
PIN020D | 0.238888 0.322375 0.741025 0.459 -0.393173 0.870949
POP110D | 0.0 NaN NaN NaN NaN NaN
POP210D | -0.97205 0.862226 -1.12737 0.260 -2.66256 0.718462
POP240D | 0.0 NaN NaN NaN NaN NaN
POP440D | -3.881 2.36789 -1.63901 0.101 -8.52357 0.761582
POP450D | 11.6363 8.56405 1.35874 0.174 -5.1547 28.4273
POP470D | -3.53761 1.20799 -2.92852 0.003 -5.90604 -1.16919
POP480D | -1.77262 1.34222 -1.32067 0.187 -4.40422 0.858975
POP540D | -1.48071 2.19712 -0.673931 0.500 -5.78847 2.82705
POP550D | -28.1907 10.8632 -2.59506 0.009 -49.4895 -6.89191
POP570D | 0.166538 1.84645 0.0901933 0.928 -3.45369 3.78677
POP580D | 2.25075 1.7154 1.31208 0.190 -1.11253 5.61403
POP700D | 44.3169 33.8678 1.30853 0.191 -22.0856 110.719
POP710D | -13.1278 8.3321 -1.57557 0.115 -29.464 3.20843
POP720D | 0.891964 0.817457 1.09114 0.275 -0.710773 2.4947
POP740D | 0.406846 0.386029 1.05393 0.292 -0.350017 1.16371
PPQ010D | -0.31431 0.402496 -0.780902 0.435 -1.10346 0.474839
PPQ100D | 0.490539 0.261388 1.87667 0.061 -0.0219473 1.00303
PPQ110D | 0.0 NaN NaN NaN NaN NaN
PPQ120D | 0.0 NaN NaN NaN NaN NaN
PVY020D | -1.14716 2.74035 -0.418619 0.676 -6.52 4.22568
PVY120D | 2.03161 1.93462 1.05014 0.294 -1.76147 5.82469
PVY210D | 0.307278 0.5058 0.607508 0.544 -0.684413 1.29897
PVY310D | -0.428341 0.849478 -0.50424 0.614 -2.09386 1.23718
PVY420D | 0.611491 1.82103 0.335794 0.737 -2.9589 4.18188
PVY520D | -0.326862 0.690404 -0.473436 0.636 -1.68049 1.02677
SPR030D | -0.500169 0.443742 -1.12716 0.260 -1.37019 0.369848
SPR130D | -1.65748 0.831641 -1.99303 0.046 -3.28803 -0.0269365
SPR230D | 1.44839 0.797224 1.8168 0.069 -0.114674 3.01146
SPR330D | -0.0336548 0.322708 -0.104289 0.917 -0.666369 0.599059
SPR440D | -0.431825 0.293288 -1.47236 0.141 -1.00686 0.143207
VST020D | 0.532202 0.264856 2.00941 0.045 0.0129168 1.05149
===================================================================================
println("For <<logfssl>> variable: ")
println("2.5% : ", GLM.coeftable(control_ols).cols[5][1])
println("97.5% : " , GLM.coeftable(control_ols).cols[6][1])
println("Estimate: ", GLM.coeftable(control_ols).cols[1][1])
println("Cluster s.e. : " , GLM.r2(control_ols))
println("T-value : ", GLM.coeftable(control_ols).cols[3][1])
println("Pr(>|t|) : " , GLM.coeftable(control_ols).cols[4][1])
For <<logfssl>> variable:
2.5% : 0.07713994895431024
97.5% : 0.3041501481690573
Estimate: 0.19064504856168377
Cluster s.e. : 0.20296825192995305
T-value : 3.2931187658598318
Pr(>|t|) : 0.0010006039635811058
18.4. DML algorithm#
Here we perform inference of the predictive coefficient \(\beta\) in our partially linear statistical model,
using the double machine learning approach.
For \(\tilde Y = Y- E(Y|Z)\) and \(\tilde D= D- E(D|Z)\), we can write $\( \tilde Y = \alpha \tilde D + \epsilon, \quad E (\epsilon |\tilde D) =0. \)$
Using cross-fitting, we employ modern regression methods to build estimators \(\hat \ell(Z)\) and \(\hat m(Z)\) of \(\ell(Z):=E(Y|Z)\) and \(m(Z):=E(D|Z)\) to obtain the estimates of the residualized quantities:
Finally, using ordinary least squares of \(\tilde Y_i\) on \(\tilde D_i\), we obtain the estimate of \(\beta\).
using MLDataUtils, MLBase
function DML2_for_PLM(z , d , y, dreg , yreg , nfold, clu)
# Num ob observations
nobser = size(z,1)
# Define folds indices
foldid = collect(Kfold(size(z)[1], nfold))
# Create array to save errors
ytil = ones(nobser)
dtil = ones(nobser)
println("Folds: " )
# loop to save results
for i in 1:nfold
# Lasso regression, excluding folds selected
dfit = dreg(z[foldid[i],:], d[foldid[i]])
yfit = yreg(z[foldid[i],:], y[foldid[i]])
# Predict estimates using the
dhat = GLM.predict(dfit, z[Not(foldid[i]),:])
yhat = GLM.predict(yfit, z[Not(foldid[i]),:])
# Save errors
dtil[Not(foldid[i])] = (d[Not(foldid[i])] - dhat)
ytil[Not(foldid[i])] = (y[Not(foldid[i])] - yhat)
println(i)
end
# Create dataframe
data = DataFrame(ytil = ytil, dtil = dtil, clu=clu)
# OLS clustering at the County level
rfit = reg(data, @formula(ytil ~ dtil +fe(clu)))
coef_est = coef(rfit)[1]
se = FixedEffectModels.coeftable(rfit).cols[2]
println(" coef (se) = ", coef_est ,"(",se,")")
return rfit, data;
end
DML2_for_PLM (generic function with 1 method)
Now, we apply the Double Machine Learning (DML) approach with different machine learning methods. First, we load the relevant libraries.
Let us, construct the input matrices.
# Create main variables
z = Matrix(Z);
d = D[!,1];
y = Y[!,1];
clu = rdata[!, :CountyCode];
first(DataFrame(logghomr = y,logfssl = d,CountyCode = clu ),6)
6 rows × 3 columns
logghomr | logfssl | CountyCode | |
---|---|---|---|
Float64 | Float64 | Int64 | |
1 | -0.134778 | 0.0961271 | 1073 |
2 | -0.239622 | 0.0808094 | 1073 |
3 | -0.0786772 | 0.0573399 | 1073 |
4 | -0.331465 | 0.0816945 | 1073 |
5 | -0.31664 | 0.0253655 | 1073 |
6 | 0.105132 | -0.00677726 | 1073 |
In the following, we apply the DML approach with the differnt versions of lasso.
18.5. Lasso#
using Lasso
Random.seed!(123)
dreg(z,d) = fit(LassoModel, z, d, standardize = false)
yreg(z,y) = fit(LassoModel, z, y, standardize = false)
DML2_lasso = DML2_for_PLM(z, d, y, dreg, yreg, 10, clu);
Folds:
1
2
3
4
5
6
7
8
9
10
coef (se) = 0.21585895494601096([0.056923806335367456])
18.6. HDM.JL#
We are going to replicate the above regressions but using hmd
library.
include("hdmjl/hdmjl.jl")
function DML2_lasso_cv_hdm(z , d , y, dreg , yreg , nfold, clu)
# Num ob observations
nobser = size(z,1)
# Define folds indices
foldid = collect(Kfold(size(z)[1], nfold))
# Create array to save errors
ytil = ones(nobser)
dtil = ones(nobser)
println("Folds: " )
# loop to save results
for i in 1:nfold
# Lasso regression, excluding folds selected
dtil= dreg(Z[foldid[i],:], D[foldid[i],:])
ytil= dreg(Z[foldid[i],:], Y[foldid[i],:])
println(i)
end
# Create dataframe
data = DataFrame(ytil = ytil, dtil = dtil)
# OLS clustering at the County level
rfit = reg(data, @formula(ytil ~ 0 + dtil))
coef_est = coef(rfit)[1]
se = FixedEffectModels.coeftable(rfit).cols[2]
println(" coef (se) = ", coef_est ,"(",se,")")
return rfit, data;
end
DML2_lasso_cv_hdm (generic function with 1 method)
function dreg(Z, Y)
res_Y_0 = rlasso_arg( Z, Y, nothing, true, true, true, false, false,
nothing, 1.1, nothing, 5000, 15, 10^(-5), -Inf, true, Inf, true )
res_Y = rlasso(res_Y_0)["residuals"]
end
function yreg(Z, D)
res_D_0 = rlasso_arg( Z, D, nothing, true, true, true, false, false,
nothing, 1.1, nothing, 5000, 15, 10^(-5), -Inf, true, Inf, true )
res_D = rlasso(res_D_0)["residuals"]
end
DML2_lasso_hdm = DML2_lasso_cv_hdm(z, d, y, dreg, yreg, 10, clu);
Folds:
1
2
3
4
5
6
7
8
9
10
coef (se) = 0.22759871882516988([0.06080798929481985])
18.7. GLMNET#
using GLMNet
function DML2_lasso_cv(z , d , y, dreg , yreg , nfold, clu)
# Num ob observations
nobser = size(z,1)
# Define folds indices
foldid = collect(Kfold(size(z)[1], nfold))
# Create array to save errors
ytil = ones(nobser)
dtil = ones(nobser)
println("Folds: " )
# loop to save results
for i in 1:nfold
dfit = dreg(z[foldid[i],:], d[foldid[i]])
yfit = yreg(z[foldid[i],:], y[foldid[i]])
dhat = GLMNet.predict(dfit, z[Not(foldid[i]),:])
yhat = GLMNet.predict(yfit, z[Not(foldid[i]),:])
dtil[Not(foldid[i])] = (d[Not(foldid[i])] - dhat)
ytil[Not(foldid[i])] = (y[Not(foldid[i])] - yhat)
println(i)
end
# Create dataframe
data = DataFrame(ytil = ytil, dtil = dtil, clu=clu)
# OLS clustering at the County level
rfit = reg(data, @formula(ytil ~ dtil +fe(clu)))
coef_est = coef(rfit)[1]
se = FixedEffectModels.coeftable(rfit).cols[2]
println(" coef (se) = ", coef_est ,"(",se,")")
return rfit, data;
end
DML2_lasso_cv (generic function with 1 method)
18.7.1. ML method = lasso from glmnet#
##ML method = lasso from glmnet
dreg(z, d) = glmnetcv(z, d, alpha = 1)
yreg(z, y) = glmnetcv(z, y, alpha = 1)
yreg (generic function with 1 method)
DML2_lasso_cv_1 = DML2_lasso_cv(z, d, y, dreg, yreg, 10, clu);
Folds:
1
2
3
4
5
6
7
8
9
10
coef (se) = 0.20125677024540278([0.05721047723253721])
18.7.2. ML method = Elastic net from glmnet#
##ML method = elastic net from glmnet
dreg(z, d) = glmnetcv(z, d, alpha = 0.5)
yreg(z, y) = glmnetcv(z, y, alpha = 0.5)
DML2_elnet = DML2_lasso_cv(z, d, y, dreg, yreg, 10, clu);
Folds:
1
2
3
4
5
6
7
8
9
10
coef (se) = 0.21993346167089925([0.05666642136725058])
18.7.3. ML method = Ridge from glmnet#
##ML method = elastic net from glmnet
dreg(z, d) = glmnetcv(z, d, alpha = 0)
yreg(z, y) = glmnetcv(z, y, alpha = 0)
DML2_ridge = DML2_lasso_cv(z, d, y, dreg, yreg, 10, clu);
Folds:
1
2
3
4
5
6
7
8
9
10
coef (se) = 0.2226512719332019([0.056771317887477434])
Here we also compute DML with OLS used as the ML method
Random.seed!(123)
dreg(z,d) = lm(z,d)
yreg(z,y) = lm(z,y)
DML2_ols = DML2_for_PLM(z, d, y, dreg, yreg, 10, clu);
Folds:
1
2
3
4
5
6
7
8
9
10
coef (se) = 0.1923864674102141([0.05591060950811912])
Next, we also apply Random Forest for comparison purposes.
18.7.4. Random Forest#
import Pkg; Pkg.add("MLJ")
import Pkg; Pkg.add("DecisionTree")
Resolving package versions...
No Changes to `C:\Users\sandr\.julia\environments\v1.6\Project.toml`
No Changes to `C:\Users\sandr\.julia\environments\v1.6\Manifest.toml`
Resolving package versions...
No Changes to `C:\Users\sandr\.julia\environments\v1.6\Project.toml`
No Changes to `C:\Users\sandr\.julia\environments\v1.6\Manifest.toml`
using DecisionTree, MLJ
function DML2_RF(z , d , y, dreg , yreg , nfold, clu)
# Num ob observations
nobser = size(z,1)
# Define folds indices
foldid = collect(Kfold(size(z)[1], nfold))
# Create array to save errors
ytil = ones(nobser)
dtil = ones(nobser)
println("Folds: " )
# loop to save results
for i in 1:nfold
dfit = dreg(z[foldid[i],:], d[foldid[i]])
yfit = yreg(z[foldid[i],:], y[foldid[i]])
dhat = apply_forest(dfit,z[Not(foldid[1]),:])
yhat = apply_forest(yfit,z[Not(foldid[1]),:])
dtil[Not(foldid[i])] = (d[Not(foldid[i])] - dhat)
ytil[Not(foldid[i])] = (y[Not(foldid[i])] - yhat)
println(i)
end
# Create dataframe
data = DataFrame(ytil = ytil, dtil = dtil, clu=clu)
# OLS clustering at the County level
rfit = reg(data, @formula(ytil ~ dtil +fe(clu)))
coef_est = coef(rfit)[1]
se = FixedEffectModels.coeftable(rfit).cols[2]
println(" coef (se) = ", coef_est ,"(",se,")")
return rfit, data;
end
DML2_RF (generic function with 1 method)
function dreg(z,d)
RFmodel = build_forest(d,z)
end
function yreg(z,y)
RFmodel = build_forest(y,z)
end
DML2_RF_1 = DML2_RF(z, d, y, dreg, yreg, 10, clu);
Folds:
1
2
3
4
5
6
7
8
9
10
coef (se) = 0.3175570504921465([0.05941831382579539])
We conclude that the gun ownership rates are related to gun homicide rates - if gun ownership increases by 1% relative to a trend then the predicted gun homicide rate goes up by about 0.20% controlling for counties’ characteristics.
Finally, let’s see which method is actually better. We compute RMSE for predicting D and Y, and see which of the methods works better.
using PrettyTables
mods = [DML2_ols, DML2_lasso, DML2_lasso_cv_1, DML2_ridge, DML2_elnet, DML2_RF_1];
mods_name = ["DML2_ols", "DML2_lasso", "DML2_lasso_cv", "DML2_ridge", "DML2_elnet", "DML2_RF"];
RMSE_Y = []
RMSE_D = []
for i in mods
push!(RMSE_Y, sqrt(mean(i[2][!,1])^2))
push!(RMSE_D, sqrt(mean(i[2][!,2])^2))
end
result = DataFrame([mods_name RMSE_Y RMSE_D], [:Models, :RMSE_Y, :RMSE_D])
pretty_table(result; formatters = ft_printf("%5.10f"))
┌───────────────┬──────────────┬──────────────┐
│ Models │ RMSE_Y │ RMSE_D │
│ Any │ Any │ Any │
├───────────────┼──────────────┼──────────────┤
│ DML2_ols │ 0.0011135761 │ 0.0002411937 │
│ DML2_lasso │ 0.0008511270 │ 0.0000111794 │
│ DML2_lasso_cv │ 0.0010020822 │ 0.0001238803 │
│ DML2_ridge │ 0.0015664215 │ 0.0000376254 │
│ DML2_elnet │ 0.0005612559 │ 0.0002353224 │
│ DML2_RF │ 0.0005524424 │ 0.0005975433 │
└───────────────┴──────────────┴──────────────┘
It looks like the best method for predicting D is Lasso, and the best method for predicting Y is CV Ridge.
#DML with cross-validated Lasso:
dreg(z,d) = glmnetcv(z,d, alpha = 1)
yreg(z,y) = glmnetcv(z,y, alpha = 0)
DML2_best = DML2_lasso_cv(z, d, y, dreg, yreg, 10, clu);
Folds:
1
2
3
4
5
6
7
8
9
10
coef (se) = 0.22358046307058665([0.0566348880807125])
ols_coef = GLM.coeftable(baseline_ols).cols[1][1]
ols_std = GLM.coeftable(baseline_ols).cols[2][1]
control_ols_coef = GLM.coeftable(control_ols).cols[1][1]
control_ols_std = GLM.coeftable(control_ols).cols[2][1]
lasso_coef = GLM.coeftable(DML2_lasso[1]).cols[1][1]
lasso_std = GLM.coeftable(DML2_lasso[1]).cols[2][1]
DML2_lasso_cv_1_coef = GLM.coeftable(DML2_lasso_cv_1[1]).cols[1][1]
DML2_lasso_cv_1_std = GLM.coeftable(DML2_lasso_cv_1[1]).cols[2][1]
DML2_elnet_coef = GLM.coeftable(DML2_elnet[1]).cols[1][1]
DML2_elnet_std = GLM.coeftable(DML2_elnet[1]).cols[2][1]
DML2_ridge_coef = GLM.coeftable(DML2_ridge[1]).cols[1][1]
DML2_ridge_std = GLM.coeftable(DML2_ridge[1]).cols[2][1]
DML2_RF_1_coef = GLM.coeftable(DML2_RF_1[1]).cols[1][1]
DML2_RF_1_std = GLM.coeftable(DML2_RF_1[1]).cols[2][1]
DML2_best_coef = GLM.coeftable(DML2_best).cols[1][1]
DML2_best_std = GLM.coeftable(DML2_best).cols[2][1];
tabla = DataFrame(modelos = ["Baseline OLS", "Least Squares with controls", "Lasso", "CV Lasso", "CV Elnet", "CV Ridge", "Random Forest", "Best"],
Estimate = [ols_coef, control_ols_coef, lasso_coef, DML2_lasso_cv_1_coef, DML2_elnet_coef, DML2_ridge_coef, DML2_RF_1_coef, DML2_best_coef],
StdError = [ols_std, control_ols_std, lasso_std, DML2_lasso_cv_1_std, DML2_elnet_std, DML2_ridge_std, DML2_RF_1_std, DML2_best_std])
8 rows × 3 columns
modelos | Estimate | StdError | |
---|---|---|---|
String | Float64 | Float64 | |
1 | Baseline OLS | 0.282304 | 0.0648108 |
2 | Least Squares with controls | 0.190645 | 0.0578919 |
3 | Lasso | 0.215859 | 0.0569238 |
4 | CV Lasso | 0.201257 | 0.0572105 |
5 | CV Elnet | 0.219933 | 0.0566664 |
6 | CV Ridge | 0.222651 | 0.0567713 |
7 | Random Forest | 0.317557 | 0.0594183 |
8 | Best | 0.22358 | 0.0566349 |