# !wget https://developer.nvidia.com/compute/cuda/9.0/Prod/local_installers/cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb
# !dpkg -i cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb
# !apt-key add /var/cuda-repo-9-0-local/7fa2af80.pub
# !apt update -q
# !apt install cuda gcc-6 g++-6 -y -q
# !ln -s /usr/bin/gcc-6 /usr/local/cuda/bin/gcc
# !ln -s /usr/bin/g++-6 /usr/local/cuda/bin/g++
# !curl -sSL "https://julialang-s3.julialang.org/bin/linux/x64/1.7/julia-1.7.3-linux-x86_64.tar.gz" -o julia.tar.gz
# !tar -xzf julia.tar.gz -C /usr --strip-components 1
# !rm -rf julia.tar.gz*
# !julia -e 'using Pkg; pkg"add IJulia; precompile"'
19. DML inference using NN for gun ownership#
In this lab, we estimate the effect of gun ownership on the homicide rate by a neural network.
# using Pkg
# Pkg.add("CSV"), using CSV
# Pkg.add("DataFrames"), using DataFrames
# Pkg.add("StatsModels"), using StatsModels
# Pkg.add("GLM"), using GLM
# Pkg.add("Random"), using Random
using Statistics, Plots, FixedEffectModels, MLDataUtils, MLBase, RData, Downloads
WARNING: using Downloads.download in module Main conflicts with an existing identifier.
First, we need to load and preprocess the data.
url = "https://github.com/d2cml-ai/14.388_jl/raw/github_data/data/gun_clean.RData"
download(url, "data.RData")
rdata_read = load("data.RData")
rm("data.RData")
data = rdata_read["data"]
names(data)
println("Number of rows: ",size(data,1))
println("Number of columns: ",size(data,2))
Number of rows: 3900
Number of columns: 198
################################# Create Variables ###############################
# Dummy Variables for Year and County Fixed Effects
fixed = filter(x->contains(x, "X_Jfips"), names(data));
year = filter(x->contains(x, "X_Tyear"), names(data));
census = []
census_var = ["AGE", "BN", "BP", "BZ", "ED", "EL", "HI", "HS", "INC", "LF", "LN", "PI", "PO", "PP", "PV", "SPR", "VS"]
for i in 1:size(census_var,1)
append!(census, filter(x->contains(x, census_var[i]), names(data)))
end
################################ Variables ##################################
# Treatment Variable
d = ["logfssl"];
# Outcome Variable
y = ["logghomr"];
# Other Control Variables
X1 = ["logrobr", "logburg", "burg_missing", "robrate_missing"];
X2 = ["newblack", "newfhh", "newmove", "newdens", "newmal"];
################################# Partial out Fixed Effects ########################
# Variables to be Partialled-out
variable = [y, d,X1, X2, census]
varlis = []
# Partial out Variables in varlist from year and county fixed effect
for i in variable
append!(varlis,i)
end
# Running the following lines takes aprox. 10 minutes (depends on your CPU)
example = DataFrame(CountyCode = data[:,"CountyCode"]);
rdata = DataFrame(CountyCode = data[:,"CountyCode"]);
for i in 1:size(varlis,1)
rdata[!,varlis[i]]= residuals(lm(term(Symbol(varlis[i])) ~ sum(term.(Symbol.(year))) + sum(term.(Symbol.(fixed))), data))
end
19.1. DML for neural nets#
The following algorithm comsumes \(Y\),\(D\) and \(Z\), and learns the residuals \(\tilde{Y}\) and \(\tilde{D}\) via a neural network, where the residuals are obtained by cross-validation (cross-fitting). Then, it prints the estimated coefficient β and the clustered standard error from the final OLS regression.
using Flux
using Flux: crossentropy, @epochs
using Flux.Data: DataLoader
using Flux: throttle
using Flux: onehotbatch, onecold, @epochs
using StatsBase
Z
logrobr | logburg | burg_missing | robrate_missing | newblack | newfhh | |
---|---|---|---|---|---|---|
Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | |
1 | 0.150893 | -0.124395 | 0.0104613 | -0.021229 | 0.0309471 | -0.0204832 |
2 | 0.0401683 | -0.134781 | 0.0104613 | -0.0194181 | 0.0309471 | -0.0204832 |
3 | -0.017679 | -0.167909 | 0.0104613 | -0.0220374 | 0.0309471 | -0.0204832 |
4 | -0.00963344 | -0.22925 | 0.0104613 | -0.0194181 | 0.0309471 | -0.0204832 |
5 | -0.0267151 | -0.176635 | 0.00324793 | -0.0208037 | 0.0309471 | -0.0204832 |
6 | -0.151487 | -0.189069 | 0.0104613 | 0.016953 | 0.0309471 | -0.0204832 |
7 | -0.166729 | -0.117739 | 0.0104613 | 0.0245505 | 0.0309471 | -0.0204832 |
8 | -0.0996453 | -0.0833094 | 0.00448964 | 0.021457 | 0.0309471 | -0.0204832 |
9 | 0.151557 | 0.319282 | -0.0448348 | -0.0366629 | 0.0309471 | -0.0204832 |
10 | 0.0476034 | -0.0144728 | -0.00233214 | 0.00765442 | 0.0309471 | -0.0204832 |
11 | 0.00814297 | -0.0349694 | 0.0104613 | 0.0101673 | -0.0309471 | 0.0204832 |
12 | -0.0811847 | 0.0441466 | 0.00848568 | 0.0169169 | -0.0309471 | 0.0204832 |
13 | -0.153504 | 0.393015 | -0.0484441 | 0.0230577 | -0.0309471 | 0.0204832 |
14 | -0.148069 | -0.0146316 | 0.00886135 | 0.0161103 | -0.0309471 | 0.0204832 |
15 | 0.0247091 | 0.0531618 | 0.00677168 | 0.0155368 | -0.0309471 | 0.0204832 |
16 | 0.235093 | 0.178299 | -0.00374318 | 0.000594213 | -0.0309471 | 0.0204832 |
17 | 0.294984 | 0.146047 | -0.00460173 | -0.0205562 | -0.0309471 | 0.0204832 |
18 | 0.0237644 | 0.0162414 | 0.00588019 | 0.00584345 | -0.0309471 | 0.0204832 |
19 | -0.116843 | -0.0173146 | 0.00543154 | 0.0053948 | -0.0309471 | 0.0204832 |
20 | -0.00542654 | 0.154285 | -0.0124411 | -0.00411101 | -0.0309471 | 0.0204832 |
21 | 0.106903 | -0.117615 | 0.00797198 | -0.0161775 | 0.0425486 | -0.0222253 |
22 | 0.0906478 | -0.0953381 | 0.00797198 | -0.0147974 | 0.0425486 | -0.0222253 |
23 | 0.0844482 | -0.115481 | 0.00797198 | -0.0167935 | 0.0425486 | -0.0222253 |
24 | 0.0297359 | -0.187202 | 0.00797198 | -0.0147974 | 0.0425486 | -0.0222253 |
25 | -0.0270093 | -0.187794 | 0.00247507 | -0.0158534 | 0.0425486 | -0.0222253 |
26 | -0.220044 | -0.148477 | 0.00797198 | 0.0129189 | 0.0425486 | -0.0222253 |
27 | -0.2442 | -0.0289042 | 0.00797198 | 0.0187086 | 0.0425486 | -0.0222253 |
28 | -0.217045 | 0.127496 | 0.00342131 | 0.0163512 | 0.0425486 | -0.0222253 |
29 | 0.10424 | 0.393619 | -0.0341662 | -0.0279388 | 0.0425486 | -0.0222253 |
30 | -0.172381 | -0.00919521 | -0.0017772 | 0.00583301 | 0.0425486 | -0.0222253 |
⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
mean_1 = mean.(eachcol(z))
std_1 = std.(eachcol(z))
for i in 1:size(z)[2]
p = (z[:, i] .- mean_1[i]) / std_1[i]
#colname = names(Z)[i]
df[!,i] = p
end
ArgumentError: Cannot assign to non-existent column: 1
Stacktrace:
[1] insert_single_column!(df::DataFrame, v::Vector{Float64}, col_ind::Int64)
@ DataFrames C:\Users\PC\.julia\packages\DataFrames\3mEXm\src\dataframe\dataframe.jl:512
[2] setindex!(df::DataFrame, v::Vector{Float64}, #unused#::typeof(!), col_ind::Int64)
@ DataFrames C:\Users\PC\.julia\packages\DataFrames\3mEXm\src\dataframe\dataframe.jl:541
[3] top-level scope
@ .\In[54]:11
[4] eval
@ .\boot.jl:360 [inlined]
[5] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
@ Base .\loading.jl:1116
df[:,1] = p
ArgumentError: Cannot assign to non-existent column: 1
Stacktrace:
[1] insert_single_column!(df::DataFrame, v::Vector{Float64}, col_ind::Int64)
@ DataFrames C:\Users\PC\.julia\packages\DataFrames\3mEXm\src\dataframe\dataframe.jl:512
[2] setindex!
@ C:\Users\PC\.julia\packages\DataFrames\3mEXm\src\dataframe\dataframe.jl:541 [inlined]
[3] setindex!(df::DataFrame, v::Vector{Float64}, row_inds::Colon, col_ind::Int64)
@ DataFrames C:\Users\PC\.julia\packages\DataFrames\3mEXm\src\dataframe\dataframe.jl:592
[4] top-level scope
@ In[67]:1
[5] eval
@ .\boot.jl:360 [inlined]
[6] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
@ Base .\loading.jl:1116
p = (z[:,1].-mean_1[1]) / std_1[1]
3900-element Vector{Float64}:
0.19297177786972372
0.051369841595354765
-0.02260902583397552
-0.012319878683044443
-0.034165055650964275
-0.1937313645019607
-0.21322361685221175
-0.12743291758859554
0.19382086474044877
0.06087839527047379
0.010413768831547246
-0.10382424886614387
-0.19631106003544865
⋮
-0.1519666509001525
-0.028157516507789738
0.10057946371092834
0.07184702189210973
0.267696062135722
-0.1097948884357846
-0.1257385988841877
0.02330696441580686
0.3215097989527482
0.23017669648906897
-1.7589560388446375
0.430706036235847
[names(Z) mean_1]
195×3 Matrix{Any}:
"logrobr" "logrobr" -2.84804e-15
"logburg" "logburg" 8.66521e-15
"burg_missing" "burg_missing" -1.38787e-17
"robrate_missing" "robrate_missing" -1.22089e-17
"newblack" "newblack" 3.02021e-15
"newfhh" "newfhh" -5.07913e-16
"newmove" "newmove" 4.74788e-15
"newdens" "newdens" -8.50898e-15
"newmal" "newmal" -1.01486e-17
"AGE010D" "AGE010D" -2.37094e-14
"AGE050D" "AGE050D" 8.20774e-15
"AGE110D" "AGE110D" 2.68134e-14
"AGE170D" "AGE170D" 1.65919e-14
⋮
"PVY020D" "PVY020D" -2.85392e-14
"PVY120D" "PVY120D" -3.9999e-14
"PVY210D" "PVY210D" -2.53837e-15
"PVY310D" "PVY310D" 5.4832e-14
"PVY420D" "PVY420D" -4.56684e-14
"PVY520D" "PVY520D" 2.98352e-14
"SPR030D" "SPR030D" -9.37837e-15
"SPR130D" "SPR130D" -1.04047e-14
"SPR230D" "SPR230D" 1.05672e-14
"SPR330D" "SPR330D" 5.74424e-15
"SPR440D" "SPR440D" -9.52036e-15
"VST020D" "VST020D" 3.76667e-15
function DML2_for_NN(z , d , y, nfold, clu, num_epochs)
# Num ob observations
nobser = size(z,1)
# Define folds indices
foldid = collect(Kfold(size(z)[1], nfold))
# Create array to save errors
ytil = ones(nobser)
dtil = ones(nobser)
println("Folds: " )
# loop to save results
for i in 1:nfold
##############################################
################| MODEL D |###################
model_y= Chain(Dense(size(z,2), 16, relu),
Dense(16, 16, relu),
Dense(16, 1))
opt = RMSProp()
loss_y(x, y) = Flux.Losses.mse(model_y(x), y)
metrics_y(x, y) = Flux.mae(model_y(x), y)
ps_y = Flux.params(model_y)
##############################################
################| MODEL Y |###################
model_d= Chain(Dense(size(z,2), 16, relu),
Dense(16, 16, relu),
Dense(16, 1))
opt = RMSProp()
loss_d(x, y) = Flux.Losses.mse(model_d(x), y)
metrics_d(x, y) = Flux.mae(model_d(x), y)
ps_d = Flux.params(model_d)
data_d = DataLoader((z[foldid[i],:]', d[foldid[i]]'))
data_y = DataLoader((z[foldid[i],:]', y[foldid[i]]'))
# Lasso regression, excluding folds selected
for epoch in 1:num_epochs
time = @elapsed Flux.train!(loss_y, ps_y, data_y, opt)
end
for epoch in 1:num_epochs
time = @elapsed Flux.train!(loss_d, ps_d, data_d, opt)
end
# Predict estimates using the
yhat = model_y(z[Not(foldid[i]),:]')';
###############################################################################
dhat = model_d(z[Not(foldid[i]),:]')';
# Save errors
dtil[Not(foldid[i])] = (d[Not(foldid[i])] - dhat)
ytil[Not(foldid[i])] = (y[Not(foldid[i])] - yhat)
println(i)
end
# Create dataframe
data = DataFrame(ytil = ytil, dtil = dtil, clu=clu)
# OLS clustering at the County level
rfit = reg(data, @formula(ytil ~ dtil +fe(clu)))
coef_est = coef(rfit)[1]
se = FixedEffectModels.coeftable(rfit).cols[2]
println(" coef (se) = ", coef_est ,"(",se,")")
#return rfit, data;
end
DML2_for_NN (generic function with 1 method)
19.2. Estimating the effect with DLM for neural nets#
# Treatment variable
D = rdata[!,d]
# Outcome variable
Y = rdata[!,y];
# Construct matrix Z
Z = rdata[!, varlis[3:end]];
# Create main variables
z = Matrix(Z);
d = D[!,1];
y = Y[!,1];
clu = rdata[!, :CountyCode];
first(DataFrame(logghomr = y,logfssl = d,CountyCode = clu ),6)
logghomr | logfssl | CountyCode | |
---|---|---|---|
Float64 | Float64 | Int64 | |
1 | -0.134778 | 0.0961271 | 1073 |
2 | -0.239622 | 0.0808094 | 1073 |
3 | -0.0786772 | 0.0573399 | 1073 |
4 | -0.331465 | 0.0816945 | 1073 |
5 | -0.31664 | 0.0253655 | 1073 |
6 | 0.105132 | -0.00677726 | 1073 |
##
DML2_for_NN(z,d,y,10,clu,100)
Folds:
1
2
3
4
5
6
7
8
9
10
coef (se) = 0.08016811689720804([0.059424888983297536])