# !wget https://developer.nvidia.com/compute/cuda/9.0/Prod/local_installers/cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb
# !dpkg -i cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb
# !apt-key add /var/cuda-repo-9-0-local/7fa2af80.pub
# !apt update -q
# !apt install cuda gcc-6 g++-6 -y -q
# !ln -s /usr/bin/gcc-6 /usr/local/cuda/bin/gcc
# !ln -s /usr/bin/g++-6 /usr/local/cuda/bin/g++
# !curl -sSL "https://julialang-s3.julialang.org/bin/linux/x64/1.7/julia-1.7.3-linux-x86_64.tar.gz" -o julia.tar.gz
# !tar -xzf julia.tar.gz -C /usr --strip-components 1
# !rm -rf julia.tar.gz*
# !julia -e 'using Pkg; pkg"add IJulia; precompile"'

19. DML inference using NN for gun ownership#

In this lab, we estimate the effect of gun ownership on the homicide rate by a neural network.

# using Pkg
# Pkg.add("CSV"), using CSV
# Pkg.add("DataFrames"), using DataFrames
# Pkg.add("StatsModels"), using StatsModels
# Pkg.add("GLM"), using GLM
# Pkg.add("Random"), using Random
using Statistics, Plots, FixedEffectModels, MLDataUtils, MLBase, RData, Downloads
WARNING: using Downloads.download in module Main conflicts with an existing identifier.

First, we need to load and preprocess the data.

url = "https://github.com/d2cml-ai/14.388_jl/raw/github_data/data/gun_clean.RData"
download(url, "data.RData")
rdata_read = load("data.RData")
rm("data.RData")
data = rdata_read["data"]
names(data)
println("Number of rows: ",size(data,1))
println("Number of columns: ",size(data,2))
Number of rows: 3900
Number of columns: 198
################################# Create Variables ###############################

# Dummy Variables for Year and County Fixed Effects
fixed = filter(x->contains(x, "X_Jfips"), names(data));
year = filter(x->contains(x, "X_Tyear"), names(data));
census = []
census_var = ["AGE", "BN", "BP", "BZ", "ED", "EL", "HI", "HS", "INC", "LF", "LN", "PI", "PO", "PP", "PV", "SPR", "VS"]

for i in 1:size(census_var,1) 
    append!(census, filter(x->contains(x, census_var[i]), names(data)))
end
################################ Variables ##################################

# Treatment Variable
d = ["logfssl"];

# Outcome Variable
y = ["logghomr"];

# Other Control Variables
X1 = ["logrobr", "logburg", "burg_missing", "robrate_missing"];
X2 = ["newblack", "newfhh", "newmove", "newdens", "newmal"];
#################################  Partial out Fixed Effects ########################

# Variables to be Partialled-out
variable = [y, d,X1, X2, census]
varlis = []

# Partial out Variables in varlist from year and county fixed effect
for i in variable
    append!(varlis,i)
end
# Running the following lines takes aprox. 10 minutes (depends on your CPU)

example = DataFrame(CountyCode = data[:,"CountyCode"]);
rdata = DataFrame(CountyCode = data[:,"CountyCode"]);

for i in 1:size(varlis,1)
    rdata[!,varlis[i]]= residuals(lm(term(Symbol(varlis[i])) ~ sum(term.(Symbol.(year))) + sum(term.(Symbol.(fixed))), data))
end

19.1. DML for neural nets#

The following algorithm comsumes \(Y\),\(D\) and \(Z\), and learns the residuals \(\tilde{Y}\) and \(\tilde{D}\) via a neural network, where the residuals are obtained by cross-validation (cross-fitting). Then, it prints the estimated coefficient β and the clustered standard error from the final OLS regression.

using Flux
using Flux: crossentropy, @epochs
using Flux.Data: DataLoader
using Flux: throttle
using Flux: onehotbatch, onecold, @epochs
using StatsBase
Z

3,900 rows × 195 columns (omitted printing of 189 columns)

logrobrlogburgburg_missingrobrate_missingnewblacknewfhh
Float64Float64Float64Float64Float64Float64
10.150893-0.1243950.0104613-0.0212290.0309471-0.0204832
20.0401683-0.1347810.0104613-0.01941810.0309471-0.0204832
3-0.017679-0.1679090.0104613-0.02203740.0309471-0.0204832
4-0.00963344-0.229250.0104613-0.01941810.0309471-0.0204832
5-0.0267151-0.1766350.00324793-0.02080370.0309471-0.0204832
6-0.151487-0.1890690.01046130.0169530.0309471-0.0204832
7-0.166729-0.1177390.01046130.02455050.0309471-0.0204832
8-0.0996453-0.08330940.004489640.0214570.0309471-0.0204832
90.1515570.319282-0.0448348-0.03666290.0309471-0.0204832
100.0476034-0.0144728-0.002332140.007654420.0309471-0.0204832
110.00814297-0.03496940.01046130.0101673-0.03094710.0204832
12-0.08118470.04414660.008485680.0169169-0.03094710.0204832
13-0.1535040.393015-0.04844410.0230577-0.03094710.0204832
14-0.148069-0.01463160.008861350.0161103-0.03094710.0204832
150.02470910.05316180.006771680.0155368-0.03094710.0204832
160.2350930.178299-0.003743180.000594213-0.03094710.0204832
170.2949840.146047-0.00460173-0.0205562-0.03094710.0204832
180.02376440.01624140.005880190.00584345-0.03094710.0204832
19-0.116843-0.01731460.005431540.0053948-0.03094710.0204832
20-0.005426540.154285-0.0124411-0.00411101-0.03094710.0204832
210.106903-0.1176150.00797198-0.01617750.0425486-0.0222253
220.0906478-0.09533810.00797198-0.01479740.0425486-0.0222253
230.0844482-0.1154810.00797198-0.01679350.0425486-0.0222253
240.0297359-0.1872020.00797198-0.01479740.0425486-0.0222253
25-0.0270093-0.1877940.00247507-0.01585340.0425486-0.0222253
26-0.220044-0.1484770.007971980.01291890.0425486-0.0222253
27-0.2442-0.02890420.007971980.01870860.0425486-0.0222253
28-0.2170450.1274960.003421310.01635120.0425486-0.0222253
290.104240.393619-0.0341662-0.02793880.0425486-0.0222253
30-0.172381-0.00919521-0.00177720.005833010.0425486-0.0222253
mean_1 = mean.(eachcol(z))


std_1 = std.(eachcol(z))



for i in 1:size(z)[2]
    p = (z[:, i] .- mean_1[i]) / std_1[i]
    #colname = names(Z)[i]
    df[!,i] = p
end
    
  
ArgumentError: Cannot assign to non-existent column: 1

Stacktrace:
 [1] insert_single_column!(df::DataFrame, v::Vector{Float64}, col_ind::Int64)
   @ DataFrames C:\Users\PC\.julia\packages\DataFrames\3mEXm\src\dataframe\dataframe.jl:512
 [2] setindex!(df::DataFrame, v::Vector{Float64}, #unused#::typeof(!), col_ind::Int64)
   @ DataFrames C:\Users\PC\.julia\packages\DataFrames\3mEXm\src\dataframe\dataframe.jl:541
 [3] top-level scope
   @ .\In[54]:11
 [4] eval
   @ .\boot.jl:360 [inlined]
 [5] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
   @ Base .\loading.jl:1116
df[:,1] = p
ArgumentError: Cannot assign to non-existent column: 1

Stacktrace:
 [1] insert_single_column!(df::DataFrame, v::Vector{Float64}, col_ind::Int64)
   @ DataFrames C:\Users\PC\.julia\packages\DataFrames\3mEXm\src\dataframe\dataframe.jl:512
 [2] setindex!
   @ C:\Users\PC\.julia\packages\DataFrames\3mEXm\src\dataframe\dataframe.jl:541 [inlined]
 [3] setindex!(df::DataFrame, v::Vector{Float64}, row_inds::Colon, col_ind::Int64)
   @ DataFrames C:\Users\PC\.julia\packages\DataFrames\3mEXm\src\dataframe\dataframe.jl:592
 [4] top-level scope
   @ In[67]:1
 [5] eval
   @ .\boot.jl:360 [inlined]
 [6] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
   @ Base .\loading.jl:1116
p = (z[:,1].-mean_1[1]) / std_1[1]
3900-element Vector{Float64}:
  0.19297177786972372
  0.051369841595354765
 -0.02260902583397552
 -0.012319878683044443
 -0.034165055650964275
 -0.1937313645019607
 -0.21322361685221175
 -0.12743291758859554
  0.19382086474044877
  0.06087839527047379
  0.010413768831547246
 -0.10382424886614387
 -0.19631106003544865
  ⋮
 -0.1519666509001525
 -0.028157516507789738
  0.10057946371092834
  0.07184702189210973
  0.267696062135722
 -0.1097948884357846
 -0.1257385988841877
  0.02330696441580686
  0.3215097989527482
  0.23017669648906897
 -1.7589560388446375
  0.430706036235847
[names(Z) mean_1]
195×3 Matrix{Any}:
 "logrobr"          "logrobr"          -2.84804e-15
 "logburg"          "logburg"           8.66521e-15
 "burg_missing"     "burg_missing"     -1.38787e-17
 "robrate_missing"  "robrate_missing"  -1.22089e-17
 "newblack"         "newblack"          3.02021e-15
 "newfhh"           "newfhh"           -5.07913e-16
 "newmove"          "newmove"           4.74788e-15
 "newdens"          "newdens"          -8.50898e-15
 "newmal"           "newmal"           -1.01486e-17
 "AGE010D"          "AGE010D"          -2.37094e-14
 "AGE050D"          "AGE050D"           8.20774e-15
 "AGE110D"          "AGE110D"           2.68134e-14
 "AGE170D"          "AGE170D"           1.65919e-14
 ⋮                                     
 "PVY020D"          "PVY020D"          -2.85392e-14
 "PVY120D"          "PVY120D"          -3.9999e-14
 "PVY210D"          "PVY210D"          -2.53837e-15
 "PVY310D"          "PVY310D"           5.4832e-14
 "PVY420D"          "PVY420D"          -4.56684e-14
 "PVY520D"          "PVY520D"           2.98352e-14
 "SPR030D"          "SPR030D"          -9.37837e-15
 "SPR130D"          "SPR130D"          -1.04047e-14
 "SPR230D"          "SPR230D"           1.05672e-14
 "SPR330D"          "SPR330D"           5.74424e-15
 "SPR440D"          "SPR440D"          -9.52036e-15
 "VST020D"          "VST020D"           3.76667e-15
function DML2_for_NN(z , d , y, nfold, clu, num_epochs)
    
    # Num ob observations
    nobser = size(z,1)
    
    # Define folds indices 
    foldid = collect(Kfold(size(z)[1], nfold))
    
    # Create array to save errors 
    ytil = ones(nobser)
    dtil = ones(nobser)
    println("Folds: " )
    
    # loop to save results
    for i in 1:nfold
        ##############################################
        ################| MODEL D |###################
        model_y= Chain(Dense(size(z,2), 16, relu), 
        Dense(16, 16, relu),
        Dense(16, 1))

        opt = RMSProp()
        loss_y(x, y) = Flux.Losses.mse(model_y(x), y)
        metrics_y(x, y) = Flux.mae(model_y(x), y)
        ps_y = Flux.params(model_y)

        ##############################################
        ################| MODEL Y |###################
        model_d= Chain(Dense(size(z,2), 16, relu), 
        Dense(16, 16, relu),
        Dense(16, 1))

        opt = RMSProp()
        loss_d(x, y) = Flux.Losses.mse(model_d(x), y)
        metrics_d(x, y) = Flux.mae(model_d(x), y)
        ps_d = Flux.params(model_d)

        data_d = DataLoader((z[foldid[i],:]', d[foldid[i]]'))
        data_y = DataLoader((z[foldid[i],:]', y[foldid[i]]'))

    # Lasso regression, excluding folds selected 
    for epoch in 1:num_epochs
        time = @elapsed Flux.train!(loss_y, ps_y, data_y, opt)
    end

    for epoch in 1:num_epochs
        time = @elapsed Flux.train!(loss_d, ps_d, data_d, opt)
    end

    # Predict estimates using the 
    yhat = model_y(z[Not(foldid[i]),:]')';
    ###############################################################################
    dhat = model_d(z[Not(foldid[i]),:]')';
    
        
    # Save errors 
    dtil[Not(foldid[i])] = (d[Not(foldid[i])] - dhat)
    ytil[Not(foldid[i])] = (y[Not(foldid[i])] - yhat)

    println(i)
    end
    
    # Create dataframe 
    data = DataFrame(ytil = ytil, dtil = dtil, clu=clu)
    
    # OLS clustering at the County level
    rfit = reg(data, @formula(ytil ~ dtil +fe(clu)))
    coef_est = coef(rfit)[1]
    se = FixedEffectModels.coeftable(rfit).cols[2]

    println(" coef (se) = ", coef_est ,"(",se,")")
    
    #return rfit, data;
    
end
DML2_for_NN (generic function with 1 method)

19.2. Estimating the effect with DLM for neural nets#

# Treatment variable
D = rdata[!,d]

# Outcome variable

Y = rdata[!,y];

# Construct matrix Z
Z = rdata[!, varlis[3:end]];
# Create main variables
z = Matrix(Z);
d = D[!,1];
y = Y[!,1];
clu = rdata[!, :CountyCode];
first(DataFrame(logghomr = y,logfssl = d,CountyCode = clu ),6)

6 rows × 3 columns

logghomrlogfsslCountyCode
Float64Float64Int64
1-0.1347780.09612711073
2-0.2396220.08080941073
3-0.07867720.05733991073
4-0.3314650.08169451073
5-0.316640.02536551073
60.105132-0.006777261073
##
DML2_for_NN(z,d,y,10,clu,100)
Folds: 
1
2
3
4
5
6
7
8
9
10
 coef (se) = 0.08016811689720804([0.059424888983297536])