DML inference using NN for gun ownership

Contents

# !wget https://developer.nvidia.com/compute/cuda/9.0/Prod/local_installers/cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb
# !dpkg -i cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb
# !apt-key add /var/cuda-repo-9-0-local/7fa2af80.pub
# !apt update -q
# !apt install cuda gcc-6 g++-6 -y -q
# !ln -s /usr/bin/gcc-6 /usr/local/cuda/bin/gcc
# !ln -s /usr/bin/g++-6 /usr/local/cuda/bin/g++

# !curl -sSL "https://julialang-s3.julialang.org/bin/linux/x64/1.7/julia-1.7.3-linux-x86_64.tar.gz" -o julia.tar.gz
# !tar -xzf julia.tar.gz -C /usr --strip-components 1
# !rm -rf julia.tar.gz*
# !julia -e 'using Pkg; pkg"add IJulia; precompile"'

19. DML inference using NN for gun ownership#

In this lab, we estimate the effect of gun ownership on the homicide rate by a neural network.

# using Pkg
# Pkg.add("CSV"), using CSV
# Pkg.add("DataFrames"), using DataFrames
# Pkg.add("StatsModels"), using StatsModels
# Pkg.add("GLM"), using GLM
# Pkg.add("Random"), using Random
using Statistics, Plots, FixedEffectModels, MLDataUtils, MLBase, RData, Downloads

WARNING: using Downloads.download in module Main conflicts with an existing identifier.

First, we need to load and preprocess the data.

url = "https://github.com/d2cml-ai/14.388_jl/raw/github_data/data/gun_clean.RData"
download(url, "data.RData")
rdata_read = load("data.RData")
rm("data.RData")
data = rdata_read["data"]
names(data)
println("Number of rows: ",size(data,1))
println("Number of columns: ",size(data,2))

Number of rows: 3900
Number of columns: 198

################################# Create Variables ###############################

# Dummy Variables for Year and County Fixed Effects
fixed = filter(x->contains(x, "X_Jfips"), names(data));
year = filter(x->contains(x, "X_Tyear"), names(data));

census = []
census_var = ["AGE", "BN", "BP", "BZ", "ED", "EL", "HI", "HS", "INC", "LF", "LN", "PI", "PO", "PP", "PV", "SPR", "VS"]

for i in 1:size(census_var,1) 
    append!(census, filter(x->contains(x, census_var[i]), names(data)))
end

################################ Variables ##################################

# Treatment Variable
d = ["logfssl"];

# Outcome Variable
y = ["logghomr"];

# Other Control Variables
X1 = ["logrobr", "logburg", "burg_missing", "robrate_missing"];
X2 = ["newblack", "newfhh", "newmove", "newdens", "newmal"];

#################################  Partial out Fixed Effects ########################

# Variables to be Partialled-out
variable = [y, d,X1, X2, census]
varlis = []

# Partial out Variables in varlist from year and county fixed effect
for i in variable
    append!(varlis,i)
end

# Running the following lines takes aprox. 10 minutes (depends on your CPU)

example = DataFrame(CountyCode = data[:,"CountyCode"]);
rdata = DataFrame(CountyCode = data[:,"CountyCode"]);

for i in 1:size(varlis,1)
    rdata[!,varlis[i]]= residuals(lm(term(Symbol(varlis[i])) ~ sum(term.(Symbol.(year))) + sum(term.(Symbol.(fixed))), data))
end

19.1. DML for neural nets#

The following algorithm comsumes \(Y\),\(D\) and \(Z\), and learns the residuals \(\tilde{Y}\) and \(\tilde{D}\) via a neural network, where the residuals are obtained by cross-validation (cross-fitting). Then, it prints the estimated coefficient β and the clustered standard error from the final OLS regression.

using Flux
using Flux: crossentropy, @epochs
using Flux.Data: DataLoader
using Flux: throttle
using Flux: onehotbatch, onecold, @epochs
using StatsBase

3,900 rows × 195 columns (omitted printing of 189 columns)

	logrobr	logburg	burg_missing	robrate_missing	newblack	newfhh
	Float64	Float64	Float64	Float64	Float64	Float64
1	0.150893	-0.124395	0.0104613	-0.021229	0.0309471	-0.0204832
2	0.0401683	-0.134781	0.0104613	-0.0194181	0.0309471	-0.0204832
3	-0.017679	-0.167909	0.0104613	-0.0220374	0.0309471	-0.0204832
4	-0.00963344	-0.22925	0.0104613	-0.0194181	0.0309471	-0.0204832
5	-0.0267151	-0.176635	0.00324793	-0.0208037	0.0309471	-0.0204832
6	-0.151487	-0.189069	0.0104613	0.016953	0.0309471	-0.0204832
7	-0.166729	-0.117739	0.0104613	0.0245505	0.0309471	-0.0204832
8	-0.0996453	-0.0833094	0.00448964	0.021457	0.0309471	-0.0204832
9	0.151557	0.319282	-0.0448348	-0.0366629	0.0309471	-0.0204832
10	0.0476034	-0.0144728	-0.00233214	0.00765442	0.0309471	-0.0204832
11	0.00814297	-0.0349694	0.0104613	0.0101673	-0.0309471	0.0204832
12	-0.0811847	0.0441466	0.00848568	0.0169169	-0.0309471	0.0204832
13	-0.153504	0.393015	-0.0484441	0.0230577	-0.0309471	0.0204832
14	-0.148069	-0.0146316	0.00886135	0.0161103	-0.0309471	0.0204832
15	0.0247091	0.0531618	0.00677168	0.0155368	-0.0309471	0.0204832
16	0.235093	0.178299	-0.00374318	0.000594213	-0.0309471	0.0204832
17	0.294984	0.146047	-0.00460173	-0.0205562	-0.0309471	0.0204832
18	0.0237644	0.0162414	0.00588019	0.00584345	-0.0309471	0.0204832
19	-0.116843	-0.0173146	0.00543154	0.0053948	-0.0309471	0.0204832
20	-0.00542654	0.154285	-0.0124411	-0.00411101	-0.0309471	0.0204832
21	0.106903	-0.117615	0.00797198	-0.0161775	0.0425486	-0.0222253
22	0.0906478	-0.0953381	0.00797198	-0.0147974	0.0425486	-0.0222253
23	0.0844482	-0.115481	0.00797198	-0.0167935	0.0425486	-0.0222253
24	0.0297359	-0.187202	0.00797198	-0.0147974	0.0425486	-0.0222253
25	-0.0270093	-0.187794	0.00247507	-0.0158534	0.0425486	-0.0222253
26	-0.220044	-0.148477	0.00797198	0.0129189	0.0425486	-0.0222253
27	-0.2442	-0.0289042	0.00797198	0.0187086	0.0425486	-0.0222253
28	-0.217045	0.127496	0.00342131	0.0163512	0.0425486	-0.0222253
29	0.10424	0.393619	-0.0341662	-0.0279388	0.0425486	-0.0222253
30	-0.172381	-0.00919521	-0.0017772	0.00583301	0.0425486	-0.0222253
⋮	⋮	⋮	⋮	⋮	⋮	⋮

mean_1 = mean.(eachcol(z))

std_1 = std.(eachcol(z))

for i in 1:size(z)[2]
    p = (z[:, i] .- mean_1[i]) / std_1[i]
    #colname = names(Z)[i]
    df[!,i] = p
end

ArgumentError: Cannot assign to non-existent column: 1

Stacktrace:
 [1] insert_single_column!(df::DataFrame, v::Vector{Float64}, col_ind::Int64)
   @ DataFrames C:\Users\PC\.julia\packages\DataFrames\3mEXm\src\dataframe\dataframe.jl:512
 [2] setindex!(df::DataFrame, v::Vector{Float64}, #unused#::typeof(!), col_ind::Int64)
   @ DataFrames C:\Users\PC\.julia\packages\DataFrames\3mEXm\src\dataframe\dataframe.jl:541
 [3] top-level scope
   @ .\In[54]:11
 [4] eval
   @ .\boot.jl:360 [inlined]
 [5] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
   @ Base .\loading.jl:1116

df[:,1] = p

ArgumentError: Cannot assign to non-existent column: 1

Stacktrace:
 [1] insert_single_column!(df::DataFrame, v::Vector{Float64}, col_ind::Int64)
   @ DataFrames C:\Users\PC\.julia\packages\DataFrames\3mEXm\src\dataframe\dataframe.jl:512
 [2] setindex!
   @ C:\Users\PC\.julia\packages\DataFrames\3mEXm\src\dataframe\dataframe.jl:541 [inlined]
 [3] setindex!(df::DataFrame, v::Vector{Float64}, row_inds::Colon, col_ind::Int64)
   @ DataFrames C:\Users\PC\.julia\packages\DataFrames\3mEXm\src\dataframe\dataframe.jl:592
 [4] top-level scope
   @ In[67]:1
 [5] eval
   @ .\boot.jl:360 [inlined]
 [6] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
   @ Base .\loading.jl:1116

p = (z[:,1].-mean_1[1]) / std_1[1]

3900-element Vector{Float64}:
  0.19297177786972372
  0.051369841595354765
 -0.02260902583397552
 -0.012319878683044443
 -0.034165055650964275
 -0.1937313645019607
 -0.21322361685221175
 -0.12743291758859554
  0.19382086474044877
  0.06087839527047379
  0.010413768831547246
 -0.10382424886614387
 -0.19631106003544865
  ⋮
 -0.1519666509001525
 -0.028157516507789738
  0.10057946371092834
  0.07184702189210973
  0.267696062135722
 -0.1097948884357846
 -0.1257385988841877
  0.02330696441580686
  0.3215097989527482
  0.23017669648906897
 -1.7589560388446375
  0.430706036235847

[names(Z) mean_1]

195×3 Matrix{Any}:
 "logrobr"          "logrobr"          -2.84804e-15
 "logburg"          "logburg"           8.66521e-15
 "burg_missing"     "burg_missing"     -1.38787e-17
 "robrate_missing"  "robrate_missing"  -1.22089e-17
 "newblack"         "newblack"          3.02021e-15
 "newfhh"           "newfhh"           -5.07913e-16
 "newmove"          "newmove"           4.74788e-15
 "newdens"          "newdens"          -8.50898e-15
 "newmal"           "newmal"           -1.01486e-17
 "AGE010D"          "AGE010D"          -2.37094e-14
 "AGE050D"          "AGE050D"           8.20774e-15
 "AGE110D"          "AGE110D"           2.68134e-14
 "AGE170D"          "AGE170D"           1.65919e-14
 ⋮                                     
 "PVY020D"          "PVY020D"          -2.85392e-14
 "PVY120D"          "PVY120D"          -3.9999e-14
 "PVY210D"          "PVY210D"          -2.53837e-15
 "PVY310D"          "PVY310D"           5.4832e-14
 "PVY420D"          "PVY420D"          -4.56684e-14
 "PVY520D"          "PVY520D"           2.98352e-14
 "SPR030D"          "SPR030D"          -9.37837e-15
 "SPR130D"          "SPR130D"          -1.04047e-14
 "SPR230D"          "SPR230D"           1.05672e-14
 "SPR330D"          "SPR330D"           5.74424e-15
 "SPR440D"          "SPR440D"          -9.52036e-15
 "VST020D"          "VST020D"           3.76667e-15

function DML2_for_NN(z , d , y, nfold, clu, num_epochs)
    
    # Num ob observations
    nobser = size(z,1)
    
    # Define folds indices 
    foldid = collect(Kfold(size(z)[1], nfold))
    
    # Create array to save errors 
    ytil = ones(nobser)
    dtil = ones(nobser)
    println("Folds: " )
    
    # loop to save results
    for i in 1:nfold
        ##############################################
        ################| MODEL D |###################
        model_y= Chain(Dense(size(z,2), 16, relu), 
        Dense(16, 16, relu),
        Dense(16, 1))

        opt = RMSProp()
        loss_y(x, y) = Flux.Losses.mse(model_y(x), y)
        metrics_y(x, y) = Flux.mae(model_y(x), y)
        ps_y = Flux.params(model_y)

        ##############################################
        ################| MODEL Y |###################
        model_d= Chain(Dense(size(z,2), 16, relu), 
        Dense(16, 16, relu),
        Dense(16, 1))

        opt = RMSProp()
        loss_d(x, y) = Flux.Losses.mse(model_d(x), y)
        metrics_d(x, y) = Flux.mae(model_d(x), y)
        ps_d = Flux.params(model_d)

        data_d = DataLoader((z[foldid[i],:]', d[foldid[i]]'))
        data_y = DataLoader((z[foldid[i],:]', y[foldid[i]]'))

    # Lasso regression, excluding folds selected 
    for epoch in 1:num_epochs
        time = @elapsed Flux.train!(loss_y, ps_y, data_y, opt)
    end

    for epoch in 1:num_epochs
        time = @elapsed Flux.train!(loss_d, ps_d, data_d, opt)
    end

    # Predict estimates using the 
    yhat = model_y(z[Not(foldid[i]),:]')';
    ###############################################################################
    dhat = model_d(z[Not(foldid[i]),:]')';
    
        
    # Save errors 
    dtil[Not(foldid[i])] = (d[Not(foldid[i])] - dhat)
    ytil[Not(foldid[i])] = (y[Not(foldid[i])] - yhat)

    println(i)
    end
    
    # Create dataframe 
    data = DataFrame(ytil = ytil, dtil = dtil, clu=clu)
    
    # OLS clustering at the County level
    rfit = reg(data, @formula(ytil ~ dtil +fe(clu)))
    coef_est = coef(rfit)[1]
    se = FixedEffectModels.coeftable(rfit).cols[2]

    println(" coef (se) = ", coef_est ,"(",se,")")
    
    #return rfit, data;
    
end

DML2_for_NN (generic function with 1 method)

19.2. Estimating the effect with DLM for neural nets#

# Treatment variable
D = rdata[!,d]

# Outcome variable

Y = rdata[!,y];

# Construct matrix Z
Z = rdata[!, varlis[3:end]];

# Create main variables
z = Matrix(Z);
d = D[!,1];
y = Y[!,1];
clu = rdata[!, :CountyCode];
first(DataFrame(logghomr = y,logfssl = d,CountyCode = clu ),6)

6 rows × 3 columns

	logghomr	logfssl	CountyCode
	Float64	Float64	Int64
1	-0.134778	0.0961271	1073
2	-0.239622	0.0808094	1073
3	-0.0786772	0.0573399	1073
4	-0.331465	0.0816945	1073
5	-0.31664	0.0253655	1073
6	0.105132	-0.00677726	1073

##
DML2_for_NN(z,d,y,10,clu,100)

Folds: 
1
2
3
4
5
6
7
8
9
10
 coef (se) = 0.08016811689720804([0.059424888983297536])