18. DML inference for gun ownership#
# Import relevant packages
import pandas as pd
import numpy as np
import pyreadr
import os
from urllib.request import urlopen
from sklearn import preprocessing
import patsy
from numpy import loadtxt
from keras.models import Sequential
from keras.layers import Dense
import hdmpy
import numpy as np
import random
import statsmodels.api as sm
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import colors
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV, ElasticNetCV
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
import itertools
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
from pandas.api.types import is_categorical_dtype
from itertools import compress
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.feature_selection import SelectFromModel
This notebook contains an example for teaching.
18.1. A Case Study: The Effect of Gun Ownership on Gun-Homicide Rates#
We consider the problem of estimating the effect of gun ownership on the homicide rate. For this purpose, we estimate the following partially linear model
18.2. Data#
\(Y_{j,t}\) is log homicide rate in county \(j\) at time \(t\), \(D_{j, t-1}\) is log fraction of suicides committed with a firearm in county \(j\) at time \(t-1\), which we use as a proxy for gun ownership, and \(Z_{j,t}\) is a set of demographic and economic characteristics of county \(j\) at time \(t\). The parameter \(\beta\) is the effect of gun ownership on the homicide rates, controlling for county-level demographic and economic characteristics.
The sample covers 195 large United States counties between the years 1980 through 1999, giving us 3900 observations.
data1 = pd.read_csv("https://raw.githubusercontent.com/d2cml-ai/14.388_py/main/data/gun_clean.csv")
data1.shape[1]
415
18.2.1. Preprocessing#
To account for heterogeneity across counties and time trends in all variables, we remove from them county-specific and time-specific effects in the following preprocessing.
import re
################################# Find Variable Names from Dataset ########################
def varlist( df = None, type_dataframe = [ 'numeric' , 'categorical' , 'string' ], pattern = "", exclude = None ):
varrs = []
if ('numeric' in type_dataframe):
varrs = varrs + df.columns[ df.apply( is_numeric_dtype , axis = 0 ).to_list() ].tolist()
if ('categorical' in type_dataframe):
varrs = varrs + df.columns[ df.apply( is_categorical_dtype , axis = 0 ).to_list() ].tolist()
if ('string' in type_dataframe):
varrs = varrs + df.columns[ df.apply( is_string_dtype , axis = 0 ).to_list() ].tolist()
grepl_result = np.array( [ re.search( pattern , variable ) is not None for variable in df.columns.to_list() ] )
if exclude is None:
result = list(compress( varrs, grepl_result ) )
else:
varrs_excluded = np.array( [var in exclude for var in varrs ] )
and_filter = np.logical_and( ~varrs_excluded , grepl_result )
result = list(compress( varrs, and_filter ) )
return result
################################# Create Variables ###############################
# Dummy Variables for Year and County Fixed Effects
r = re.compile("X_Jfips")
fixed = list( filter( r.match, data1.columns.to_list() ) )
year = varlist(data1, pattern="X_Tyear")
census = []
census_var = ["^AGE", "^BN", "^BP", "^BZ", "^ED", "^EL","^HI", "^HS", "^INC", "^LF", "^LN", "^PI", "^PO", "^PP", "^PV", "^SPR", "^VS"]
for variable in census_var:
r = re.compile( variable )
census = census + list( filter( r.match, data1.columns.to_list() ) )
################################ Variables ##################################
# Treatment Variable
d = "logfssl"
# Outcome Variable
y = "logghomr"
# Other Control Variables
X1 = ["logrobr", "logburg", "burg_missing", "robrate_missing"]
X2 = ["newblack", "newfhh", "newmove", "newdens", "newmal"]
################################# Partial out Fixed Effects ########################
rdata = data1[['CountyCode']]
# Variables to be Partialled-out
varlist2 = [y] + [d] + X1 + X2 + census
# Partial out Variables in varlist from year and county fixed effect
for var_name in varlist2:
form = var_name + " ~ " + " + ".join( year ) + "+" + " + ".join( fixed )
rdata[f'{var_name}'] = smf.ols( formula = form , data = data1 ).fit().resid
link="https://raw.githubusercontent.com/d2cml-ai/14.388_py/main/data/gun_clean.RData"
response = urlopen(link)
content = response.read()
fhandle = open( 'gun_clean.RData', 'wb')
fhandle.write(content)
fhandle.close()
result = pyreadr.read_r("gun_clean.RData")
os.remove("gun_clean.RData")
# Extracting the data frame from rdata_read
data = result[ 'data' ]
n = data.shape[0]
18.2.2. We check that our results are equal to R results at 6 decimals#
column_names = data.columns.to_list()
for col in column_names:
result = (data[f'{col}'].round(6) == rdata[f'{col}'].round(6)).sum()
Now, we can construct the treatment variable, the outcome variable and the matrix \(Z\) that includes the control variables.
# Treatment Variable
D = rdata[ f'{d}']
# Outcome Variable
Y = rdata[ f'{y}']
# Construct matrix Z
Z = rdata[ X1 + X2 + census ]
Z.shape
(3900, 195)
We have in total 195 control variables. The control variables \(Z_{j,t}\) are from the U.S. Census Bureau and contain demographic and economic characteristics of the counties such as the age distribution, the income distribution, crime rates, federal spending, home ownership rates, house prices, educational attainment, voting paterns, employment statistics, and migration rates.
clu = rdata[['CountyCode']]
data = pd.concat([Y, D, Z, clu], axis=1)
data.to_csv( r"../data/gun_clean2.csv" , index = False )
18.3. The effect of gun ownership#
18.3.1. OLS#
After preprocessing the data, we first look at simple regression of \(Y_{j,t}\) on \(D_{j,t-1}\) without controls as a baseline model.
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
# # Run this line to avoid all the lines of code above
# data = pd.read_csv( r"../data/gun_clean2.csv" )
# OLS clustering at the County level
model = "logghomr ~ logfssl"
baseline_ols = smf.ols(model , data=data).fit().get_robustcov_results(cov_type = "cluster", groups= data['CountyCode'])
baseline_ols_table = baseline_ols.summary2().tables[1]
print( baseline_ols_table.iloc[ 1 , 4:] )
baseline_ols_table.iloc[1, :]
[0.025 0.154480
0.975] 0.410129
Name: logfssl, dtype: float64
Coef. 0.282304
Std.Err. 0.064811
t 4.355825
P>|t| 0.000021
[0.025 0.154480
0.975] 0.410129
Name: logfssl, dtype: float64
The point estimate is \(0.282\) with the confidence interval ranging from 0.155 to 0.41. This suggests that increases in gun ownership rates are related to gun homicide rates - if gun ownership increases by 1% relative to a trend then the predicted gun homicide rate goes up by 0.28%, without controlling for counties’ characteristics.
Since our goal is to estimate the effect of gun ownership after controlling for a rich set county characteristics we next include the controls. First, we estimate the model by ols and then by an array of the modern regression methods using the double machine learning approach.
# define the variables
y = 'logghomr'
data_columns = list(data)
no_relev_col = ['logfssl', 'CountyCode', 'logghomr']
# This gives us: new_list = ['carrot' , 'lemon']
z = [col for col in data_columns if col not in no_relev_col]
control_formula = "logghomr" + ' ~ ' + 'logfssl + ' + ' + '.join( Z.columns.to_list() )
control_ols = smf.ols( control_formula , data=data).fit().get_robustcov_results(cov_type = "cluster", groups= data['CountyCode'])
control_ols_table = control_ols.summary2().tables[1]
print( control_ols_table.iloc[ 1 , 4:] )
control_ols_table.iloc[1, :]
[0.025 0.087545
0.975] 0.296430
Name: logfssl, dtype: float64
Coef. 0.191987
Std.Err. 0.052956
t 3.625435
P>|t| 0.000369
[0.025 0.087545
0.975] 0.296430
Name: logfssl, dtype: float64
In R, the coefficients of the bellow variables are Null
. However, in python we got a very high value.
control_ols_table.loc[['PPQ110D', 'PPQ120D'], :]
Coef. | Std.Err. | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
PPQ110D | 1.590771e+11 | 7.334932e+11 | 0.216876 | 0.828533 | -1.287568e+12 | 1.605722e+12 |
PPQ120D | -4.875189e+10 | 6.826842e+11 | -0.071412 | 0.943143 | -1.395188e+12 | 1.297684e+12 |
After controlling for a rich set of characteristics, the point estimate of gun ownership reduces to \(0.19\).
18.3.2. DML algorithm#
Here we perform inference of the predictive coefficient \(\beta\) in our partially linear statistical model,
using the double machine learning approach.
For \(\tilde Y = Y- E(Y|Z)\) and \(\tilde D= D- E(D|Z)\), we can write
Using cross-fitting, we employ modern regression methods to build estimators \(\hat \ell(Z)\) and \(\hat m(Z)\) of \(\ell(Z):=E(Y|Z)\) and \(m(Z):=E(D|Z)\) to obtain the estimates of the residualized quantities:
Finally, using ordinary least squares of \(\tilde Y_i\) on \(\tilde D_i\), we obtain the estimate of \(\beta\).
The following algorithm comsumes \(Y, D, Z\), and a machine learning method for learning the residuals \(\tilde Y\) and \(\tilde D\), where the residuals are obtained by cross-validation (cross-fitting). Then, it prints the estimated coefficient \(\beta\) and the corresponding standard error from the final OLS regression.
from sklearn.model_selection import KFold
def DML2_for_PLM(z, d, y, dreg, yreg, nfold, clu):
kf = KFold(n_splits = nfold, shuffle=True) #Here we use kfold to generate kfolds
I = np.arange(0, len(d)) #To have a id vector from data
train_id, test_id = [], [] #arrays to store kfold's ids
#generate and store kfold's id
for kfold_index in kf.split(I):
train_id.append(kfold_index[0])
test_id.append(kfold_index[1])
# Create array to save errors
dtil = np.zeros( len(z) ).reshape( len(z) , 1 )
ytil = np.zeros( len(z) ).reshape( len(z) , 1 )
# loop to save results
for b in range(0,len(train_id)):
# Lasso regression, excluding folds selected
dfit = dreg(z[train_id[b],], d[train_id[b],])
yfit = yreg(z[train_id[b],], y[train_id[b],])
# predict estimates using the
dhat = dfit.predict( z[test_id[b],] )
yhat = yfit.predict( z[test_id[b],] )
# save errors
dtil[test_id[b]] = d[test_id[b],] - dhat.reshape( -1 , 1 )
ytil[test_id[b]] = y[test_id[b],] - yhat.reshape( -1 , 1 )
print(b, " ")
# Create dataframe
data_2 = pd.DataFrame(np.concatenate( ( ytil, dtil,clu ), axis = 1), columns = ['ytil','dtil','CountyCode'])
# OLS clustering at the County level
model = "ytil ~ dtil"
baseline_ols = smf.ols(model , data = data_2 ).fit().get_robustcov_results(cov_type = "cluster", groups= data_2['CountyCode'])
coef_est = baseline_ols.summary2().tables[1]['Coef.']['dtil']
se = baseline_ols.summary2().tables[1]['Std.Err.']['dtil']
Final_result = { 'coef_est' : coef_est , 'se' : se , 'dtil' : dtil , 'ytil' : ytil }
print("Coefficient is {}, SE is equal to {}".format(coef_est, se))
return Final_result
Now, we apply the Double Machine Learning (DML) approach with different machine learning methods. First, we load the relevant libraries.
Let us, construct the input matrices.
# Create main variables
Y = data['logghomr']
D = data['logfssl']
Z = data.drop(['logghomr', 'logfssl', 'CountyCode'], axis=1)
CLU = data['CountyCode']
# as matrix
y = Y.to_numpy().reshape( len(Y) , 1 )
d = D.to_numpy().reshape( len(Y) , 1 )
z = Z.to_numpy()
clu = clu.to_numpy().reshape( len(Y) , 1 )
18.3.3. Lasso#
def dreg(z,d):
alpha=0.00000001
result = linear_model.Lasso(alpha = alpha).fit(z, d)
return result
def yreg(z,y):
alpha=0.00000001
result = linear_model.Lasso(alpha = alpha).fit(z, y)
return result
DML2_lasso = DML2_for_PLM(z, d, y, dreg, yreg, 10, clu)
0
1
2
3
4
5
6
7
8
9
Coefficient is 0.2173388201108385, SE is equal to 0.05589284340861423
18.3.3.1. We use SelectFromModel to select variables which do not have a zero coefficient. It is done because we want to reduce dimensionality as rlasso( post = T ) does.#
class Lasso_post:
def __init__(self, alpha ):
self.alpha = alpha
def fit( self, X, Y ):
self.X = X
self.Y = Y
lasso = linear_model.Lasso( alpha = self.alpha ).fit( X , Y )
model = SelectFromModel( lasso , prefit = True )
X_new = model.transform( X )
# Gettin indices from columns which has variance for regression
index_X = model.get_support()
self.index = index_X
new_x = X[ : , index_X ]
lasso2 = linear_model.Lasso( alpha = self.alpha ).fit( new_x , Y )
self.model = lasso2
return self
def predict( self , X ):
dropped_X = X[ : , self.index ]
predictions = self.model.predict( dropped_X )
return predictions
Run the below code to verify whether Lasso_post functions as it is expected.
def dreg(z,d):
alpha=0.00000001
result = Lasso_post( alpha = alpha ).fit( z , d )
return result
def yreg( z , y ):
alpha = 0.00000001
result = Lasso_post( alpha = alpha ).fit( z , y )
return result
DML2_lasso_post = DML2_for_PLM(z, d, y, dreg, yreg, 10, clu)
0
1
2
3
4
5
6
7
8
9
Coefficient is 0.2248307753410121, SE is equal to 0.05648034432412721
18.3.4. hdmy#
We are going to replicate the above regressions but using hmdpy
library.
import hdmpy
from statsmodels.tools import add_constant
class rlasso_sklearn:
def __init__(self, post ):
self.post = post
def fit( self, X, Y ):
self.X = X
self.Y = Y
# Standarization of X and Y
self.rlasso_model = hdmpy.rlasso( X , Y , post = self.post )
return self
def predict( self , X_1 ):
self.X_1 = X_1
beta = self.rlasso_model.est['coefficients'].to_numpy()
if beta.sum() == 0:
prediction = np.repeat( self.rlasso_model.est['intercept'] , self.X_1.shape[0] )
else:
prediction = ( add_constant( self.X_1 , has_constant = 'add') @ beta ).flatten()
return prediction
# Post = false
def dreg(x, d):
result = rlasso_sklearn( post = False ).fit( x , d )
return result
def yreg(x,y):
result = rlasso_sklearn( post = False ).fit( x , y )
return result
DML2_lasso_hdmpy = DML2_for_PLM(z, d, y, dreg, yreg, 10, clu)
0
1
2
3
4
5
6
7
8
9
Coefficient is 0.22355861068775723, SE is equal to 0.056959040205356325
# Post = True
def dreg(x, d):
result = rlasso_sklearn( post = True ).fit( x , d )
return result
def yreg(x,y):
result = rlasso_sklearn( post = True ).fit( x , y )
return result
DML2_lasso_post_hdmpy = DML2_for_PLM(z, d, y, dreg, yreg, 10, clu)
0
1
2
3
4
5
6
7
8
9
Coefficient is 0.22778980282980787, SE is equal to 0.055391060681506045
We found out that the results are close. However, there are not a big differences between the Sklearn and the Hdmpy lasso function.
18.3.5. cv.glmnet curiosities#
According to link1 and link2, It standardize X variables before estimation. In sklearn we have an option named normalize. It normalizes X by subtracting the mean and dividing by the l2-norm. Based on link3, cv.glmnet (standardize = TRUE) and sklearn (normlize = True) are not the same. We decided to use StandardScaler that meets suggestions made in link3. We proved this in the commented code below.
class standard_skl_model:
def __init__(self, model ):
self.model = model
def fit( self, X, Y ):
# Standarization of X and Y
self.scaler_X = StandardScaler()
self.scaler_X.fit( X )
std_X = self.scaler_X.transform( X )
self.model.fit( std_X , Y )
return self
def predict( self , X ):
self.scaler_X = StandardScaler()
self.scaler_X.fit( X )
std_X = self.scaler_X.transform( X )
prediction = self.model.predict( std_X )
return prediction
Run the below code to verify whether standard_skl_model functions as it is expected.
#DML with cross-validated Lasso:
def dreg(z,d):
result = standard_skl_model( LassoCV(cv = 10 , random_state = 0 ) ).fit( z, d )
return result
def yreg(z,y):
result = standard_skl_model( LassoCV(cv = 10 , random_state = 0 ) ).fit( z, y )
return result
DML2_lasso_cv = DML2_for_PLM(z, d, y, dreg, yreg, 2 , clu)
0
1
Coefficient is 0.20674588093019308, SE is equal to 0.05530116123651258
# DML with cross-validated Lasso:
def dreg(z,d):
result = standard_skl_model( ElasticNetCV( cv = 10 , random_state = 0 , l1_ratio = 0.5, max_iter = 100000 ) ).fit( z, d )
return result
def yreg(z,y):
result = standard_skl_model( ElasticNetCV( cv = 10 , random_state = 0 , l1_ratio = 0.5, max_iter = 100000 ) ).fit( z, y )
return result
DML2_elnet = DML2_for_PLM(z, d, y, dreg, yreg, 2 , clu)
0
1
Coefficient is 0.22254221301343585, SE is equal to 0.053879833505691754
#DML with cross-validated Lasso:
def dreg(z,d):
result = standard_skl_model( ElasticNetCV( cv = 10 , random_state = 0 , l1_ratio = 0.0001 ) ).fit( z, d )
return result
def yreg(z,y):
result = standard_skl_model( ElasticNetCV( cv = 10 , random_state = 0 , l1_ratio = 0.0001 ) ).fit( z, y )
return result
DML2_ridge = DML2_for_PLM(z, d, y, dreg, yreg, 2, clu)
0
1
Coefficient is 0.19639693752730308, SE is equal to 0.05303210277187753
Here we also compute DML with OLS used as the ML method
#DML with cross-validated Lasso:
def dreg(z,d):
result = LinearRegression().fit( z, d )
return result
def yreg(z,y):
result = LinearRegression().fit( z, y )
return result
DML2_ols = DML2_for_PLM(z, d, y, dreg, yreg, 2, clu)
0
1
Coefficient is 0.1867889216454423, SE is equal to 0.05529603580186457
Next, we also apply Random Forest for comparison purposes.
18.3.6. Random Forest#
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
#DML with cross-validated Lasso:
def dreg(z,d):
result = RandomForestRegressor( random_state = 0 , n_estimators = 500 , max_features = 65 , n_jobs = 4 , min_samples_leaf = 5 ).fit( z, d )
return result
def yreg(z,y):
result = RandomForestRegressor( random_state = 0 , n_estimators = 500 , max_features = 65 , n_jobs = 4 , min_samples_leaf = 5 ).fit( z, y )
return result
DML2_RF = DML2_for_PLM(z, d, y, dreg, yreg, 2, clu) # set to 2 due to computation time
0
1
Coefficient is 0.1714420140198645, SE is equal to 0.05677223302029048
We conclude that the gun ownership rates are related to gun homicide rates - if gun ownership increases by 1% relative to a trend then the predicted gun homicide rate goes up by about 0.20% controlling for counties’ characteristics.
Finally, let’s see which method is actually better. We compute RMSE for predicting D and Y, and see which of the methods works better.
mods = [DML2_ols, DML2_lasso, DML2_lasso_post , DML2_lasso_cv, DML2_ridge, DML2_elnet, DML2_RF]
mods_name = ["DML2_ols", "DML2_lasso", "DML2_lasso_post" , "DML2_lasso_cv", 'DML2_ridge', 'DML2_elnet', 'DML2_RF']
def mdl( model , model_name ):
RMSEY = np.sqrt( np.mean( model[ 'ytil' ] ) ** 2 ) # I have some doubts about these equations...we have to recheck
RMSED = np.sqrt( np.mean( model[ 'dtil' ] ) ** 2 ) # I have some doubts about these equations...we have to recheck
result = pd.DataFrame( { model_name : [ RMSEY , RMSED ]} , index = [ 'RMSEY' , 'RMSED' ])
return result
RES = [ mdl( model , name ) for model, name in zip( mods , mods_name ) ]
pr_Res = pd.concat( RES, axis = 1)
pr_Res
DML2_ols | DML2_lasso | DML2_lasso_post | DML2_lasso_cv | DML2_ridge | DML2_elnet | DML2_RF | |
---|---|---|---|---|---|---|---|
RMSEY | 0.005623 | 0.000244 | 0.000479 | 1.764970e-18 | 7.401487e-19 | 2.391250e-18 | 0.002519 |
RMSED | 0.000370 | 0.000527 | 0.000086 | 9.251859e-20 | 9.963540e-20 | 1.138690e-19 | 0.000603 |
18.3.6.1. This verfies that the function DML2_for_PLM has no errors#
np.where(DML2_lasso_post[ 'ytil' ] == 0)[0].size
0
It looks like the best method for predicting D is Lasso, and the best method for predicting Y is CV Ridge.
#DML with cross-validated Lasso:
def dreg(z,d):
result = standard_skl_model( LassoCV(cv = 10 , random_state = 0 , alphas = [0]) ).fit( z, d )
return result
def yreg(z,y):
result = standard_skl_model( ElasticNetCV( cv = 10 , random_state = 0 , l1_ratio = 0.0001 ) ).fit( z, y )
return result
DML2_best = DML2_for_PLM(z, d, y , dreg, yreg, 10, clu)
0
1
2
3
4
5
6
7
8
9
Coefficient is 0.20611237089029483, SE is equal to 0.052394355011270544
table = np.zeros( ( 9 , 2 ))
table[ 0 , 0] = baseline_ols_table.iloc[ 1 , 0 ]
table[ 1 , 0] = control_ols_table.iloc[ 1 , 0 ]
table[ 2 , 0] = DML2_lasso['coef_est']
table[ 3 , 0] = DML2_lasso_post['coef_est']
table[ 4 , 0] = DML2_lasso_cv['coef_est']
table[ 5 , 0] = DML2_ridge['coef_est']
table[ 6 , 0] = DML2_elnet['coef_est']
table[ 7 , 0] = DML2_RF['coef_est']
table[ 8 , 0] = DML2_best['coef_est']
table[ 0 , 1] = baseline_ols_table.iloc[ 1 , 1 ]
table[ 1 , 1] = control_ols_table.iloc[ 1 , 1 ]
table[ 2 , 1] = DML2_lasso['se']
table[ 3 , 1] = DML2_lasso_post['se']
table[ 4 , 1] = DML2_lasso_cv['se']
table[ 5 , 1] = DML2_ridge['se']
table[ 6 , 1] = DML2_elnet['se']
table[ 7 , 1] = DML2_RF['se']
table[ 8 , 1] = DML2_best['se']
table = pd.DataFrame( table , index = [ "Baseline OLS", "Least Squares with controls", "Lasso", \
"Post-Lasso", "CV Lasso","CV Elnet", "CV Ridge", "Random Forest", "Best" ] , \
columns = ["Estimate","Standard Error"] )
table.round( 3 )
Estimate | Standard Error | |
---|---|---|
Baseline OLS | 0.282 | 0.065 |
Least Squares with controls | 0.192 | 0.053 |
Lasso | 0.217 | 0.056 |
Post-Lasso | 0.225 | 0.056 |
CV Lasso | 0.207 | 0.055 |
CV Elnet | 0.196 | 0.053 |
CV Ridge | 0.223 | 0.054 |
Random Forest | 0.171 | 0.057 |
Best | 0.206 | 0.052 |