22. Debiased ML for Partially Linear Model in R#

This is a simple implementation of Debiased Machine Learning for the Partially Linear Regression Model.

Reference:

https://arxiv.org/abs/1608.00060

https://www.amazon.com/Business-Data-Science-Combining-Accelerate/dp/1260452778

The code is based on the book.

22.1. DML algorithm#

Here we perform estimation and inference of predictive coefficient \(\alpha\) in the partially linear statistical model,

\[ Y = D\alpha + g(X) + U, \quad E (U | D, X) = 0. \]

For \(\tilde Y = Y- E(Y|X)\) and \(\tilde D= D- E(D|X)\), we can write

\[ \tilde Y = \alpha \tilde D + U, \quad E (U |\tilde D) =0. \]

Parameter \(\alpha\) is then estimated using cross-fitting approach to obtain the residuals \(\tilde D\) and \(\tilde Y\).

The algorithm comsumes \(Y, D, X\), and machine learning methods for learning the residuals \(\tilde Y\) and \(\tilde D\), where the residuals are obtained by cross-validation (cross-fitting).

The statistical parameter \(\alpha\) has a causal intertpreation of being the effect of \(D\) on \(Y\) in the causal DAG

\[ D\to Y, \quad X\to (D,Y) \]

or the counterfactual outcome model with conditionally exogenous (conditionally random) assignment of treatment \(D\) given \(X\):

\[ Y(d) = d\alpha + g(X) + U(d),\quad U(d) \text{ indep } D |X, \quad Y = Y(D), \quad U = U(D). \]
install.packages("librarian")
librarian::shelf(
  AER, randomForest, hdm, glmnet
)
Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependency ‘BiocManager’



  These packages will be installed:

  'AER', 'randomForest', 'hdm', 'glmnet'

  It may take some time.

also installing the dependencies ‘numDeriv’, ‘SparseM’, ‘MatrixModels’, ‘sp’, ‘minqa’, ‘nloptr’, ‘carData’, ‘abind’, ‘pbkrtest’, ‘quantreg’, ‘maptools’, ‘lme4’, ‘iterators’, ‘car’, ‘lmtest’, ‘sandwich’, ‘zoo’, ‘Formula’, ‘checkmate’, ‘foreach’, ‘shape’, ‘Rcpp’, ‘RcppEigen’
DML2.for.PLM <- function(x, d, y, dreg, yreg, nfold=2) {
  nobs <- nrow(x) #number of observations
  foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] #define folds indices
  I <- split(1:nobs, foldid)  #split observation indices into folds  
  ytil <- dtil <- rep(NA, nobs)
  cat("fold: ")
  for(b in 1:length(I)){
    dfit <- dreg(x[-I[[b]],], d[-I[[b]]]) #take a fold out
    yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) # take a foldt out
    dhat <- predict(dfit, x[I[[b]],], type="response") #predict the left-out fold 
    yhat <- predict(yfit, x[I[[b]],], type="response") #predict the left-out fold 
    dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual for the left-out fold
    ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial for the left-out fold
    cat(b," ")
        }
  rfit <- lm(ytil ~ dtil)    #estimate the main parameter by regressing one residual on the other
  coef.est <- coef(rfit)[2]  #extract coefficient
  se <- sqrt(vcovHC(rfit)[2,2]) #record robust standard error
  cat(sprintf("\ncoef (se) = %g (%g)\n", coef.est , se))  #printing output
  return( list(coef.est =coef.est , se=se, dtil=dtil, ytil=ytil) ) #save output and residuals 
}
# library(AER)  #applied econometrics library
# library(randomForest)  #random Forest library
# library(hdm) #high-dimensional econometrics library
# library(glmnet) #glm net

data(GrowthData)                     # Barro-Lee growth data
y= as.matrix(GrowthData[,1])         # outcome: growth rate
d= as.matrix(GrowthData[,3])         # treatment: initial wealth
x= as.matrix(GrowthData[,-c(1,2,3)]) # controls: country characteristics

cat(sprintf("\n length of y is %g \n", length(y) ))
cat(sprintf("\n num features x is %g \n", dim(x)[2] ))


#summary(y)
#summary(d)
#summary(x)

cat(sprintf("\n Naive OLS that uses all features w/o cross-fitting \n"))
lres=summary(lm(y~d +x))$coef[2,1:2]

cat(sprintf("\ncoef (se) = %g (%g)\n", lres[1] , lres[2]))



#DML with OLS


cat(sprintf("\n DML with OLS w/o feature selection \n"))

set.seed(1)
dreg <- function(x,d){ glmnet(x, d, lambda = 0) } #ML method= OLS using glmnet; using lm gives bugs
yreg <- function(x,y){ glmnet(x, y, lambda = 0) } #ML method = OLS

DML2.OLS = DML2.for.PLM(x, d, y, dreg, yreg, nfold=10)


#DML with Lasso:

cat(sprintf("\n DML with Lasso \n"))

set.seed(1)
dreg <- function(x,d){ rlasso(x,d, post=FALSE) } #ML method= lasso from hdm 
yreg <- function(x,y){ rlasso(x,y, post=FALSE) } #ML method = lasso from hdm
DML2.lasso = DML2.for.PLM(x, d, y, dreg, yreg, nfold=10)

cat(sprintf("\n DML with Random Forest \n"))

#DML with Random Forest:
dreg <- function(x,d){ randomForest(x, d) } #ML method=Forest 
yreg <- function(x,y){ randomForest(x, y) } #ML method=Forest
set.seed(1)
DML2.RF = DML2.for.PLM(x, d, y, dreg, yreg, nfold=10)


cat(sprintf("\n DML with Lasso/Random Forest \n"))

#DML MIX:
dreg <- function(x,d){ rlasso(x,d, post=FALSE) } #ML method=Forest 
yreg <- function(x,y){ randomForest(x, y) } #ML method=Forest
set.seed(1)
DML2.RF = DML2.for.PLM(x, d, y, dreg, yreg, nfold=10)
 length of y is 90 

 num features x is 60 

 Naive OLS that uses all features w/o cross-fitting 

coef (se) = -0.00937799 (0.0298877)

 DML with OLS w/o feature selection 
fold: 1  2  3  4  5  6  7  8  9  10  
coef (se) = 0.01013 (0.0167061)

 DML with Lasso 
fold: 1  2  3  4  5  6  7  8  9  10  
coef (se) = -0.0352021 (0.0161357)

 DML with Random Forest 
fold: 1  2  3  4  5  6  7  8  9  10  
coef (se) = -0.0365831 (0.0151539)

 DML with Lasso/Random Forest 
fold: 1  2  3  4  5  6  7  8  9  10  
coef (se) = -0.0375019 (0.0135088)
prRes.D<- c( mean((DML2.OLS$dtil)^2), mean((DML2.lasso$dtil)^2), mean((DML2.RF$dtil)^2));
prRes.Y<- c(mean((DML2.OLS$ytil)^2), mean((DML2.lasso$ytil)^2),mean((DML2.RF$ytil)^2));
prRes<- rbind(sqrt(prRes.D), sqrt(prRes.Y)); 
rownames(prRes)<- c("RMSE D", "RMSE Y");
colnames(prRes)<- c("OLS", "Lasso", "RF")
print(prRes,digit=2)
         OLS Lasso    RF
RMSE D 0.467 0.372 0.372
RMSE Y 0.054 0.052 0.046