Linear Model Overfiting

# !wget https://developer.nvidia.com/compute/cuda/9.0/Prod/local_installers/cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb
# !dpkg -i cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb
# !apt-key add /var/cuda-repo-9-0-local/7fa2af80.pub
# !apt update -q
# !apt install cuda gcc-6 g++-6 -y -q
# !ln -s /usr/bin/gcc-6 /usr/local/cuda/bin/gcc
# !ln -s /usr/bin/g++-6 /usr/local/cuda/bin/g++

# !curl -sSL "https://julialang-s3.julialang.org/bin/linux/x64/1.7/julia-1.7.3-linux-x86_64.tar.gz" -o julia.tar.gz
# !tar -xzf julia.tar.gz -C /usr --strip-components 1
# !rm -rf julia.tar.gz*
# !julia -e 'using Pkg; pkg"add IJulia; precompile"'

1. Linear Model Overfiting#

# If necesary, install functions
# import Pkg; Pkg.add("GLM")
# import Pkg; Pkg.add("DataFrames")

# Import functions
using LinearAlgebra, GLM, DataFrames, Statistics, Random

First set p=n

Random.seed!(1234)

n = 1000
p = n

# Create a 1000x1000 matrix of standard Gaussians
X = randn(n, p)

# Create a 1000x1 matrix of standard Gaussians
Y = randn(n)

# We can not run the regression below, because we need to have n>p otherwise error shows up.(I think it is because the matrix
# decomposition procedure)
# Fitted linear regression 
# fitted = lm(X,Y)

# This is a fuction that returns coeficients,R2 and Adj R2

function OLSestimator(Y, X)

    β = inv(X'*X)*(X'*Y)
    # β = X\Y
    errors = Y - X*β
    R_squared = 1.0 - sum(errors.^2.0)/sum((Y .- mean(Y)).^2.0)
    R_squared_adj =  1.0 - ( 1.0 - R_squared )*( size(Y)[1] - 1.0 )/( size(Y)[1]- size(X)[2] - 1.0 )    
    
    return β, R_squared, R_squared_adj
end

results_ols = OLSestimator(Y, X)

println("p/n is")
println(p/n)

println("R2 is")
println(results_ols[2])

println("Adjusted R2 is")
println(results_ols[3])

p/n is
1.0
R2 is
1.0
Adjusted R2 is
1.0

Second, set p=n/2.

# We have to make sure that both variables are the same type (Integers or floats) to avoid errors when running the regression
n = 1000;
p = Int(n/2);

# Create a nxp matrix of standard Gaussians
X = randn(n, p);

# Create a nx1 matrix of standard Gaussians
Y = randn(n);

fitted = lm(X,Y);

println("p/n is")
println(p/n)

println("R2 is")
println(r2(fitted))

println("Adjusted R2 is")
println(adjr2(fitted))

p/n is
0.5
R2 is
0.4790545125058999
Adjusted R2 is
-0.040849084013211856

Third, set p/n =.05

n = 1000
p = Int(0.05*n)

X = randn(n, p)

Y = randn(n)

fitted = lm(X,Y)

println("p/n is")
println(p/n)

println("R2 is")
println(r2(fitted))

println("Adjusted R2 is")
println(adjr2(fitted))

p/n is
0.05
R2 is
0.0462927046544056
Adjusted R2 is
-0.002898513737104036