1. Linear Model Overfiting#

1.1. First set p=n#

install.packages(
    "librarian"
    , quiet = T) # package installation and management
librarian::shelf(
    tidyverse, broom, glue
    , quiet = T)
set.seed(123)
n = 1000

p = n
X <- matrix(rnorm(n*p), n, p)
Y <- rnorm(n)

mdl_0 <- lm(Y ~ X)


cat(
  "- p/n is: " , p/n
  , "\n- R2 is: ", summary(mdl_0)$r.squared
  , "\n- Adjusted r2 is: ", summary(mdl_0)$adj.r.squared
)
Warning message:
"unable to access index for repository https://cran.r-project.org/src/contrib:
  cannot open URL 'https://cran.r-project.org/src/contrib/PACKAGES'"
Warning message:
"package 'librarian' is not available for this version of R

A version of this package for your version of R might be available elsewhere,
see the ideas at
https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages"
Warning message:
"unable to access index for repository https://cran.r-project.org/bin/windows/contrib/4.2:
  cannot open URL 'https://cran.r-project.org/bin/windows/contrib/4.2/PACKAGES'"
- p/n is:  1 
- R2 is:  1 
- Adjusted r2 is:  NaN
broom::tidy(mdl_0) |> 
  slice(1:5, 995:1000)
A tibble: 11 × 5
termestimatestd.errorstatisticp.value
<chr><dbl><dbl><dbl><dbl>
(Intercept) 0.008655314NaNNaNNaN
X1 0.209114583NaNNaNNaN
X2 1.132843246NaNNaNNaN
X3 -0.165922653NaNNaNNaN
X4 0.342487380NaNNaNNaN
X994 -1.015520198NaNNaNNaN
X995 -1.709652513NaNNaNNaN
X996 2.107076237NaNNaNNaN
X997 1.250854062NaNNaNNaN
X998 1.513957764NaNNaNNaN
X999 0.800422182NaNNaNNaN

1.2. Second, set p=n/2.#

set.seed(123)
n = 1000

p = n/2
X <- matrix(rnorm(n*p), n, p)
Y <- rnorm(n)

mdl_1 <- lm(Y ~ X)


cat(
  "- p/n is: " , p/n
  , "\n- R2 is: ", summary(mdl_1)$r.squared
  , "\n- Adjusted r2 is: ", summary(mdl_1)$adj.r.squared
)
- p/n is:  0.5 
- R2 is:  0.4922339 
- Adjusted r2 is:  -0.01654975

1.3. Third, set p/n =.05#

set.seed(123)
n = 1000

p = .05*n
X <- matrix(rnorm(n*p), n, p)
Y <- rnorm(n)

mld_2 <- lm(Y ~ X)

cat(
  "- p/n is: " , p/n
  , "\n- R2 is: ", summary(mdl_1)$r.squared
  , "\n- Adjusted r2 is: ", summary(mdl_1)$adj.r.squared
)
- p/n is:  0.05 
- R2 is:  0.4922339 
- Adjusted r2 is:  -0.01654975