n <- 100
p <- 200
X <- matrix(rnorm(n*p), nrow = n)
Y <- rnorm(n)
Note: this is an alternative method to cross-validation
# Compute lasso path
# while storing information required for
# conditional (truncated) inference
lasso_path <- lar(X, Y)
# Use AIC/BIC to choose model size
# and compute adjusted p-values
# in higher dimensions change mult
# to log(n) for BIC, log(p) for RIC
lasso_inference <- larInf(lasso_path, mult = 2, type = "aic", ntimes = 1)
## Warning in larInf(lasso_path, mult = 2, type = "aic", ntimes = 1): p > n/
## 2, and sd(y) = 0.999 used as an estimate of sigma; you may want to use the
## estimateSigma function
Selected predictors
active_set <- lasso_inference$vars
active_set
## [1] 186 65 142 97 57 104 4 64
(Use summary()
or tidy()
from the broom
package)
OLS_fit <- lm(Y ~ X[, active_set] - 1)
tidy(OLS_fit)
## # A tibble: 8 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 X[, active_set]1 0.130 0.110 1.18 0.240
## 2 X[, active_set]2 0.158 0.0836 1.89 0.0622
## 3 X[, active_set]3 -0.238 0.0903 -2.64 0.00974
## 4 X[, active_set]4 -0.129 0.0865 -1.50 0.138
## 5 X[, active_set]5 -0.175 0.0883 -1.98 0.0507
## 6 X[, active_set]6 -0.213 0.0982 -2.17 0.0329
## 7 X[, active_set]7 -0.178 0.0965 -1.85 0.0682
## 8 X[, active_set]8 0.161 0.0894 1.81 0.0742
These p-values (from the larInf
function in the selectiveInference
package above) are calculated from a conditional distribution, conditional on the model selection
lasso_inference
##
## Call:
## larInf(obj = lasso_path, type = "aic", mult = 2, ntimes = 1)
##
## Standard deviation of noise (specified or estimated) sigma = 0.999
##
## Testing results at step = 8, with alpha = 0.100
##
## Var Coef Z-score P-value LowConfPt UpConfPt LowTailArea UpTailArea
## 186 0.140 1.114 0.670 -Inf 5.946 0.00 0.05
## 65 0.151 1.586 0.712 -Inf 4.277 0.00 0.05
## 142 -0.230 -2.233 0.454 -Inf Inf 0.00 0.00
## 97 -0.135 -1.370 0.406 -8.278 5.559 0.05 0.05
## 57 -0.180 -1.793 0.548 -Inf Inf 0.00 0.00
## 104 -0.213 -1.912 0.193 -Inf 7.948 0.00 0.05
## 4 -0.174 -1.594 0.750 -5.464 Inf 0.05 0.00
## 64 0.155 1.517 0.377 -Inf Inf 0.00 0.00
##
## Estimated stopping point from AIC rule = 8
Y
to be generated from a coefficient vector where only the first 5 entries are nonzero and then run the code again a few more times