## Inference for the lasso

#### Generate data for a high-dimensional regression

n <- 100
p <- 200
X <- matrix(rnorm(n*p), nrow = n)
Y <- rnorm(n)

#### Fit the lasso solution path and choose a sparsity level automatically

Note: this is an alternative method to cross-validation

# Compute lasso path
# while storing information required for
# conditional (truncated) inference
lasso_path <- lar(X, Y)
# Use AIC/BIC to choose model size
# in higher dimensions change mult
# to log(n) for BIC, log(p) for RIC
lasso_inference <- larInf(lasso_path, mult = 2, type = "aic", ntimes = 1)
## Warning in larInf(lasso_path, mult = 2, type = "aic", ntimes = 1): p > n/
## 2, and sd(y) = 0.999 used as an estimate of sigma; you may want to use the
## estimateSigma function

Selected predictors

active_set <- lasso_inference\$vars
active_set
##  186  65 142  97  57 104   4  64

#### Fit an OLS model using only selected variables, compute p-values the classical way

(Use summary() or tidy() from the broom package)

OLS_fit <- lm(Y ~ X[, active_set] - 1)
tidy(OLS_fit)
## # A tibble: 8 x 5
##   term             estimate std.error statistic p.value
##   <chr>               <dbl>     <dbl>     <dbl>   <dbl>
## 1 X[, active_set]1    0.130    0.110       1.18 0.240
## 2 X[, active_set]2    0.158    0.0836      1.89 0.0622
## 3 X[, active_set]3   -0.238    0.0903     -2.64 0.00974
## 4 X[, active_set]4   -0.129    0.0865     -1.50 0.138
## 5 X[, active_set]5   -0.175    0.0883     -1.98 0.0507
## 6 X[, active_set]6   -0.213    0.0982     -2.17 0.0329
## 7 X[, active_set]7   -0.178    0.0965     -1.85 0.0682
## 8 X[, active_set]8    0.161    0.0894      1.81 0.0742

These p-values (from the larInf function in the selectiveInference package above) are calculated from a conditional distribution, conditional on the model selection

lasso_inference
##
## Call:
## larInf(obj = lasso_path, type = "aic", mult = 2, ntimes = 1)
##
## Standard deviation of noise (specified or estimated) sigma = 0.999
##
## Testing results at step = 8, with alpha = 0.100
##
##  Var   Coef Z-score P-value LowConfPt UpConfPt LowTailArea UpTailArea
##  186  0.140   1.114   0.670      -Inf    5.946        0.00       0.05
##   65  0.151   1.586   0.712      -Inf    4.277        0.00       0.05
##  142 -0.230  -2.233   0.454      -Inf      Inf        0.00       0.00
##   97 -0.135  -1.370   0.406    -8.278    5.559        0.05       0.05
##   57 -0.180  -1.793   0.548      -Inf      Inf        0.00       0.00
##  104 -0.213  -1.912   0.193      -Inf    7.948        0.00       0.05
##    4 -0.174  -1.594   0.750    -5.464      Inf        0.05       0.00
##   64  0.155   1.517   0.377      -Inf      Inf        0.00       0.00
##
## Estimated stopping point from AIC rule = 8