Statistical Feature Selection Methods
Feature selection removes irrelevant and redundant features to: reduce overfitting, speed up training, improve interpretability, and sometimes improve accuracy. Filter methods use statistical tests independent of the model (fast, no overfitting risk). Wrapper methods use model performance to evaluate subsets (slow but model-aware). Embedded methods perform selection during training (Lasso, tree importance).
Filter, Wrapper, and Embedded Feature Selection
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import (SelectKBest, f_classif, mutual_info_classif,
chi2, RFE, RFECV, VarianceThreshold,
SelectFromModel)
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
cancer = load_breast_cancer()
X, y = cancer.data, cancer.target
feature_names = np.array(cancer.feature_names)
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# 1. FILTER METHODS (model-independent)
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# Variance threshold -- remove near-constant features
vt = VarianceThreshold(threshold=0.1)
X_vt = vt.fit_transform(StandardScaler().fit_transform(X))
print(f"Variance threshold: {X.shape[1]} -> {X_vt.shape[1]} features")
# ANOVA F-test (for numeric features, classification)
selector_f = SelectKBest(score_func=f_classif, k=10)
selector_f.fit(X, y)
f_scores = pd.DataFrame({"Feature": feature_names, "F_Score": selector_f.scores_, "p_value": selector_f.pvalues_})
print("\nTop 10 by ANOVA F-score:")
print(f_scores.nlargest(10, "F_Score")[["Feature", "F_Score", "p_value"]].round(2).to_string(index=False))
# Mutual Information (handles non-linear relationships, no distributional assumptions)
selector_mi = SelectKBest(score_func=mutual_info_classif, k=10, n_jobs=-1)
selector_mi.fit(X, y)
mi_scores = pd.DataFrame({"Feature": feature_names, "MI_Score": selector_mi.scores_})
print("\nTop 10 by Mutual Information:")
print(mi_scores.nlargest(10, "MI_Score").round(4).to_string(index=False))
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# 2. WRAPPER METHODS: RFE (Recursive Feature Elimination)
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
rf_base = RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1)
rfe = RFE(estimator=rf_base, n_features_to_select=15, step=2)
rfe.fit(X, y)
selected_rfe = feature_names[rfe.support_]
print(f"\nRFE selected {len(selected_rfe)} features:")
print(list(selected_rfe))
# RFECV: automatically finds optimal number of features via cross-validation
rfecv = RFECV(estimator=rf_base, step=2, cv=5, scoring="roc_auc", n_jobs=-1)
rfecv.fit(X, y)
print(f"\nRFECV optimal n_features: {rfecv.n_features_} (automatically tuned)")
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# 3. EMBEDDED METHODS
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# Lasso: sets unimportant feature weights to exactly 0
lasso_pipe = Pipeline([("scaler", StandardScaler()), ("lasso", Lasso(alpha=0.01, max_iter=5000))])
sfm_lasso = SelectFromModel(lasso_pipe, threshold=1e-4)
sfm_lasso.fit(X, y.astype(float))
print(f"\nLasso SelectFromModel: {X.shape[1]} -> {sfm_lasso.transform(X).shape[1]} features")
# Tree importance: select above-mean importance
sfm_rf = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42), threshold="mean")
sfm_rf.fit(X, y)
print(f"RF SelectFromModel: {X.shape[1]} -> {sfm_rf.transform(X).shape[1]} features")
# COMPARE ALL SELECTION STRATEGIES
print("\nModel AUC-ROC with different feature sets:")
for name, X_sel in [("All features", X),
(f"Top 10 (F-test)", selector_f.transform(X)),
(f"Top 10 (MI)", selector_mi.transform(X)),
(f"RFE ({len(selected_rfe)})", rfe.transform(X)),
(f"Lasso select", sfm_lasso.transform(X))]:
pipe = Pipeline([("sc", StandardScaler()), ("m", GradientBoostingClassifier(n_estimators=100, random_state=42))])
auc = cross_val_score(pipe, X_sel, y, cv=5, scoring="roc_auc").mean()
n_f = X_sel.shape[1]
print(f" {name:22s} ({n_f:2d} feats): AUC={auc:.4f}")Tip
Tip
Practice Statistical Feature Selection Methods in small, isolated examples before integrating into larger projects. Breaking concepts into small experiments builds genuine understanding faster than reading alone.
Feature engineering = 80% of ML success.
Practice Task
Note
Practice Task — (1) Write a working example of Statistical Feature Selection Methods from scratch without looking at notes. (2) Modify it to handle an edge case (empty input, null value, or error state). (3) Share your solution in the Priygop community for feedback.
Quick Quiz
Common Mistake
Warning
A common mistake with Statistical Feature Selection Methods is skipping edge case testing — empty inputs, null values, and unexpected data types. Always validate boundary conditions to write robust, production-ready ml code.