Custom Transformers — Extending Sklearn
Sklearn's built-in transformers cover most cases, but every real project needs domain-specific transformations: clip outliers at specific business-defined thresholds, compute ratio features from two others, apply log only when a column is sufficiently skewed. By subclassing BaseEstimator and TransformerMixin you get full pipeline compatibility — the custom transformer sits in any Pipeline or ColumnTransformer.
Building Custom Sklearn-Compatible Transformers
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, train_test_split
np.random.seed(42)
N = 1500
df = pd.DataFrame({
"income": np.random.exponential(55000, N).clip(15000, 300000),
"loan_amt": np.random.exponential(18000, N).clip(1000, 80000),
"credit": np.random.normal(680, 80, N).clip(300, 850),
"age": np.random.normal(38, 12, N).clip(18, 75),
"default": np.random.choice([0,1], N, p=[0.82,0.18]),
})
X = df.drop("default", axis=1)
y = df["default"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# CUSTOM TRANSFORMER 1: WinsorizeTransformer
class WinsorizeTransformer(BaseEstimator, TransformerMixin):
"""Clip each feature to [lower_pct, upper_pct] percentiles."""
def __init__(self, lower_pct: float = 0.01, upper_pct: float = 0.99):
self.lower_pct = lower_pct
self.upper_pct = upper_pct
def fit(self, X: np.ndarray, y=None):
# Learn clip bounds from TRAINING DATA ONLY
self.lower_: np.ndarray = np.percentile(X, self.lower_pct * 100, axis=0)
self.upper_: np.ndarray = np.percentile(X, self.upper_pct * 100, axis=0)
return self # always return self
def transform(self, X: np.ndarray, y=None) -> np.ndarray:
X = np.array(X, copy=True)
for i in range(X.shape[1]):
X[:, i] = np.clip(X[:, i], self.lower_[i], self.upper_[i])
return X
# CUSTOM TRANSFORMER 2: RatioFeatureTransformer
class RatioFeatureTransformer(BaseEstimator, TransformerMixin):
"""Create ratio features from pairs of columns."""
def __init__(self, pairs: list, eps: float = 1e-6):
self.pairs = pairs # list of (numerator_idx, denominator_idx)
self.eps = eps
def fit(self, X, y=None):
return self
def transform(self, X: np.ndarray, y=None) -> np.ndarray:
X = np.array(X)
ratios = []
for num_idx, den_idx in self.pairs:
ratio = X[:, num_idx] / (X[:, den_idx] + self.eps)
ratios.append(ratio.reshape(-1, 1))
return np.hstack([X] + ratios)
# CUSTOM TRANSFORMER 3: LogSkewedTransformer
class LogSkewedTransformer(BaseEstimator, TransformerMixin):
"""Apply log1p only to columns with |skew| > threshold."""
def __init__(self, skew_threshold: float = 1.0):
self.skew_threshold = skew_threshold
def fit(self, X: np.ndarray, y=None):
self.log_cols_ = [i for i in range(X.shape[1])
if abs(pd.Series(X[:, i]).skew()) > self.skew_threshold]
print(f" LogSkewedTransformer: applying log1p to columns {self.log_cols_}")
return self
def transform(self, X: np.ndarray, y=None) -> np.ndarray:
X = np.array(X, copy=True, dtype=float)
for i in self.log_cols_:
X[:, i] = np.log1p(np.clip(X[:, i], 0, None))
return X
# BUILD PIPELINE WITH CUSTOM TRANSFORMERS
# Columns: income(0), loan_amt(1), credit(2), age(3)
custom_pipeline = Pipeline([
("winsorize", WinsorizeTransformer(lower_pct=0.01, upper_pct=0.99)),
("log_skewed", LogSkewedTransformer(skew_threshold=0.8)),
("ratios", RatioFeatureTransformer(pairs=[(1, 0)])), # loan_amt / income
("scaler", StandardScaler()),
("model", GradientBoostingClassifier(n_estimators=100, random_state=42)),
])
custom_pipeline.fit(X_train, y_train)
cv_auc = cross_val_score(custom_pipeline, X_train, y_train, cv=5, scoring="roc_auc").mean()
print(f"\nCustom pipeline CV AUC: {cv_auc:.4f}")
# SET_PARAMS WORKS ON CUSTOM TRANSFORMERS (sklearn compatibility!)
custom_pipeline.set_params(winsorize__lower_pct=0.02, winsorize__upper_pct=0.98)
print("set_params on custom transformer works!")
# FunctionTransformer: simplest custom step (no class needed)
log_transformer = FunctionTransformer(np.log1p, validate=True)
simple_pipe = Pipeline([
("log", log_transformer),
("sc", StandardScaler()),
("m", GradientBoostingClassifier(n_estimators=100, random_state=42)),
])
simple_auc = cross_val_score(simple_pipe, X_train[["income", "loan_amt"]], y_train, cv=5, scoring="roc_auc").mean()
print(f"FunctionTransformer pipeline AUC: {simple_auc:.4f}")Tip
Tip
Practice Custom Transformers Extending Sklearn in small, isolated examples before integrating into larger projects. Breaking concepts into small experiments builds genuine understanding faster than reading alone.
QKV attention. LoRA fine-tuning.
Practice Task
Note
Practice Task — (1) Write a working example of Custom Transformers Extending Sklearn from scratch without looking at notes. (2) Modify it to handle an edge case (empty input, null value, or error state). (3) Share your solution in the Priygop community for feedback.
Quick Quiz
Common Mistake
Warning
A common mistake with Custom Transformers Extending Sklearn is skipping edge case testing — empty inputs, null values, and unexpected data types. Always validate boundary conditions to write robust, production-ready ml code.