Custom Transformers — Extending Sklearn

Sklearn's built-in transformers cover most cases, but every real project needs domain-specific transformations: clip outliers at specific business-defined thresholds, compute ratio features from two others, apply log only when a column is sufficiently skewed. By subclassing BaseEstimator and TransformerMixin you get full pipeline compatibility — the custom transformer sits in any Pipeline or ColumnTransformer.

20 min•By Priygop Team•Updated 2026

Building Custom Sklearn-Compatible Transformers

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, train_test_split

np.random.seed(42)
N = 1500
df = pd.DataFrame({
    "income":    np.random.exponential(55000, N).clip(15000, 300000),
    "loan_amt":  np.random.exponential(18000, N).clip(1000, 80000),
    "credit":    np.random.normal(680, 80, N).clip(300, 850),
    "age":       np.random.normal(38, 12, N).clip(18, 75),
    "default":   np.random.choice([0,1], N, p=[0.82,0.18]),
})
X = df.drop("default", axis=1)
y = df["default"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# CUSTOM TRANSFORMER 1: WinsorizeTransformer
class WinsorizeTransformer(BaseEstimator, TransformerMixin):
    """Clip each feature to [lower_pct, upper_pct] percentiles."""
    def __init__(self, lower_pct: float = 0.01, upper_pct: float = 0.99):
        self.lower_pct = lower_pct
        self.upper_pct = upper_pct

    def fit(self, X: np.ndarray, y=None):
        # Learn clip bounds from TRAINING DATA ONLY
        self.lower_: np.ndarray = np.percentile(X, self.lower_pct * 100, axis=0)
        self.upper_: np.ndarray = np.percentile(X, self.upper_pct * 100, axis=0)
        return self  # always return self

    def transform(self, X: np.ndarray, y=None) -> np.ndarray:
        X = np.array(X, copy=True)
        for i in range(X.shape[1]):
            X[:, i] = np.clip(X[:, i], self.lower_[i], self.upper_[i])
        return X

# CUSTOM TRANSFORMER 2: RatioFeatureTransformer
class RatioFeatureTransformer(BaseEstimator, TransformerMixin):
    """Create ratio features from pairs of columns."""
    def __init__(self, pairs: list, eps: float = 1e-6):
        self.pairs = pairs  # list of (numerator_idx, denominator_idx)
        self.eps   = eps

    def fit(self, X, y=None):
        return self

    def transform(self, X: np.ndarray, y=None) -> np.ndarray:
        X = np.array(X)
        ratios = []
        for num_idx, den_idx in self.pairs:
            ratio = X[:, num_idx] / (X[:, den_idx] + self.eps)
            ratios.append(ratio.reshape(-1, 1))
        return np.hstack([X] + ratios)

# CUSTOM TRANSFORMER 3: LogSkewedTransformer
class LogSkewedTransformer(BaseEstimator, TransformerMixin):
    """Apply log1p only to columns with |skew| > threshold."""
    def __init__(self, skew_threshold: float = 1.0):
        self.skew_threshold = skew_threshold

    def fit(self, X: np.ndarray, y=None):
        self.log_cols_ = [i for i in range(X.shape[1])
                          if abs(pd.Series(X[:, i]).skew()) > self.skew_threshold]
        print(f"  LogSkewedTransformer: applying log1p to columns {self.log_cols_}")
        return self

    def transform(self, X: np.ndarray, y=None) -> np.ndarray:
        X = np.array(X, copy=True, dtype=float)
        for i in self.log_cols_:
            X[:, i] = np.log1p(np.clip(X[:, i], 0, None))
        return X

# BUILD PIPELINE WITH CUSTOM TRANSFORMERS
# Columns: income(0), loan_amt(1), credit(2), age(3)
custom_pipeline = Pipeline([
    ("winsorize",  WinsorizeTransformer(lower_pct=0.01, upper_pct=0.99)),
    ("log_skewed", LogSkewedTransformer(skew_threshold=0.8)),
    ("ratios",     RatioFeatureTransformer(pairs=[(1, 0)])),  # loan_amt / income
    ("scaler",     StandardScaler()),
    ("model",      GradientBoostingClassifier(n_estimators=100, random_state=42)),
])

custom_pipeline.fit(X_train, y_train)
cv_auc = cross_val_score(custom_pipeline, X_train, y_train, cv=5, scoring="roc_auc").mean()
print(f"\nCustom pipeline CV AUC: {cv_auc:.4f}")

# SET_PARAMS WORKS ON CUSTOM TRANSFORMERS (sklearn compatibility!)
custom_pipeline.set_params(winsorize__lower_pct=0.02, winsorize__upper_pct=0.98)
print("set_params on custom transformer works!")

# FunctionTransformer: simplest custom step (no class needed)
log_transformer = FunctionTransformer(np.log1p, validate=True)
simple_pipe = Pipeline([
    ("log", log_transformer),
    ("sc",  StandardScaler()),
    ("m",   GradientBoostingClassifier(n_estimators=100, random_state=42)),
])
simple_auc = cross_val_score(simple_pipe, X_train[["income", "loan_amt"]], y_train, cv=5, scoring="roc_auc").mean()
print(f"FunctionTransformer pipeline AUC: {simple_auc:.4f}")

Tip

Practice Custom Transformers Extending Sklearn in small, isolated examples before integrating into larger projects. Breaking concepts into small experiments builds genuine understanding faster than reading alone.

Diagram

Loading diagram…

QKV attention. LoRA fine-tuning.

Practice Task

Note

Practice Task — (1) Write a working example of Custom Transformers Extending Sklearn from scratch without looking at notes. (2) Modify it to handle an edge case (empty input, null value, or error state). (3) Share your solution in the Priygop community for feedback.

Quick Quiz

Common Mistake

Warning

A common mistake with Custom Transformers Extending Sklearn is skipping edge case testing — empty inputs, null values, and unexpected data types. Always validate boundary conditions to write robust, production-ready ml code.

Topics in This Module