Best Practices

Model Evaluation

Properly evaluate ML models with cross-validation, hyperparameter tuning, and avoiding data leakage.

Why Evaluation Matters

A model that performs well on training data but poorly on new data is overfitting. Proper evaluation helps you:

Honestly assess model performance
Compare different models fairly
Tune hyperparameters without overfitting

Cross-Validation

Instead of a single train/test split, k-fold CV splits data into k folds, trains k times using each fold as the test set. More reliable performance estimate.

Hyperparameter Tuning

Grid Search: Try all combinations (slow but thorough)
Random Search: Sample random combinations (faster)
Bayesian Optimization: Smart search (fastest)

Data Leakage

A common mistake: using information from the test set during training. Always fit preprocessors (scalers, encoders) on training data only, then apply to test data.

Example

python

from sklearn.model_selection import (
    cross_val_score, KFold, GridSearchCV, RandomizedSearchCV
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
import numpy as np

X, y = make_classification(n_samples=1000, n_features=20, random_state=42)

# K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(
    RandomForestClassifier(n_estimators=100),
    X, y,
    cv=kf,
    scoring='accuracy'
)
print(f"CV Accuracy: {scores.mean():.3f} +/- {scores.std():.3f}")

# Pipeline - prevents data leakage!
pipe = Pipeline([
    ('scaler', StandardScaler()),      # fitted on train fold only
    ('classifier', RandomForestClassifier()),
])

# Grid Search with cross-validation
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5],
}

grid_search = GridSearchCV(
    pipe,
    param_grid,
    cv=5,
    scoring='f1',
    n_jobs=-1,  # use all CPU cores
    verbose=1
)
grid_search.fit(X, y)

print(f"Best params: {grid_search.best_params_}")
print(f"Best F1: {grid_search.best_score_:.3f}")

# Randomized Search (faster)
param_dist = {
    'classifier__n_estimators': np.arange(50, 500, 50),
    'classifier__max_depth': [None, 5, 10, 20, 30],
    'classifier__min_samples_split': np.arange(2, 20),
}

random_search = RandomizedSearchCV(pipe, param_dist, n_iter=20, cv=5, random_state=42)
random_search.fit(X, y)
print(f"Best random params: {random_search.best_params_}")

Try it yourself — PYTHON

from sklearn.model_selection import (
    cross_val_score, KFold, GridSearchCV, RandomizedSearchCV
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
import numpy as np

X, y = make_classification(n_samples=1000, n_features=20, random_state=42)

# K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(
    RandomForestClassifier(n_estimators=100),
    X, y,
    cv=kf,
    scoring='accuracy'
)
print(f"CV Accuracy: {scores.mean():.3f} +/- {scores.std():.3f}")

# Pipeline - prevents data leakage!
pipe = Pipeline([
    ('scaler', StandardScaler()),      # fitted on train fold only
    ('classifier', RandomForestClassifier()),
])

# Grid Search with cross-validation
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5],
}

grid_search = GridSearchCV(
    pipe,
    param_grid,
    cv=5,
    scoring='f1',
    n_jobs=-1,  # use all CPU cores
    verbose=1
)
grid_search.fit(X, y)

print(f"Best params: {grid_search.best_params_}")
print(f"Best F1: {grid_search.best_score_:.3f}")

# Randomized Search (faster)
param_dist = {
    'classifier__n_estimators': np.arange(50, 500, 50),
    'classifier__max_depth': [None, 5, 10, 20, 30],
    'classifier__min_samples_split': np.arange(2, 20),
}

random_search = RandomizedSearchCV(pipe, param_dist, n_iter=20, cv=5, random_state=42)
random_search.fit(X, y)
print(f"Best random params: {random_search.best_params_}")