Model Evaluation Metrics in Machine Learning: Comprehensive Guide with Python
AI-Generated Content Notice
Some code examples and technical explanations in this article were generated with AI assistance. The content has been reviewed for accuracy, but please test any code snippets in your development environment before using them.
Model Evaluation Metrics in Machine Learning: Comprehensive Guide with Python
Introduction
Choosing the right evaluation metric is crucial for assessing machine learning model performance. Different metrics reveal different aspects of model behavior, and using inappropriate metrics can lead to misleading conclusions about model quality.
This guide covers essential evaluation metrics for classification and regression tasks, explaining when to use each metric and how to interpret results correctly.
Classification Metrics
Basic Classification Metrics
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification, load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score,
confusion_matrix, classification_report, roc_curve, auc,
precision_recall_curve, roc_auc_score, average_precision_score
)
import pandas as pd
from typing import Dict, List, Tuple
import warnings
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')
class ClassificationMetricsAnalyzer:
"""Comprehensive classification metrics analyzer"""
def __init__(self):
self.results = {}
def calculate_basic_metrics(self, y_true: np.ndarray, y_pred: np.ndarray,
y_proba: np.ndarray = None) -> Dict:
"""Calculate all basic classification metrics"""
metrics = {
'accuracy': accuracy_score(y_true, y_pred),
'precision': precision_score(y_true, y_pred, average='weighted'),
'recall': recall_score(y_true, y_pred, average='weighted'),
'f1_score': f1_score(y_true, y_pred, average='weighted')
}
# Add probability-based metrics if available
if y_proba is not None:
if y_proba.shape[1] == 2: # Binary classification
metrics['roc_auc'] = roc_auc_score(y_true, y_proba[:, 1])
metrics['avg_precision'] = average_precision_score(y_true, y_proba[:, 1])
else: # Multi-class
metrics['roc_auc'] = roc_auc_score(y_true, y_proba, multi_class='ovr')
return metrics
def plot_confusion_matrix(self, y_true: np.ndarray, y_pred: np.ndarray,
class_names: List[str] = None) -> None:
"""Plot confusion matrix with detailed analysis"""
cm = confusion_matrix(y_true, y_pred)
if class_names is None:
class_names = [f'Class {i}' for i in range(len(np.unique(y_true)))]
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
# Raw confusion matrix
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=class_names, yticklabels=class_names, ax=ax1)
ax1.set_title('Confusion Matrix (Counts)', fontweight='bold')
ax1.set_xlabel('Predicted Label')
ax1.set_ylabel('True Label')
# Normalized confusion matrix
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
sns.heatmap(cm_normalized, annot=True, fmt='.3f', cmap='Blues',
xticklabels=class_names, yticklabels=class_names, ax=ax2)
ax2.set_title('Confusion Matrix (Normalized)', fontweight='bold')
ax2.set_xlabel('Predicted Label')
ax2.set_ylabel('True Label')
plt.tight_layout()
plt.show()
# Print per-class metrics
print("\nPer-Class Analysis:")
print("-" * 50)
for i, class_name in enumerate(class_names):
if i < len(cm):
tp = cm[i, i]
fp = cm[:, i].sum() - tp
fn = cm[i, :].sum() - tp
tn = cm.sum() - tp - fp - fn
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
print(f"{class_name}:")
print(f" Precision: {precision:.3f}")
print(f" Recall (Sensitivity): {recall:.3f}")
print(f" Specificity: {specificity:.3f}")
print()
# Generate imbalanced dataset for demonstration
X, y = make_classification(n_samples=1000, n_features=20, n_classes=3,
n_informative=10, weights=[0.7, 0.2, 0.1],
random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
random_state=42, stratify=y)
# Train models
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train, y_train)
# Get predictions
rf_pred = rf_model.predict(X_test)
rf_proba = rf_model.predict_proba(X_test)
lr_pred = lr_model.predict(X_test)
lr_proba = lr_model.predict_proba(X_test)
# Analyze metrics
analyzer = ClassificationMetricsAnalyzer()
print("=== RANDOM FOREST METRICS ===")
rf_metrics = analyzer.calculate_basic_metrics(y_test, rf_pred, rf_proba)
for metric, value in rf_metrics.items():
print(f"{metric.capitalize()}: {value:.4f}")
print("\n=== LOGISTIC REGRESSION METRICS ===")
lr_metrics = analyzer.calculate_basic_metrics(y_test, lr_pred, lr_proba)
for metric, value in lr_metrics.items():
print(f"{metric.capitalize()}: {value:.4f}")
# Plot confusion matrices
analyzer.plot_confusion_matrix(y_test, rf_pred, ['Majority', 'Minority 1', 'Minority 2'])
ROC and Precision-Recall Curves
class ROCPRAnalyzer:
"""ROC and Precision-Recall curve analyzer"""
def __init__(self):
pass
def plot_roc_curves(self, models_data: Dict) -> None:
"""Plot ROC curves for multiple models"""
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
# Binary classification ROC
colors = ['blue', 'red', 'green', 'orange', 'purple']
for idx, (model_name, data) in enumerate(models_data.items()):
y_true = data['y_true']
y_proba = data['y_proba']
if len(np.unique(y_true)) == 2: # Binary classification
fpr, tpr, _ = roc_curve(y_true, y_proba[:, 1])
roc_auc = auc(fpr, tpr)
axes[0].plot(fpr, tpr, color=colors[idx % len(colors)],
lw=2, label=f'{model_name} (AUC = {roc_auc:.3f})')
# Plot diagonal line
axes[0].plot([0, 1], [0, 1], 'k--', lw=2, alpha=0.5)
axes[0].set_xlabel('False Positive Rate', fontsize=12)
axes[0].set_ylabel('True Positive Rate', fontsize=12)
axes[0].set_title('ROC Curves', fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
# Precision-Recall curves
for idx, (model_name, data) in enumerate(models_data.items()):
y_true = data['y_true']
y_proba = data['y_proba']
if len(np.unique(y_true)) == 2: # Binary classification
precision, recall, _ = precision_recall_curve(y_true, y_proba[:, 1])
avg_precision = average_precision_score(y_true, y_proba[:, 1])
axes[1].plot(recall, precision, color=colors[idx % len(colors)],
lw=2, label=f'{model_name} (AP = {avg_precision:.3f})')
# Baseline (random classifier)
baseline = np.sum(y_true) / len(y_true)
axes[1].axhline(y=baseline, color='k', linestyle='--', alpha=0.5,
label=f'Baseline ({baseline:.3f})')
axes[1].set_xlabel('Recall', fontsize=12)
axes[1].set_ylabel('Precision', fontsize=12)
axes[1].set_title('Precision-Recall Curves', fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
def threshold_analysis(self, y_true: np.ndarray, y_proba: np.ndarray) -> None:
"""Analyze performance across different probability thresholds"""
# Calculate metrics for different thresholds
thresholds = np.linspace(0.1, 0.9, 17)
metrics_data = []
for threshold in thresholds:
y_pred_thresh = (y_proba[:, 1] >= threshold).astype(int)
precision = precision_score(y_true, y_pred_thresh, zero_division=0)
recall = recall_score(y_true, y_pred_thresh, zero_division=0)
f1 = f1_score(y_true, y_pred_thresh, zero_division=0)
metrics_data.append({
'threshold': threshold,
'precision': precision,
'recall': recall,
'f1_score': f1
})
# Convert to DataFrame for easy plotting
df = pd.DataFrame(metrics_data)
# Plot threshold analysis
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
# Metrics vs threshold
ax1.plot(df['threshold'], df['precision'], 'o-', label='Precision',
linewidth=2, markersize=6)
ax1.plot(df['threshold'], df['recall'], 's-', label='Recall',
linewidth=2, markersize=6)
ax1.plot(df['threshold'], df['f1_score'], '^-', label='F1-Score',
linewidth=2, markersize=6)
# Find optimal F1 threshold
optimal_idx = df['f1_score'].idxmax()
optimal_threshold = df.loc[optimal_idx, 'threshold']
optimal_f1 = df.loc[optimal_idx, 'f1_score']
ax1.axvline(x=optimal_threshold, color='red', linestyle='--', alpha=0.7,
label=f'Optimal F1 ({optimal_threshold:.2f})')
ax1.set_xlabel('Classification Threshold', fontsize=12)
ax1.set_ylabel('Metric Value', fontsize=12)
ax1.set_title('Metrics vs Classification Threshold', fontweight='bold')
ax1.legend()
ax1.grid(True, alpha=0.3)
# Precision-Recall tradeoff
ax2.plot(df['recall'], df['precision'], 'o-', linewidth=2, markersize=6)
# Highlight optimal point
optimal_precision = df.loc[optimal_idx, 'precision']
optimal_recall = df.loc[optimal_idx, 'recall']
ax2.scatter(optimal_recall, optimal_precision, color='red', s=100,
zorder=5, label=f'Optimal F1 ({optimal_f1:.3f})')
ax2.set_xlabel('Recall', fontsize=12)
ax2.set_ylabel('Precision', fontsize=12)
ax2.set_title('Precision-Recall Tradeoff', fontweight='bold')
ax2.legend()
ax2.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print(f"Optimal threshold for F1-score: {optimal_threshold:.3f}")
print(f"At optimal threshold:")
print(f" Precision: {optimal_precision:.3f}")
print(f" Recall: {optimal_recall:.3f}")
print(f" F1-Score: {optimal_f1:.3f}")
# Create binary classification for ROC analysis
X_binary, y_binary = make_classification(n_samples=1000, n_features=20, n_classes=2,
weights=[0.8, 0.2], random_state=42)
X_train_bin, X_test_bin, y_train_bin, y_test_bin = train_test_split(
X_binary, y_binary, test_size=0.3, random_state=42, stratify=y_binary
)
# Train models for binary classification
rf_bin = RandomForestClassifier(n_estimators=100, random_state=42)
rf_bin.fit(X_train_bin, y_train_bin)
lr_bin = LogisticRegression(random_state=42, max_iter=1000)
lr_bin.fit(X_train_bin, y_train_bin)
# Prepare data for ROC analysis
models_data = {
'Random Forest': {
'y_true': y_test_bin,
'y_proba': rf_bin.predict_proba(X_test_bin)
},
'Logistic Regression': {
'y_true': y_test_bin,
'y_proba': lr_bin.predict_proba(X_test_bin)
}
}
# Analyze ROC and PR curves
roc_analyzer = ROCPRAnalyzer()
roc_analyzer.plot_roc_curves(models_data)
# Threshold analysis
print("\n=== THRESHOLD ANALYSIS ===")
roc_analyzer.threshold_analysis(y_test_bin, rf_bin.predict_proba(X_test_bin))
Regression Metrics
Basic Regression Metrics
from sklearn.datasets import make_regression, load_diabetes
from sklearn.metrics import (
mean_squared_error, mean_absolute_error, r2_score,
mean_absolute_percentage_error, explained_variance_score
)
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
class RegressionMetricsAnalyzer:
"""Comprehensive regression metrics analyzer"""
def __init__(self):
pass
def calculate_regression_metrics(self, y_true: np.ndarray, y_pred: np.ndarray) -> Dict:
"""Calculate all regression metrics"""
metrics = {
'mse': mean_squared_error(y_true, y_pred),
'rmse': np.sqrt(mean_squared_error(y_true, y_pred)),
'mae': mean_absolute_error(y_true, y_pred),
'r2_score': r2_score(y_true, y_pred),
'explained_variance': explained_variance_score(y_true, y_pred)
}
# MAPE (handle zero division)
try:
metrics['mape'] = mean_absolute_percentage_error(y_true, y_pred)
except:
# Manual MAPE calculation with zero handling
mask = y_true != 0
if np.sum(mask) > 0:
metrics['mape'] = np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100
else:
metrics['mape'] = np.inf
return metrics
def plot_regression_analysis(self, y_true: np.ndarray, y_pred: np.ndarray,
model_name: str = "Model") -> None:
"""Plot comprehensive regression analysis"""
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
# Plot 1: Predicted vs Actual
axes[0, 0].scatter(y_true, y_pred, alpha=0.6, s=30)
# Perfect prediction line
min_val = min(y_true.min(), y_pred.min())
max_val = max(y_true.max(), y_pred.max())
axes[0, 0].plot([min_val, max_val], [min_val, max_val], 'r--', lw=2, alpha=0.8)
axes[0, 0].set_xlabel('Actual Values', fontsize=12)
axes[0, 0].set_ylabel('Predicted Values', fontsize=12)
axes[0, 0].set_title(f'{model_name}: Predicted vs Actual', fontweight='bold')
axes[0, 0].grid(True, alpha=0.3)
# Add R² annotation
r2 = r2_score(y_true, y_pred)
axes[0, 0].text(0.05, 0.95, f'R² = {r2:.3f}',
transform=axes[0, 0].transAxes, fontsize=12, fontweight='bold',
bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
# Plot 2: Residuals vs Predicted
residuals = y_true - y_pred
axes[0, 1].scatter(y_pred, residuals, alpha=0.6, s=30)
axes[0, 1].axhline(y=0, color='r', linestyle='--', alpha=0.8)
axes[0, 1].set_xlabel('Predicted Values', fontsize=12)
axes[0, 1].set_ylabel('Residuals', fontsize=12)
axes[0, 1].set_title(f'{model_name}: Residual Plot', fontweight='bold')
axes[0, 1].grid(True, alpha=0.3)
# Plot 3: Residuals histogram
axes[1, 0].hist(residuals, bins=30, alpha=0.7, density=True, color='skyblue')
axes[1, 0].axvline(x=0, color='r', linestyle='--', alpha=0.8)
# Overlay normal distribution
mu, sigma = np.mean(residuals), np.std(residuals)
x = np.linspace(residuals.min(), residuals.max(), 100)
axes[1, 0].plot(x, (1/np.sqrt(2*np.pi*sigma**2)) * np.exp(-0.5*((x-mu)/sigma)**2),
'r-', lw=2, alpha=0.8, label='Normal fit')
axes[1, 0].set_xlabel('Residuals', fontsize=12)
axes[1, 0].set_ylabel('Density', fontsize=12)
axes[1, 0].set_title('Residuals Distribution', fontweight='bold')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)
# Plot 4: Metrics comparison bar chart
metrics = self.calculate_regression_metrics(y_true, y_pred)
# Normalize metrics for visualization (exclude MAPE if infinite)
viz_metrics = {k: v for k, v in metrics.items() if k != 'mape' or not np.isinf(v)}
metric_names = list(viz_metrics.keys())
metric_values = list(viz_metrics.values())
colors = ['blue' if v >= 0 else 'red' for v in metric_values]
bars = axes[1, 1].bar(metric_names, metric_values, alpha=0.7, color=colors)
axes[1, 1].set_ylabel('Metric Value', fontsize=12)
axes[1, 1].set_title('Regression Metrics', fontweight='bold')
axes[1, 1].tick_params(axis='x', rotation=45)
axes[1, 1].grid(True, alpha=0.3)
# Add value labels on bars
for bar, value in zip(bars, metric_values):
height = bar.get_height()
axes[1, 1].text(bar.get_x() + bar.get_width()/2., height + 0.01 * max(metric_values),
f'{value:.3f}', ha='center', va='bottom', fontweight='bold')
plt.tight_layout()
plt.show()
# Print detailed metrics
print(f"\n{model_name} Regression Metrics:")
print("-" * 40)
for metric, value in metrics.items():
if not np.isinf(value):
print(f"{metric.upper()}: {value:.6f}")
else:
print(f"{metric.upper()}: Infinite (division by zero)")
# Generate regression dataset
X_reg, y_reg = make_regression(n_samples=1000, n_features=10, noise=0.1, random_state=42)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
X_reg, y_reg, test_size=0.3, random_state=42
)
# Train regression models
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train_reg, y_train_reg)
lr_reg = LinearRegression()
lr_reg.fit(X_train_reg, y_train_reg)
# Get predictions
rf_pred_reg = rf_reg.predict(X_test_reg)
lr_pred_reg = lr_reg.predict(X_test_reg)
# Analyze regression metrics
reg_analyzer = RegressionMetricsAnalyzer()
print("=== REGRESSION ANALYSIS ===")
reg_analyzer.plot_regression_analysis(y_test_reg, rf_pred_reg, "Random Forest")
reg_analyzer.plot_regression_analysis(y_test_reg, lr_pred_reg, "Linear Regression")
Metrics Comparison and Selection Guide
class MetricsComparisonAnalyzer:
"""Compare and guide metric selection"""
def __init__(self):
self.classification_metrics_guide = {
'Accuracy': {
'when_to_use': 'Balanced datasets',
'pros': 'Simple, intuitive',
'cons': 'Misleading with imbalanced data',
'formula': 'TP + TN / (TP + TN + FP + FN)'
},
'Precision': {
'when_to_use': 'When false positives are costly',
'pros': 'Focus on positive prediction quality',
'cons': 'Ignores false negatives',
'formula': 'TP / (TP + FP)'
},
'Recall': {
'when_to_use': 'When false negatives are costly',
'pros': 'Focus on catching all positives',
'cons': 'Ignores false positives',
'formula': 'TP / (TP + FN)'
},
'F1-Score': {
'when_to_use': 'Balance between precision and recall',
'pros': 'Harmonic mean of precision and recall',
'cons': 'May not reflect business priorities',
'formula': '2 * (Precision * Recall) / (Precision + Recall)'
},
'ROC-AUC': {
'when_to_use': 'Binary classification, balanced datasets',
'pros': 'Threshold-independent, probabilistic',
'cons': 'Optimistic with imbalanced data',
'formula': 'Area under ROC curve'
}
}
self.regression_metrics_guide = {
'MSE': {
'when_to_use': 'When large errors are particularly bad',
'pros': 'Penalizes large errors heavily',
'cons': 'Sensitive to outliers, not interpretable',
'formula': 'mean((y_true - y_pred)²)'
},
'RMSE': {
'when_to_use': 'Same scale as target variable needed',
'pros': 'Same units as target, interpretable',
'cons': 'Sensitive to outliers',
'formula': 'sqrt(MSE)'
},
'MAE': {
'when_to_use': 'Robust metric needed, outliers present',
'pros': 'Robust to outliers, interpretable',
'cons': 'Less sensitive to large errors',
'formula': 'mean(|y_true - y_pred|)'
},
'R²': {
'when_to_use': 'Understanding proportion of variance explained',
'pros': 'Normalized, interpretable (0-1)',
'cons': 'Can be negative, not always meaningful',
'formula': '1 - (SS_res / SS_tot)'
},
'MAPE': {
'when_to_use': 'Percentage error interpretation needed',
'pros': 'Scale-independent, interpretable',
'cons': 'Undefined when true values are zero',
'formula': 'mean(|y_true - y_pred| / |y_true|) * 100'
}
}
def print_metrics_guide(self, task_type: str = 'classification') -> None:
"""Print comprehensive metrics selection guide"""
if task_type == 'classification':
guide = self.classification_metrics_guide
title = "CLASSIFICATION METRICS SELECTION GUIDE"
else:
guide = self.regression_metrics_guide
title = "REGRESSION METRICS SELECTION GUIDE"
print("\n" + "="*60)
print(title)
print("="*60)
for metric_name, info in guide.items():
print(f"\n{metric_name.upper()}")
print("-" * len(metric_name))
print(f"When to use: {info['when_to_use']}")
print(f"Pros: {info['pros']}")
print(f"Cons: {info['cons']}")
print(f"Formula: {info['formula']}")
def create_metrics_comparison_table(self, models_performance: Dict) -> pd.DataFrame:
"""Create comparison table for multiple models"""
df = pd.DataFrame(models_performance).T
# Round values for better display
numeric_columns = df.select_dtypes(include=[np.number]).columns
df[numeric_columns] = df[numeric_columns].round(4)
return df
def plot_metrics_comparison(self, models_performance: Dict,
task_type: str = 'classification') -> None:
"""Plot metrics comparison across models"""
df = pd.DataFrame(models_performance).T
# Select metrics to plot based on task type
if task_type == 'classification':
metrics_to_plot = ['accuracy', 'precision', 'recall', 'f1_score']
else:
metrics_to_plot = ['r2_score', 'rmse', 'mae']
# Filter available metrics
available_metrics = [m for m in metrics_to_plot if m in df.columns]
if not available_metrics:
print("No metrics available for plotting")
return
fig, ax = plt.subplots(figsize=(12, 6))
df[available_metrics].plot(kind='bar', ax=ax, alpha=0.8)
ax.set_title(f'{task_type.capitalize()} Metrics Comparison', fontweight='bold')
ax.set_ylabel('Metric Value')
ax.set_xlabel('Models')
ax.legend(title='Metrics', bbox_to_anchor=(1.05, 1), loc='upper left')
ax.grid(True, alpha=0.3)
# Rotate x-axis labels
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
# Demonstrate metrics comparison
comparison_analyzer = MetricsComparisonAnalyzer()
# Print guides
comparison_analyzer.print_metrics_guide('classification')
comparison_analyzer.print_metrics_guide('regression')
# Compare classification models
classification_performance = {
'Random Forest': rf_metrics,
'Logistic Regression': lr_metrics
}
print("\n=== CLASSIFICATION MODELS COMPARISON ===")
class_comparison_df = comparison_analyzer.create_metrics_comparison_table(classification_performance)
print(class_comparison_df)
comparison_analyzer.plot_metrics_comparison(classification_performance, 'classification')
# Compare regression models
regression_performance = {
'Random Forest': reg_analyzer.calculate_regression_metrics(y_test_reg, rf_pred_reg),
'Linear Regression': reg_analyzer.calculate_regression_metrics(y_test_reg, lr_pred_reg)
}
print("\n=== REGRESSION MODELS COMPARISON ===")
reg_comparison_df = comparison_analyzer.create_metrics_comparison_table(regression_performance)
print(reg_comparison_df)
comparison_analyzer.plot_metrics_comparison(regression_performance, 'regression')
Business-Specific Metrics
class BusinessMetricsAnalyzer:
"""Business-specific metrics and cost-sensitive evaluation"""
def __init__(self):
pass
def calculate_business_metrics(self, y_true: np.ndarray, y_pred: np.ndarray,
cost_matrix: np.ndarray = None) -> Dict:
"""Calculate business-specific metrics"""
cm = confusion_matrix(y_true, y_pred)
if cost_matrix is None:
# Default cost matrix (equal costs)
cost_matrix = np.ones_like(cm) - np.eye(cm.shape[0])
# Calculate total cost
total_cost = np.sum(cm * cost_matrix)
# Calculate cost per prediction
cost_per_prediction = total_cost / len(y_true)
# Calculate savings compared to worst-case scenario
worst_case_cost = len(y_true) * np.max(cost_matrix)
cost_savings = worst_case_cost - total_cost
savings_percentage = (cost_savings / worst_case_cost) * 100
return {
'total_cost': total_cost,
'cost_per_prediction': cost_per_prediction,
'cost_savings': cost_savings,
'savings_percentage': savings_percentage
}
def plot_cost_analysis(self, y_true: np.ndarray, models_predictions: Dict,
cost_matrix: np.ndarray) -> None:
"""Plot cost analysis for different models"""
business_metrics = {}
for model_name, y_pred in models_predictions.items():
metrics = self.calculate_business_metrics(y_true, y_pred, cost_matrix)
business_metrics[model_name] = metrics
# Create comparison plots
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
models = list(business_metrics.keys())
# Plot 1: Total cost comparison
total_costs = [business_metrics[model]['total_cost'] for model in models]
bars1 = axes[0, 0].bar(models, total_costs, alpha=0.7, color='red')
axes[0, 0].set_ylabel('Total Cost', fontsize=12)
axes[0, 0].set_title('Total Business Cost', fontweight='bold')
axes[0, 0].grid(True, alpha=0.3)
# Add value labels
for bar, cost in zip(bars1, total_costs):
axes[0, 0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(total_costs)*0.01,
f'{cost:.0f}', ha='center', va='bottom', fontweight='bold')
# Plot 2: Cost per prediction
cost_per_pred = [business_metrics[model]['cost_per_prediction'] for model in models]
bars2 = axes[0, 1].bar(models, cost_per_pred, alpha=0.7, color='orange')
axes[0, 1].set_ylabel('Cost per Prediction', fontsize=12)
axes[0, 1].set_title('Average Cost per Prediction', fontweight='bold')
axes[0, 1].grid(True, alpha=0.3)
# Add value labels
for bar, cost in zip(bars2, cost_per_pred):
axes[0, 1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(cost_per_pred)*0.01,
f'{cost:.2f}', ha='center', va='bottom', fontweight='bold')
# Plot 3: Savings percentage
savings_pct = [business_metrics[model]['savings_percentage'] for model in models]
bars3 = axes[1, 0].bar(models, savings_pct, alpha=0.7, color='green')
axes[1, 0].set_ylabel('Savings Percentage (%)', fontsize=12)
axes[1, 0].set_title('Cost Savings vs Worst Case', fontweight='bold')
axes[1, 0].grid(True, alpha=0.3)
# Add value labels
for bar, savings in zip(bars3, savings_pct):
axes[1, 0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(savings_pct)*0.01,
f'{savings:.1f}%', ha='center', va='bottom', fontweight='bold')
# Plot 4: Cost matrix heatmap
sns.heatmap(cost_matrix, annot=True, fmt='.0f', cmap='Reds', ax=axes[1, 1])
axes[1, 1].set_title('Cost Matrix', fontweight='bold')
axes[1, 1].set_xlabel('Predicted Class')
axes[1, 1].set_ylabel('True Class')
plt.tight_layout()
plt.show()
# Print business metrics summary
print("\n" + "="*60)
print("BUSINESS METRICS SUMMARY")
print("="*60)
print(f"{'Model':<20} {'Total Cost':<12} {'Cost/Pred':<12} {'Savings %':<12}")
print("-"*60)
for model in models:
metrics = business_metrics[model]
print(f"{model:<20} {metrics['total_cost']:<12.0f} "
f"{metrics['cost_per_prediction']:<12.2f} {metrics['savings_percentage']:<12.1f}")
# Example: Medical diagnosis cost matrix
# Rows: True class, Columns: Predicted class
# [Healthy, Sick]
medical_cost_matrix = np.array([
[0, 100], # Healthy misclassified as Sick (unnecessary treatment)
[1000, 0] # Sick misclassified as Healthy (missed diagnosis)
])
print("\n=== BUSINESS METRICS ANALYSIS ===")
print("Medical Diagnosis Example:")
print("Cost Matrix:")
print("- False Positive (Healthy→Sick): $100 (unnecessary treatment)")
print("- False Negative (Sick→Healthy): $1000 (missed diagnosis)")
business_analyzer = BusinessMetricsAnalyzer()
# Use binary classification predictions
models_predictions = {
'Random Forest': rf_bin.predict(X_test_bin),
'Logistic Regression': lr_bin.predict(X_test_bin)
}
business_analyzer.plot_cost_analysis(y_test_bin, models_predictions, medical_cost_matrix)
Best Practices and Guidelines
Metric Selection Framework
Task | Primary Metric | Secondary Metrics | When to Use |
---|---|---|---|
Balanced Classification | Accuracy | Precision, Recall, F1 | Equal class importance |
Imbalanced Classification | F1-Score, ROC-AUC | Precision-Recall AUC | Minority class important |
Medical Diagnosis | Recall | Precision, F1 | False negatives costly |
Spam Detection | Precision | Recall, F1 | False positives costly |
Regression (General) | RMSE | MAE, R² | Normal distribution |
Regression (Outliers) | MAE | RMSE, R² | Robust metric needed |
Business Applications | Custom Cost | Precision, Recall | Domain-specific costs |
Key Recommendations
- Always consider the problem context - metrics should align with business objectives
- Use multiple metrics - single metrics can be misleading
- Validate on hold-out set - avoid overfitting to validation metrics
- Consider class imbalance - accuracy can be misleading
- Plot curves when possible - ROC and PR curves provide rich information
- Define custom metrics for business-specific requirements
Conclusion
Choosing appropriate evaluation metrics is critical for model assessment. Key takeaways:
- Context matters - select metrics aligned with business objectives
- Multiple metrics provide comprehensive model understanding
- Imbalanced data requires careful metric selection (avoid accuracy)
- Threshold analysis helps optimize business outcomes
- Cost-sensitive evaluation incorporates real-world consequences
- Visual analysis (confusion matrices, curves) provides insights beyond numbers
Always validate your metric choice against the specific requirements and constraints of your problem domain.
References
-
Powers, D. M. (2011). Evaluation: from precision, recall and F-measure to ROC, informedness, markedness and correlation. Journal of Machine Learning Technologies.
-
Fawcett, T. (2006). An introduction to ROC analysis. Pattern recognition letters, 27(8), 861-874.
-
Japkowicz, N., & Shah, M. (2011). Evaluating learning algorithms: a classification perspective. Cambridge University Press.
Connect with me on LinkedIn or X to discuss model evaluation strategies!