Machine Learning with scikit-learn: A Practical Guide


Machine Learning with scikit-learn: A Practical Guide

Scikit-learn is a powerful library for machine learning in Python. In this guide, we'll explore how to build and deploy machine learning models effectively.

Getting Started with scikit-learn

First, let's set up our environment and import the necessary libraries:

import numpy as np
import pandas as pd
from sklearn import datasets, metrics, model_selection
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

Data Preprocessing

Loading and Preparing Data

# Load sample dataset
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

# Split data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

Feature Scaling

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Classification Models

Logistic Regression

from sklearn.linear_model import LogisticRegression

# Create and train model
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = lr_model.predict(X_test_scaled)

# Evaluate model
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(metrics.classification_report(y_test, y_pred))

Random Forest

from sklearn.ensemble import RandomForestClassifier

# Create and train model
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42
)
rf_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = rf_model.predict(X_test_scaled)

# Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 10 Important Features:")
print(feature_importance.head(10))

Support Vector Machine

from sklearn.svm import SVC

# Create and train model
svm_model = SVC(kernel='rbf', random_state=42)
svm_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = svm_model.predict(X_test_scaled)

# Evaluate model
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Regression Models

Linear Regression

from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_boston

# Load dataset
boston = load_boston()
X = pd.DataFrame(boston.data, columns=boston.feature_names)
y = pd.Series(boston.target)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Create and train model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions
y_pred = lr_model.predict(X_test)

# Evaluate model
print("R2 Score:", metrics.r2_score(y_test, y_pred))
print("Mean Squared Error:", metrics.mean_squared_error(y_test, y_pred))

Random Forest Regressor

from sklearn.ensemble import RandomForestRegressor

# Create and train model
rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    random_state=42
)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate model
print("R2 Score:", metrics.r2_score(y_test, y_pred))

Model Selection and Evaluation

Cross-Validation

from sklearn.model_selection import cross_val_score

# Perform cross-validation
cv_scores = cross_val_score(
    rf_model, X_train_scaled, y_train,
    cv=5, scoring='accuracy'
)

print("Cross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())
print("Standard deviation:", cv_scores.std())

Grid Search

from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Perform grid search
grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)
grid_search.fit(X_train_scaled, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Project: Customer Churn Prediction

Let's build a complete machine learning pipeline for predicting customer churn:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

class ChurnPredictor:
    def __init__(self):
        """Initialize the ChurnPredictor."""
        self.scaler = StandardScaler()
        self.label_encoders = {}
        self.model = None
        self.feature_importance = None
    
    def preprocess_data(self, df):
        """Preprocess the data for training or prediction."""
        # Create copy of dataframe
        df_processed = df.copy()
        
        # Handle missing values
        numeric_columns = df_processed.select_dtypes(include=['int64', 'float64']).columns
        categorical_columns = df_processed.select_dtypes(include=['object']).columns
        
        df_processed[numeric_columns] = df_processed[numeric_columns].fillna(
            df_processed[numeric_columns].mean()
        )
        df_processed[categorical_columns] = df_processed[categorical_columns].fillna(
            df_processed[categorical_columns].mode().iloc[0]
        )
        
        # Encode categorical variables
        for column in categorical_columns:
            if column not in self.label_encoders:
                self.label_encoders[column] = LabelEncoder()
                df_processed[column] = self.label_encoders[column].fit_transform(
                    df_processed[column]
                )
            else:
                df_processed[column] = self.label_encoders[column].transform(
                    df_processed[column]
                )
        
        return df_processed
    
    def train(self, X_train, y_train):
        """Train the model."""
        # Scale features
        X_train_scaled = self.scaler.fit_transform(X_train)
        
        # Train model
        self.model = RandomForestClassifier(
            n_estimators=200,
            max_depth=20,
            min_samples_split=5,
            random_state=42
        )
        self.model.fit(X_train_scaled, y_train)
        
        # Calculate feature importance
        self.feature_importance = pd.DataFrame({
            'feature': X_train.columns,
            'importance': self.model.feature_importances_
        }).sort_values('importance', ascending=False)
    
    def predict(self, X):
        """Make predictions on new data."""
        X_scaled = self.scaler.transform(X)
        return self.model.predict(X_scaled)
    
    def predict_proba(self, X):
        """Get probability predictions."""
        X_scaled = self.scaler.transform(X)
        return self.model.predict_proba(X_scaled)
    
    def evaluate(self, X_test, y_test):
        """Evaluate the model."""
        X_test_scaled = self.scaler.transform(X_test)
        y_pred = self.model.predict(X_test_scaled)
        
        # Calculate metrics
        report = classification_report(y_test, y_pred)
        conf_matrix = confusion_matrix(y_test, y_pred)
        
        # Plot confusion matrix
        plt.figure(figsize=(8, 6))
        sns.heatmap(
            conf_matrix,
            annot=True,
            fmt='d',
            cmap='Blues',
            xticklabels=['Not Churned', 'Churned'],
            yticklabels=['Not Churned', 'Churned']
        )
        plt.title('Confusion Matrix')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.show()
        
        # Plot feature importance
        plt.figure(figsize=(10, 6))
        sns.barplot(
            x='importance',
            y='feature',
            data=self.feature_importance.head(10)
        )
        plt.title('Top 10 Important Features')
        plt.show()
        
        return report
    
    def save_model(self, filename):
        """Save the model to a file."""
        import joblib
        model_data = {
            'model': self.model,
            'scaler': self.scaler,
            'label_encoders': self.label_encoders,
            'feature_importance': self.feature_importance
        }
        joblib.dump(model_data, filename)
    
    @classmethod
    def load_model(cls, filename):
        """Load a saved model."""
        import joblib
        predictor = cls()
        model_data = joblib.load(filename)
        predictor.model = model_data['model']
        predictor.scaler = model_data['scaler']
        predictor.label_encoders = model_data['label_encoders']
        predictor.feature_importance = model_data['feature_importance']
        return predictor

# Example usage
if __name__ == "__main__":
    # Create sample data
    np.random.seed(42)
    n_samples = 1000
    
    data = {
        'age': np.random.normal(45, 15, n_samples),
        'tenure': np.random.randint(0, 10, n_samples),
        'monthly_charges': np.random.normal(70, 30, n_samples),
        'total_charges': np.random.normal(1000, 500, n_samples),
        'gender': np.random.choice(['Male', 'Female'], n_samples),
        'internet_service': np.random.choice(['DSL', 'Fiber optic', 'No'], n_samples),
        'contract': np.random.choice(['Month-to-month', 'One year', 'Two year'], n_samples),
        'payment_method': np.random.choice(['Electronic check', 'Mailed check', 'Bank transfer'], n_samples),
        'churn': np.random.choice([0, 1], n_samples, p=[0.7, 0.3])
    }
    
    df = pd.DataFrame(data)
    
    # Split features and target
    X = df.drop('churn', axis=1)
    y = df['churn']
    
    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # Create and train predictor
    predictor = ChurnPredictor()
    
    # Preprocess data
    X_train_processed = predictor.preprocess_data(X_train)
    X_test_processed = predictor.preprocess_data(X_test)
    
    # Train model
    predictor.train(X_train_processed, y_train)
    
    # Evaluate model
    print("\nModel Evaluation:")
    print("================")
    report = predictor.evaluate(X_test_processed, y_test)
    print("\nClassification Report:")
    print(report)
    
    # Save model
    predictor.save_model('churn_predictor.joblib')
    
    # Load model and make predictions
    loaded_predictor = ChurnPredictor.load_model('churn_predictor.joblib')
    
    # Make predictions for new customers
    new_customers = pd.DataFrame({
        'age': [35, 45, 55],
        'tenure': [2, 5, 8],
        'monthly_charges': [50, 70, 90],
        'total_charges': [800, 1200, 1600],
        'gender': ['Male', 'Female', 'Male'],
        'internet_service': ['DSL', 'Fiber optic', 'DSL'],
        'contract': ['Month-to-month', 'One year', 'Two year'],
        'payment_method': ['Electronic check', 'Bank transfer', 'Mailed check']
    })
    
    new_customers_processed = loaded_predictor.preprocess_data(new_customers)
    predictions = loaded_predictor.predict(new_customers_processed)
    probabilities = loaded_predictor.predict_proba(new_customers_processed)
    
    print("\nPredictions for New Customers:")
    print("=============================")
    for i, (pred, prob) in enumerate(zip(predictions, probabilities)):
        print(f"Customer {i+1}:")
        print(f"Prediction: {'Churn' if pred == 1 else 'Not Churn'}")
        print(f"Probability of Churn: {prob[1]:.2f}")
        print()

Best Practices

  1. Data Preprocessing
def preprocess_data(df):
    # Handle missing values
    df = handle_missing_values(df)
    
    # Scale features
    df = scale_features(df)
    
    # Encode categorical variables
    df = encode_categories(df)
    
    return df
  1. Model Evaluation
def evaluate_model(model, X_test, y_test):
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    accuracy = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }
  1. Model Persistence
import joblib

def save_model(model, filename):
    joblib.dump(model, filename)

def load_model(filename):
    return joblib.load(filename)

Common Patterns

  1. Pipeline Creation
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier())
])

pipeline.fit(X_train, y_train)
  1. Feature Selection
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(X, y)

Conclusion

Scikit-learn provides powerful tools for machine learning:

  • Easy-to-use API
  • Comprehensive algorithms
  • Efficient implementation
  • Excellent documentation

Keep exploring scikit-learn's capabilities to build better machine learning models.

Further Reading