Machine Learning with scikit-learn: A Practical Guide

AI-Generated Content Notice

Some code examples and technical explanations in this article were generated with AI assistance. The content has been reviewed for accuracy, but please test any code snippets in your development environment before using them.


Machine Learning with scikit-learn: A Practical Guide

Scikit-learn is a powerful library for machine learning in Python. In this guide, we'll explore how to build and deploy machine learning models effectively.

Getting Started with scikit-learn

First, let's set up our environment and import the necessary libraries:

import numpy as np
import pandas as pd
from sklearn import datasets, metrics, model_selection
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

Data Preprocessing

Loading and Preparing Data

# Load sample dataset
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

# Split data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

Feature Scaling

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Classification Models

Logistic Regression

from sklearn.linear_model import LogisticRegression

# Create and train model
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = lr_model.predict(X_test_scaled)

# Evaluate model
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(metrics.classification_report(y_test, y_pred))

Random Forest

from sklearn.ensemble import RandomForestClassifier

# Create and train model
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42
)
rf_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = rf_model.predict(X_test_scaled)

# Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 10 Important Features:")
print(feature_importance.head(10))

Support Vector Machine

from sklearn.svm import SVC

# Create and train model
svm_model = SVC(kernel='rbf', random_state=42)
svm_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = svm_model.predict(X_test_scaled)

# Evaluate model
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Regression Models

Linear Regression

from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_boston

# Load dataset
boston = load_boston()
X = pd.DataFrame(boston.data, columns=boston.feature_names)
y = pd.Series(boston.target)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Create and train model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions
y_pred = lr_model.predict(X_test)

# Evaluate model
print("R2 Score:", metrics.r2_score(y_test, y_pred))
print("Mean Squared Error:", metrics.mean_squared_error(y_test, y_pred))

Random Forest Regressor

from sklearn.ensemble import RandomForestRegressor

# Create and train model
rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    random_state=42
)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate model
print("R2 Score:", metrics.r2_score(y_test, y_pred))

Model Selection and Evaluation

Cross-Validation

from sklearn.model_selection import cross_val_score

# Perform cross-validation
cv_scores = cross_val_score(
    rf_model, X_train_scaled, y_train,
    cv=5, scoring='accuracy'
)

print("Cross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())
print("Standard deviation:", cv_scores.std())

Grid Search

from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Perform grid search
grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)
grid_search.fit(X_train_scaled, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Project: Customer Churn Prediction

Let's build a complete machine learning pipeline for predicting customer churn:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

class ChurnPredictor:
    def __init__(self):
        """Initialize the ChurnPredictor."""
        self.scaler = StandardScaler()
        self.label_encoders = {}
        self.model = None
        self.feature_importance = None
    
    def preprocess_data(self, df):
        """Preprocess the data for training or prediction."""
        # Create copy of dataframe
        df_processed = df.copy()
        
        # Handle missing values
        numeric_columns = df_processed.select_dtypes(include=['int64', 'float64']).columns
        categorical_columns = df_processed.select_dtypes(include=['object']).columns
        
        df_processed[numeric_columns] = df_processed[numeric_columns].fillna(
            df_processed[numeric_columns].mean()
        )
        df_processed[categorical_columns] = df_processed[categorical_columns].fillna(
            df_processed[categorical_columns].mode().iloc[0]
        )
        
        # Encode categorical variables
        for column in categorical_columns:
            if column not in self.label_encoders:
                self.label_encoders[column] = LabelEncoder()
                df_processed[column] = self.label_encoders[column].fit_transform(
                    df_processed[column]
                )
            else:
                df_processed[column] = self.label_encoders[column].transform(
                    df_processed[column]
                )
        
        return df_processed
    
    def train(self, X_train, y_train):
        """Train the model."""
        # Scale features
        X_train_scaled = self.scaler.fit_transform(X_train)
        
        # Train model
        self.model = RandomForestClassifier(
            n_estimators=200,
            max_depth=20,
            min_samples_split=5,
            random_state=42
        )
        self.model.fit(X_train_scaled, y_train)
        
        # Calculate feature importance
        self.feature_importance = pd.DataFrame({
            'feature': X_train.columns,
            'importance': self.model.feature_importances_
        }).sort_values('importance', ascending=False)
    
    def predict(self, X):
        """Make predictions on new data."""
        X_scaled = self.scaler.transform(X)
        return self.model.predict(X_scaled)
    
    def predict_proba(self, X):
        """Get probability predictions."""
        X_scaled = self.scaler.transform(X)
        return self.model.predict_proba(X_scaled)
    
    def evaluate(self, X_test, y_test):
        """Evaluate the model."""
        X_test_scaled = self.scaler.transform(X_test)
        y_pred = self.model.predict(X_test_scaled)
        
        # Calculate metrics
        report = classification_report(y_test, y_pred)
        conf_matrix = confusion_matrix(y_test, y_pred)
        
        # Plot confusion matrix
        plt.figure(figsize=(8, 6))
        sns.heatmap(
            conf_matrix,
            annot=True,
            fmt='d',
            cmap='Blues',
            xticklabels=['Not Churned', 'Churned'],
            yticklabels=['Not Churned', 'Churned']
        )
        plt.title('Confusion Matrix')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.show()
        
        # Plot feature importance
        plt.figure(figsize=(10, 6))
        sns.barplot(
            x='importance',
            y='feature',
            data=self.feature_importance.head(10)
        )
        plt.title('Top 10 Important Features')
        plt.show()
        
        return report
    
    def save_model(self, filename):
        """Save the model to a file."""
        import joblib
        model_data = {
            'model': self.model,
            'scaler': self.scaler,
            'label_encoders': self.label_encoders,
            'feature_importance': self.feature_importance
        }
        joblib.dump(model_data, filename)
    
    @classmethod
    def load_model(cls, filename):
        """Load a saved model."""
        import joblib
        predictor = cls()
        model_data = joblib.load(filename)
        predictor.model = model_data['model']
        predictor.scaler = model_data['scaler']
        predictor.label_encoders = model_data['label_encoders']
        predictor.feature_importance = model_data['feature_importance']
        return predictor

# Example usage
if __name__ == "__main__":
    # Create sample data
    np.random.seed(42)
    n_samples = 1000
    
    data = {
        'age': np.random.normal(45, 15, n_samples),
        'tenure': np.random.randint(0, 10, n_samples),
        'monthly_charges': np.random.normal(70, 30, n_samples),
        'total_charges': np.random.normal(1000, 500, n_samples),
        'gender': np.random.choice(['Male', 'Female'], n_samples),
        'internet_service': np.random.choice(['DSL', 'Fiber optic', 'No'], n_samples),
        'contract': np.random.choice(['Month-to-month', 'One year', 'Two year'], n_samples),
        'payment_method': np.random.choice(['Electronic check', 'Mailed check', 'Bank transfer'], n_samples),
        'churn': np.random.choice([0, 1], n_samples, p=[0.7, 0.3])
    }
    
    df = pd.DataFrame(data)
    
    # Split features and target
    X = df.drop('churn', axis=1)
    y = df['churn']
    
    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # Create and train predictor
    predictor = ChurnPredictor()
    
    # Preprocess data
    X_train_processed = predictor.preprocess_data(X_train)
    X_test_processed = predictor.preprocess_data(X_test)
    
    # Train model
    predictor.train(X_train_processed, y_train)
    
    # Evaluate model
    print("\nModel Evaluation:")
    print("================")
    report = predictor.evaluate(X_test_processed, y_test)
    print("\nClassification Report:")
    print(report)
    
    # Save model
    predictor.save_model('churn_predictor.joblib')
    
    # Load model and make predictions
    loaded_predictor = ChurnPredictor.load_model('churn_predictor.joblib')
    
    # Make predictions for new customers
    new_customers = pd.DataFrame({
        'age': [35, 45, 55],
        'tenure': [2, 5, 8],
        'monthly_charges': [50, 70, 90],
        'total_charges': [800, 1200, 1600],
        'gender': ['Male', 'Female', 'Male'],
        'internet_service': ['DSL', 'Fiber optic', 'DSL'],
        'contract': ['Month-to-month', 'One year', 'Two year'],
        'payment_method': ['Electronic check', 'Bank transfer', 'Mailed check']
    })
    
    new_customers_processed = loaded_predictor.preprocess_data(new_customers)
    predictions = loaded_predictor.predict(new_customers_processed)
    probabilities = loaded_predictor.predict_proba(new_customers_processed)
    
    print("\nPredictions for New Customers:")
    print("=============================")
    for i, (pred, prob) in enumerate(zip(predictions, probabilities)):
        print(f"Customer {i+1}:")
        print(f"Prediction: {'Churn' if pred == 1 else 'Not Churn'}")
        print(f"Probability of Churn: {prob[1]:.2f}")
        print()

Best Practices

  1. Data Preprocessing
def preprocess_data(df):
    # Handle missing values
    df = handle_missing_values(df)
    
    # Scale features
    df = scale_features(df)
    
    # Encode categorical variables
    df = encode_categories(df)
    
    return df
  1. Model Evaluation
def evaluate_model(model, X_test, y_test):
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    accuracy = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }
  1. Model Persistence
import joblib

def save_model(model, filename):
    joblib.dump(model, filename)

def load_model(filename):
    return joblib.load(filename)

Common Patterns

  1. Pipeline Creation
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier())
])

pipeline.fit(X_train, y_train)
  1. Feature Selection
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(X, y)

Conclusion

Scikit-learn provides powerful tools for machine learning:

  • Easy-to-use API
  • Comprehensive algorithms
  • Efficient implementation
  • Excellent documentation

Keep exploring scikit-learn's capabilities to build better machine learning models.

Further Reading