Machine Learning with scikit-learn: A Practical Guide
AI-Generated Content Notice
Some code examples and technical explanations in this article were generated with AI assistance. The content has been reviewed for accuracy, but please test any code snippets in your development environment before using them.
Machine Learning with scikit-learn: A Practical Guide
Scikit-learn is a powerful library for machine learning in Python. In this guide, we'll explore how to build and deploy machine learning models effectively.
Getting Started with scikit-learn
First, let's set up our environment and import the necessary libraries:
import numpy as np
import pandas as pd
from sklearn import datasets, metrics, model_selection
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
Data Preprocessing
Loading and Preparing Data
# Load sample dataset
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)
# Split data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
Feature Scaling
# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
Classification Models
Logistic Regression
from sklearn.linear_model import LogisticRegression
# Create and train model
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train_scaled, y_train)
# Make predictions
y_pred = lr_model.predict(X_test_scaled)
# Evaluate model
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(metrics.classification_report(y_test, y_pred))
Random Forest
from sklearn.ensemble import RandomForestClassifier
# Create and train model
rf_model = RandomForestClassifier(
n_estimators=100,
max_depth=10,
random_state=42
)
rf_model.fit(X_train_scaled, y_train)
# Make predictions
y_pred = rf_model.predict(X_test_scaled)
# Feature importance
feature_importance = pd.DataFrame({
'feature': X.columns,
'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)
print("Top 10 Important Features:")
print(feature_importance.head(10))
Support Vector Machine
from sklearn.svm import SVC
# Create and train model
svm_model = SVC(kernel='rbf', random_state=42)
svm_model.fit(X_train_scaled, y_train)
# Make predictions
y_pred = svm_model.predict(X_test_scaled)
# Evaluate model
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
Regression Models
Linear Regression
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_boston
# Load dataset
boston = load_boston()
X = pd.DataFrame(boston.data, columns=boston.feature_names)
y = pd.Series(boston.target)
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Create and train model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
# Make predictions
y_pred = lr_model.predict(X_test)
# Evaluate model
print("R2 Score:", metrics.r2_score(y_test, y_pred))
print("Mean Squared Error:", metrics.mean_squared_error(y_test, y_pred))
Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
# Create and train model
rf_model = RandomForestRegressor(
n_estimators=100,
max_depth=10,
random_state=42
)
rf_model.fit(X_train, y_train)
# Make predictions
y_pred = rf_model.predict(X_test)
# Evaluate model
print("R2 Score:", metrics.r2_score(y_test, y_pred))
Model Selection and Evaluation
Cross-Validation
from sklearn.model_selection import cross_val_score
# Perform cross-validation
cv_scores = cross_val_score(
rf_model, X_train_scaled, y_train,
cv=5, scoring='accuracy'
)
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())
print("Standard deviation:", cv_scores.std())
Grid Search
from sklearn.model_selection import GridSearchCV
# Define parameter grid
param_grid = {
'n_estimators': [100, 200, 300],
'max_depth': [10, 20, 30],
'min_samples_split': [2, 5, 10]
}
# Perform grid search
grid_search = GridSearchCV(
RandomForestClassifier(random_state=42),
param_grid,
cv=5,
scoring='accuracy',
n_jobs=-1
)
grid_search.fit(X_train_scaled, y_train)
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)
Project: Customer Churn Prediction
Let's build a complete machine learning pipeline for predicting customer churn:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
class ChurnPredictor:
def __init__(self):
"""Initialize the ChurnPredictor."""
self.scaler = StandardScaler()
self.label_encoders = {}
self.model = None
self.feature_importance = None
def preprocess_data(self, df):
"""Preprocess the data for training or prediction."""
# Create copy of dataframe
df_processed = df.copy()
# Handle missing values
numeric_columns = df_processed.select_dtypes(include=['int64', 'float64']).columns
categorical_columns = df_processed.select_dtypes(include=['object']).columns
df_processed[numeric_columns] = df_processed[numeric_columns].fillna(
df_processed[numeric_columns].mean()
)
df_processed[categorical_columns] = df_processed[categorical_columns].fillna(
df_processed[categorical_columns].mode().iloc[0]
)
# Encode categorical variables
for column in categorical_columns:
if column not in self.label_encoders:
self.label_encoders[column] = LabelEncoder()
df_processed[column] = self.label_encoders[column].fit_transform(
df_processed[column]
)
else:
df_processed[column] = self.label_encoders[column].transform(
df_processed[column]
)
return df_processed
def train(self, X_train, y_train):
"""Train the model."""
# Scale features
X_train_scaled = self.scaler.fit_transform(X_train)
# Train model
self.model = RandomForestClassifier(
n_estimators=200,
max_depth=20,
min_samples_split=5,
random_state=42
)
self.model.fit(X_train_scaled, y_train)
# Calculate feature importance
self.feature_importance = pd.DataFrame({
'feature': X_train.columns,
'importance': self.model.feature_importances_
}).sort_values('importance', ascending=False)
def predict(self, X):
"""Make predictions on new data."""
X_scaled = self.scaler.transform(X)
return self.model.predict(X_scaled)
def predict_proba(self, X):
"""Get probability predictions."""
X_scaled = self.scaler.transform(X)
return self.model.predict_proba(X_scaled)
def evaluate(self, X_test, y_test):
"""Evaluate the model."""
X_test_scaled = self.scaler.transform(X_test)
y_pred = self.model.predict(X_test_scaled)
# Calculate metrics
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(
conf_matrix,
annot=True,
fmt='d',
cmap='Blues',
xticklabels=['Not Churned', 'Churned'],
yticklabels=['Not Churned', 'Churned']
)
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()
# Plot feature importance
plt.figure(figsize=(10, 6))
sns.barplot(
x='importance',
y='feature',
data=self.feature_importance.head(10)
)
plt.title('Top 10 Important Features')
plt.show()
return report
def save_model(self, filename):
"""Save the model to a file."""
import joblib
model_data = {
'model': self.model,
'scaler': self.scaler,
'label_encoders': self.label_encoders,
'feature_importance': self.feature_importance
}
joblib.dump(model_data, filename)
@classmethod
def load_model(cls, filename):
"""Load a saved model."""
import joblib
predictor = cls()
model_data = joblib.load(filename)
predictor.model = model_data['model']
predictor.scaler = model_data['scaler']
predictor.label_encoders = model_data['label_encoders']
predictor.feature_importance = model_data['feature_importance']
return predictor
# Example usage
if __name__ == "__main__":
# Create sample data
np.random.seed(42)
n_samples = 1000
data = {
'age': np.random.normal(45, 15, n_samples),
'tenure': np.random.randint(0, 10, n_samples),
'monthly_charges': np.random.normal(70, 30, n_samples),
'total_charges': np.random.normal(1000, 500, n_samples),
'gender': np.random.choice(['Male', 'Female'], n_samples),
'internet_service': np.random.choice(['DSL', 'Fiber optic', 'No'], n_samples),
'contract': np.random.choice(['Month-to-month', 'One year', 'Two year'], n_samples),
'payment_method': np.random.choice(['Electronic check', 'Mailed check', 'Bank transfer'], n_samples),
'churn': np.random.choice([0, 1], n_samples, p=[0.7, 0.3])
}
df = pd.DataFrame(data)
# Split features and target
X = df.drop('churn', axis=1)
y = df['churn']
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Create and train predictor
predictor = ChurnPredictor()
# Preprocess data
X_train_processed = predictor.preprocess_data(X_train)
X_test_processed = predictor.preprocess_data(X_test)
# Train model
predictor.train(X_train_processed, y_train)
# Evaluate model
print("\nModel Evaluation:")
print("================")
report = predictor.evaluate(X_test_processed, y_test)
print("\nClassification Report:")
print(report)
# Save model
predictor.save_model('churn_predictor.joblib')
# Load model and make predictions
loaded_predictor = ChurnPredictor.load_model('churn_predictor.joblib')
# Make predictions for new customers
new_customers = pd.DataFrame({
'age': [35, 45, 55],
'tenure': [2, 5, 8],
'monthly_charges': [50, 70, 90],
'total_charges': [800, 1200, 1600],
'gender': ['Male', 'Female', 'Male'],
'internet_service': ['DSL', 'Fiber optic', 'DSL'],
'contract': ['Month-to-month', 'One year', 'Two year'],
'payment_method': ['Electronic check', 'Bank transfer', 'Mailed check']
})
new_customers_processed = loaded_predictor.preprocess_data(new_customers)
predictions = loaded_predictor.predict(new_customers_processed)
probabilities = loaded_predictor.predict_proba(new_customers_processed)
print("\nPredictions for New Customers:")
print("=============================")
for i, (pred, prob) in enumerate(zip(predictions, probabilities)):
print(f"Customer {i+1}:")
print(f"Prediction: {'Churn' if pred == 1 else 'Not Churn'}")
print(f"Probability of Churn: {prob[1]:.2f}")
print()
Best Practices
- Data Preprocessing
def preprocess_data(df):
# Handle missing values
df = handle_missing_values(df)
# Scale features
df = scale_features(df)
# Encode categorical variables
df = encode_categories(df)
return df
- Model Evaluation
def evaluate_model(model, X_test, y_test):
# Make predictions
y_pred = model.predict(X_test)
# Calculate metrics
accuracy = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
f1 = metrics.f1_score(y_test, y_pred)
return {
'accuracy': accuracy,
'precision': precision,
'recall': recall,
'f1': f1
}
- Model Persistence
import joblib
def save_model(model, filename):
joblib.dump(model, filename)
def load_model(filename):
return joblib.load(filename)
Common Patterns
- Pipeline Creation
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
('scaler', StandardScaler()),
('classifier', RandomForestClassifier())
])
pipeline.fit(X_train, y_train)
- Feature Selection
from sklearn.feature_selection import SelectKBest, f_classif
selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(X, y)
Conclusion
Scikit-learn provides powerful tools for machine learning:
- Easy-to-use API
- Comprehensive algorithms
- Efficient implementation
- Excellent documentation
Keep exploring scikit-learn's capabilities to build better machine learning models.