Machine Learning with scikit-learn: A Practical Guide
Machine Learning with scikit-learn: A Practical Guide
Scikit-learn is a powerful library for machine learning in Python. In this guide, we'll explore how to build and deploy machine learning models effectively.
Getting Started with scikit-learn
First, let's set up our environment and import the necessary libraries:
import numpy as np
import pandas as pd
from sklearn import datasets, metrics, model_selection
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
Data Preprocessing
Loading and Preparing Data
# Load sample dataset
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)
# Split data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
Feature Scaling
# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
Classification Models
Logistic Regression
from sklearn.linear_model import LogisticRegression
# Create and train model
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train_scaled, y_train)
# Make predictions
y_pred = lr_model.predict(X_test_scaled)
# Evaluate model
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(metrics.classification_report(y_test, y_pred))
Random Forest
from sklearn.ensemble import RandomForestClassifier
# Create and train model
rf_model = RandomForestClassifier(
n_estimators=100,
max_depth=10,
random_state=42
)
rf_model.fit(X_train_scaled, y_train)
# Make predictions
y_pred = rf_model.predict(X_test_scaled)
# Feature importance
feature_importance = pd.DataFrame({
'feature': X.columns,
'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)
print("Top 10 Important Features:")
print(feature_importance.head(10))
Support Vector Machine
from sklearn.svm import SVC
# Create and train model
svm_model = SVC(kernel='rbf', random_state=42)
svm_model.fit(X_train_scaled, y_train)
# Make predictions
y_pred = svm_model.predict(X_test_scaled)
# Evaluate model
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
Regression Models
Linear Regression
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_boston
# Load dataset
boston = load_boston()
X = pd.DataFrame(boston.data, columns=boston.feature_names)
y = pd.Series(boston.target)
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Create and train model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
# Make predictions
y_pred = lr_model.predict(X_test)
# Evaluate model
print("R2 Score:", metrics.r2_score(y_test, y_pred))
print("Mean Squared Error:", metrics.mean_squared_error(y_test, y_pred))
Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
# Create and train model
rf_model = RandomForestRegressor(
n_estimators=100,
max_depth=10,
random_state=42
)
rf_model.fit(X_train, y_train)
# Make predictions
y_pred = rf_model.predict(X_test)
# Evaluate model
print("R2 Score:", metrics.r2_score(y_test, y_pred))
Model Selection and Evaluation
Cross-Validation
from sklearn.model_selection import cross_val_score
# Perform cross-validation
cv_scores = cross_val_score(
rf_model, X_train_scaled, y_train,
cv=5, scoring='accuracy'
)
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())
print("Standard deviation:", cv_scores.std())
Grid Search
from sklearn.model_selection import GridSearchCV
# Define parameter grid
param_grid = {
'n_estimators': [100, 200, 300],
'max_depth': [10, 20, 30],
'min_samples_split': [2, 5, 10]
}
# Perform grid search
grid_search = GridSearchCV(
RandomForestClassifier(random_state=42),
param_grid,
cv=5,
scoring='accuracy',
n_jobs=-1
)
grid_search.fit(X_train_scaled, y_train)
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)
Project: Customer Churn Prediction
Let's build a complete machine learning pipeline for predicting customer churn:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
class ChurnPredictor:
def __init__(self):
"""Initialize the ChurnPredictor."""
self.scaler = StandardScaler()
self.label_encoders = {}
self.model = None
self.feature_importance = None
def preprocess_data(self, df):
"""Preprocess the data for training or prediction."""
# Create copy of dataframe
df_processed = df.copy()
# Handle missing values
numeric_columns = df_processed.select_dtypes(include=['int64', 'float64']).columns
categorical_columns = df_processed.select_dtypes(include=['object']).columns
df_processed[numeric_columns] = df_processed[numeric_columns].fillna(
df_processed[numeric_columns].mean()
)
df_processed[categorical_columns] = df_processed[categorical_columns].fillna(
df_processed[categorical_columns].mode().iloc[0]
)
# Encode categorical variables
for column in categorical_columns:
if column not in self.label_encoders:
self.label_encoders[column] = LabelEncoder()
df_processed[column] = self.label_encoders[column].fit_transform(
df_processed[column]
)
else:
df_processed[column] = self.label_encoders[column].transform(
df_processed[column]
)
return df_processed
def train(self, X_train, y_train):
"""Train the model."""
# Scale features
X_train_scaled = self.scaler.fit_transform(X_train)
# Train model
self.model = RandomForestClassifier(
n_estimators=200,
max_depth=20,
min_samples_split=5,
random_state=42
)
self.model.fit(X_train_scaled, y_train)
# Calculate feature importance
self.feature_importance = pd.DataFrame({
'feature': X_train.columns,
'importance': self.model.feature_importances_
}).sort_values('importance', ascending=False)
def predict(self, X):
"""Make predictions on new data."""
X_scaled = self.scaler.transform(X)
return self.model.predict(X_scaled)
def predict_proba(self, X):
"""Get probability predictions."""
X_scaled = self.scaler.transform(X)
return self.model.predict_proba(X_scaled)
def evaluate(self, X_test, y_test):
"""Evaluate the model."""
X_test_scaled = self.scaler.transform(X_test)
y_pred = self.model.predict(X_test_scaled)
# Calculate metrics
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(
conf_matrix,
annot=True,
fmt='d',
cmap='Blues',
xticklabels=['Not Churned', 'Churned'],
yticklabels=['Not Churned', 'Churned']
)
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()
# Plot feature importance
plt.figure(figsize=(10, 6))
sns.barplot(
x='importance',
y='feature',
data=self.feature_importance.head(10)
)
plt.title('Top 10 Important Features')
plt.show()
return report
def save_model(self, filename):
"""Save the model to a file."""
import joblib
model_data = {
'model': self.model,
'scaler': self.scaler,
'label_encoders': self.label_encoders,
'feature_importance': self.feature_importance
}
joblib.dump(model_data, filename)
@classmethod
def load_model(cls, filename):
"""Load a saved model."""
import joblib
predictor = cls()
model_data = joblib.load(filename)
predictor.model = model_data['model']
predictor.scaler = model_data['scaler']
predictor.label_encoders = model_data['label_encoders']
predictor.feature_importance = model_data['feature_importance']
return predictor
# Example usage
if __name__ == "__main__":
# Create sample data
np.random.seed(42)
n_samples = 1000
data = {
'age': np.random.normal(45, 15, n_samples),
'tenure': np.random.randint(0, 10, n_samples),
'monthly_charges': np.random.normal(70, 30, n_samples),
'total_charges': np.random.normal(1000, 500, n_samples),
'gender': np.random.choice(['Male', 'Female'], n_samples),
'internet_service': np.random.choice(['DSL', 'Fiber optic', 'No'], n_samples),
'contract': np.random.choice(['Month-to-month', 'One year', 'Two year'], n_samples),
'payment_method': np.random.choice(['Electronic check', 'Mailed check', 'Bank transfer'], n_samples),
'churn': np.random.choice([0, 1], n_samples, p=[0.7, 0.3])
}
df = pd.DataFrame(data)
# Split features and target
X = df.drop('churn', axis=1)
y = df['churn']
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Create and train predictor
predictor = ChurnPredictor()
# Preprocess data
X_train_processed = predictor.preprocess_data(X_train)
X_test_processed = predictor.preprocess_data(X_test)
# Train model
predictor.train(X_train_processed, y_train)
# Evaluate model
print("\nModel Evaluation:")
print("================")
report = predictor.evaluate(X_test_processed, y_test)
print("\nClassification Report:")
print(report)
# Save model
predictor.save_model('churn_predictor.joblib')
# Load model and make predictions
loaded_predictor = ChurnPredictor.load_model('churn_predictor.joblib')
# Make predictions for new customers
new_customers = pd.DataFrame({
'age': [35, 45, 55],
'tenure': [2, 5, 8],
'monthly_charges': [50, 70, 90],
'total_charges': [800, 1200, 1600],
'gender': ['Male', 'Female', 'Male'],
'internet_service': ['DSL', 'Fiber optic', 'DSL'],
'contract': ['Month-to-month', 'One year', 'Two year'],
'payment_method': ['Electronic check', 'Bank transfer', 'Mailed check']
})
new_customers_processed = loaded_predictor.preprocess_data(new_customers)
predictions = loaded_predictor.predict(new_customers_processed)
probabilities = loaded_predictor.predict_proba(new_customers_processed)
print("\nPredictions for New Customers:")
print("=============================")
for i, (pred, prob) in enumerate(zip(predictions, probabilities)):
print(f"Customer {i+1}:")
print(f"Prediction: {'Churn' if pred == 1 else 'Not Churn'}")
print(f"Probability of Churn: {prob[1]:.2f}")
print()
Best Practices
- Data Preprocessing
def preprocess_data(df):
# Handle missing values
df = handle_missing_values(df)
# Scale features
df = scale_features(df)
# Encode categorical variables
df = encode_categories(df)
return df
- Model Evaluation
def evaluate_model(model, X_test, y_test):
# Make predictions
y_pred = model.predict(X_test)
# Calculate metrics
accuracy = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
f1 = metrics.f1_score(y_test, y_pred)
return {
'accuracy': accuracy,
'precision': precision,
'recall': recall,
'f1': f1
}
- Model Persistence
import joblib
def save_model(model, filename):
joblib.dump(model, filename)
def load_model(filename):
return joblib.load(filename)
Common Patterns
- Pipeline Creation
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
('scaler', StandardScaler()),
('classifier', RandomForestClassifier())
])
pipeline.fit(X_train, y_train)
- Feature Selection
from sklearn.feature_selection import SelectKBest, f_classif
selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(X, y)
Conclusion
Scikit-learn provides powerful tools for machine learning:
- Easy-to-use API
- Comprehensive algorithms
- Efficient implementation
- Excellent documentation
Keep exploring scikit-learn's capabilities to build better machine learning models.