Data Analysis with Pandas: From Basics to Advanced Techniques
Data Analysis with Pandas: From Basics to Advanced Techniques
Pandas is a powerful library for data manipulation and analysis in Python. In this guide, we'll explore how to use Pandas effectively for data analysis tasks.
Getting Started with Pandas
First, let's set up our environment and import the necessary libraries:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)
pd.set_option('display.width', 1000)
Basic Data Structures
Series
A one-dimensional labeled array:
# Create a Series
s = pd.Series([1, 3, 5, np.nan, 6, 8])
print(s)
# Create a Series with custom index
s = pd.Series([1, 3, 5, 6, 8], index=['a', 'b', 'c', 'd', 'e'])
print(s)
# Access elements
print(s['a']) # Access by label
print(s[0]) # Access by position
DataFrame
A two-dimensional labeled data structure:
# Create a DataFrame from a dictionary
data = {
'name': ['John', 'Emma', 'Alex', 'Sarah'],
'age': [28, 24, 32, 27],
'city': ['New York', 'London', 'Paris', 'Tokyo']
}
df = pd.DataFrame(data)
print(df)
# Create a DataFrame from a CSV file
df = pd.read_csv('data.csv')
# Basic information about the DataFrame
print(df.info())
print(df.describe())
Data Loading and Export
# Read from various file formats
df_csv = pd.read_csv('data.csv')
df_excel = pd.read_excel('data.xlsx')
df_json = pd.read_json('data.json')
df_sql = pd.read_sql('SELECT * FROM table', connection)
# Export to different formats
df.to_csv('output.csv', index=False)
df.to_excel('output.xlsx', index=False)
df.to_json('output.json', orient='records')
df.to_sql('table_name', connection)
Data Cleaning and Preprocessing
Handling Missing Values
# Check for missing values
print(df.isnull().sum())
# Fill missing values
df['column'].fillna(0) # Fill with a specific value
df['column'].fillna(df['column'].mean()) # Fill with mean
df['column'].fillna(method='ffill') # Forward fill
df['column'].fillna(method='bfill') # Backward fill
# Drop missing values
df.dropna() # Drop rows with any missing values
df.dropna(subset=['column']) # Drop rows with missing values in specific columns
Data Type Conversion
# Convert data types
df['date'] = pd.to_datetime(df['date'])
df['number'] = pd.to_numeric(df['number'], errors='coerce')
df['category'] = df['category'].astype('category')
# Memory optimization
def optimize_dtypes(df):
for col in df.columns:
if df[col].dtype == 'object':
if len(df[col].unique()) / len(df) < 0.5:
df[col] = df[col].astype('category')
elif df[col].dtype == 'float64':
df[col] = pd.to_numeric(df[col], downcast='float')
elif df[col].dtype == 'int64':
df[col] = pd.to_numeric(df[col], downcast='integer')
return df
Data Analysis Techniques
Basic Statistics
# Summary statistics
print(df.describe())
print(df.corr()) # Correlation matrix
# Group by operations
df.groupby('category')['value'].mean()
df.groupby(['category1', 'category2'])['value'].agg(['mean', 'sum', 'count'])
# Custom aggregation
def custom_metric(x):
return x.sum() / x.count()
df.groupby('category').agg({
'value1': 'mean',
'value2': 'sum',
'value3': custom_metric
})
Time Series Analysis
# Set datetime index
df.set_index('date', inplace=True)
# Resample data
daily = df.resample('D').mean()
monthly = df.resample('M').sum()
# Rolling statistics
df['rolling_mean'] = df['value'].rolling(window=7).mean()
df['rolling_std'] = df['value'].rolling(window=7).std()
# Time-based indexing
df['2024-01':'2024-03'] # Select date range
df.loc['2024-01-01'] # Select specific date
Advanced Analysis
# Pivot tables
pivot = pd.pivot_table(
df,
values='value',
index='category1',
columns='category2',
aggfunc='mean',
fill_value=0
)
# Cross-tabulation
pd.crosstab(df['category1'], df['category2'])
# Apply custom functions
def categorize(x):
if x < 0:
return 'negative'
elif x == 0:
return 'zero'
else:
return 'positive'
df['category'] = df['value'].apply(categorize)
Project: Sales Data Analysis
Let's build a complete sales analysis system:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
class SalesAnalyzer:
def __init__(self, data_file):
"""Initialize the SalesAnalyzer with a data file."""
self.df = pd.read_csv(data_file)
self.preprocess_data()
def preprocess_data(self):
"""Clean and preprocess the data."""
# Convert date columns
self.df['date'] = pd.to_datetime(self.df['date'])
# Set date as index
self.df.set_index('date', inplace=True)
# Convert numeric columns
numeric_columns = ['quantity', 'unit_price', 'total_amount']
for col in numeric_columns:
self.df[col] = pd.to_numeric(self.df[col], errors='coerce')
# Calculate derived metrics
self.df['profit'] = self.df['total_amount'] - (self.df['quantity'] * self.df['cost_price'])
self.df['profit_margin'] = self.df['profit'] / self.df['total_amount'] * 100
def get_daily_sales(self):
"""Calculate daily sales metrics."""
return self.df.resample('D').agg({
'quantity': 'sum',
'total_amount': 'sum',
'profit': 'sum'
})
def get_monthly_sales(self):
"""Calculate monthly sales metrics."""
return self.df.resample('M').agg({
'quantity': 'sum',
'total_amount': 'sum',
'profit': 'sum',
'profit_margin': 'mean'
})
def get_product_performance(self):
"""Analyze product performance."""
return self.df.groupby('product_id').agg({
'quantity': 'sum',
'total_amount': 'sum',
'profit': 'sum',
'profit_margin': 'mean'
}).sort_values('profit', ascending=False)
def get_customer_segments(self):
"""Segment customers based on purchase behavior."""
customer_metrics = self.df.groupby('customer_id').agg({
'total_amount': 'sum',
'quantity': 'sum',
'profit': 'sum'
})
# Calculate RFM scores
current_date = self.df.index.max()
# Recency
customer_metrics['last_purchase'] = self.df.groupby('customer_id')['date'].max()
customer_metrics['recency'] = (current_date - customer_metrics['last_purchase']).dt.days
# Frequency
customer_metrics['frequency'] = self.df.groupby('customer_id').size()
# Monetary
customer_metrics['monetary'] = customer_metrics['total_amount']
# Calculate RFM scores
r_labels = range(4, 0, -1)
r_quartiles = pd.qcut(customer_metrics['recency'], q=4, labels=r_labels)
f_labels = range(1, 5)
f_quartiles = pd.qcut(customer_metrics['frequency'], q=4, labels=f_labels)
m_labels = range(1, 5)
m_quartiles = pd.qcut(customer_metrics['monetary'], q=4, labels=m_labels)
customer_metrics['R'] = r_quartiles
customer_metrics['F'] = f_quartiles
customer_metrics['M'] = m_quartiles
customer_metrics['RFM_Score'] = customer_metrics['R'].astype(str) + \
customer_metrics['F'].astype(str) + \
customer_metrics['M'].astype(str)
# Segment customers
def segment_customers(row):
score = row['RFM_Score']
if score in ['444', '434', '443', '433']:
return 'Best Customers'
elif row['R'] == 4:
return 'Lost Customers'
elif row['F'] == 4 and row['M'] == 4:
return 'Loyal Customers'
elif row['M'] == 4:
return 'Big Spenders'
elif row['F'] == 4:
return 'Frequent Customers'
else:
return 'Average Customers'
customer_metrics['Customer_Segment'] = customer_metrics.apply(segment_customers, axis=1)
return customer_metrics
def get_sales_forecast(self, periods=30):
"""Generate sales forecast using simple moving average."""
daily_sales = self.get_daily_sales()
# Calculate moving averages
ma7 = daily_sales['total_amount'].rolling(window=7).mean()
ma30 = daily_sales['total_amount'].rolling(window=30).mean()
# Create forecast DataFrame
forecast = pd.DataFrame({
'MA7': ma7,
'MA30': ma30
})
# Generate future dates
last_date = daily_sales.index[-1]
future_dates = pd.date_range(start=last_date + timedelta(days=1),
periods=periods,
freq='D')
# Create future forecast
future_forecast = pd.DataFrame(index=future_dates)
future_forecast['Forecast'] = ma30.iloc[-1] # Use last 30-day MA as forecast
return forecast, future_forecast
def plot_sales_trend(self):
"""Plot sales trends."""
daily_sales = self.get_daily_sales()
plt.figure(figsize=(15, 6))
plt.plot(daily_sales.index, daily_sales['total_amount'], label='Daily Sales')
plt.plot(daily_sales.index, daily_sales['total_amount'].rolling(7).mean(),
label='7-day Moving Average')
plt.title('Daily Sales Trend')
plt.xlabel('Date')
plt.ylabel('Sales Amount')
plt.legend()
plt.grid(True)
plt.show()
def plot_product_performance(self):
"""Plot product performance."""
product_perf = self.get_product_performance().head(10)
plt.figure(figsize=(12, 6))
sns.barplot(x=product_perf.index, y='profit', data=product_perf)
plt.title('Top 10 Products by Profit')
plt.xlabel('Product ID')
plt.ylabel('Profit')
plt.xticks(rotation=45)
plt.show()
def generate_report(self):
"""Generate a comprehensive sales report."""
report = {
'total_sales': self.df['total_amount'].sum(),
'total_profit': self.df['profit'].sum(),
'average_profit_margin': self.df['profit_margin'].mean(),
'total_customers': self.df['customer_id'].nunique(),
'total_products': self.df['product_id'].nunique(),
'daily_sales': self.get_daily_sales(),
'monthly_sales': self.get_monthly_sales(),
'product_performance': self.get_product_performance(),
'customer_segments': self.get_customer_segments()
}
return report
# Example usage
if __name__ == "__main__":
# Create sample data
np.random.seed(42)
dates = pd.date_range(start='2024-01-01', end='2024-03-31', freq='D')
n_records = 1000
data = {
'date': np.random.choice(dates, n_records),
'customer_id': np.random.randint(1, 101, n_records),
'product_id': np.random.randint(1, 51, n_records),
'quantity': np.random.randint(1, 11, n_records),
'unit_price': np.random.uniform(10, 100, n_records),
'cost_price': np.random.uniform(5, 50, n_records)
}
df = pd.DataFrame(data)
df['total_amount'] = df['quantity'] * df['unit_price']
# Save to CSV
df.to_csv('sales_data.csv', index=False)
# Create analyzer
analyzer = SalesAnalyzer('sales_data.csv')
# Generate and print report
report = analyzer.generate_report()
print("\nSales Analysis Report")
print("===================")
print(f"Total Sales: ${report['total_sales']:,.2f}")
print(f"Total Profit: ${report['total_profit']:,.2f}")
print(f"Average Profit Margin: {report['average_profit_margin']:.2f}%")
print(f"Total Customers: {report['total_customers']}")
print(f"Total Products: {report['total_products']}")
# Plot visualizations
analyzer.plot_sales_trend()
analyzer.plot_product_performance()
Data Visualization
Basic Plotting
# Line plot
df['value'].plot(kind='line')
# Bar plot
df['category'].value_counts().plot(kind='bar')
# Histogram
df['value'].hist(bins=50)
# Box plot
df.boxplot(column='value', by='category')
# Scatter plot
df.plot.scatter(x='value1', y='value2')
Advanced Visualization with Seaborn
# Distribution plot
sns.distplot(df['value'])
# Box plot with swarm
sns.boxplot(x='category', y='value', data=df)
sns.swarmplot(x='category', y='value', data=df, color='0.25')
# Heatmap
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
# Pair plot
sns.pairplot(df, hue='category')
# Joint plot
sns.jointplot(x='value1', y='value2', data=df, kind='reg')
Best Practices
- Memory Management
# Read large files in chunks
chunks = pd.read_csv('large_file.csv', chunksize=10000)
for chunk in chunks:
process_chunk(chunk)
# Use appropriate data types
df = optimize_dtypes(df)
- Performance Optimization
# Use vectorized operations
df['new_column'] = df['column'] * 2 # Good
df['new_column'] = df.apply(lambda x: x['column'] * 2, axis=1) # Bad
# Use query for filtering
df.query('column > 5') # More efficient
df[df['column'] > 5] # Less efficient
- Code Organization
class DataProcessor:
def __init__(self, df):
self.df = df
def clean_data(self):
# Cleaning operations
pass
def transform_data(self):
# Transformation operations
pass
def analyze_data(self):
# Analysis operations
pass
Common Patterns
- Data Pipeline
def process_data(file_path):
# Load data
df = pd.read_csv(file_path)
# Clean data
df = clean_data(df)
# Transform data
df = transform_data(df)
# Analyze data
results = analyze_data(df)
return results
- Custom Aggregations
def weighted_average(group):
weights = group['weight']
values = group['value']
return (weights * values).sum() / weights.sum()
df.groupby('category').apply(weighted_average)
Conclusion
Pandas provides powerful tools for data analysis:
- Efficient data manipulation
- Flexible data structures
- Rich analysis capabilities
- Seamless integration with other libraries
Keep exploring Pandas' features to enhance your data analysis skills.