Python Data Analysis with Pandas: From Basics to Advanced Techniques
Python Data Analysis with Pandas: From Basics to Advanced Techniques
Data analysis is a crucial skill in today's data-driven world. Python's Pandas library has become the go-to tool for data scientists and analysts. This guide will take you through everything you need to know about data analysis with Pandas.
Getting Started with Pandas
First, let's set up our environment:
pip install pandas numpy matplotlib seaborn jupyter
Basic Data Structures
import pandas as pd
import numpy as np
# Creating a Series
s = pd.Series([1, 3, 5, np.nan, 6, 8])
# Creating a DataFrame
df = pd.DataFrame({
'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar'],
'B': [1, 2, 3, 4, 5, 6],
'C': [2.0, 5., 8., 1., 2., 9.]
})
Data Loading and Export
# Reading from CSV
df = pd.read_csv('data.csv')
# Reading from Excel
df = pd.read_excel('data.xlsx')
# Reading from JSON
df = pd.read_json('data.json')
# Reading from SQL database
from sqlalchemy import create_engine
engine = create_engine('sqlite:///database.db')
df = pd.read_sql('SELECT * FROM table', engine)
# Exporting data
df.to_csv('output.csv', index=False)
df.to_excel('output.xlsx')
df.to_json('output.json')
Data Cleaning and Preprocessing
Handling Missing Values
# Check for missing values
df.isnull().sum()
# Fill missing values
df.fillna(0) # Fill with zero
df.fillna(method='ffill') # Forward fill
df.fillna(method='bfill') # Backward fill
# Drop missing values
df.dropna() # Drop rows with any missing values
df.dropna(how='all') # Drop rows with all missing values
Data Type Conversion
# Convert data types
df['column'] = df['column'].astype('int64')
df['date'] = pd.to_datetime(df['date'])
# Handle categorical data
df['category'] = df['category'].astype('category')
Removing Duplicates
# Check for duplicates
df.duplicated().sum()
# Remove duplicates
df.drop_duplicates()
Data Analysis Techniques
Basic Statistics
# Summary statistics
df.describe()
# Correlation analysis
df.corr()
# Group by operations
df.groupby('category').agg({
'value': ['mean', 'sum', 'count'],
'other_value': ['min', 'max']
})
Time Series Analysis
# Set datetime index
df.set_index('date', inplace=True)
# Resample data
daily = df.resample('D').mean()
monthly = df.resample('M').sum()
# Rolling statistics
df['rolling_mean'] = df['value'].rolling(window=7).mean()
Advanced Analysis
# Pivot tables
pivot = pd.pivot_table(
df,
values='value',
index='category',
columns='subcategory',
aggfunc='sum'
)
# Cross-tabulation
pd.crosstab(df['category'], df['subcategory'])
# Merge and join operations
merged = pd.merge(df1, df2, on='key', how='left')
Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
# Basic plotting
df['value'].plot(kind='line')
df['category'].value_counts().plot(kind='bar')
df.plot.scatter(x='x', y='y')
# Seaborn visualizations
sns.heatmap(df.corr(), annot=True)
sns.boxplot(x='category', y='value', data=df)
sns.pairplot(df)
plt.show()
Project: Sales Data Analysis
Let's analyze a real-world sales dataset:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
class SalesAnalyzer:
def __init__(self, data_file):
self.df = pd.read_csv(data_file)
self.preprocess_data()
def preprocess_data(self):
# Convert date column
self.df['date'] = pd.to_datetime(self.df['date'])
# Handle missing values
self.df['quantity'].fillna(0, inplace=True)
self.df['price'].fillna(self.df['price'].mean(), inplace=True)
# Calculate total sales
self.df['total_sales'] = self.df['quantity'] * self.df['price']
def monthly_sales_trend(self):
"""Analyze monthly sales trends"""
monthly = self.df.set_index('date').resample('M')['total_sales'].sum()
plt.figure(figsize=(12, 6))
monthly.plot(kind='line', marker='o')
plt.title('Monthly Sales Trend')
plt.xlabel('Month')
plt.ylabel('Total Sales')
plt.grid(True)
return monthly
def top_products(self, n=10):
"""Get top selling products"""
return self.df.groupby('product')['total_sales'].sum().nlargest(n)
def sales_by_category(self):
"""Analyze sales by category"""
category_sales = self.df.groupby('category')['total_sales'].sum()
plt.figure(figsize=(10, 6))
category_sales.plot(kind='pie', autopct='%1.1f%%')
plt.title('Sales Distribution by Category')
return category_sales
def customer_segmentation(self):
"""Segment customers based on purchase behavior"""
customer_stats = self.df.groupby('customer_id').agg({
'total_sales': ['sum', 'mean', 'count']
})
# K-means clustering
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_stats = scaler.fit_transform(customer_stats)
kmeans = KMeans(n_clusters=3, random_state=42)
customer_stats['segment'] = kmeans.fit_predict(scaled_stats)
return customer_stats
def generate_report(self):
"""Generate comprehensive sales report"""
report = {
'total_revenue': self.df['total_sales'].sum(),
'average_order_value': self.df.groupby('order_id')['total_sales'].sum().mean(),
'total_orders': self.df['order_id'].nunique(),
'total_customers': self.df['customer_id'].nunique(),
'top_products': self.top_products(),
'monthly_trend': self.monthly_sales_trend(),
'category_distribution': self.sales_by_category(),
'customer_segments': self.customer_segmentation()
}
return report
# Example usage
if __name__ == '__main__':
analyzer = SalesAnalyzer('sales_data.csv')
report = analyzer.generate_report()
print(f"Total Revenue: ${report['total_revenue']:,.2f}")
print(f"Average Order Value: ${report['average_order_value']:,.2f}")
print(f"Total Orders: {report['total_orders']}")
print(f"Total Customers: {report['total_customers']}")
# Display visualizations
plt.show()
Best Practices
-
Data Quality
- Always check for missing values
- Validate data types
- Handle outliers appropriately
- Document data cleaning steps
-
Performance
- Use appropriate data types
- Avoid loops when possible
- Use vectorized operations
- Consider chunking for large datasets
-
Code Organization
- Create reusable functions
- Use classes for complex analysis
- Maintain clear documentation
- Follow PEP 8 style guide
-
Memory Management
- Use chunks for large files
- Delete unnecessary objects
- Use appropriate data types
- Consider using dask for big data
Advanced Topics
- Custom Aggregations
def custom_agg(x):
return pd.Series({
'mean': x.mean(),
'std': x.std(),
'q75': x.quantile(0.75)
})
df.groupby('category').agg(custom_agg)
- Window Functions
df['moving_avg'] = df.groupby('category')['value'].transform(
lambda x: x.rolling(window=3).mean()
)
- Complex Transformations
def complex_transform(group):
return group.assign(
pct_of_total=lambda x: x['value'] / x['value'].sum() * 100,
cumulative=lambda x: x['value'].cumsum()
)
df.groupby('category').apply(complex_transform)
Conclusion
Pandas is an incredibly powerful tool for data analysis that can help you:
- Clean and preprocess data efficiently
- Perform complex analyses with ease
- Create insightful visualizations
- Generate comprehensive reports
Keep practicing with different datasets and exploring new features to become proficient in data analysis with Python.