Data Analysis with Pandas: From Basics to Advanced Techniques


Data Analysis with Pandas: From Basics to Advanced Techniques

Pandas is a powerful library for data manipulation and analysis in Python. In this guide, we'll explore how to use Pandas effectively for data analysis tasks.

Getting Started with Pandas

First, let's set up our environment and import the necessary libraries:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)
pd.set_option('display.width', 1000)

Basic Data Structures

Series

A one-dimensional labeled array:

# Create a Series
s = pd.Series([1, 3, 5, np.nan, 6, 8])
print(s)

# Create a Series with custom index
s = pd.Series([1, 3, 5, 6, 8], index=['a', 'b', 'c', 'd', 'e'])
print(s)

# Access elements
print(s['a'])  # Access by label
print(s[0])    # Access by position

DataFrame

A two-dimensional labeled data structure:

# Create a DataFrame from a dictionary
data = {
    'name': ['John', 'Emma', 'Alex', 'Sarah'],
    'age': [28, 24, 32, 27],
    'city': ['New York', 'London', 'Paris', 'Tokyo']
}
df = pd.DataFrame(data)
print(df)

# Create a DataFrame from a CSV file
df = pd.read_csv('data.csv')

# Basic information about the DataFrame
print(df.info())
print(df.describe())

Data Loading and Export

# Read from various file formats
df_csv = pd.read_csv('data.csv')
df_excel = pd.read_excel('data.xlsx')
df_json = pd.read_json('data.json')
df_sql = pd.read_sql('SELECT * FROM table', connection)

# Export to different formats
df.to_csv('output.csv', index=False)
df.to_excel('output.xlsx', index=False)
df.to_json('output.json', orient='records')
df.to_sql('table_name', connection)

Data Cleaning and Preprocessing

Handling Missing Values

# Check for missing values
print(df.isnull().sum())

# Fill missing values
df['column'].fillna(0)  # Fill with a specific value
df['column'].fillna(df['column'].mean())  # Fill with mean
df['column'].fillna(method='ffill')  # Forward fill
df['column'].fillna(method='bfill')  # Backward fill

# Drop missing values
df.dropna()  # Drop rows with any missing values
df.dropna(subset=['column'])  # Drop rows with missing values in specific columns

Data Type Conversion

# Convert data types
df['date'] = pd.to_datetime(df['date'])
df['number'] = pd.to_numeric(df['number'], errors='coerce')
df['category'] = df['category'].astype('category')

# Memory optimization
def optimize_dtypes(df):
    for col in df.columns:
        if df[col].dtype == 'object':
            if len(df[col].unique()) / len(df) < 0.5:
                df[col] = df[col].astype('category')
        elif df[col].dtype == 'float64':
            df[col] = pd.to_numeric(df[col], downcast='float')
        elif df[col].dtype == 'int64':
            df[col] = pd.to_numeric(df[col], downcast='integer')
    return df

Data Analysis Techniques

Basic Statistics

# Summary statistics
print(df.describe())
print(df.corr())  # Correlation matrix

# Group by operations
df.groupby('category')['value'].mean()
df.groupby(['category1', 'category2'])['value'].agg(['mean', 'sum', 'count'])

# Custom aggregation
def custom_metric(x):
    return x.sum() / x.count()

df.groupby('category').agg({
    'value1': 'mean',
    'value2': 'sum',
    'value3': custom_metric
})

Time Series Analysis

# Set datetime index
df.set_index('date', inplace=True)

# Resample data
daily = df.resample('D').mean()
monthly = df.resample('M').sum()

# Rolling statistics
df['rolling_mean'] = df['value'].rolling(window=7).mean()
df['rolling_std'] = df['value'].rolling(window=7).std()

# Time-based indexing
df['2024-01':'2024-03']  # Select date range
df.loc['2024-01-01']     # Select specific date

Advanced Analysis

# Pivot tables
pivot = pd.pivot_table(
    df,
    values='value',
    index='category1',
    columns='category2',
    aggfunc='mean',
    fill_value=0
)

# Cross-tabulation
pd.crosstab(df['category1'], df['category2'])

# Apply custom functions
def categorize(x):
    if x < 0:
        return 'negative'
    elif x == 0:
        return 'zero'
    else:
        return 'positive'

df['category'] = df['value'].apply(categorize)

Project: Sales Data Analysis

Let's build a complete sales analysis system:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

class SalesAnalyzer:
    def __init__(self, data_file):
        """Initialize the SalesAnalyzer with a data file."""
        self.df = pd.read_csv(data_file)
        self.preprocess_data()
    
    def preprocess_data(self):
        """Clean and preprocess the data."""
        # Convert date columns
        self.df['date'] = pd.to_datetime(self.df['date'])
        
        # Set date as index
        self.df.set_index('date', inplace=True)
        
        # Convert numeric columns
        numeric_columns = ['quantity', 'unit_price', 'total_amount']
        for col in numeric_columns:
            self.df[col] = pd.to_numeric(self.df[col], errors='coerce')
        
        # Calculate derived metrics
        self.df['profit'] = self.df['total_amount'] - (self.df['quantity'] * self.df['cost_price'])
        self.df['profit_margin'] = self.df['profit'] / self.df['total_amount'] * 100
    
    def get_daily_sales(self):
        """Calculate daily sales metrics."""
        return self.df.resample('D').agg({
            'quantity': 'sum',
            'total_amount': 'sum',
            'profit': 'sum'
        })
    
    def get_monthly_sales(self):
        """Calculate monthly sales metrics."""
        return self.df.resample('M').agg({
            'quantity': 'sum',
            'total_amount': 'sum',
            'profit': 'sum',
            'profit_margin': 'mean'
        })
    
    def get_product_performance(self):
        """Analyze product performance."""
        return self.df.groupby('product_id').agg({
            'quantity': 'sum',
            'total_amount': 'sum',
            'profit': 'sum',
            'profit_margin': 'mean'
        }).sort_values('profit', ascending=False)
    
    def get_customer_segments(self):
        """Segment customers based on purchase behavior."""
        customer_metrics = self.df.groupby('customer_id').agg({
            'total_amount': 'sum',
            'quantity': 'sum',
            'profit': 'sum'
        })
        
        # Calculate RFM scores
        current_date = self.df.index.max()
        
        # Recency
        customer_metrics['last_purchase'] = self.df.groupby('customer_id')['date'].max()
        customer_metrics['recency'] = (current_date - customer_metrics['last_purchase']).dt.days
        
        # Frequency
        customer_metrics['frequency'] = self.df.groupby('customer_id').size()
        
        # Monetary
        customer_metrics['monetary'] = customer_metrics['total_amount']
        
        # Calculate RFM scores
        r_labels = range(4, 0, -1)
        r_quartiles = pd.qcut(customer_metrics['recency'], q=4, labels=r_labels)
        f_labels = range(1, 5)
        f_quartiles = pd.qcut(customer_metrics['frequency'], q=4, labels=f_labels)
        m_labels = range(1, 5)
        m_quartiles = pd.qcut(customer_metrics['monetary'], q=4, labels=m_labels)
        
        customer_metrics['R'] = r_quartiles
        customer_metrics['F'] = f_quartiles
        customer_metrics['M'] = m_quartiles
        
        customer_metrics['RFM_Score'] = customer_metrics['R'].astype(str) + \
                                      customer_metrics['F'].astype(str) + \
                                      customer_metrics['M'].astype(str)
        
        # Segment customers
        def segment_customers(row):
            score = row['RFM_Score']
            if score in ['444', '434', '443', '433']:
                return 'Best Customers'
            elif row['R'] == 4:
                return 'Lost Customers'
            elif row['F'] == 4 and row['M'] == 4:
                return 'Loyal Customers'
            elif row['M'] == 4:
                return 'Big Spenders'
            elif row['F'] == 4:
                return 'Frequent Customers'
            else:
                return 'Average Customers'
        
        customer_metrics['Customer_Segment'] = customer_metrics.apply(segment_customers, axis=1)
        
        return customer_metrics
    
    def get_sales_forecast(self, periods=30):
        """Generate sales forecast using simple moving average."""
        daily_sales = self.get_daily_sales()
        
        # Calculate moving averages
        ma7 = daily_sales['total_amount'].rolling(window=7).mean()
        ma30 = daily_sales['total_amount'].rolling(window=30).mean()
        
        # Create forecast DataFrame
        forecast = pd.DataFrame({
            'MA7': ma7,
            'MA30': ma30
        })
        
        # Generate future dates
        last_date = daily_sales.index[-1]
        future_dates = pd.date_range(start=last_date + timedelta(days=1),
                                   periods=periods,
                                   freq='D')
        
        # Create future forecast
        future_forecast = pd.DataFrame(index=future_dates)
        future_forecast['Forecast'] = ma30.iloc[-1]  # Use last 30-day MA as forecast
        
        return forecast, future_forecast
    
    def plot_sales_trend(self):
        """Plot sales trends."""
        daily_sales = self.get_daily_sales()
        
        plt.figure(figsize=(15, 6))
        plt.plot(daily_sales.index, daily_sales['total_amount'], label='Daily Sales')
        plt.plot(daily_sales.index, daily_sales['total_amount'].rolling(7).mean(),
                label='7-day Moving Average')
        plt.title('Daily Sales Trend')
        plt.xlabel('Date')
        plt.ylabel('Sales Amount')
        plt.legend()
        plt.grid(True)
        plt.show()
    
    def plot_product_performance(self):
        """Plot product performance."""
        product_perf = self.get_product_performance().head(10)
        
        plt.figure(figsize=(12, 6))
        sns.barplot(x=product_perf.index, y='profit', data=product_perf)
        plt.title('Top 10 Products by Profit')
        plt.xlabel('Product ID')
        plt.ylabel('Profit')
        plt.xticks(rotation=45)
        plt.show()
    
    def generate_report(self):
        """Generate a comprehensive sales report."""
        report = {
            'total_sales': self.df['total_amount'].sum(),
            'total_profit': self.df['profit'].sum(),
            'average_profit_margin': self.df['profit_margin'].mean(),
            'total_customers': self.df['customer_id'].nunique(),
            'total_products': self.df['product_id'].nunique(),
            'daily_sales': self.get_daily_sales(),
            'monthly_sales': self.get_monthly_sales(),
            'product_performance': self.get_product_performance(),
            'customer_segments': self.get_customer_segments()
        }
        
        return report

# Example usage
if __name__ == "__main__":
    # Create sample data
    np.random.seed(42)
    dates = pd.date_range(start='2024-01-01', end='2024-03-31', freq='D')
    n_records = 1000
    
    data = {
        'date': np.random.choice(dates, n_records),
        'customer_id': np.random.randint(1, 101, n_records),
        'product_id': np.random.randint(1, 51, n_records),
        'quantity': np.random.randint(1, 11, n_records),
        'unit_price': np.random.uniform(10, 100, n_records),
        'cost_price': np.random.uniform(5, 50, n_records)
    }
    
    df = pd.DataFrame(data)
    df['total_amount'] = df['quantity'] * df['unit_price']
    
    # Save to CSV
    df.to_csv('sales_data.csv', index=False)
    
    # Create analyzer
    analyzer = SalesAnalyzer('sales_data.csv')
    
    # Generate and print report
    report = analyzer.generate_report()
    print("\nSales Analysis Report")
    print("===================")
    print(f"Total Sales: ${report['total_sales']:,.2f}")
    print(f"Total Profit: ${report['total_profit']:,.2f}")
    print(f"Average Profit Margin: {report['average_profit_margin']:.2f}%")
    print(f"Total Customers: {report['total_customers']}")
    print(f"Total Products: {report['total_products']}")
    
    # Plot visualizations
    analyzer.plot_sales_trend()
    analyzer.plot_product_performance()

Data Visualization

Basic Plotting

# Line plot
df['value'].plot(kind='line')

# Bar plot
df['category'].value_counts().plot(kind='bar')

# Histogram
df['value'].hist(bins=50)

# Box plot
df.boxplot(column='value', by='category')

# Scatter plot
df.plot.scatter(x='value1', y='value2')

Advanced Visualization with Seaborn

# Distribution plot
sns.distplot(df['value'])

# Box plot with swarm
sns.boxplot(x='category', y='value', data=df)
sns.swarmplot(x='category', y='value', data=df, color='0.25')

# Heatmap
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')

# Pair plot
sns.pairplot(df, hue='category')

# Joint plot
sns.jointplot(x='value1', y='value2', data=df, kind='reg')

Best Practices

  1. Memory Management
# Read large files in chunks
chunks = pd.read_csv('large_file.csv', chunksize=10000)
for chunk in chunks:
    process_chunk(chunk)

# Use appropriate data types
df = optimize_dtypes(df)
  1. Performance Optimization
# Use vectorized operations
df['new_column'] = df['column'] * 2  # Good
df['new_column'] = df.apply(lambda x: x['column'] * 2, axis=1)  # Bad

# Use query for filtering
df.query('column > 5')  # More efficient
df[df['column'] > 5]    # Less efficient
  1. Code Organization
class DataProcessor:
    def __init__(self, df):
        self.df = df
    
    def clean_data(self):
        # Cleaning operations
        pass
    
    def transform_data(self):
        # Transformation operations
        pass
    
    def analyze_data(self):
        # Analysis operations
        pass

Common Patterns

  1. Data Pipeline
def process_data(file_path):
    # Load data
    df = pd.read_csv(file_path)
    
    # Clean data
    df = clean_data(df)
    
    # Transform data
    df = transform_data(df)
    
    # Analyze data
    results = analyze_data(df)
    
    return results
  1. Custom Aggregations
def weighted_average(group):
    weights = group['weight']
    values = group['value']
    return (weights * values).sum() / weights.sum()

df.groupby('category').apply(weighted_average)

Conclusion

Pandas provides powerful tools for data analysis:

  • Efficient data manipulation
  • Flexible data structures
  • Rich analysis capabilities
  • Seamless integration with other libraries

Keep exploring Pandas' features to enhance your data analysis skills.

Further Reading