Python Data Analysis with Pandas: From Basics to Advanced Techniques


Python Data Analysis with Pandas: From Basics to Advanced Techniques

Data analysis is a crucial skill in today's data-driven world. Python's Pandas library has become the go-to tool for data scientists and analysts. This guide will take you through everything you need to know about data analysis with Pandas.

Getting Started with Pandas

First, let's set up our environment:

pip install pandas numpy matplotlib seaborn jupyter

Basic Data Structures

import pandas as pd
import numpy as np

# Creating a Series
s = pd.Series([1, 3, 5, np.nan, 6, 8])

# Creating a DataFrame
df = pd.DataFrame({
    'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar'],
    'B': [1, 2, 3, 4, 5, 6],
    'C': [2.0, 5., 8., 1., 2., 9.]
})

Data Loading and Export

# Reading from CSV
df = pd.read_csv('data.csv')

# Reading from Excel
df = pd.read_excel('data.xlsx')

# Reading from JSON
df = pd.read_json('data.json')

# Reading from SQL database
from sqlalchemy import create_engine
engine = create_engine('sqlite:///database.db')
df = pd.read_sql('SELECT * FROM table', engine)

# Exporting data
df.to_csv('output.csv', index=False)
df.to_excel('output.xlsx')
df.to_json('output.json')

Data Cleaning and Preprocessing

Handling Missing Values

# Check for missing values
df.isnull().sum()

# Fill missing values
df.fillna(0)  # Fill with zero
df.fillna(method='ffill')  # Forward fill
df.fillna(method='bfill')  # Backward fill

# Drop missing values
df.dropna()  # Drop rows with any missing values
df.dropna(how='all')  # Drop rows with all missing values

Data Type Conversion

# Convert data types
df['column'] = df['column'].astype('int64')
df['date'] = pd.to_datetime(df['date'])

# Handle categorical data
df['category'] = df['category'].astype('category')

Removing Duplicates

# Check for duplicates
df.duplicated().sum()

# Remove duplicates
df.drop_duplicates()

Data Analysis Techniques

Basic Statistics

# Summary statistics
df.describe()

# Correlation analysis
df.corr()

# Group by operations
df.groupby('category').agg({
    'value': ['mean', 'sum', 'count'],
    'other_value': ['min', 'max']
})

Time Series Analysis

# Set datetime index
df.set_index('date', inplace=True)

# Resample data
daily = df.resample('D').mean()
monthly = df.resample('M').sum()

# Rolling statistics
df['rolling_mean'] = df['value'].rolling(window=7).mean()

Advanced Analysis

# Pivot tables
pivot = pd.pivot_table(
    df,
    values='value',
    index='category',
    columns='subcategory',
    aggfunc='sum'
)

# Cross-tabulation
pd.crosstab(df['category'], df['subcategory'])

# Merge and join operations
merged = pd.merge(df1, df2, on='key', how='left')

Data Visualization

import matplotlib.pyplot as plt
import seaborn as sns

# Basic plotting
df['value'].plot(kind='line')
df['category'].value_counts().plot(kind='bar')
df.plot.scatter(x='x', y='y')

# Seaborn visualizations
sns.heatmap(df.corr(), annot=True)
sns.boxplot(x='category', y='value', data=df)
sns.pairplot(df)

plt.show()

Project: Sales Data Analysis

Let's analyze a real-world sales dataset:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

class SalesAnalyzer:
    def __init__(self, data_file):
        self.df = pd.read_csv(data_file)
        self.preprocess_data()
    
    def preprocess_data(self):
        # Convert date column
        self.df['date'] = pd.to_datetime(self.df['date'])
        
        # Handle missing values
        self.df['quantity'].fillna(0, inplace=True)
        self.df['price'].fillna(self.df['price'].mean(), inplace=True)
        
        # Calculate total sales
        self.df['total_sales'] = self.df['quantity'] * self.df['price']
    
    def monthly_sales_trend(self):
        """Analyze monthly sales trends"""
        monthly = self.df.set_index('date').resample('M')['total_sales'].sum()
        
        plt.figure(figsize=(12, 6))
        monthly.plot(kind='line', marker='o')
        plt.title('Monthly Sales Trend')
        plt.xlabel('Month')
        plt.ylabel('Total Sales')
        plt.grid(True)
        return monthly
    
    def top_products(self, n=10):
        """Get top selling products"""
        return self.df.groupby('product')['total_sales'].sum().nlargest(n)
    
    def sales_by_category(self):
        """Analyze sales by category"""
        category_sales = self.df.groupby('category')['total_sales'].sum()
        
        plt.figure(figsize=(10, 6))
        category_sales.plot(kind='pie', autopct='%1.1f%%')
        plt.title('Sales Distribution by Category')
        return category_sales
    
    def customer_segmentation(self):
        """Segment customers based on purchase behavior"""
        customer_stats = self.df.groupby('customer_id').agg({
            'total_sales': ['sum', 'mean', 'count']
        })
        
        # K-means clustering
        from sklearn.cluster import KMeans
        from sklearn.preprocessing import StandardScaler
        
        scaler = StandardScaler()
        scaled_stats = scaler.fit_transform(customer_stats)
        
        kmeans = KMeans(n_clusters=3, random_state=42)
        customer_stats['segment'] = kmeans.fit_predict(scaled_stats)
        
        return customer_stats
    
    def generate_report(self):
        """Generate comprehensive sales report"""
        report = {
            'total_revenue': self.df['total_sales'].sum(),
            'average_order_value': self.df.groupby('order_id')['total_sales'].sum().mean(),
            'total_orders': self.df['order_id'].nunique(),
            'total_customers': self.df['customer_id'].nunique(),
            'top_products': self.top_products(),
            'monthly_trend': self.monthly_sales_trend(),
            'category_distribution': self.sales_by_category(),
            'customer_segments': self.customer_segmentation()
        }
        return report

# Example usage
if __name__ == '__main__':
    analyzer = SalesAnalyzer('sales_data.csv')
    report = analyzer.generate_report()
    
    print(f"Total Revenue: ${report['total_revenue']:,.2f}")
    print(f"Average Order Value: ${report['average_order_value']:,.2f}")
    print(f"Total Orders: {report['total_orders']}")
    print(f"Total Customers: {report['total_customers']}")
    
    # Display visualizations
    plt.show()

Best Practices

  1. Data Quality

    • Always check for missing values
    • Validate data types
    • Handle outliers appropriately
    • Document data cleaning steps
  2. Performance

    • Use appropriate data types
    • Avoid loops when possible
    • Use vectorized operations
    • Consider chunking for large datasets
  3. Code Organization

    • Create reusable functions
    • Use classes for complex analysis
    • Maintain clear documentation
    • Follow PEP 8 style guide
  4. Memory Management

    • Use chunks for large files
    • Delete unnecessary objects
    • Use appropriate data types
    • Consider using dask for big data

Advanced Topics

  1. Custom Aggregations
def custom_agg(x):
    return pd.Series({
        'mean': x.mean(),
        'std': x.std(),
        'q75': x.quantile(0.75)
    })

df.groupby('category').agg(custom_agg)
  1. Window Functions
df['moving_avg'] = df.groupby('category')['value'].transform(
    lambda x: x.rolling(window=3).mean()
)
  1. Complex Transformations
def complex_transform(group):
    return group.assign(
        pct_of_total=lambda x: x['value'] / x['value'].sum() * 100,
        cumulative=lambda x: x['value'].cumsum()
    )

df.groupby('category').apply(complex_transform)

Conclusion

Pandas is an incredibly powerful tool for data analysis that can help you:

  • Clean and preprocess data efficiently
  • Perform complex analyses with ease
  • Create insightful visualizations
  • Generate comprehensive reports

Keep practicing with different datasets and exploring new features to become proficient in data analysis with Python.


Further Reading