Web Scraping with BeautifulSoup: A Complete Guide


Web Scraping with BeautifulSoup: A Complete Guide

Web scraping is a powerful technique for extracting data from websites. In this guide, we'll explore how to use BeautifulSoup to scrape web data effectively.

Getting Started with BeautifulSoup

First, let's set up our environment and import the necessary libraries:

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import logging
from typing import List, Dict, Optional

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

Basic Web Scraping

Making HTTP Requests

def fetch_page(url: str) -> Optional[str]:
    """Fetch webpage content with error handling."""
    try:
        # Add headers to mimic browser request
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        logger.error(f"Error fetching {url}: {e}")
        return None

# Example usage
url = 'https://example.com'
html_content = fetch_page(url)
if html_content:
    soup = BeautifulSoup(html_content, 'html.parser')

Parsing HTML

# Find elements by tag
paragraphs = soup.find_all('p')
links = soup.find_all('a')

# Find elements by class
elements = soup.find_all(class_='my-class')

# Find elements by ID
element = soup.find(id='my-id')

# Find elements by CSS selector
elements = soup.select('.class-name')
elements = soup.select('#id-name')
elements = soup.select('div.class-name')

Advanced Scraping Techniques

Navigation and Searching

# Navigate through elements
parent = element.parent
children = element.children
siblings = element.next_siblings

# Find nested elements
nested = soup.find('div').find('p').find('span')

# Search with multiple conditions
elements = soup.find_all(['p', 'div'], class_='content')

# Regular expression search
import re
elements = soup.find_all(text=re.compile(r'pattern'))

Extracting Data

# Get text content
text = element.text.strip()

# Get attributes
href = element.get('href')
src = element['src']

# Get all text recursively
all_text = soup.get_text(separator=' ', strip=True)

Project: E-commerce Product Scraper

Let's build a complete scraper for e-commerce products:

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import logging
from typing import List, Dict, Optional
from datetime import datetime
from urllib.parse import urljoin
import json
import os

class ProductScraper:
    def __init__(self, base_url: str, output_dir: str = 'data'):
        """Initialize the ProductScraper."""
        self.base_url = base_url
        self.output_dir = output_dir
        self.session = requests.Session()
        self.products: List[Dict] = []
        
        # Create output directory
        os.makedirs(output_dir, exist_ok=True)
        
        # Configure logging
        self._setup_logging()
    
    def _setup_logging(self):
        """Set up logging configuration."""
        log_file = os.path.join(self.output_dir, 'scraper.log')
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler(log_file),
                logging.StreamHandler()
            ]
        )
        self.logger = logging.getLogger(__name__)
    
    def _make_request(self, url: str) -> Optional[str]:
        """Make HTTP request with retry mechanism."""
        max_retries = 3
        retry_delay = 1
        
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
        }
        
        for attempt in range(max_retries):
            try:
                response = self.session.get(url, headers=headers, timeout=10)
                response.raise_for_status()
                
                # Add delay between requests
                time.sleep(random.uniform(1, 3))
                
                return response.text
            except requests.RequestException as e:
                self.logger.error(f"Attempt {attempt + 1} failed for {url}: {e}")
                if attempt < max_retries - 1:
                    time.sleep(retry_delay * (attempt + 1))
                continue
        
        return None
    
    def _parse_product_page(self, url: str, html: str) -> Optional[Dict]:
        """Parse product details from a product page."""
        try:
            soup = BeautifulSoup(html, 'html.parser')
            
            # Extract product details (customize based on website structure)
            product = {
                'url': url,
                'name': self._extract_text(soup, '.product-name'),
                'price': self._extract_price(soup, '.product-price'),
                'description': self._extract_text(soup, '.product-description'),
                'specifications': self._extract_specifications(soup),
                'images': self._extract_images(soup),
                'scraped_at': datetime.now().isoformat()
            }
            
            return product
        except Exception as e:
            self.logger.error(f"Error parsing product page {url}: {e}")
            return None
    
    def _extract_text(self, soup: BeautifulSoup, selector: str) -> str:
        """Extract text from an element."""
        element = soup.select_one(selector)
        return element.text.strip() if element else ''
    
    def _extract_price(self, soup: BeautifulSoup, selector: str) -> Optional[float]:
        """Extract and parse price."""
        price_text = self._extract_text(soup, selector)
        try:
            # Remove currency symbol and convert to float
            price = float(''.join(filter(str.isdigit, price_text)))
            return price
        except ValueError:
            return None
    
    def _extract_specifications(self, soup: BeautifulSoup) -> Dict:
        """Extract product specifications."""
        specs = {}
        # Customize based on website structure
        spec_table = soup.select_one('.specifications-table')
        if spec_table:
            for row in spec_table.select('tr'):
                cols = row.select('td')
                if len(cols) >= 2:
                    key = cols[0].text.strip()
                    value = cols[1].text.strip()
                    specs[key] = value
        return specs
    
    def _extract_images(self, soup: BeautifulSoup) -> List[str]:
        """Extract product images."""
        images = []
        for img in soup.select('.product-images img'):
            src = img.get('src') or img.get('data-src')
            if src:
                images.append(urljoin(self.base_url, src))
        return images
    
    def scrape_product_links(self, category_url: str) -> List[str]:
        """Scrape product links from category page."""
        links = []
        html = self._make_request(category_url)
        
        if html:
            soup = BeautifulSoup(html, 'html.parser')
            # Customize selector based on website structure
            for link in soup.select('.product-link'):
                href = link.get('href')
                if href:
                    full_url = urljoin(self.base_url, href)
                    links.append(full_url)
        
        return links
    
    def scrape_products(self, category_urls: List[str]):
        """Scrape products from multiple categories."""
        for category_url in category_urls:
            self.logger.info(f"Scraping category: {category_url}")
            
            # Get product links
            product_links = self.scrape_product_links(category_url)
            self.logger.info(f"Found {len(product_links)} products")
            
            # Scrape each product
            for link in product_links:
                self.logger.info(f"Scraping product: {link}")
                html = self._make_request(link)
                
                if html:
                    product = self._parse_product_page(link, html)
                    if product:
                        self.products.append(product)
    
    def save_results(self):
        """Save scraped data to files."""
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        
        # Save to CSV
        csv_file = os.path.join(self.output_dir, f'products_{timestamp}.csv')
        df = pd.DataFrame(self.products)
        df.to_csv(csv_file, index=False)
        self.logger.info(f"Saved results to {csv_file}")
        
        # Save to JSON
        json_file = os.path.join(self.output_dir, f'products_{timestamp}.json')
        with open(json_file, 'w', encoding='utf-8') as f:
            json.dump(self.products, f, indent=2, ensure_ascii=False)
        self.logger.info(f"Saved results to {json_file}")
    
    def generate_report(self) -> Dict:
        """Generate scraping report."""
        return {
            'total_products': len(self.products),
            'categories_scraped': len(set(p['category'] for p in self.products if 'category' in p)),
            'average_price': sum(p['price'] for p in self.products if p.get('price')) / len(self.products),
            'scraped_at': datetime.now().isoformat()
        }

# Example usage
if __name__ == "__main__":
    # Initialize scraper
    scraper = ProductScraper(
        base_url='https://example.com',
        output_dir='product_data'
    )
    
    # Define categories to scrape
    categories = [
        'https://example.com/category1',
        'https://example.com/category2'
    ]
    
    try:
        # Start scraping
        scraper.scrape_products(categories)
        
        # Save results
        scraper.save_results()
        
        # Generate and print report
        report = scraper.generate_report()
        print("\nScraping Report")
        print("===============")
        for key, value in report.items():
            print(f"{key}: {value}")
        
    except Exception as e:
        scraper.logger.error(f"Scraping failed: {e}")

Best Practices

  1. Respect Robots.txt
from urllib.robotparser import RobotFileParser

def can_fetch(url: str) -> bool:
    rp = RobotFileParser()
    rp.set_url(urljoin(url, '/robots.txt'))
    rp.read()
    return rp.can_fetch('*', url)
  1. Rate Limiting
def rate_limit(delay: float):
    """Decorator for rate limiting."""
    def decorator(func):
        last_called = [0.0]
        def wrapper(*args, **kwargs):
            elapsed = time.time() - last_called[0]
            if elapsed < delay:
                time.sleep(delay - elapsed)
            result = func(*args, **kwargs)
            last_called[0] = time.time()
            return result
        return wrapper
    return decorator
  1. Error Handling
def safe_request(url: str) -> Optional[str]:
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        return response.text
    except requests.Timeout:
        logger.error("Request timed out")
    except requests.HTTPError as e:
        logger.error(f"HTTP error: {e}")
    except requests.RequestException as e:
        logger.error(f"Request failed: {e}")
    return None

Common Patterns

  1. Pagination Handling
def scrape_paginated_content(base_url: str, max_pages: int = 10):
    results = []
    for page in range(1, max_pages + 1):
        url = f"{base_url}?page={page}"
        content = fetch_page(url)
        if not content:
            break
        page_results = parse_page(content)
        if not page_results:
            break
        results.extend(page_results)
    return results
  1. Data Cleaning
def clean_text(text: str) -> str:
    """Clean scraped text."""
    import re
    # Remove extra whitespace
    text = ' '.join(text.split())
    # Remove special characters
    text = re.sub(r'[^\w\s]', '', text)
    return text.strip()

Conclusion

BeautifulSoup provides powerful tools for web scraping:

  • Easy to use API
  • Robust parsing capabilities
  • Good documentation
  • Active community support

Keep exploring BeautifulSoup's features to build better web scrapers.

Further Reading