Web Scraping with BeautifulSoup: A Complete Guide
Web Scraping with BeautifulSoup: A Complete Guide
Web scraping is a powerful technique for extracting data from websites. In this guide, we'll explore how to use BeautifulSoup to scrape web data effectively.
Getting Started with BeautifulSoup
First, let's set up our environment and import the necessary libraries:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import logging
from typing import List, Dict, Optional
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
Basic Web Scraping
Making HTTP Requests
def fetch_page(url: str) -> Optional[str]:
"""Fetch webpage content with error handling."""
try:
# Add headers to mimic browser request
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, headers=headers)
response.raise_for_status()
return response.text
except requests.RequestException as e:
logger.error(f"Error fetching {url}: {e}")
return None
# Example usage
url = 'https://example.com'
html_content = fetch_page(url)
if html_content:
soup = BeautifulSoup(html_content, 'html.parser')
Parsing HTML
# Find elements by tag
paragraphs = soup.find_all('p')
links = soup.find_all('a')
# Find elements by class
elements = soup.find_all(class_='my-class')
# Find elements by ID
element = soup.find(id='my-id')
# Find elements by CSS selector
elements = soup.select('.class-name')
elements = soup.select('#id-name')
elements = soup.select('div.class-name')
Advanced Scraping Techniques
Navigation and Searching
# Navigate through elements
parent = element.parent
children = element.children
siblings = element.next_siblings
# Find nested elements
nested = soup.find('div').find('p').find('span')
# Search with multiple conditions
elements = soup.find_all(['p', 'div'], class_='content')
# Regular expression search
import re
elements = soup.find_all(text=re.compile(r'pattern'))
Extracting Data
# Get text content
text = element.text.strip()
# Get attributes
href = element.get('href')
src = element['src']
# Get all text recursively
all_text = soup.get_text(separator=' ', strip=True)
Project: E-commerce Product Scraper
Let's build a complete scraper for e-commerce products:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import logging
from typing import List, Dict, Optional
from datetime import datetime
from urllib.parse import urljoin
import json
import os
class ProductScraper:
def __init__(self, base_url: str, output_dir: str = 'data'):
"""Initialize the ProductScraper."""
self.base_url = base_url
self.output_dir = output_dir
self.session = requests.Session()
self.products: List[Dict] = []
# Create output directory
os.makedirs(output_dir, exist_ok=True)
# Configure logging
self._setup_logging()
def _setup_logging(self):
"""Set up logging configuration."""
log_file = os.path.join(self.output_dir, 'scraper.log')
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(log_file),
logging.StreamHandler()
]
)
self.logger = logging.getLogger(__name__)
def _make_request(self, url: str) -> Optional[str]:
"""Make HTTP request with retry mechanism."""
max_retries = 3
retry_delay = 1
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
}
for attempt in range(max_retries):
try:
response = self.session.get(url, headers=headers, timeout=10)
response.raise_for_status()
# Add delay between requests
time.sleep(random.uniform(1, 3))
return response.text
except requests.RequestException as e:
self.logger.error(f"Attempt {attempt + 1} failed for {url}: {e}")
if attempt < max_retries - 1:
time.sleep(retry_delay * (attempt + 1))
continue
return None
def _parse_product_page(self, url: str, html: str) -> Optional[Dict]:
"""Parse product details from a product page."""
try:
soup = BeautifulSoup(html, 'html.parser')
# Extract product details (customize based on website structure)
product = {
'url': url,
'name': self._extract_text(soup, '.product-name'),
'price': self._extract_price(soup, '.product-price'),
'description': self._extract_text(soup, '.product-description'),
'specifications': self._extract_specifications(soup),
'images': self._extract_images(soup),
'scraped_at': datetime.now().isoformat()
}
return product
except Exception as e:
self.logger.error(f"Error parsing product page {url}: {e}")
return None
def _extract_text(self, soup: BeautifulSoup, selector: str) -> str:
"""Extract text from an element."""
element = soup.select_one(selector)
return element.text.strip() if element else ''
def _extract_price(self, soup: BeautifulSoup, selector: str) -> Optional[float]:
"""Extract and parse price."""
price_text = self._extract_text(soup, selector)
try:
# Remove currency symbol and convert to float
price = float(''.join(filter(str.isdigit, price_text)))
return price
except ValueError:
return None
def _extract_specifications(self, soup: BeautifulSoup) -> Dict:
"""Extract product specifications."""
specs = {}
# Customize based on website structure
spec_table = soup.select_one('.specifications-table')
if spec_table:
for row in spec_table.select('tr'):
cols = row.select('td')
if len(cols) >= 2:
key = cols[0].text.strip()
value = cols[1].text.strip()
specs[key] = value
return specs
def _extract_images(self, soup: BeautifulSoup) -> List[str]:
"""Extract product images."""
images = []
for img in soup.select('.product-images img'):
src = img.get('src') or img.get('data-src')
if src:
images.append(urljoin(self.base_url, src))
return images
def scrape_product_links(self, category_url: str) -> List[str]:
"""Scrape product links from category page."""
links = []
html = self._make_request(category_url)
if html:
soup = BeautifulSoup(html, 'html.parser')
# Customize selector based on website structure
for link in soup.select('.product-link'):
href = link.get('href')
if href:
full_url = urljoin(self.base_url, href)
links.append(full_url)
return links
def scrape_products(self, category_urls: List[str]):
"""Scrape products from multiple categories."""
for category_url in category_urls:
self.logger.info(f"Scraping category: {category_url}")
# Get product links
product_links = self.scrape_product_links(category_url)
self.logger.info(f"Found {len(product_links)} products")
# Scrape each product
for link in product_links:
self.logger.info(f"Scraping product: {link}")
html = self._make_request(link)
if html:
product = self._parse_product_page(link, html)
if product:
self.products.append(product)
def save_results(self):
"""Save scraped data to files."""
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
# Save to CSV
csv_file = os.path.join(self.output_dir, f'products_{timestamp}.csv')
df = pd.DataFrame(self.products)
df.to_csv(csv_file, index=False)
self.logger.info(f"Saved results to {csv_file}")
# Save to JSON
json_file = os.path.join(self.output_dir, f'products_{timestamp}.json')
with open(json_file, 'w', encoding='utf-8') as f:
json.dump(self.products, f, indent=2, ensure_ascii=False)
self.logger.info(f"Saved results to {json_file}")
def generate_report(self) -> Dict:
"""Generate scraping report."""
return {
'total_products': len(self.products),
'categories_scraped': len(set(p['category'] for p in self.products if 'category' in p)),
'average_price': sum(p['price'] for p in self.products if p.get('price')) / len(self.products),
'scraped_at': datetime.now().isoformat()
}
# Example usage
if __name__ == "__main__":
# Initialize scraper
scraper = ProductScraper(
base_url='https://example.com',
output_dir='product_data'
)
# Define categories to scrape
categories = [
'https://example.com/category1',
'https://example.com/category2'
]
try:
# Start scraping
scraper.scrape_products(categories)
# Save results
scraper.save_results()
# Generate and print report
report = scraper.generate_report()
print("\nScraping Report")
print("===============")
for key, value in report.items():
print(f"{key}: {value}")
except Exception as e:
scraper.logger.error(f"Scraping failed: {e}")
Best Practices
- Respect Robots.txt
from urllib.robotparser import RobotFileParser
def can_fetch(url: str) -> bool:
rp = RobotFileParser()
rp.set_url(urljoin(url, '/robots.txt'))
rp.read()
return rp.can_fetch('*', url)
- Rate Limiting
def rate_limit(delay: float):
"""Decorator for rate limiting."""
def decorator(func):
last_called = [0.0]
def wrapper(*args, **kwargs):
elapsed = time.time() - last_called[0]
if elapsed < delay:
time.sleep(delay - elapsed)
result = func(*args, **kwargs)
last_called[0] = time.time()
return result
return wrapper
return decorator
- Error Handling
def safe_request(url: str) -> Optional[str]:
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
return response.text
except requests.Timeout:
logger.error("Request timed out")
except requests.HTTPError as e:
logger.error(f"HTTP error: {e}")
except requests.RequestException as e:
logger.error(f"Request failed: {e}")
return None
Common Patterns
- Pagination Handling
def scrape_paginated_content(base_url: str, max_pages: int = 10):
results = []
for page in range(1, max_pages + 1):
url = f"{base_url}?page={page}"
content = fetch_page(url)
if not content:
break
page_results = parse_page(content)
if not page_results:
break
results.extend(page_results)
return results
- Data Cleaning
def clean_text(text: str) -> str:
"""Clean scraped text."""
import re
# Remove extra whitespace
text = ' '.join(text.split())
# Remove special characters
text = re.sub(r'[^\w\s]', '', text)
return text.strip()
Conclusion
BeautifulSoup provides powerful tools for web scraping:
- Easy to use API
- Robust parsing capabilities
- Good documentation
- Active community support
Keep exploring BeautifulSoup's features to build better web scrapers.