Build a RAG-Ready Web Scraper with Python

Use this Python script to build an automated web scraping pipeline. Crawl websites, extract clean text from static and dynamic pages, and prepare the content for RAG applications. Includes rate limiting and configurable depth.

intermediate30 min5 steps

The play

Install Dependencies
The Web Scraping Pipeline requires libraries for web requests, HTML parsing, and rendering JavaScript. Install `requests`, `beautifulsoup4`, and `playwright` using pip. Playwright also requires browser binaries.
Run a Basic Static Scrape
Execute a single-page scrape on a static website. Save the starter code as `scraper.py`. The script will fetch the URL, extract the main text content, and save it to a file in the `output/` directory.
Crawl Multiple Pages
Use the `--depth` argument to make the Web Scraping Pipeline follow links on the starting page. It will crawl up to the specified depth, creating a text file for each unique page found within the same domain.
Scrape Dynamic JavaScript Content
For sites that render content with JavaScript (e.g., React, Vue), use the `--js` flag. This tells the script to use a headless browser via Playwright to fully render the page before extracting content.
Set Rate Limiting
Be a considerate web citizen by adding a delay between requests with the `--delay` flag. This prevents overwhelming the server and reduces the risk of being blocked. The value is in seconds.

Starter code

import argparse
import time
import os
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
from playwright.sync_api import sync_playwright

def get_static_content(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def get_dynamic_content(url):
    try:
        with sync_playwright() as p:
            browser = p.chromium.launch()
            page = browser.new_page()
            page.goto(url, wait_until='networkidle')
            content = page.content()
            browser.close()
            return content
    except Exception as e:
        print(f"Error fetching {url} with Playwright: {e}")
        return None

def parse_content(html, base_url):
    soup = BeautifulSoup(html, 'html.parser')
    
    # Extract text
    for script_or_style in soup(['script', 'style']):
        script_or_style.decompose()
    text = soup.get_text(separator='\n', strip=True)
    
    # Extract links
    links = set()
    for a_tag in soup.find_all('a', href=True):
        link = a_tag['href']
        abs_link = urljoin(base_url, link)
        if urlparse(abs_link).netloc == urlparse(base_url).netloc:
            links.add(abs_link)
            
    return text, links

def save_content(url, text, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    filename = url.replace('https://', '').replace('http://', '').replace('/', '_') + '.txt'
    filepath = os.path.join(output_dir, filename)
    
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(f"URL: {url}\n\n")
        f.write(text)
    print(f"Saved content from {url} to {filepath}")

def main(args):
    queue = [(args.start_url, 0)]
    visited = set()

    while queue:
        current_url, current_depth = queue.pop(0)

        if current_url in visited or current_depth > args.depth:
            continue

        print(f"Crawling: {current_url} at depth {current_depth}")
        visited.add(current_url)

        if args.js:
            html = get_dynamic_content(current_url)
        else:
            html = get_static_content(current_url)

        if html:
            text, links = parse_content(html, current_url)
            save_content(current_url, text, args.output_dir)

            if current_depth < args.depth:
                for link in links:
                    if link not in visited:
                        queue.append((link, current_depth + 1))
        
        time.sleep(args.delay)

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Web Scraping Pipeline for RAG')
    parser.add_argument('start_url', type=str, help='The starting URL to crawl.')
    parser.add_argument('--depth', type=int, default=0, help='Crawl depth. 0 means only the start URL.')
    parser.add_argument('--delay', type=int, default=0, help='Delay in seconds between requests.')
    parser.add_argument('--js', action='store_true', help='Enable JavaScript rendering via Playwright.')
    parser.add_argument('--output-dir', type=str, default='output', help='Directory to save scraped content.')
    
    args = parser.parse_args()
    main(args)