Article
web-scrapingpythonragdata-extractionautomationplaywrightbeautifulsoup
Build a RAG-Ready Web Scraper with Python
Use this Python script to build an automated web scraping pipeline. Crawl websites, extract clean text from static and dynamic pages, and prepare the content for RAG applications. Includes rate limiting and configurable depth.
intermediate30 min5 steps
The play
- Install DependenciesThe Web Scraping Pipeline requires libraries for web requests, HTML parsing, and rendering JavaScript. Install `requests`, `beautifulsoup4`, and `playwright` using pip. Playwright also requires browser binaries.
- Run a Basic Static ScrapeExecute a single-page scrape on a static website. Save the starter code as `scraper.py`. The script will fetch the URL, extract the main text content, and save it to a file in the `output/` directory.
- Crawl Multiple PagesUse the `--depth` argument to make the Web Scraping Pipeline follow links on the starting page. It will crawl up to the specified depth, creating a text file for each unique page found within the same domain.
- Scrape Dynamic JavaScript ContentFor sites that render content with JavaScript (e.g., React, Vue), use the `--js` flag. This tells the script to use a headless browser via Playwright to fully render the page before extracting content.
- Set Rate LimitingBe a considerate web citizen by adding a delay between requests with the `--delay` flag. This prevents overwhelming the server and reduces the risk of being blocked. The value is in seconds.
Starter code
import argparse
import time
import os
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
from playwright.sync_api import sync_playwright
def get_static_content(url):
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
return response.text
except requests.RequestException as e:
print(f"Error fetching {url}: {e}")
return None
def get_dynamic_content(url):
try:
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto(url, wait_until='networkidle')
content = page.content()
browser.close()
return content
except Exception as e:
print(f"Error fetching {url} with Playwright: {e}")
return None
def parse_content(html, base_url):
soup = BeautifulSoup(html, 'html.parser')
# Extract text
for script_or_style in soup(['script', 'style']):
script_or_style.decompose()
text = soup.get_text(separator='\n', strip=True)
# Extract links
links = set()
for a_tag in soup.find_all('a', href=True):
link = a_tag['href']
abs_link = urljoin(base_url, link)
if urlparse(abs_link).netloc == urlparse(base_url).netloc:
links.add(abs_link)
return text, links
def save_content(url, text, output_dir):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
filename = url.replace('https://', '').replace('http://', '').replace('/', '_') + '.txt'
filepath = os.path.join(output_dir, filename)
with open(filepath, 'w', encoding='utf-8') as f:
f.write(f"URL: {url}\n\n")
f.write(text)
print(f"Saved content from {url} to {filepath}")
def main(args):
queue = [(args.start_url, 0)]
visited = set()
while queue:
current_url, current_depth = queue.pop(0)
if current_url in visited or current_depth > args.depth:
continue
print(f"Crawling: {current_url} at depth {current_depth}")
visited.add(current_url)
if args.js:
html = get_dynamic_content(current_url)
else:
html = get_static_content(current_url)
if html:
text, links = parse_content(html, current_url)
save_content(current_url, text, args.output_dir)
if current_depth < args.depth:
for link in links:
if link not in visited:
queue.append((link, current_depth + 1))
time.sleep(args.delay)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Web Scraping Pipeline for RAG')
parser.add_argument('start_url', type=str, help='The starting URL to crawl.')
parser.add_argument('--depth', type=int, default=0, help='Crawl depth. 0 means only the start URL.')
parser.add_argument('--delay', type=int, default=0, help='Delay in seconds between requests.')
parser.add_argument('--js', action='store_true', help='Enable JavaScript rendering via Playwright.')
parser.add_argument('--output-dir', type=str, default='output', help='Directory to save scraped content.')
args = parser.parse_args()
main(args)