| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149 |
- #!/usr/bin/env python3
- """
- Test script to investigate PR #144 concerns
- """
- import sys
- import json
- import tempfile
- from pathlib import Path
- from collections import deque
- # Add cli to path
- sys.path.insert(0, str(Path(__file__).parent / 'cli'))
- print("="*60)
- print("PR #144 CONCERN INVESTIGATION")
- print("="*60)
- ## CONCERN 1: Thread Safety
- print("\n1. THREAD SAFETY ANALYSIS")
- print("-" * 40)
- print("✓ Lock created when workers > 1:")
- print(" - Line 54-56: Creates self.lock with threading.Lock()")
- print(" - Only created when self.workers > 1")
- print("\n✓ Protected operations in scrape_page():")
- print(" - print() - Line 295 (with lock)")
- print(" - save_page() - Line 296 (with lock)")
- print(" - pages.append() - Line 297 (with lock)")
- print(" - visited_urls check - Line 301 (with lock)")
- print(" - pending_urls.append() - Line 302 (with lock)")
- print("\n✓ Protected operations in scrape_all():")
- print(" - visited_urls.add() - Line 414 (BEFORE lock!)")
- print(" - save_checkpoint() - Line 431 (with lock)")
- print(" - print() - Line 435 (with lock)")
- print("\n❌ RACE CONDITION FOUND:")
- print(" - Line 414: visited_urls.add(url) is OUTSIDE lock")
- print(" - Line 301: Link check 'if link not in visited_urls' is INSIDE lock")
- print(" - Two threads could add same URL to visited_urls simultaneously")
- print(" - Result: Same URL could be scraped twice")
- ## CONCERN 2: Checkpoint Behavior
- print("\n2. CHECKPOINT WITH WORKERS")
- print("-" * 40)
- print("✓ Checkpoint save is protected:")
- print(" - Line 430-431: Uses lock before save_checkpoint()")
- print(" - save_checkpoint() itself does file I/O (line 103-104)")
- print("\n⚠️ POTENTIAL ISSUE:")
- print(" - pages_scraped counter incremented WITHOUT lock (line 427, 442)")
- print(" - Could miss checkpoints or checkpoint at wrong interval")
- print(" - Multiple threads incrementing same counter = race condition")
- ## CONCERN 3: Error Handling
- print("\n3. ERROR HANDLING IN PARALLEL MODE")
- print("-" * 40)
- print("✓ Exceptions are caught in scrape_page():")
- print(" - Line 319-324: try/except wraps entire method")
- print(" - Errors are printed (with lock if workers > 1)")
- print("\n✓ ThreadPoolExecutor exception handling:")
- print(" - Exceptions stored in Future objects")
- print(" - as_completed() will raise exception when accessed")
- print("\n❌ SILENT FAILURE POSSIBLE:")
- print(" - Line 425-442: Futures are iterated but exceptions not checked")
- print(" - future.result() is never called - exceptions never raised")
- print(" - Failed pages silently disappear")
- ## CONCERN 4: Rate Limiting Semantics
- print("\n4. RATE LIMITING WITH WORKERS")
- print("-" * 40)
- print("✓ Rate limit applied per-worker:")
- print(" - Line 315-317: time.sleep() after each scrape_page()")
- print(" - Each worker sleeps independently")
- print("\n✓ Semantics:")
- print(" - 4 workers, 0.5s rate limit = 8 requests/second total")
- print(" - 1 worker, 0.5s rate limit = 2 requests/second total")
- print(" - This is per-worker, not global rate limiting")
- print("\n⚠️ CONSIDERATION:")
- print(" - Documentation should clarify this is per-worker")
- print(" - Users might expect global rate limit")
- print(" - 10 workers with 0.1s = 100 req/s (very aggressive)")
- ## CONCERN 5: Resource Limits
- print("\n5. RESOURCE LIMITS")
- print("-" * 40)
- print("✓ Worker limit enforced:")
- print(" - Capped at 10 workers (mentioned in PR)")
- print(" - ThreadPoolExecutor bounds threads")
- print("\n❌ NO MEMORY LIMITS:")
- print(" - self.pages list grows unbounded")
- print(" - visited_urls set grows unbounded")
- print(" - 10,000 pages * avg 50KB each = 500MB minimum")
- print(" - Unlimited mode could cause OOM")
- print("\n❌ NO PENDING URL LIMIT:")
- print(" - pending_urls deque grows unbounded")
- print(" - Could have thousands of URLs queued")
- ## CONCERN 6: Streaming Subprocess
- print("\n6. STREAMING SUBPROCESS")
- print("-" * 40)
- print("✓ Good implementation:")
- print(" - Uses select() for non-blocking I/O")
- print(" - Timeout mechanism works (line 60-63)")
- print(" - Kills process on timeout")
- print("\n⚠️ Windows fallback:")
- print(" - Line 83-85: Falls back to sleep() on Windows")
- print(" - Won't stream output on Windows (will appear frozen)")
- print(" - But will still work, just poor UX")
- print("\n✓ Process cleanup:")
- print(" - Line 88: communicate() gets remaining output")
- print(" - process.returncode properly captured")
- print("\n" + "="*60)
- print("SUMMARY OF FINDINGS")
- print("="*60)
- print("\n🚨 CRITICAL ISSUES FOUND:")
- print("1. Race condition on visited_urls.add() (line 414)")
- print("2. pages_scraped counter not thread-safe")
- print("3. Silent exception swallowing in parallel mode")
- print("\n⚠️ MODERATE CONCERNS:")
- print("4. No memory limits for unlimited mode")
- print("5. Per-worker rate limiting may confuse users")
- print("6. Windows streaming falls back to polling")
- print("\n✅ WORKS CORRECTLY:")
- print("7. Lock protects most shared state")
- print("8. Checkpoint saves are protected")
- print("9. save_page() file I/O protected")
- print("10. Timeout mechanism solid")
- print("\n" + "="*60)
|