test_pr144_concerns.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149
  1. #!/usr/bin/env python3
  2. """
  3. Test script to investigate PR #144 concerns
  4. """
  5. import sys
  6. import json
  7. import tempfile
  8. from pathlib import Path
  9. from collections import deque
  10. # Add cli to path
  11. sys.path.insert(0, str(Path(__file__).parent / 'cli'))
  12. print("="*60)
  13. print("PR #144 CONCERN INVESTIGATION")
  14. print("="*60)
  15. ## CONCERN 1: Thread Safety
  16. print("\n1. THREAD SAFETY ANALYSIS")
  17. print("-" * 40)
  18. print("✓ Lock created when workers > 1:")
  19. print(" - Line 54-56: Creates self.lock with threading.Lock()")
  20. print(" - Only created when self.workers > 1")
  21. print("\n✓ Protected operations in scrape_page():")
  22. print(" - print() - Line 295 (with lock)")
  23. print(" - save_page() - Line 296 (with lock)")
  24. print(" - pages.append() - Line 297 (with lock)")
  25. print(" - visited_urls check - Line 301 (with lock)")
  26. print(" - pending_urls.append() - Line 302 (with lock)")
  27. print("\n✓ Protected operations in scrape_all():")
  28. print(" - visited_urls.add() - Line 414 (BEFORE lock!)")
  29. print(" - save_checkpoint() - Line 431 (with lock)")
  30. print(" - print() - Line 435 (with lock)")
  31. print("\n❌ RACE CONDITION FOUND:")
  32. print(" - Line 414: visited_urls.add(url) is OUTSIDE lock")
  33. print(" - Line 301: Link check 'if link not in visited_urls' is INSIDE lock")
  34. print(" - Two threads could add same URL to visited_urls simultaneously")
  35. print(" - Result: Same URL could be scraped twice")
  36. ## CONCERN 2: Checkpoint Behavior
  37. print("\n2. CHECKPOINT WITH WORKERS")
  38. print("-" * 40)
  39. print("✓ Checkpoint save is protected:")
  40. print(" - Line 430-431: Uses lock before save_checkpoint()")
  41. print(" - save_checkpoint() itself does file I/O (line 103-104)")
  42. print("\n⚠️ POTENTIAL ISSUE:")
  43. print(" - pages_scraped counter incremented WITHOUT lock (line 427, 442)")
  44. print(" - Could miss checkpoints or checkpoint at wrong interval")
  45. print(" - Multiple threads incrementing same counter = race condition")
  46. ## CONCERN 3: Error Handling
  47. print("\n3. ERROR HANDLING IN PARALLEL MODE")
  48. print("-" * 40)
  49. print("✓ Exceptions are caught in scrape_page():")
  50. print(" - Line 319-324: try/except wraps entire method")
  51. print(" - Errors are printed (with lock if workers > 1)")
  52. print("\n✓ ThreadPoolExecutor exception handling:")
  53. print(" - Exceptions stored in Future objects")
  54. print(" - as_completed() will raise exception when accessed")
  55. print("\n❌ SILENT FAILURE POSSIBLE:")
  56. print(" - Line 425-442: Futures are iterated but exceptions not checked")
  57. print(" - future.result() is never called - exceptions never raised")
  58. print(" - Failed pages silently disappear")
  59. ## CONCERN 4: Rate Limiting Semantics
  60. print("\n4. RATE LIMITING WITH WORKERS")
  61. print("-" * 40)
  62. print("✓ Rate limit applied per-worker:")
  63. print(" - Line 315-317: time.sleep() after each scrape_page()")
  64. print(" - Each worker sleeps independently")
  65. print("\n✓ Semantics:")
  66. print(" - 4 workers, 0.5s rate limit = 8 requests/second total")
  67. print(" - 1 worker, 0.5s rate limit = 2 requests/second total")
  68. print(" - This is per-worker, not global rate limiting")
  69. print("\n⚠️ CONSIDERATION:")
  70. print(" - Documentation should clarify this is per-worker")
  71. print(" - Users might expect global rate limit")
  72. print(" - 10 workers with 0.1s = 100 req/s (very aggressive)")
  73. ## CONCERN 5: Resource Limits
  74. print("\n5. RESOURCE LIMITS")
  75. print("-" * 40)
  76. print("✓ Worker limit enforced:")
  77. print(" - Capped at 10 workers (mentioned in PR)")
  78. print(" - ThreadPoolExecutor bounds threads")
  79. print("\n❌ NO MEMORY LIMITS:")
  80. print(" - self.pages list grows unbounded")
  81. print(" - visited_urls set grows unbounded")
  82. print(" - 10,000 pages * avg 50KB each = 500MB minimum")
  83. print(" - Unlimited mode could cause OOM")
  84. print("\n❌ NO PENDING URL LIMIT:")
  85. print(" - pending_urls deque grows unbounded")
  86. print(" - Could have thousands of URLs queued")
  87. ## CONCERN 6: Streaming Subprocess
  88. print("\n6. STREAMING SUBPROCESS")
  89. print("-" * 40)
  90. print("✓ Good implementation:")
  91. print(" - Uses select() for non-blocking I/O")
  92. print(" - Timeout mechanism works (line 60-63)")
  93. print(" - Kills process on timeout")
  94. print("\n⚠️ Windows fallback:")
  95. print(" - Line 83-85: Falls back to sleep() on Windows")
  96. print(" - Won't stream output on Windows (will appear frozen)")
  97. print(" - But will still work, just poor UX")
  98. print("\n✓ Process cleanup:")
  99. print(" - Line 88: communicate() gets remaining output")
  100. print(" - process.returncode properly captured")
  101. print("\n" + "="*60)
  102. print("SUMMARY OF FINDINGS")
  103. print("="*60)
  104. print("\n🚨 CRITICAL ISSUES FOUND:")
  105. print("1. Race condition on visited_urls.add() (line 414)")
  106. print("2. pages_scraped counter not thread-safe")
  107. print("3. Silent exception swallowing in parallel mode")
  108. print("\n⚠️ MODERATE CONCERNS:")
  109. print("4. No memory limits for unlimited mode")
  110. print("5. Per-worker rate limiting may confuse users")
  111. print("6. Windows streaming falls back to polling")
  112. print("\n✅ WORKS CORRECTLY:")
  113. print("7. Lock protects most shared state")
  114. print("8. Checkpoint saves are protected")
  115. print("9. save_page() file I/O protected")
  116. print("10. Timeout mechanism solid")
  117. print("\n" + "="*60)