test_parallel_scraping.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352
  1. #!/usr/bin/env python3
  2. """
  3. Tests for parallel scraping, unlimited mode, and rate limiting features (PR #144)
  4. """
  5. import sys
  6. import os
  7. import unittest
  8. import tempfile
  9. import json
  10. import time
  11. from pathlib import Path
  12. from unittest.mock import Mock, patch, MagicMock
  13. from collections import deque
  14. from skill_seekers.cli.doc_scraper import DocToSkillConverter
  15. class TestParallelScrapingConfiguration(unittest.TestCase):
  16. """Test parallel scraping configuration and initialization"""
  17. def setUp(self):
  18. """Save original working directory"""
  19. self.original_cwd = os.getcwd()
  20. def tearDown(self):
  21. """Restore original working directory"""
  22. os.chdir(self.original_cwd)
  23. def test_single_worker_default(self):
  24. """Test default is single-worker mode"""
  25. config = {
  26. 'name': 'test',
  27. 'base_url': 'https://example.com/',
  28. 'selectors': {'main_content': 'article'},
  29. 'max_pages': 10
  30. }
  31. with tempfile.TemporaryDirectory() as tmpdir:
  32. os.chdir(tmpdir)
  33. converter = DocToSkillConverter(config, dry_run=True)
  34. self.assertEqual(converter.workers, 1)
  35. self.assertFalse(hasattr(converter, 'lock'))
  36. def test_multiple_workers_creates_lock(self):
  37. """Test multiple workers creates thread lock"""
  38. config = {
  39. 'name': 'test',
  40. 'base_url': 'https://example.com/',
  41. 'selectors': {'main_content': 'article'},
  42. 'max_pages': 10,
  43. 'workers': 4
  44. }
  45. with tempfile.TemporaryDirectory() as tmpdir:
  46. os.chdir(tmpdir)
  47. converter = DocToSkillConverter(config, dry_run=True)
  48. self.assertEqual(converter.workers, 4)
  49. self.assertTrue(hasattr(converter, 'lock'))
  50. def test_workers_from_config(self):
  51. """Test workers parameter is read from config"""
  52. config = {
  53. 'name': 'test',
  54. 'base_url': 'https://example.com/',
  55. 'selectors': {'main_content': 'article'},
  56. 'workers': 8
  57. }
  58. with tempfile.TemporaryDirectory() as tmpdir:
  59. os.chdir(tmpdir)
  60. converter = DocToSkillConverter(config, dry_run=True)
  61. self.assertEqual(converter.workers, 8)
  62. class TestUnlimitedMode(unittest.TestCase):
  63. """Test unlimited scraping mode"""
  64. def setUp(self):
  65. """Save original working directory"""
  66. self.original_cwd = os.getcwd()
  67. def tearDown(self):
  68. """Restore original working directory"""
  69. os.chdir(self.original_cwd)
  70. def test_unlimited_with_none(self):
  71. """Test max_pages: None enables unlimited mode"""
  72. config = {
  73. 'name': 'test',
  74. 'base_url': 'https://example.com/',
  75. 'selectors': {'main_content': 'article'},
  76. 'max_pages': None
  77. }
  78. with tempfile.TemporaryDirectory() as tmpdir:
  79. os.chdir(tmpdir)
  80. converter = DocToSkillConverter(config, dry_run=True)
  81. self.assertIsNone(converter.config.get('max_pages'))
  82. def test_unlimited_with_minus_one(self):
  83. """Test max_pages: -1 enables unlimited mode"""
  84. config = {
  85. 'name': 'test',
  86. 'base_url': 'https://example.com/',
  87. 'selectors': {'main_content': 'article'},
  88. 'max_pages': -1
  89. }
  90. with tempfile.TemporaryDirectory() as tmpdir:
  91. os.chdir(tmpdir)
  92. converter = DocToSkillConverter(config, dry_run=True)
  93. self.assertEqual(converter.config.get('max_pages'), -1)
  94. def test_limited_mode_default(self):
  95. """Test default max_pages is limited"""
  96. config = {
  97. 'name': 'test',
  98. 'base_url': 'https://example.com/',
  99. 'selectors': {'main_content': 'article'}
  100. }
  101. with tempfile.TemporaryDirectory() as tmpdir:
  102. os.chdir(tmpdir)
  103. converter = DocToSkillConverter(config, dry_run=True)
  104. max_pages = converter.config.get('max_pages', 500)
  105. self.assertIsNotNone(max_pages)
  106. self.assertGreater(max_pages, 0)
  107. class TestRateLimiting(unittest.TestCase):
  108. """Test rate limiting configuration"""
  109. def setUp(self):
  110. """Save original working directory"""
  111. self.original_cwd = os.getcwd()
  112. def tearDown(self):
  113. """Restore original working directory"""
  114. os.chdir(self.original_cwd)
  115. def test_rate_limit_from_config(self):
  116. """Test rate_limit is read from config"""
  117. config = {
  118. 'name': 'test',
  119. 'base_url': 'https://example.com/',
  120. 'selectors': {'main_content': 'article'},
  121. 'rate_limit': 0.1
  122. }
  123. with tempfile.TemporaryDirectory() as tmpdir:
  124. os.chdir(tmpdir)
  125. converter = DocToSkillConverter(config, dry_run=True)
  126. self.assertEqual(converter.config.get('rate_limit'), 0.1)
  127. def test_rate_limit_default(self):
  128. """Test default rate_limit is 0.5"""
  129. config = {
  130. 'name': 'test',
  131. 'base_url': 'https://example.com/',
  132. 'selectors': {'main_content': 'article'}
  133. }
  134. with tempfile.TemporaryDirectory() as tmpdir:
  135. os.chdir(tmpdir)
  136. converter = DocToSkillConverter(config, dry_run=True)
  137. self.assertEqual(converter.config.get('rate_limit', 0.5), 0.5)
  138. def test_zero_rate_limit_disables(self):
  139. """Test rate_limit: 0 disables rate limiting"""
  140. config = {
  141. 'name': 'test',
  142. 'base_url': 'https://example.com/',
  143. 'selectors': {'main_content': 'article'},
  144. 'rate_limit': 0
  145. }
  146. with tempfile.TemporaryDirectory() as tmpdir:
  147. os.chdir(tmpdir)
  148. converter = DocToSkillConverter(config, dry_run=True)
  149. self.assertEqual(converter.config.get('rate_limit'), 0)
  150. class TestThreadSafety(unittest.TestCase):
  151. """Test thread-safety fixes"""
  152. def setUp(self):
  153. """Save original working directory"""
  154. self.original_cwd = os.getcwd()
  155. def tearDown(self):
  156. """Restore original working directory"""
  157. os.chdir(self.original_cwd)
  158. def test_lock_protects_visited_urls(self):
  159. """Test visited_urls operations are protected by lock"""
  160. config = {
  161. 'name': 'test',
  162. 'base_url': 'https://example.com/',
  163. 'selectors': {'main_content': 'article'},
  164. 'workers': 4
  165. }
  166. with tempfile.TemporaryDirectory() as tmpdir:
  167. os.chdir(tmpdir)
  168. converter = DocToSkillConverter(config, dry_run=True)
  169. # Verify lock exists
  170. self.assertTrue(hasattr(converter, 'lock'))
  171. # Verify it's a threading.Lock
  172. import threading
  173. self.assertIsInstance(converter.lock, type(threading.Lock()))
  174. def test_single_worker_no_lock(self):
  175. """Test single worker doesn't create unnecessary lock"""
  176. config = {
  177. 'name': 'test',
  178. 'base_url': 'https://example.com/',
  179. 'selectors': {'main_content': 'article'},
  180. 'workers': 1
  181. }
  182. with tempfile.TemporaryDirectory() as tmpdir:
  183. os.chdir(tmpdir)
  184. converter = DocToSkillConverter(config, dry_run=True)
  185. self.assertFalse(hasattr(converter, 'lock'))
  186. class TestScrapingModes(unittest.TestCase):
  187. """Test different scraping mode combinations"""
  188. def setUp(self):
  189. """Save original working directory"""
  190. self.original_cwd = os.getcwd()
  191. def tearDown(self):
  192. """Restore original working directory"""
  193. os.chdir(self.original_cwd)
  194. def test_single_threaded_limited(self):
  195. """Test traditional single-threaded limited mode"""
  196. config = {
  197. 'name': 'test',
  198. 'base_url': 'https://example.com/',
  199. 'selectors': {'main_content': 'article'},
  200. 'max_pages': 10,
  201. 'workers': 1
  202. }
  203. with tempfile.TemporaryDirectory() as tmpdir:
  204. os.chdir(tmpdir)
  205. converter = DocToSkillConverter(config, dry_run=True)
  206. self.assertEqual(converter.workers, 1)
  207. self.assertEqual(converter.config.get('max_pages'), 10)
  208. def test_parallel_limited(self):
  209. """Test parallel scraping with page limit"""
  210. config = {
  211. 'name': 'test',
  212. 'base_url': 'https://example.com/',
  213. 'selectors': {'main_content': 'article'},
  214. 'max_pages': 100,
  215. 'workers': 4
  216. }
  217. with tempfile.TemporaryDirectory() as tmpdir:
  218. os.chdir(tmpdir)
  219. converter = DocToSkillConverter(config, dry_run=True)
  220. self.assertEqual(converter.workers, 4)
  221. self.assertEqual(converter.config.get('max_pages'), 100)
  222. self.assertTrue(hasattr(converter, 'lock'))
  223. def test_parallel_unlimited(self):
  224. """Test parallel scraping with unlimited pages"""
  225. config = {
  226. 'name': 'test',
  227. 'base_url': 'https://example.com/',
  228. 'selectors': {'main_content': 'article'},
  229. 'max_pages': None,
  230. 'workers': 8
  231. }
  232. with tempfile.TemporaryDirectory() as tmpdir:
  233. os.chdir(tmpdir)
  234. converter = DocToSkillConverter(config, dry_run=True)
  235. self.assertEqual(converter.workers, 8)
  236. self.assertIsNone(converter.config.get('max_pages'))
  237. self.assertTrue(hasattr(converter, 'lock'))
  238. def test_fast_scraping_mode(self):
  239. """Test fast scraping with low rate limit and workers"""
  240. config = {
  241. 'name': 'test',
  242. 'base_url': 'https://example.com/',
  243. 'selectors': {'main_content': 'article'},
  244. 'rate_limit': 0.1,
  245. 'workers': 8,
  246. 'max_pages': 1000
  247. }
  248. with tempfile.TemporaryDirectory() as tmpdir:
  249. os.chdir(tmpdir)
  250. converter = DocToSkillConverter(config, dry_run=True)
  251. self.assertEqual(converter.workers, 8)
  252. self.assertEqual(converter.config.get('rate_limit'), 0.1)
  253. class TestDryRunWithNewFeatures(unittest.TestCase):
  254. """Test dry-run mode works with new features"""
  255. def setUp(self):
  256. """Save original working directory"""
  257. self.original_cwd = os.getcwd()
  258. def tearDown(self):
  259. """Restore original working directory"""
  260. os.chdir(self.original_cwd)
  261. def test_dry_run_with_parallel(self):
  262. """Test dry-run with parallel workers"""
  263. config = {
  264. 'name': 'test',
  265. 'base_url': 'https://example.com/',
  266. 'selectors': {'main_content': 'article'},
  267. 'workers': 4
  268. }
  269. with tempfile.TemporaryDirectory() as tmpdir:
  270. os.chdir(tmpdir)
  271. converter = DocToSkillConverter(config, dry_run=True)
  272. self.assertTrue(converter.dry_run)
  273. self.assertEqual(converter.workers, 4)
  274. def test_dry_run_with_unlimited(self):
  275. """Test dry-run with unlimited mode"""
  276. config = {
  277. 'name': 'test',
  278. 'base_url': 'https://example.com/',
  279. 'selectors': {'main_content': 'article'},
  280. 'max_pages': None
  281. }
  282. with tempfile.TemporaryDirectory() as tmpdir:
  283. os.chdir(tmpdir)
  284. converter = DocToSkillConverter(config, dry_run=True)
  285. self.assertTrue(converter.dry_run)
  286. self.assertIsNone(converter.config.get('max_pages'))
  287. if __name__ == '__main__':
  288. unittest.main()