test_async_scraping.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328
  1. #!/usr/bin/env python3
  2. """
  3. Tests for async scraping functionality
  4. Tests the async/await implementation for parallel web scraping
  5. """
  6. import sys
  7. import os
  8. import unittest
  9. import asyncio
  10. import tempfile
  11. from pathlib import Path
  12. from unittest.mock import Mock, patch, AsyncMock, MagicMock
  13. from collections import deque
  14. from skill_seekers.cli.doc_scraper import DocToSkillConverter
  15. class TestAsyncConfiguration(unittest.TestCase):
  16. """Test async mode configuration and initialization"""
  17. def setUp(self):
  18. """Save original working directory"""
  19. self.original_cwd = os.getcwd()
  20. def tearDown(self):
  21. """Restore original working directory"""
  22. os.chdir(self.original_cwd)
  23. def test_async_mode_default_false(self):
  24. """Test async mode is disabled by default"""
  25. config = {
  26. 'name': 'test',
  27. 'base_url': 'https://example.com/',
  28. 'selectors': {'main_content': 'article'},
  29. 'max_pages': 10
  30. }
  31. with tempfile.TemporaryDirectory() as tmpdir:
  32. try:
  33. os.chdir(tmpdir)
  34. converter = DocToSkillConverter(config, dry_run=True)
  35. self.assertFalse(converter.async_mode)
  36. finally:
  37. os.chdir(self.original_cwd)
  38. def test_async_mode_enabled_from_config(self):
  39. """Test async mode can be enabled via config"""
  40. config = {
  41. 'name': 'test',
  42. 'base_url': 'https://example.com/',
  43. 'selectors': {'main_content': 'article'},
  44. 'max_pages': 10,
  45. 'async_mode': True
  46. }
  47. with tempfile.TemporaryDirectory() as tmpdir:
  48. try:
  49. os.chdir(tmpdir)
  50. converter = DocToSkillConverter(config, dry_run=True)
  51. self.assertTrue(converter.async_mode)
  52. finally:
  53. os.chdir(self.original_cwd)
  54. def test_async_mode_with_workers(self):
  55. """Test async mode works with multiple workers"""
  56. config = {
  57. 'name': 'test',
  58. 'base_url': 'https://example.com/',
  59. 'selectors': {'main_content': 'article'},
  60. 'workers': 4,
  61. 'async_mode': True
  62. }
  63. with tempfile.TemporaryDirectory() as tmpdir:
  64. try:
  65. os.chdir(tmpdir)
  66. converter = DocToSkillConverter(config, dry_run=True)
  67. self.assertTrue(converter.async_mode)
  68. self.assertEqual(converter.workers, 4)
  69. finally:
  70. os.chdir(self.original_cwd)
  71. class TestAsyncScrapeMethods(unittest.TestCase):
  72. """Test async scraping methods exist and have correct signatures"""
  73. def setUp(self):
  74. """Set up test fixtures"""
  75. self.original_cwd = os.getcwd()
  76. def tearDown(self):
  77. """Clean up"""
  78. os.chdir(self.original_cwd)
  79. def test_scrape_page_async_exists(self):
  80. """Test scrape_page_async method exists"""
  81. config = {
  82. 'name': 'test',
  83. 'base_url': 'https://example.com/',
  84. 'selectors': {'main_content': 'article'}
  85. }
  86. with tempfile.TemporaryDirectory() as tmpdir:
  87. try:
  88. os.chdir(tmpdir)
  89. converter = DocToSkillConverter(config, dry_run=True)
  90. self.assertTrue(hasattr(converter, 'scrape_page_async'))
  91. self.assertTrue(asyncio.iscoroutinefunction(converter.scrape_page_async))
  92. finally:
  93. os.chdir(self.original_cwd)
  94. def test_scrape_all_async_exists(self):
  95. """Test scrape_all_async method exists"""
  96. config = {
  97. 'name': 'test',
  98. 'base_url': 'https://example.com/',
  99. 'selectors': {'main_content': 'article'}
  100. }
  101. with tempfile.TemporaryDirectory() as tmpdir:
  102. try:
  103. os.chdir(tmpdir)
  104. converter = DocToSkillConverter(config, dry_run=True)
  105. self.assertTrue(hasattr(converter, 'scrape_all_async'))
  106. self.assertTrue(asyncio.iscoroutinefunction(converter.scrape_all_async))
  107. finally:
  108. os.chdir(self.original_cwd)
  109. class TestAsyncRouting(unittest.TestCase):
  110. """Test that scrape_all() correctly routes to async version"""
  111. def setUp(self):
  112. """Set up test fixtures"""
  113. self.original_cwd = os.getcwd()
  114. def tearDown(self):
  115. """Clean up"""
  116. os.chdir(self.original_cwd)
  117. def test_scrape_all_routes_to_async_when_enabled(self):
  118. """Test scrape_all calls async version when async_mode=True"""
  119. config = {
  120. 'name': 'test',
  121. 'base_url': 'https://example.com/',
  122. 'selectors': {'main_content': 'article'},
  123. 'async_mode': True,
  124. 'max_pages': 1
  125. }
  126. with tempfile.TemporaryDirectory() as tmpdir:
  127. try:
  128. os.chdir(tmpdir)
  129. converter = DocToSkillConverter(config, dry_run=True)
  130. # Mock scrape_all_async to verify it gets called
  131. with patch.object(converter, 'scrape_all_async', new_callable=AsyncMock) as mock_async:
  132. converter.scrape_all()
  133. # Verify async version was called
  134. mock_async.assert_called_once()
  135. finally:
  136. os.chdir(self.original_cwd)
  137. def test_scrape_all_uses_sync_when_async_disabled(self):
  138. """Test scrape_all uses sync version when async_mode=False"""
  139. config = {
  140. 'name': 'test',
  141. 'base_url': 'https://example.com/',
  142. 'selectors': {'main_content': 'article'},
  143. 'async_mode': False,
  144. 'max_pages': 1
  145. }
  146. with tempfile.TemporaryDirectory() as tmpdir:
  147. try:
  148. os.chdir(tmpdir)
  149. converter = DocToSkillConverter(config, dry_run=True)
  150. # Mock scrape_all_async to verify it does NOT get called
  151. with patch.object(converter, 'scrape_all_async', new_callable=AsyncMock) as mock_async:
  152. with patch.object(converter, '_try_llms_txt', return_value=False):
  153. converter.scrape_all()
  154. # Verify async version was NOT called
  155. mock_async.assert_not_called()
  156. finally:
  157. os.chdir(self.original_cwd)
  158. class TestAsyncDryRun(unittest.TestCase):
  159. """Test async scraping in dry-run mode"""
  160. def setUp(self):
  161. """Set up test fixtures"""
  162. self.original_cwd = os.getcwd()
  163. def tearDown(self):
  164. """Clean up"""
  165. os.chdir(self.original_cwd)
  166. def test_async_dry_run_completes(self):
  167. """Test async dry run completes without errors"""
  168. config = {
  169. 'name': 'test',
  170. 'base_url': 'https://example.com/',
  171. 'selectors': {'main_content': 'article'},
  172. 'async_mode': True,
  173. 'max_pages': 5
  174. }
  175. with tempfile.TemporaryDirectory() as tmpdir:
  176. try:
  177. os.chdir(tmpdir)
  178. converter = DocToSkillConverter(config, dry_run=True)
  179. # Mock _try_llms_txt to skip llms.txt detection
  180. with patch.object(converter, '_try_llms_txt', return_value=False):
  181. # Should complete without errors
  182. converter.scrape_all()
  183. # Verify dry run mode was used
  184. self.assertTrue(converter.dry_run)
  185. finally:
  186. os.chdir(self.original_cwd)
  187. class TestAsyncErrorHandling(unittest.TestCase):
  188. """Test error handling in async scraping"""
  189. def setUp(self):
  190. """Set up test fixtures"""
  191. self.original_cwd = os.getcwd()
  192. def tearDown(self):
  193. """Clean up"""
  194. os.chdir(self.original_cwd)
  195. def test_async_handles_http_errors(self):
  196. """Test async scraping handles HTTP errors gracefully"""
  197. config = {
  198. 'name': 'test',
  199. 'base_url': 'https://example.com/',
  200. 'selectors': {'main_content': 'article'},
  201. 'async_mode': True,
  202. 'workers': 2,
  203. 'max_pages': 1
  204. }
  205. with tempfile.TemporaryDirectory() as tmpdir:
  206. try:
  207. os.chdir(tmpdir)
  208. converter = DocToSkillConverter(config, dry_run=False)
  209. # Mock httpx to simulate errors
  210. import httpx
  211. async def run_test():
  212. semaphore = asyncio.Semaphore(2)
  213. async with httpx.AsyncClient() as client:
  214. # Mock client.get to raise exception
  215. with patch.object(client, 'get', side_effect=httpx.HTTPError("Test error")):
  216. # Should not raise exception, just log error
  217. await converter.scrape_page_async('https://example.com/test', semaphore, client)
  218. # Run async test
  219. asyncio.run(run_test())
  220. # If we got here without exception, test passed
  221. finally:
  222. os.chdir(self.original_cwd)
  223. class TestAsyncPerformance(unittest.TestCase):
  224. """Test async performance characteristics"""
  225. def test_async_uses_semaphore_for_concurrency_control(self):
  226. """Test async mode uses semaphore instead of threading lock"""
  227. config = {
  228. 'name': 'test',
  229. 'base_url': 'https://example.com/',
  230. 'selectors': {'main_content': 'article'},
  231. 'async_mode': True,
  232. 'workers': 4
  233. }
  234. original_cwd = os.getcwd()
  235. with tempfile.TemporaryDirectory() as tmpdir:
  236. try:
  237. os.chdir(tmpdir)
  238. converter = DocToSkillConverter(config, dry_run=True)
  239. # Async mode should NOT create threading lock
  240. # (async uses asyncio.Semaphore instead)
  241. self.assertTrue(converter.async_mode)
  242. finally:
  243. os.chdir(original_cwd)
  244. class TestAsyncLlmsTxtIntegration(unittest.TestCase):
  245. """Test async mode with llms.txt detection"""
  246. def test_async_respects_llms_txt(self):
  247. """Test async mode respects llms.txt and skips HTML scraping"""
  248. config = {
  249. 'name': 'test',
  250. 'base_url': 'https://example.com/',
  251. 'selectors': {'main_content': 'article'},
  252. 'async_mode': True
  253. }
  254. original_cwd = os.getcwd()
  255. with tempfile.TemporaryDirectory() as tmpdir:
  256. try:
  257. os.chdir(tmpdir)
  258. converter = DocToSkillConverter(config, dry_run=False)
  259. # Mock _try_llms_txt to return True (llms.txt found)
  260. with patch.object(converter, '_try_llms_txt', return_value=True):
  261. with patch.object(converter, 'save_summary'):
  262. converter.scrape_all()
  263. # If llms.txt succeeded, async scraping should be skipped
  264. # Verify by checking that pages were not scraped
  265. self.assertEqual(len(converter.visited_urls), 0)
  266. finally:
  267. os.chdir(original_cwd)
  268. if __name__ == '__main__':
  269. unittest.main()