test_integration.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647
  1. #!/usr/bin/env python3
  2. """
  3. Integration tests for doc_scraper
  4. Tests complete workflows and dry-run mode
  5. """
  6. import sys
  7. import os
  8. import unittest
  9. import json
  10. import tempfile
  11. import shutil
  12. from pathlib import Path
  13. # Add parent directory to path
  14. sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  15. from skill_seekers.cli.doc_scraper import DocToSkillConverter, load_config, validate_config
  16. class TestDryRunMode(unittest.TestCase):
  17. """Test dry-run mode functionality"""
  18. def setUp(self):
  19. """Set up test configuration"""
  20. self.config = {
  21. 'name': 'test-dry-run',
  22. 'base_url': 'https://example.com/',
  23. 'selectors': {
  24. 'main_content': 'article',
  25. 'title': 'h1',
  26. 'code_blocks': 'pre code'
  27. },
  28. 'url_patterns': {
  29. 'include': [],
  30. 'exclude': []
  31. },
  32. 'rate_limit': 0.1,
  33. 'max_pages': 10
  34. }
  35. def test_dry_run_no_directories_created(self):
  36. """Test that dry-run mode doesn't create directories"""
  37. converter = DocToSkillConverter(self.config, dry_run=True)
  38. # Check directories were NOT created
  39. data_dir = Path(f"output/{self.config['name']}_data")
  40. skill_dir = Path(f"output/{self.config['name']}")
  41. self.assertFalse(data_dir.exists(), "Dry-run should not create data directory")
  42. self.assertFalse(skill_dir.exists(), "Dry-run should not create skill directory")
  43. def test_dry_run_flag_set(self):
  44. """Test that dry_run flag is properly set"""
  45. converter = DocToSkillConverter(self.config, dry_run=True)
  46. self.assertTrue(converter.dry_run)
  47. converter_normal = DocToSkillConverter(self.config, dry_run=False)
  48. self.assertFalse(converter_normal.dry_run)
  49. # Clean up
  50. shutil.rmtree(f"output/{self.config['name']}_data", ignore_errors=True)
  51. shutil.rmtree(f"output/{self.config['name']}", ignore_errors=True)
  52. def test_normal_mode_creates_directories(self):
  53. """Test that normal mode creates directories"""
  54. converter = DocToSkillConverter(self.config, dry_run=False)
  55. # Check directories WERE created
  56. data_dir = Path(f"output/{self.config['name']}_data")
  57. skill_dir = Path(f"output/{self.config['name']}")
  58. self.assertTrue(data_dir.exists(), "Normal mode should create data directory")
  59. self.assertTrue(skill_dir.exists(), "Normal mode should create skill directory")
  60. # Clean up
  61. shutil.rmtree(data_dir, ignore_errors=True)
  62. shutil.rmtree(skill_dir, ignore_errors=True)
  63. class TestConfigLoading(unittest.TestCase):
  64. """Test configuration loading and validation"""
  65. def setUp(self):
  66. """Set up temporary directory for test configs"""
  67. self.temp_dir = tempfile.mkdtemp()
  68. def tearDown(self):
  69. """Clean up temporary directory"""
  70. shutil.rmtree(self.temp_dir, ignore_errors=True)
  71. def test_load_valid_config(self):
  72. """Test loading a valid configuration file"""
  73. config_data = {
  74. 'name': 'test-config',
  75. 'base_url': 'https://example.com/',
  76. 'selectors': {
  77. 'main_content': 'article',
  78. 'title': 'h1',
  79. 'code_blocks': 'pre code'
  80. },
  81. 'rate_limit': 0.5,
  82. 'max_pages': 100
  83. }
  84. config_path = Path(self.temp_dir) / 'test.json'
  85. with open(config_path, 'w') as f:
  86. json.dump(config_data, f)
  87. loaded_config = load_config(str(config_path))
  88. self.assertEqual(loaded_config['name'], 'test-config')
  89. self.assertEqual(loaded_config['base_url'], 'https://example.com/')
  90. def test_load_invalid_json(self):
  91. """Test loading an invalid JSON file"""
  92. config_path = Path(self.temp_dir) / 'invalid.json'
  93. with open(config_path, 'w') as f:
  94. f.write('{ invalid json }')
  95. with self.assertRaises(SystemExit):
  96. load_config(str(config_path))
  97. def test_load_nonexistent_file(self):
  98. """Test loading a nonexistent file"""
  99. config_path = Path(self.temp_dir) / 'nonexistent.json'
  100. with self.assertRaises(SystemExit):
  101. load_config(str(config_path))
  102. def test_load_config_with_validation_errors(self):
  103. """Test loading a config with validation errors"""
  104. config_data = {
  105. 'name': 'invalid@name', # Invalid name
  106. 'base_url': 'example.com' # Missing protocol
  107. }
  108. config_path = Path(self.temp_dir) / 'invalid_config.json'
  109. with open(config_path, 'w') as f:
  110. json.dump(config_data, f)
  111. with self.assertRaises(SystemExit):
  112. load_config(str(config_path))
  113. class TestRealConfigFiles(unittest.TestCase):
  114. """Test that real config files in the repository are valid"""
  115. def test_godot_config(self):
  116. """Test Godot config is valid"""
  117. config_path = 'configs/godot.json'
  118. if os.path.exists(config_path):
  119. config = load_config(config_path)
  120. errors, _ = validate_config(config)
  121. self.assertEqual(len(errors), 0, f"Godot config should be valid, got errors: {errors}")
  122. def test_react_config(self):
  123. """Test React config is valid"""
  124. config_path = 'configs/react.json'
  125. if os.path.exists(config_path):
  126. config = load_config(config_path)
  127. errors, _ = validate_config(config)
  128. self.assertEqual(len(errors), 0, f"React config should be valid, got errors: {errors}")
  129. def test_vue_config(self):
  130. """Test Vue config is valid"""
  131. config_path = 'configs/vue.json'
  132. if os.path.exists(config_path):
  133. config = load_config(config_path)
  134. errors, _ = validate_config(config)
  135. self.assertEqual(len(errors), 0, f"Vue config should be valid, got errors: {errors}")
  136. def test_django_config(self):
  137. """Test Django config is valid"""
  138. config_path = 'configs/django.json'
  139. if os.path.exists(config_path):
  140. config = load_config(config_path)
  141. errors, _ = validate_config(config)
  142. self.assertEqual(len(errors), 0, f"Django config should be valid, got errors: {errors}")
  143. def test_fastapi_config(self):
  144. """Test FastAPI config is valid"""
  145. config_path = 'configs/fastapi.json'
  146. if os.path.exists(config_path):
  147. config = load_config(config_path)
  148. errors, _ = validate_config(config)
  149. self.assertEqual(len(errors), 0, f"FastAPI config should be valid, got errors: {errors}")
  150. def test_steam_economy_config(self):
  151. """Test Steam Economy config is valid"""
  152. config_path = 'configs/steam-economy-complete.json'
  153. if os.path.exists(config_path):
  154. config = load_config(config_path)
  155. errors, _ = validate_config(config)
  156. self.assertEqual(len(errors), 0, f"Steam Economy config should be valid, got errors: {errors}")
  157. class TestURLProcessing(unittest.TestCase):
  158. """Test URL processing and validation"""
  159. def test_url_normalization(self):
  160. """Test URL normalization in converter"""
  161. config = {
  162. 'name': 'test',
  163. 'base_url': 'https://example.com/',
  164. 'selectors': {'main_content': 'article', 'title': 'h1', 'code_blocks': 'pre'},
  165. 'url_patterns': {'include': [], 'exclude': []},
  166. 'rate_limit': 0.1,
  167. 'max_pages': 10
  168. }
  169. converter = DocToSkillConverter(config, dry_run=True)
  170. # Base URL should be stored correctly
  171. self.assertEqual(converter.base_url, 'https://example.com/')
  172. def test_start_urls_fallback(self):
  173. """Test that start_urls defaults to base_url"""
  174. config = {
  175. 'name': 'test',
  176. 'base_url': 'https://example.com/',
  177. 'selectors': {'main_content': 'article', 'title': 'h1', 'code_blocks': 'pre'},
  178. 'rate_limit': 0.1,
  179. 'max_pages': 10
  180. }
  181. converter = DocToSkillConverter(config, dry_run=True)
  182. # Should have base_url in pending_urls
  183. self.assertEqual(len(converter.pending_urls), 1)
  184. self.assertEqual(converter.pending_urls[0], 'https://example.com/')
  185. def test_multiple_start_urls(self):
  186. """Test multiple start URLs"""
  187. config = {
  188. 'name': 'test',
  189. 'base_url': 'https://example.com/',
  190. 'start_urls': [
  191. 'https://example.com/guide/',
  192. 'https://example.com/api/',
  193. 'https://example.com/tutorial/'
  194. ],
  195. 'selectors': {'main_content': 'article', 'title': 'h1', 'code_blocks': 'pre'},
  196. 'rate_limit': 0.1,
  197. 'max_pages': 10
  198. }
  199. converter = DocToSkillConverter(config, dry_run=True)
  200. # Should have all start URLs in pending_urls
  201. self.assertEqual(len(converter.pending_urls), 3)
  202. class TestLlmsTxtIntegration(unittest.TestCase):
  203. """Test llms.txt integration into scraping workflow"""
  204. def test_scraper_has_llms_txt_attributes(self):
  205. """Test that scraper has llms.txt detection attributes"""
  206. config = {
  207. 'name': 'test-llms',
  208. 'base_url': 'https://hono.dev/docs',
  209. 'selectors': {
  210. 'main_content': 'article',
  211. 'title': 'h1',
  212. 'code_blocks': 'pre code'
  213. },
  214. 'max_pages': 50
  215. }
  216. scraper = DocToSkillConverter(config, dry_run=True)
  217. # Should have llms.txt attributes
  218. self.assertFalse(scraper.llms_txt_detected)
  219. self.assertIsNone(scraper.llms_txt_variant)
  220. def test_scraper_has_try_llms_txt_method(self):
  221. """Test that scraper has _try_llms_txt method"""
  222. config = {
  223. 'name': 'test-llms',
  224. 'base_url': 'https://hono.dev/docs',
  225. 'selectors': {
  226. 'main_content': 'article',
  227. 'title': 'h1',
  228. 'code_blocks': 'pre code'
  229. },
  230. 'max_pages': 50
  231. }
  232. scraper = DocToSkillConverter(config, dry_run=True)
  233. # Should have _try_llms_txt method
  234. self.assertTrue(hasattr(scraper, '_try_llms_txt'))
  235. self.assertTrue(callable(getattr(scraper, '_try_llms_txt')))
  236. class TestContentExtraction(unittest.TestCase):
  237. """Test content extraction functionality"""
  238. def setUp(self):
  239. """Set up test converter"""
  240. config = {
  241. 'name': 'test',
  242. 'base_url': 'https://example.com/',
  243. 'selectors': {
  244. 'main_content': 'article',
  245. 'title': 'h1',
  246. 'code_blocks': 'pre code'
  247. },
  248. 'rate_limit': 0.1,
  249. 'max_pages': 10
  250. }
  251. self.converter = DocToSkillConverter(config, dry_run=True)
  252. def test_extract_empty_content(self):
  253. """Test extracting from empty HTML"""
  254. from bs4 import BeautifulSoup
  255. html = '<html><body></body></html>'
  256. soup = BeautifulSoup(html, 'html.parser')
  257. page = self.converter.extract_content(soup, 'https://example.com/test')
  258. self.assertEqual(page['url'], 'https://example.com/test')
  259. self.assertEqual(page['title'], '')
  260. self.assertEqual(page['content'], '')
  261. self.assertEqual(len(page['code_samples']), 0)
  262. def test_extract_basic_content(self):
  263. """Test extracting basic content"""
  264. from bs4 import BeautifulSoup
  265. html = '''
  266. <html>
  267. <head><title>Test Page</title></head>
  268. <body>
  269. <article>
  270. <h1>Page Title</h1>
  271. <p>This is some content.</p>
  272. <p>This is more content with sufficient length to be included.</p>
  273. <pre><code class="language-python">print("hello")</code></pre>
  274. </article>
  275. </body>
  276. </html>
  277. '''
  278. soup = BeautifulSoup(html, 'html.parser')
  279. page = self.converter.extract_content(soup, 'https://example.com/test')
  280. self.assertEqual(page['url'], 'https://example.com/test')
  281. self.assertIn('Page Title', page['title'])
  282. self.assertIn('content', page['content'].lower())
  283. self.assertGreater(len(page['code_samples']), 0)
  284. self.assertEqual(page['code_samples'][0]['language'], 'python')
  285. class TestFullLlmsTxtWorkflow(unittest.TestCase):
  286. """Test complete llms.txt workflow with mocked HTTP requests"""
  287. def setUp(self):
  288. """Set up test configuration and temporary directory"""
  289. self.temp_dir = tempfile.mkdtemp()
  290. self.config = {
  291. 'name': 'test-e2e-llms',
  292. 'base_url': 'https://hono.dev/docs',
  293. 'llms_txt_url': 'https://hono.dev/llms-full.txt',
  294. 'selectors': {
  295. 'main_content': 'article',
  296. 'title': 'h1',
  297. 'code_blocks': 'pre code'
  298. },
  299. 'max_pages': 50
  300. }
  301. # Sample llms.txt content for testing
  302. self.sample_llms_content = """# Getting Started
  303. Welcome to the framework documentation. This is the introduction section.
  304. ## Installation
  305. To install the framework, run the following command:
  306. ```bash
  307. npm install hono
  308. ```
  309. ## Quick Start
  310. Create a simple application:
  311. ```javascript
  312. import { Hono } from 'hono'
  313. const app = new Hono()
  314. app.get('/', (c) => {
  315. return c.text('Hello World!')
  316. })
  317. export default app
  318. ```
  319. # API Reference
  320. This section covers the API documentation for the framework.
  321. ## Context
  322. The context object provides request and response handling:
  323. ```typescript
  324. interface Context {
  325. req: Request
  326. res: Response
  327. text: (text: string) => Response
  328. }
  329. ```
  330. # Middleware
  331. Middleware functions run before route handlers.
  332. ## Built-in Middleware
  333. The framework provides several built-in middleware functions:
  334. ```javascript
  335. import { logger, cors } from 'hono/middleware'
  336. app.use('*', logger())
  337. app.use('*', cors())
  338. ```
  339. """
  340. def tearDown(self):
  341. """Clean up temporary directory and test output"""
  342. shutil.rmtree(self.temp_dir, ignore_errors=True)
  343. # Clean up test output directories
  344. shutil.rmtree(f"output/{self.config['name']}_data", ignore_errors=True)
  345. shutil.rmtree(f"output/{self.config['name']}", ignore_errors=True)
  346. def test_full_llms_txt_workflow(self):
  347. """Test complete workflow: config -> scrape (llms.txt) -> build -> verify"""
  348. from unittest.mock import patch, MagicMock
  349. import requests
  350. # Mock the requests.get call for downloading llms.txt
  351. with patch('cli.llms_txt_downloader.requests.get') as mock_get:
  352. # Configure mock response
  353. mock_response = MagicMock()
  354. mock_response.status_code = 200
  355. mock_response.text = self.sample_llms_content
  356. mock_response.raise_for_status = MagicMock()
  357. mock_get.return_value = mock_response
  358. # Create scraper and scrape
  359. scraper = DocToSkillConverter(self.config, dry_run=False)
  360. scraper.scrape_all()
  361. # Verify llms.txt was detected
  362. self.assertTrue(scraper.llms_txt_detected,
  363. "llms.txt should be detected")
  364. self.assertEqual(scraper.llms_txt_variant, 'explicit',
  365. "Should use explicit variant from config")
  366. # Verify pages were parsed
  367. self.assertGreater(len(scraper.pages), 0,
  368. "Should have parsed pages from llms.txt")
  369. # Verify page structure
  370. self.assertTrue(all('title' in page for page in scraper.pages),
  371. "All pages should have titles")
  372. self.assertTrue(all('content' in page for page in scraper.pages),
  373. "All pages should have content")
  374. self.assertTrue(any(len(page.get('code_samples', [])) > 0
  375. for page in scraper.pages),
  376. "At least one page should have code samples")
  377. # Verify code samples have language detection
  378. pages_with_code = [p for p in scraper.pages
  379. if len(p.get('code_samples', [])) > 0]
  380. if pages_with_code:
  381. sample = pages_with_code[0]['code_samples'][0]
  382. self.assertIn('language', sample,
  383. "Code samples should have language field")
  384. self.assertIn('code', sample,
  385. "Code samples should have code field")
  386. # Build skill
  387. scraper.build_skill()
  388. # Verify SKILL.md exists
  389. skill_md_path = Path(f"output/{self.config['name']}/SKILL.md")
  390. self.assertTrue(skill_md_path.exists(),
  391. "SKILL.md should be created")
  392. # Verify SKILL.md content
  393. skill_content = skill_md_path.read_text()
  394. self.assertIn(self.config['name'], skill_content,
  395. "SKILL.md should contain skill name")
  396. self.assertGreater(len(skill_content), 100,
  397. "SKILL.md should have substantial content")
  398. # Verify references directory exists
  399. refs_dir = Path(f"output/{self.config['name']}/references")
  400. self.assertTrue(refs_dir.exists(),
  401. "references directory should exist")
  402. # Verify at least index.md was created
  403. index_md = refs_dir / 'index.md'
  404. self.assertTrue(index_md.exists(),
  405. "references/index.md should exist")
  406. # Verify reference files have content
  407. ref_files = list(refs_dir.glob('*.md'))
  408. self.assertGreater(len(ref_files), 0,
  409. "Should have at least one reference file")
  410. # Verify data directory was created and has summary
  411. data_dir = Path(f"output/{self.config['name']}_data")
  412. self.assertTrue(data_dir.exists(),
  413. "Data directory should exist")
  414. summary_path = data_dir / 'summary.json'
  415. self.assertTrue(summary_path.exists(),
  416. "summary.json should exist")
  417. # Verify summary content
  418. with open(summary_path) as f:
  419. summary = json.load(f)
  420. self.assertEqual(summary['name'], self.config['name'])
  421. self.assertGreater(summary['total_pages'], 0)
  422. self.assertIn('llms_txt_detected', summary)
  423. self.assertTrue(summary['llms_txt_detected'])
  424. def test_multi_variant_download(self):
  425. """Test downloading all 3 llms.txt variants"""
  426. from unittest.mock import patch, Mock
  427. config = {
  428. 'name': 'test-multi-variant',
  429. 'base_url': 'https://hono.dev/docs',
  430. 'selectors': {
  431. 'main_content': 'article',
  432. 'title': 'h1',
  433. 'code_blocks': 'pre code'
  434. },
  435. 'max_pages': 50
  436. }
  437. # Mock all 3 variants
  438. sample_full = "# Full\n" + "x" * 1000
  439. sample_standard = "# Standard\n" + "x" * 200
  440. sample_small = "# Small\n" + "x" * 500
  441. with patch('cli.llms_txt_detector.requests.head') as mock_head, \
  442. patch('cli.llms_txt_downloader.requests.get') as mock_get:
  443. # Mock detection (all exist)
  444. mock_head_response = Mock()
  445. mock_head_response.status_code = 200
  446. mock_head.return_value = mock_head_response
  447. # Mock downloads
  448. def mock_download(url, **kwargs):
  449. response = Mock()
  450. response.status_code = 200
  451. if 'llms-full.txt' in url:
  452. response.text = sample_full
  453. elif 'llms-small.txt' in url:
  454. response.text = sample_small
  455. else: # llms.txt
  456. response.text = sample_standard
  457. response.raise_for_status = Mock()
  458. return response
  459. mock_get.side_effect = mock_download
  460. # Run scraper
  461. from skill_seekers.cli.doc_scraper import DocToSkillConverter as DocumentationScraper
  462. scraper = DocumentationScraper(config, dry_run=False)
  463. result = scraper._try_llms_txt()
  464. # Verify all 3 files created
  465. refs_dir = Path(f"output/{config['name']}/references")
  466. self.assertTrue(refs_dir.exists(), "references directory should exist")
  467. self.assertTrue((refs_dir / 'llms-full.md').exists(), "llms-full.md should exist")
  468. self.assertTrue((refs_dir / 'llms.md').exists(), "llms.md should exist")
  469. self.assertTrue((refs_dir / 'llms-small.md').exists(), "llms-small.md should exist")
  470. # Verify content not truncated
  471. full_content = (refs_dir / 'llms-full.md').read_text()
  472. self.assertEqual(len(full_content), len(sample_full))
  473. # Clean up
  474. shutil.rmtree(f"output/{config['name']}_data", ignore_errors=True)
  475. shutil.rmtree(f"output/{config['name']}", ignore_errors=True)
  476. def test_no_content_truncation():
  477. """Test that content is NOT truncated in reference files"""
  478. from unittest.mock import Mock
  479. import tempfile
  480. config = {
  481. 'name': 'test-no-truncate',
  482. 'base_url': 'https://example.com/docs',
  483. 'selectors': {
  484. 'main_content': 'article',
  485. 'title': 'h1',
  486. 'code_blocks': 'pre code'
  487. },
  488. 'max_pages': 50
  489. }
  490. # Create scraper with long content
  491. from skill_seekers.cli.doc_scraper import DocToSkillConverter
  492. scraper = DocToSkillConverter(config, dry_run=False)
  493. # Create page with content > 2500 chars
  494. long_content = "x" * 5000
  495. long_code = "y" * 1000
  496. pages = [{
  497. 'title': 'Long Page',
  498. 'url': 'https://example.com/long',
  499. 'content': long_content,
  500. 'code_samples': [
  501. {'code': long_code, 'language': 'python'}
  502. ],
  503. 'headings': []
  504. }]
  505. # Create reference file
  506. scraper.create_reference_file('test', pages)
  507. # Verify no truncation
  508. ref_file = Path(f"output/{config['name']}/references/test.md")
  509. with open(ref_file, 'r') as f:
  510. content = f.read()
  511. assert long_content in content # Full content included
  512. assert long_code in content # Full code included
  513. assert '[Content truncated]' not in content
  514. assert '...' not in content or content.count('...') == 0
  515. # Clean up
  516. shutil.rmtree(f"output/{config['name']}_data", ignore_errors=True)
  517. shutil.rmtree(f"output/{config['name']}", ignore_errors=True)
  518. if __name__ == '__main__':
  519. unittest.main()