test_unified.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603
  1. #!/usr/bin/env python3
  2. """
  3. Tests for Unified Multi-Source Scraper
  4. Covers:
  5. - Config validation (unified vs legacy)
  6. - Conflict detection
  7. - Rule-based merging
  8. - Skill building
  9. """
  10. import os
  11. import sys
  12. import json
  13. import pytest
  14. import tempfile
  15. from pathlib import Path
  16. from skill_seekers.cli.config_validator import ConfigValidator, validate_config
  17. from skill_seekers.cli.conflict_detector import ConflictDetector, Conflict
  18. from skill_seekers.cli.merge_sources import RuleBasedMerger
  19. from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
  20. # ===========================
  21. # Config Validation Tests
  22. # ===========================
  23. def test_detect_unified_format():
  24. """Test unified format detection"""
  25. import tempfile
  26. import json
  27. unified_config = {
  28. "name": "test",
  29. "description": "Test skill",
  30. "sources": [
  31. {"type": "documentation", "base_url": "https://example.com"}
  32. ]
  33. }
  34. legacy_config = {
  35. "name": "test",
  36. "description": "Test skill",
  37. "base_url": "https://example.com"
  38. }
  39. # Test unified detection
  40. with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
  41. json.dump(unified_config, f)
  42. config_path = f.name
  43. try:
  44. validator = ConfigValidator(config_path)
  45. assert validator.is_unified == True
  46. finally:
  47. os.unlink(config_path)
  48. # Test legacy detection
  49. with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
  50. json.dump(legacy_config, f)
  51. config_path = f.name
  52. try:
  53. validator = ConfigValidator(config_path)
  54. assert validator.is_unified == False
  55. finally:
  56. os.unlink(config_path)
  57. def test_validate_unified_sources():
  58. """Test source type validation"""
  59. config = {
  60. "name": "test",
  61. "description": "Test",
  62. "sources": [
  63. {"type": "documentation", "base_url": "https://example.com"},
  64. {"type": "github", "repo": "user/repo"},
  65. {"type": "pdf", "path": "/path/to.pdf"}
  66. ]
  67. }
  68. validator = ConfigValidator(config)
  69. validator.validate()
  70. assert len(validator.config['sources']) == 3
  71. def test_validate_invalid_source_type():
  72. """Test invalid source type raises error"""
  73. config = {
  74. "name": "test",
  75. "description": "Test",
  76. "sources": [
  77. {"type": "invalid_type", "url": "https://example.com"}
  78. ]
  79. }
  80. validator = ConfigValidator(config)
  81. with pytest.raises(ValueError, match="Invalid type"):
  82. validator.validate()
  83. def test_needs_api_merge():
  84. """Test API merge detection"""
  85. # Config with both docs and GitHub code
  86. config_needs_merge = {
  87. "name": "test",
  88. "description": "Test",
  89. "sources": [
  90. {"type": "documentation", "base_url": "https://example.com", "extract_api": True},
  91. {"type": "github", "repo": "user/repo", "include_code": True}
  92. ]
  93. }
  94. validator = ConfigValidator(config_needs_merge)
  95. assert validator.needs_api_merge() == True
  96. # Config with only docs
  97. config_no_merge = {
  98. "name": "test",
  99. "description": "Test",
  100. "sources": [
  101. {"type": "documentation", "base_url": "https://example.com"}
  102. ]
  103. }
  104. validator = ConfigValidator(config_no_merge)
  105. assert validator.needs_api_merge() == False
  106. def test_backward_compatibility():
  107. """Test legacy config conversion"""
  108. legacy_config = {
  109. "name": "test",
  110. "description": "Test skill",
  111. "base_url": "https://example.com",
  112. "selectors": {"main_content": "article"},
  113. "max_pages": 100
  114. }
  115. validator = ConfigValidator(legacy_config)
  116. unified = validator.convert_legacy_to_unified()
  117. assert 'sources' in unified
  118. assert len(unified['sources']) == 1
  119. assert unified['sources'][0]['type'] == 'documentation'
  120. assert unified['sources'][0]['base_url'] == 'https://example.com'
  121. # ===========================
  122. # Conflict Detection Tests
  123. # ===========================
  124. def test_detect_missing_in_docs():
  125. """Test detection of APIs missing in documentation"""
  126. docs_data = {
  127. 'pages': [
  128. {
  129. 'url': 'https://example.com/api',
  130. 'apis': [
  131. {
  132. 'name': 'documented_func',
  133. 'parameters': [{'name': 'x', 'type': 'int'}],
  134. 'return_type': 'str'
  135. }
  136. ]
  137. }
  138. ]
  139. }
  140. github_data = {
  141. 'code_analysis': {
  142. 'analyzed_files': [
  143. {
  144. 'functions': [
  145. {
  146. 'name': 'undocumented_func',
  147. 'parameters': [{'name': 'y', 'type_hint': 'float'}],
  148. 'return_type': 'bool'
  149. }
  150. ]
  151. }
  152. ]
  153. }
  154. }
  155. detector = ConflictDetector(docs_data, github_data)
  156. conflicts = detector._find_missing_in_docs()
  157. assert len(conflicts) > 0
  158. assert any(c.type == 'missing_in_docs' for c in conflicts)
  159. assert any(c.api_name == 'undocumented_func' for c in conflicts)
  160. def test_detect_missing_in_code():
  161. """Test detection of APIs missing in code"""
  162. docs_data = {
  163. 'pages': [
  164. {
  165. 'url': 'https://example.com/api',
  166. 'apis': [
  167. {
  168. 'name': 'obsolete_func',
  169. 'parameters': [{'name': 'x', 'type': 'int'}],
  170. 'return_type': 'str'
  171. }
  172. ]
  173. }
  174. ]
  175. }
  176. github_data = {
  177. 'code_analysis': {
  178. 'analyzed_files': []
  179. }
  180. }
  181. detector = ConflictDetector(docs_data, github_data)
  182. conflicts = detector._find_missing_in_code()
  183. assert len(conflicts) > 0
  184. assert any(c.type == 'missing_in_code' for c in conflicts)
  185. assert any(c.api_name == 'obsolete_func' for c in conflicts)
  186. def test_detect_signature_mismatch():
  187. """Test detection of signature mismatches"""
  188. docs_data = {
  189. 'pages': [
  190. {
  191. 'url': 'https://example.com/api',
  192. 'apis': [
  193. {
  194. 'name': 'func',
  195. 'parameters': [{'name': 'x', 'type': 'int'}],
  196. 'return_type': 'str'
  197. }
  198. ]
  199. }
  200. ]
  201. }
  202. github_data = {
  203. 'code_analysis': {
  204. 'analyzed_files': [
  205. {
  206. 'functions': [
  207. {
  208. 'name': 'func',
  209. 'parameters': [
  210. {'name': 'x', 'type_hint': 'int'},
  211. {'name': 'y', 'type_hint': 'bool', 'default': 'False'}
  212. ],
  213. 'return_type': 'str'
  214. }
  215. ]
  216. }
  217. ]
  218. }
  219. }
  220. detector = ConflictDetector(docs_data, github_data)
  221. conflicts = detector._find_signature_mismatches()
  222. assert len(conflicts) > 0
  223. assert any(c.type == 'signature_mismatch' for c in conflicts)
  224. assert any(c.api_name == 'func' for c in conflicts)
  225. def test_conflict_severity():
  226. """Test conflict severity assignment"""
  227. # High severity: missing_in_code
  228. conflict_high = Conflict(
  229. type='missing_in_code',
  230. severity='high',
  231. api_name='test',
  232. docs_info={'name': 'test'},
  233. code_info=None,
  234. difference='API documented but not in code'
  235. )
  236. assert conflict_high.severity == 'high'
  237. # Medium severity: missing_in_docs
  238. conflict_medium = Conflict(
  239. type='missing_in_docs',
  240. severity='medium',
  241. api_name='test',
  242. docs_info=None,
  243. code_info={'name': 'test'},
  244. difference='API in code but not documented'
  245. )
  246. assert conflict_medium.severity == 'medium'
  247. # ===========================
  248. # Merge Tests
  249. # ===========================
  250. def test_rule_based_merge_docs_only():
  251. """Test rule-based merge for docs-only APIs"""
  252. docs_data = {
  253. 'pages': [
  254. {
  255. 'url': 'https://example.com/api',
  256. 'apis': [
  257. {
  258. 'name': 'docs_only_api',
  259. 'parameters': [{'name': 'x', 'type': 'int'}],
  260. 'return_type': 'str'
  261. }
  262. ]
  263. }
  264. ]
  265. }
  266. github_data = {'code_analysis': {'analyzed_files': []}}
  267. detector = ConflictDetector(docs_data, github_data)
  268. conflicts = detector.detect_all_conflicts()
  269. merger = RuleBasedMerger(docs_data, github_data, conflicts)
  270. merged = merger.merge_all()
  271. assert 'apis' in merged
  272. assert 'docs_only_api' in merged['apis']
  273. assert merged['apis']['docs_only_api']['status'] == 'docs_only'
  274. def test_rule_based_merge_code_only():
  275. """Test rule-based merge for code-only APIs"""
  276. docs_data = {'pages': []}
  277. github_data = {
  278. 'code_analysis': {
  279. 'analyzed_files': [
  280. {
  281. 'functions': [
  282. {
  283. 'name': 'code_only_api',
  284. 'parameters': [{'name': 'y', 'type_hint': 'float'}],
  285. 'return_type': 'bool'
  286. }
  287. ]
  288. }
  289. ]
  290. }
  291. }
  292. detector = ConflictDetector(docs_data, github_data)
  293. conflicts = detector.detect_all_conflicts()
  294. merger = RuleBasedMerger(docs_data, github_data, conflicts)
  295. merged = merger.merge_all()
  296. assert 'apis' in merged
  297. assert 'code_only_api' in merged['apis']
  298. assert merged['apis']['code_only_api']['status'] == 'code_only'
  299. def test_rule_based_merge_matched():
  300. """Test rule-based merge for matched APIs"""
  301. docs_data = {
  302. 'pages': [
  303. {
  304. 'url': 'https://example.com/api',
  305. 'apis': [
  306. {
  307. 'name': 'matched_api',
  308. 'parameters': [{'name': 'x', 'type': 'int'}],
  309. 'return_type': 'str'
  310. }
  311. ]
  312. }
  313. ]
  314. }
  315. github_data = {
  316. 'code_analysis': {
  317. 'analyzed_files': [
  318. {
  319. 'functions': [
  320. {
  321. 'name': 'matched_api',
  322. 'parameters': [{'name': 'x', 'type_hint': 'int'}],
  323. 'return_type': 'str'
  324. }
  325. ]
  326. }
  327. ]
  328. }
  329. }
  330. detector = ConflictDetector(docs_data, github_data)
  331. conflicts = detector.detect_all_conflicts()
  332. merger = RuleBasedMerger(docs_data, github_data, conflicts)
  333. merged = merger.merge_all()
  334. assert 'apis' in merged
  335. assert 'matched_api' in merged['apis']
  336. assert merged['apis']['matched_api']['status'] == 'matched'
  337. def test_merge_summary():
  338. """Test merge summary statistics"""
  339. docs_data = {
  340. 'pages': [
  341. {
  342. 'url': 'https://example.com/api',
  343. 'apis': [
  344. {'name': 'api1', 'parameters': [], 'return_type': 'str'},
  345. {'name': 'api2', 'parameters': [], 'return_type': 'int'}
  346. ]
  347. }
  348. ]
  349. }
  350. github_data = {
  351. 'code_analysis': {
  352. 'analyzed_files': [
  353. {
  354. 'functions': [
  355. {'name': 'api3', 'parameters': [], 'return_type': 'bool'}
  356. ]
  357. }
  358. ]
  359. }
  360. }
  361. detector = ConflictDetector(docs_data, github_data)
  362. conflicts = detector.detect_all_conflicts()
  363. merger = RuleBasedMerger(docs_data, github_data, conflicts)
  364. merged = merger.merge_all()
  365. assert 'summary' in merged
  366. assert merged['summary']['total_apis'] == 3
  367. assert merged['summary']['docs_only'] == 2
  368. assert merged['summary']['code_only'] == 1
  369. # ===========================
  370. # Skill Builder Tests
  371. # ===========================
  372. def test_skill_builder_basic():
  373. """Test basic skill building"""
  374. config = {
  375. 'name': 'test_skill',
  376. 'description': 'Test skill description',
  377. 'sources': [
  378. {'type': 'documentation', 'base_url': 'https://example.com'}
  379. ]
  380. }
  381. scraped_data = {
  382. 'documentation': {
  383. 'pages': [],
  384. 'data_file': '/tmp/test.json'
  385. }
  386. }
  387. with tempfile.TemporaryDirectory() as tmpdir:
  388. # Override output directory
  389. builder = UnifiedSkillBuilder(config, scraped_data)
  390. builder.skill_dir = tmpdir
  391. builder._generate_skill_md()
  392. # Check SKILL.md was created
  393. skill_md = Path(tmpdir) / 'SKILL.md'
  394. assert skill_md.exists()
  395. content = skill_md.read_text()
  396. assert 'test_skill' in content.lower()
  397. assert 'Test skill description' in content
  398. def test_skill_builder_with_conflicts():
  399. """Test skill building with conflicts"""
  400. config = {
  401. 'name': 'test_skill',
  402. 'description': 'Test',
  403. 'sources': [
  404. {'type': 'documentation', 'base_url': 'https://example.com'},
  405. {'type': 'github', 'repo': 'user/repo'}
  406. ]
  407. }
  408. scraped_data = {}
  409. conflicts = [
  410. Conflict(
  411. type='missing_in_code',
  412. severity='high',
  413. api_name='test_api',
  414. docs_info={'name': 'test_api'},
  415. code_info=None,
  416. difference='Test difference'
  417. )
  418. ]
  419. with tempfile.TemporaryDirectory() as tmpdir:
  420. builder = UnifiedSkillBuilder(config, scraped_data, conflicts=conflicts)
  421. builder.skill_dir = tmpdir
  422. builder._generate_skill_md()
  423. skill_md = Path(tmpdir) / 'SKILL.md'
  424. content = skill_md.read_text()
  425. assert '1 conflicts detected' in content
  426. assert 'missing_in_code' in content
  427. def test_skill_builder_merged_apis():
  428. """Test skill building with merged APIs"""
  429. config = {
  430. 'name': 'test',
  431. 'description': 'Test',
  432. 'sources': []
  433. }
  434. scraped_data = {}
  435. merged_data = {
  436. 'apis': {
  437. 'test_api': {
  438. 'name': 'test_api',
  439. 'status': 'matched',
  440. 'merged_signature': 'test_api(x: int) -> str',
  441. 'merged_description': 'Test API',
  442. 'source': 'both'
  443. }
  444. }
  445. }
  446. with tempfile.TemporaryDirectory() as tmpdir:
  447. builder = UnifiedSkillBuilder(config, scraped_data, merged_data=merged_data)
  448. builder.skill_dir = tmpdir
  449. content = builder._format_merged_apis()
  450. assert '✅ Verified APIs' in content
  451. assert 'test_api' in content
  452. # ===========================
  453. # Integration Tests
  454. # ===========================
  455. def test_full_workflow_unified_config():
  456. """Test complete workflow with unified config"""
  457. # Create test config
  458. config = {
  459. "name": "test_unified",
  460. "description": "Test unified workflow",
  461. "merge_mode": "rule-based",
  462. "sources": [
  463. {
  464. "type": "documentation",
  465. "base_url": "https://example.com",
  466. "extract_api": True
  467. },
  468. {
  469. "type": "github",
  470. "repo": "user/repo",
  471. "include_code": True,
  472. "code_analysis_depth": "surface"
  473. }
  474. ]
  475. }
  476. # Validate config
  477. validator = ConfigValidator(config)
  478. validator.validate()
  479. assert validator.is_unified == True
  480. assert validator.needs_api_merge() == True
  481. def test_config_file_validation():
  482. """Test validation from config file"""
  483. with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
  484. config = {
  485. "name": "test",
  486. "description": "Test",
  487. "sources": [
  488. {"type": "documentation", "base_url": "https://example.com"}
  489. ]
  490. }
  491. json.dump(config, f)
  492. config_path = f.name
  493. try:
  494. validator = validate_config(config_path)
  495. assert validator.is_unified == True
  496. finally:
  497. os.unlink(config_path)
  498. # Run tests
  499. if __name__ == '__main__':
  500. pytest.main([__file__, '-v'])