| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603 |
- #!/usr/bin/env python3
- """
- Tests for Unified Multi-Source Scraper
- Covers:
- - Config validation (unified vs legacy)
- - Conflict detection
- - Rule-based merging
- - Skill building
- """
- import os
- import sys
- import json
- import pytest
- import tempfile
- from pathlib import Path
- from skill_seekers.cli.config_validator import ConfigValidator, validate_config
- from skill_seekers.cli.conflict_detector import ConflictDetector, Conflict
- from skill_seekers.cli.merge_sources import RuleBasedMerger
- from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
- # ===========================
- # Config Validation Tests
- # ===========================
- def test_detect_unified_format():
- """Test unified format detection"""
- import tempfile
- import json
- unified_config = {
- "name": "test",
- "description": "Test skill",
- "sources": [
- {"type": "documentation", "base_url": "https://example.com"}
- ]
- }
- legacy_config = {
- "name": "test",
- "description": "Test skill",
- "base_url": "https://example.com"
- }
- # Test unified detection
- with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
- json.dump(unified_config, f)
- config_path = f.name
- try:
- validator = ConfigValidator(config_path)
- assert validator.is_unified == True
- finally:
- os.unlink(config_path)
- # Test legacy detection
- with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
- json.dump(legacy_config, f)
- config_path = f.name
- try:
- validator = ConfigValidator(config_path)
- assert validator.is_unified == False
- finally:
- os.unlink(config_path)
- def test_validate_unified_sources():
- """Test source type validation"""
- config = {
- "name": "test",
- "description": "Test",
- "sources": [
- {"type": "documentation", "base_url": "https://example.com"},
- {"type": "github", "repo": "user/repo"},
- {"type": "pdf", "path": "/path/to.pdf"}
- ]
- }
- validator = ConfigValidator(config)
- validator.validate()
- assert len(validator.config['sources']) == 3
- def test_validate_invalid_source_type():
- """Test invalid source type raises error"""
- config = {
- "name": "test",
- "description": "Test",
- "sources": [
- {"type": "invalid_type", "url": "https://example.com"}
- ]
- }
- validator = ConfigValidator(config)
- with pytest.raises(ValueError, match="Invalid type"):
- validator.validate()
- def test_needs_api_merge():
- """Test API merge detection"""
- # Config with both docs and GitHub code
- config_needs_merge = {
- "name": "test",
- "description": "Test",
- "sources": [
- {"type": "documentation", "base_url": "https://example.com", "extract_api": True},
- {"type": "github", "repo": "user/repo", "include_code": True}
- ]
- }
- validator = ConfigValidator(config_needs_merge)
- assert validator.needs_api_merge() == True
- # Config with only docs
- config_no_merge = {
- "name": "test",
- "description": "Test",
- "sources": [
- {"type": "documentation", "base_url": "https://example.com"}
- ]
- }
- validator = ConfigValidator(config_no_merge)
- assert validator.needs_api_merge() == False
- def test_backward_compatibility():
- """Test legacy config conversion"""
- legacy_config = {
- "name": "test",
- "description": "Test skill",
- "base_url": "https://example.com",
- "selectors": {"main_content": "article"},
- "max_pages": 100
- }
- validator = ConfigValidator(legacy_config)
- unified = validator.convert_legacy_to_unified()
- assert 'sources' in unified
- assert len(unified['sources']) == 1
- assert unified['sources'][0]['type'] == 'documentation'
- assert unified['sources'][0]['base_url'] == 'https://example.com'
- # ===========================
- # Conflict Detection Tests
- # ===========================
- def test_detect_missing_in_docs():
- """Test detection of APIs missing in documentation"""
- docs_data = {
- 'pages': [
- {
- 'url': 'https://example.com/api',
- 'apis': [
- {
- 'name': 'documented_func',
- 'parameters': [{'name': 'x', 'type': 'int'}],
- 'return_type': 'str'
- }
- ]
- }
- ]
- }
- github_data = {
- 'code_analysis': {
- 'analyzed_files': [
- {
- 'functions': [
- {
- 'name': 'undocumented_func',
- 'parameters': [{'name': 'y', 'type_hint': 'float'}],
- 'return_type': 'bool'
- }
- ]
- }
- ]
- }
- }
- detector = ConflictDetector(docs_data, github_data)
- conflicts = detector._find_missing_in_docs()
- assert len(conflicts) > 0
- assert any(c.type == 'missing_in_docs' for c in conflicts)
- assert any(c.api_name == 'undocumented_func' for c in conflicts)
- def test_detect_missing_in_code():
- """Test detection of APIs missing in code"""
- docs_data = {
- 'pages': [
- {
- 'url': 'https://example.com/api',
- 'apis': [
- {
- 'name': 'obsolete_func',
- 'parameters': [{'name': 'x', 'type': 'int'}],
- 'return_type': 'str'
- }
- ]
- }
- ]
- }
- github_data = {
- 'code_analysis': {
- 'analyzed_files': []
- }
- }
- detector = ConflictDetector(docs_data, github_data)
- conflicts = detector._find_missing_in_code()
- assert len(conflicts) > 0
- assert any(c.type == 'missing_in_code' for c in conflicts)
- assert any(c.api_name == 'obsolete_func' for c in conflicts)
- def test_detect_signature_mismatch():
- """Test detection of signature mismatches"""
- docs_data = {
- 'pages': [
- {
- 'url': 'https://example.com/api',
- 'apis': [
- {
- 'name': 'func',
- 'parameters': [{'name': 'x', 'type': 'int'}],
- 'return_type': 'str'
- }
- ]
- }
- ]
- }
- github_data = {
- 'code_analysis': {
- 'analyzed_files': [
- {
- 'functions': [
- {
- 'name': 'func',
- 'parameters': [
- {'name': 'x', 'type_hint': 'int'},
- {'name': 'y', 'type_hint': 'bool', 'default': 'False'}
- ],
- 'return_type': 'str'
- }
- ]
- }
- ]
- }
- }
- detector = ConflictDetector(docs_data, github_data)
- conflicts = detector._find_signature_mismatches()
- assert len(conflicts) > 0
- assert any(c.type == 'signature_mismatch' for c in conflicts)
- assert any(c.api_name == 'func' for c in conflicts)
- def test_conflict_severity():
- """Test conflict severity assignment"""
- # High severity: missing_in_code
- conflict_high = Conflict(
- type='missing_in_code',
- severity='high',
- api_name='test',
- docs_info={'name': 'test'},
- code_info=None,
- difference='API documented but not in code'
- )
- assert conflict_high.severity == 'high'
- # Medium severity: missing_in_docs
- conflict_medium = Conflict(
- type='missing_in_docs',
- severity='medium',
- api_name='test',
- docs_info=None,
- code_info={'name': 'test'},
- difference='API in code but not documented'
- )
- assert conflict_medium.severity == 'medium'
- # ===========================
- # Merge Tests
- # ===========================
- def test_rule_based_merge_docs_only():
- """Test rule-based merge for docs-only APIs"""
- docs_data = {
- 'pages': [
- {
- 'url': 'https://example.com/api',
- 'apis': [
- {
- 'name': 'docs_only_api',
- 'parameters': [{'name': 'x', 'type': 'int'}],
- 'return_type': 'str'
- }
- ]
- }
- ]
- }
- github_data = {'code_analysis': {'analyzed_files': []}}
- detector = ConflictDetector(docs_data, github_data)
- conflicts = detector.detect_all_conflicts()
- merger = RuleBasedMerger(docs_data, github_data, conflicts)
- merged = merger.merge_all()
- assert 'apis' in merged
- assert 'docs_only_api' in merged['apis']
- assert merged['apis']['docs_only_api']['status'] == 'docs_only'
- def test_rule_based_merge_code_only():
- """Test rule-based merge for code-only APIs"""
- docs_data = {'pages': []}
- github_data = {
- 'code_analysis': {
- 'analyzed_files': [
- {
- 'functions': [
- {
- 'name': 'code_only_api',
- 'parameters': [{'name': 'y', 'type_hint': 'float'}],
- 'return_type': 'bool'
- }
- ]
- }
- ]
- }
- }
- detector = ConflictDetector(docs_data, github_data)
- conflicts = detector.detect_all_conflicts()
- merger = RuleBasedMerger(docs_data, github_data, conflicts)
- merged = merger.merge_all()
- assert 'apis' in merged
- assert 'code_only_api' in merged['apis']
- assert merged['apis']['code_only_api']['status'] == 'code_only'
- def test_rule_based_merge_matched():
- """Test rule-based merge for matched APIs"""
- docs_data = {
- 'pages': [
- {
- 'url': 'https://example.com/api',
- 'apis': [
- {
- 'name': 'matched_api',
- 'parameters': [{'name': 'x', 'type': 'int'}],
- 'return_type': 'str'
- }
- ]
- }
- ]
- }
- github_data = {
- 'code_analysis': {
- 'analyzed_files': [
- {
- 'functions': [
- {
- 'name': 'matched_api',
- 'parameters': [{'name': 'x', 'type_hint': 'int'}],
- 'return_type': 'str'
- }
- ]
- }
- ]
- }
- }
- detector = ConflictDetector(docs_data, github_data)
- conflicts = detector.detect_all_conflicts()
- merger = RuleBasedMerger(docs_data, github_data, conflicts)
- merged = merger.merge_all()
- assert 'apis' in merged
- assert 'matched_api' in merged['apis']
- assert merged['apis']['matched_api']['status'] == 'matched'
- def test_merge_summary():
- """Test merge summary statistics"""
- docs_data = {
- 'pages': [
- {
- 'url': 'https://example.com/api',
- 'apis': [
- {'name': 'api1', 'parameters': [], 'return_type': 'str'},
- {'name': 'api2', 'parameters': [], 'return_type': 'int'}
- ]
- }
- ]
- }
- github_data = {
- 'code_analysis': {
- 'analyzed_files': [
- {
- 'functions': [
- {'name': 'api3', 'parameters': [], 'return_type': 'bool'}
- ]
- }
- ]
- }
- }
- detector = ConflictDetector(docs_data, github_data)
- conflicts = detector.detect_all_conflicts()
- merger = RuleBasedMerger(docs_data, github_data, conflicts)
- merged = merger.merge_all()
- assert 'summary' in merged
- assert merged['summary']['total_apis'] == 3
- assert merged['summary']['docs_only'] == 2
- assert merged['summary']['code_only'] == 1
- # ===========================
- # Skill Builder Tests
- # ===========================
- def test_skill_builder_basic():
- """Test basic skill building"""
- config = {
- 'name': 'test_skill',
- 'description': 'Test skill description',
- 'sources': [
- {'type': 'documentation', 'base_url': 'https://example.com'}
- ]
- }
- scraped_data = {
- 'documentation': {
- 'pages': [],
- 'data_file': '/tmp/test.json'
- }
- }
- with tempfile.TemporaryDirectory() as tmpdir:
- # Override output directory
- builder = UnifiedSkillBuilder(config, scraped_data)
- builder.skill_dir = tmpdir
- builder._generate_skill_md()
- # Check SKILL.md was created
- skill_md = Path(tmpdir) / 'SKILL.md'
- assert skill_md.exists()
- content = skill_md.read_text()
- assert 'test_skill' in content.lower()
- assert 'Test skill description' in content
- def test_skill_builder_with_conflicts():
- """Test skill building with conflicts"""
- config = {
- 'name': 'test_skill',
- 'description': 'Test',
- 'sources': [
- {'type': 'documentation', 'base_url': 'https://example.com'},
- {'type': 'github', 'repo': 'user/repo'}
- ]
- }
- scraped_data = {}
- conflicts = [
- Conflict(
- type='missing_in_code',
- severity='high',
- api_name='test_api',
- docs_info={'name': 'test_api'},
- code_info=None,
- difference='Test difference'
- )
- ]
- with tempfile.TemporaryDirectory() as tmpdir:
- builder = UnifiedSkillBuilder(config, scraped_data, conflicts=conflicts)
- builder.skill_dir = tmpdir
- builder._generate_skill_md()
- skill_md = Path(tmpdir) / 'SKILL.md'
- content = skill_md.read_text()
- assert '1 conflicts detected' in content
- assert 'missing_in_code' in content
- def test_skill_builder_merged_apis():
- """Test skill building with merged APIs"""
- config = {
- 'name': 'test',
- 'description': 'Test',
- 'sources': []
- }
- scraped_data = {}
- merged_data = {
- 'apis': {
- 'test_api': {
- 'name': 'test_api',
- 'status': 'matched',
- 'merged_signature': 'test_api(x: int) -> str',
- 'merged_description': 'Test API',
- 'source': 'both'
- }
- }
- }
- with tempfile.TemporaryDirectory() as tmpdir:
- builder = UnifiedSkillBuilder(config, scraped_data, merged_data=merged_data)
- builder.skill_dir = tmpdir
- content = builder._format_merged_apis()
- assert '✅ Verified APIs' in content
- assert 'test_api' in content
- # ===========================
- # Integration Tests
- # ===========================
- def test_full_workflow_unified_config():
- """Test complete workflow with unified config"""
- # Create test config
- config = {
- "name": "test_unified",
- "description": "Test unified workflow",
- "merge_mode": "rule-based",
- "sources": [
- {
- "type": "documentation",
- "base_url": "https://example.com",
- "extract_api": True
- },
- {
- "type": "github",
- "repo": "user/repo",
- "include_code": True,
- "code_analysis_depth": "surface"
- }
- ]
- }
- # Validate config
- validator = ConfigValidator(config)
- validator.validate()
- assert validator.is_unified == True
- assert validator.needs_api_merge() == True
- def test_config_file_validation():
- """Test validation from config file"""
- with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
- config = {
- "name": "test",
- "description": "Test",
- "sources": [
- {"type": "documentation", "base_url": "https://example.com"}
- ]
- }
- json.dump(config, f)
- config_path = f.name
- try:
- validator = validate_config(config_path)
- assert validator.is_unified == True
- finally:
- os.unlink(config_path)
- # Run tests
- if __name__ == '__main__':
- pytest.main([__file__, '-v'])
|