Example: Here's how to use it
print("hello")
#!/usr/bin/env python3
"""
Test suite for doc_scraper core features
Tests URL validation, language detection, pattern extraction, and categorization
"""
import sys
import os
import unittest
from unittest.mock import Mock, MagicMock
from bs4 import BeautifulSoup
# Add parent directory to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from skill_seekers.cli.doc_scraper import DocToSkillConverter
class TestURLValidation(unittest.TestCase):
"""Test URL validation logic"""
def setUp(self):
"""Set up test converter"""
self.config = {
'name': 'test',
'base_url': 'https://docs.example.com/',
'url_patterns': {
'include': ['/guide/', '/api/'],
'exclude': ['/blog/', '/about/']
},
'selectors': {
'main_content': 'article',
'title': 'h1',
'code_blocks': 'pre code'
},
'rate_limit': 0.1,
'max_pages': 10
}
self.converter = DocToSkillConverter(self.config, dry_run=True)
def test_valid_url_with_include_pattern(self):
"""Test URL matching include pattern"""
url = 'https://docs.example.com/guide/getting-started'
self.assertTrue(self.converter.is_valid_url(url))
def test_valid_url_with_api_pattern(self):
"""Test URL matching API pattern"""
url = 'https://docs.example.com/api/reference'
self.assertTrue(self.converter.is_valid_url(url))
def test_invalid_url_with_exclude_pattern(self):
"""Test URL matching exclude pattern"""
url = 'https://docs.example.com/blog/announcement'
self.assertFalse(self.converter.is_valid_url(url))
def test_invalid_url_different_domain(self):
"""Test URL from different domain"""
url = 'https://other-site.com/guide/tutorial'
self.assertFalse(self.converter.is_valid_url(url))
def test_invalid_url_no_include_match(self):
"""Test URL not matching any include pattern"""
url = 'https://docs.example.com/download/installer'
self.assertFalse(self.converter.is_valid_url(url))
def test_url_validation_no_patterns(self):
"""Test URL validation with no include/exclude patterns"""
config = {
'name': 'test',
'base_url': 'https://docs.example.com/',
'url_patterns': {
'include': [],
'exclude': []
},
'selectors': {'main_content': 'article', 'title': 'h1', 'code_blocks': 'pre'},
'rate_limit': 0.1,
'max_pages': 10
}
converter = DocToSkillConverter(config, dry_run=True)
# Should accept any URL under base_url
self.assertTrue(converter.is_valid_url('https://docs.example.com/anything'))
self.assertFalse(converter.is_valid_url('https://other.com/anything'))
class TestLanguageDetection(unittest.TestCase):
"""Test language detection from code blocks"""
def setUp(self):
"""Set up test converter"""
config = {
'name': 'test',
'base_url': 'https://example.com/',
'selectors': {'main_content': 'article', 'title': 'h1', 'code_blocks': 'pre'},
'rate_limit': 0.1,
'max_pages': 10
}
self.converter = DocToSkillConverter(config, dry_run=True)
def test_detect_language_from_class(self):
"""Test language detection from CSS class"""
html = 'print("hello")'
elem = BeautifulSoup(html, 'html.parser').find('code')
lang = self.converter.detect_language(elem, 'print("hello")')
self.assertEqual(lang, 'python')
def test_detect_language_from_lang_class(self):
"""Test language detection from lang- prefix"""
html = 'console.log("hello")'
elem = BeautifulSoup(html, 'html.parser').find('code')
lang = self.converter.detect_language(elem, 'console.log("hello")')
self.assertEqual(lang, 'javascript')
def test_detect_language_from_parent(self):
"""Test language detection from parent pre element"""
html = '
int main() {}'
elem = BeautifulSoup(html, 'html.parser').find('code')
lang = self.converter.detect_language(elem, 'int main() {}')
self.assertEqual(lang, 'cpp')
def test_detect_python_from_heuristics(self):
"""Test Python detection from code content"""
html = 'import os\nfrom pathlib import Path'
elem = BeautifulSoup(html, 'html.parser').find('code')
code = elem.get_text()
lang = self.converter.detect_language(elem, code)
self.assertEqual(lang, 'python')
def test_detect_python_from_def(self):
"""Test Python detection from def keyword"""
html = 'def my_function():\n pass'
elem = BeautifulSoup(html, 'html.parser').find('code')
code = elem.get_text()
lang = self.converter.detect_language(elem, code)
self.assertEqual(lang, 'python')
def test_detect_javascript_from_const(self):
"""Test JavaScript detection from const keyword"""
html = 'const myVar = 10;'
elem = BeautifulSoup(html, 'html.parser').find('code')
code = elem.get_text()
lang = self.converter.detect_language(elem, code)
self.assertEqual(lang, 'javascript')
def test_detect_javascript_from_arrow(self):
"""Test JavaScript detection from arrow function"""
html = 'const add = (a, b) => a + b;'
elem = BeautifulSoup(html, 'html.parser').find('code')
code = elem.get_text()
lang = self.converter.detect_language(elem, code)
self.assertEqual(lang, 'javascript')
def test_detect_gdscript(self):
"""Test GDScript detection"""
html = 'func _ready():\n var x = 5'
elem = BeautifulSoup(html, 'html.parser').find('code')
code = elem.get_text()
lang = self.converter.detect_language(elem, code)
self.assertEqual(lang, 'gdscript')
def test_detect_cpp(self):
"""Test C++ detection"""
html = '#include \nint main() { return 0; } '
elem = BeautifulSoup(html, 'html.parser').find('code')
code = elem.get_text()
lang = self.converter.detect_language(elem, code)
self.assertEqual(lang, 'cpp')
def test_detect_unknown(self):
"""Test unknown language detection"""
html = 'some random text without clear indicators'
elem = BeautifulSoup(html, 'html.parser').find('code')
code = elem.get_text()
lang = self.converter.detect_language(elem, code)
self.assertEqual(lang, 'unknown')
def test_detect_brush_pattern_in_pre(self):
"""Test brush: pattern in pre element"""
html = 'x'
elem = BeautifulSoup(html, 'html.parser').find('code')
lang = self.converter.detect_language(elem, 'x')
self.assertEqual(lang, 'python', 'Should detect python from brush: python pattern')
def test_detect_bare_class_in_pre(self):
"""Test bare class name in pre element"""
html = 'x'
elem = BeautifulSoup(html, 'html.parser').find('code')
lang = self.converter.detect_language(elem, 'x')
self.assertEqual(lang, 'python', 'Should detect python from bare class name')
def test_detect_bare_class_in_code(self):
"""Test bare class name in code element"""
html = 'x'
elem = BeautifulSoup(html, 'html.parser').find('code')
lang = self.converter.detect_language(elem, 'x')
self.assertEqual(lang, 'python', 'Should detect python from bare class name')
def test_detect_csharp_from_using_system(self):
"""Test C# detection from 'using System' keyword"""
html = 'using System;\nnamespace MyApp { }'
elem = BeautifulSoup(html, 'html.parser').find('code')
code = elem.get_text()
lang = self.converter.detect_language(elem, code)
self.assertEqual(lang, 'csharp', 'Should detect C# from using System')
def test_detect_csharp_from_namespace(self):
"""Test C# detection from 'namespace' keyword"""
html = 'namespace MyNamespace\n{\n public class Test { }\n}'
elem = BeautifulSoup(html, 'html.parser').find('code')
code = elem.get_text()
lang = self.converter.detect_language(elem, code)
self.assertEqual(lang, 'csharp', 'Should detect C# from namespace')
def test_detect_csharp_from_property_syntax(self):
"""Test C# detection from property syntax"""
html = 'public string Name { get; set; }'
elem = BeautifulSoup(html, 'html.parser').find('code')
code = elem.get_text()
lang = self.converter.detect_language(elem, code)
self.assertEqual(lang, 'csharp', 'Should detect C# from { get; set; } syntax')
def test_detect_csharp_from_public_class(self):
"""Test C# detection from 'public class' keyword"""
html = 'public class MyClass\n{\n private int value;\n}'
elem = BeautifulSoup(html, 'html.parser').find('code')
code = elem.get_text()
lang = self.converter.detect_language(elem, code)
self.assertEqual(lang, 'csharp', 'Should detect C# from public class')
def test_detect_csharp_from_private_class(self):
"""Test C# detection from 'private class' keyword"""
html = 'private class Helper { }'
elem = BeautifulSoup(html, 'html.parser').find('code')
code = elem.get_text()
lang = self.converter.detect_language(elem, code)
self.assertEqual(lang, 'csharp', 'Should detect C# from private class')
def test_detect_csharp_from_public_static_void(self):
"""Test C# detection from 'public static void' keyword"""
html = 'public static void Main(string[] args)\n{\n Console.WriteLine("Test");\n}'
elem = BeautifulSoup(html, 'html.parser').find('code')
code = elem.get_text()
lang = self.converter.detect_language(elem, code)
self.assertEqual(lang, 'csharp', 'Should detect C# from public static void')
def test_detect_csharp_from_class_attribute(self):
"""Test C# detection from CSS class attribute"""
html = 'var x = 5;'
elem = BeautifulSoup(html, 'html.parser').find('code')
code = elem.get_text()
lang = self.converter.detect_language(elem, code)
self.assertEqual(lang, 'csharp', 'Should detect C# from language-csharp class')
class TestPatternExtraction(unittest.TestCase):
"""Test pattern extraction from documentation"""
def setUp(self):
"""Set up test converter"""
config = {
'name': 'test',
'base_url': 'https://example.com/',
'selectors': {'main_content': 'article', 'title': 'h1', 'code_blocks': 'pre'},
'rate_limit': 0.1,
'max_pages': 10
}
self.converter = DocToSkillConverter(config, dry_run=True)
def test_extract_pattern_with_example_marker(self):
"""Test pattern extraction with 'Example:' marker"""
html = '''
Example: Here's how to use it
print("hello")
Usage: Call this function like so
my_function(arg)
Example {i}: Test
code_{i}'
html += 'Content with links
Link 1 Link 2 Link 3