test_scraper_features.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527
  1. #!/usr/bin/env python3
  2. """
  3. Test suite for doc_scraper core features
  4. Tests URL validation, language detection, pattern extraction, and categorization
  5. """
  6. import sys
  7. import os
  8. import unittest
  9. from unittest.mock import Mock, MagicMock
  10. from bs4 import BeautifulSoup
  11. # Add parent directory to path
  12. sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  13. from skill_seekers.cli.doc_scraper import DocToSkillConverter
  14. class TestURLValidation(unittest.TestCase):
  15. """Test URL validation logic"""
  16. def setUp(self):
  17. """Set up test converter"""
  18. self.config = {
  19. 'name': 'test',
  20. 'base_url': 'https://docs.example.com/',
  21. 'url_patterns': {
  22. 'include': ['/guide/', '/api/'],
  23. 'exclude': ['/blog/', '/about/']
  24. },
  25. 'selectors': {
  26. 'main_content': 'article',
  27. 'title': 'h1',
  28. 'code_blocks': 'pre code'
  29. },
  30. 'rate_limit': 0.1,
  31. 'max_pages': 10
  32. }
  33. self.converter = DocToSkillConverter(self.config, dry_run=True)
  34. def test_valid_url_with_include_pattern(self):
  35. """Test URL matching include pattern"""
  36. url = 'https://docs.example.com/guide/getting-started'
  37. self.assertTrue(self.converter.is_valid_url(url))
  38. def test_valid_url_with_api_pattern(self):
  39. """Test URL matching API pattern"""
  40. url = 'https://docs.example.com/api/reference'
  41. self.assertTrue(self.converter.is_valid_url(url))
  42. def test_invalid_url_with_exclude_pattern(self):
  43. """Test URL matching exclude pattern"""
  44. url = 'https://docs.example.com/blog/announcement'
  45. self.assertFalse(self.converter.is_valid_url(url))
  46. def test_invalid_url_different_domain(self):
  47. """Test URL from different domain"""
  48. url = 'https://other-site.com/guide/tutorial'
  49. self.assertFalse(self.converter.is_valid_url(url))
  50. def test_invalid_url_no_include_match(self):
  51. """Test URL not matching any include pattern"""
  52. url = 'https://docs.example.com/download/installer'
  53. self.assertFalse(self.converter.is_valid_url(url))
  54. def test_url_validation_no_patterns(self):
  55. """Test URL validation with no include/exclude patterns"""
  56. config = {
  57. 'name': 'test',
  58. 'base_url': 'https://docs.example.com/',
  59. 'url_patterns': {
  60. 'include': [],
  61. 'exclude': []
  62. },
  63. 'selectors': {'main_content': 'article', 'title': 'h1', 'code_blocks': 'pre'},
  64. 'rate_limit': 0.1,
  65. 'max_pages': 10
  66. }
  67. converter = DocToSkillConverter(config, dry_run=True)
  68. # Should accept any URL under base_url
  69. self.assertTrue(converter.is_valid_url('https://docs.example.com/anything'))
  70. self.assertFalse(converter.is_valid_url('https://other.com/anything'))
  71. class TestLanguageDetection(unittest.TestCase):
  72. """Test language detection from code blocks"""
  73. def setUp(self):
  74. """Set up test converter"""
  75. config = {
  76. 'name': 'test',
  77. 'base_url': 'https://example.com/',
  78. 'selectors': {'main_content': 'article', 'title': 'h1', 'code_blocks': 'pre'},
  79. 'rate_limit': 0.1,
  80. 'max_pages': 10
  81. }
  82. self.converter = DocToSkillConverter(config, dry_run=True)
  83. def test_detect_language_from_class(self):
  84. """Test language detection from CSS class"""
  85. html = '<code class="language-python">print("hello")</code>'
  86. elem = BeautifulSoup(html, 'html.parser').find('code')
  87. lang = self.converter.detect_language(elem, 'print("hello")')
  88. self.assertEqual(lang, 'python')
  89. def test_detect_language_from_lang_class(self):
  90. """Test language detection from lang- prefix"""
  91. html = '<code class="lang-javascript">console.log("hello")</code>'
  92. elem = BeautifulSoup(html, 'html.parser').find('code')
  93. lang = self.converter.detect_language(elem, 'console.log("hello")')
  94. self.assertEqual(lang, 'javascript')
  95. def test_detect_language_from_parent(self):
  96. """Test language detection from parent pre element"""
  97. html = '<pre class="language-cpp"><code>int main() {}</code></pre>'
  98. elem = BeautifulSoup(html, 'html.parser').find('code')
  99. lang = self.converter.detect_language(elem, 'int main() {}')
  100. self.assertEqual(lang, 'cpp')
  101. def test_detect_python_from_heuristics(self):
  102. """Test Python detection from code content"""
  103. html = '<code>import os\nfrom pathlib import Path</code>'
  104. elem = BeautifulSoup(html, 'html.parser').find('code')
  105. code = elem.get_text()
  106. lang = self.converter.detect_language(elem, code)
  107. self.assertEqual(lang, 'python')
  108. def test_detect_python_from_def(self):
  109. """Test Python detection from def keyword"""
  110. html = '<code>def my_function():\n pass</code>'
  111. elem = BeautifulSoup(html, 'html.parser').find('code')
  112. code = elem.get_text()
  113. lang = self.converter.detect_language(elem, code)
  114. self.assertEqual(lang, 'python')
  115. def test_detect_javascript_from_const(self):
  116. """Test JavaScript detection from const keyword"""
  117. html = '<code>const myVar = 10;</code>'
  118. elem = BeautifulSoup(html, 'html.parser').find('code')
  119. code = elem.get_text()
  120. lang = self.converter.detect_language(elem, code)
  121. self.assertEqual(lang, 'javascript')
  122. def test_detect_javascript_from_arrow(self):
  123. """Test JavaScript detection from arrow function"""
  124. html = '<code>const add = (a, b) => a + b;</code>'
  125. elem = BeautifulSoup(html, 'html.parser').find('code')
  126. code = elem.get_text()
  127. lang = self.converter.detect_language(elem, code)
  128. self.assertEqual(lang, 'javascript')
  129. def test_detect_gdscript(self):
  130. """Test GDScript detection"""
  131. html = '<code>func _ready():\n var x = 5</code>'
  132. elem = BeautifulSoup(html, 'html.parser').find('code')
  133. code = elem.get_text()
  134. lang = self.converter.detect_language(elem, code)
  135. self.assertEqual(lang, 'gdscript')
  136. def test_detect_cpp(self):
  137. """Test C++ detection"""
  138. html = '<code>#include <iostream>\nint main() { return 0; }</code>'
  139. elem = BeautifulSoup(html, 'html.parser').find('code')
  140. code = elem.get_text()
  141. lang = self.converter.detect_language(elem, code)
  142. self.assertEqual(lang, 'cpp')
  143. def test_detect_unknown(self):
  144. """Test unknown language detection"""
  145. html = '<code>some random text without clear indicators</code>'
  146. elem = BeautifulSoup(html, 'html.parser').find('code')
  147. code = elem.get_text()
  148. lang = self.converter.detect_language(elem, code)
  149. self.assertEqual(lang, 'unknown')
  150. def test_detect_brush_pattern_in_pre(self):
  151. """Test brush: pattern in pre element"""
  152. html = '<pre class="brush: python"><code>x</code></pre>'
  153. elem = BeautifulSoup(html, 'html.parser').find('code')
  154. lang = self.converter.detect_language(elem, 'x')
  155. self.assertEqual(lang, 'python', 'Should detect python from brush: python pattern')
  156. def test_detect_bare_class_in_pre(self):
  157. """Test bare class name in pre element"""
  158. html = '<pre class="python"><code>x</code></pre>'
  159. elem = BeautifulSoup(html, 'html.parser').find('code')
  160. lang = self.converter.detect_language(elem, 'x')
  161. self.assertEqual(lang, 'python', 'Should detect python from bare class name')
  162. def test_detect_bare_class_in_code(self):
  163. """Test bare class name in code element"""
  164. html = '<code class="python">x</code>'
  165. elem = BeautifulSoup(html, 'html.parser').find('code')
  166. lang = self.converter.detect_language(elem, 'x')
  167. self.assertEqual(lang, 'python', 'Should detect python from bare class name')
  168. def test_detect_csharp_from_using_system(self):
  169. """Test C# detection from 'using System' keyword"""
  170. html = '<code>using System;\nnamespace MyApp { }</code>'
  171. elem = BeautifulSoup(html, 'html.parser').find('code')
  172. code = elem.get_text()
  173. lang = self.converter.detect_language(elem, code)
  174. self.assertEqual(lang, 'csharp', 'Should detect C# from using System')
  175. def test_detect_csharp_from_namespace(self):
  176. """Test C# detection from 'namespace' keyword"""
  177. html = '<code>namespace MyNamespace\n{\n public class Test { }\n}</code>'
  178. elem = BeautifulSoup(html, 'html.parser').find('code')
  179. code = elem.get_text()
  180. lang = self.converter.detect_language(elem, code)
  181. self.assertEqual(lang, 'csharp', 'Should detect C# from namespace')
  182. def test_detect_csharp_from_property_syntax(self):
  183. """Test C# detection from property syntax"""
  184. html = '<code>public string Name { get; set; }</code>'
  185. elem = BeautifulSoup(html, 'html.parser').find('code')
  186. code = elem.get_text()
  187. lang = self.converter.detect_language(elem, code)
  188. self.assertEqual(lang, 'csharp', 'Should detect C# from { get; set; } syntax')
  189. def test_detect_csharp_from_public_class(self):
  190. """Test C# detection from 'public class' keyword"""
  191. html = '<code>public class MyClass\n{\n private int value;\n}</code>'
  192. elem = BeautifulSoup(html, 'html.parser').find('code')
  193. code = elem.get_text()
  194. lang = self.converter.detect_language(elem, code)
  195. self.assertEqual(lang, 'csharp', 'Should detect C# from public class')
  196. def test_detect_csharp_from_private_class(self):
  197. """Test C# detection from 'private class' keyword"""
  198. html = '<code>private class Helper { }</code>'
  199. elem = BeautifulSoup(html, 'html.parser').find('code')
  200. code = elem.get_text()
  201. lang = self.converter.detect_language(elem, code)
  202. self.assertEqual(lang, 'csharp', 'Should detect C# from private class')
  203. def test_detect_csharp_from_public_static_void(self):
  204. """Test C# detection from 'public static void' keyword"""
  205. html = '<code>public static void Main(string[] args)\n{\n Console.WriteLine("Test");\n}</code>'
  206. elem = BeautifulSoup(html, 'html.parser').find('code')
  207. code = elem.get_text()
  208. lang = self.converter.detect_language(elem, code)
  209. self.assertEqual(lang, 'csharp', 'Should detect C# from public static void')
  210. def test_detect_csharp_from_class_attribute(self):
  211. """Test C# detection from CSS class attribute"""
  212. html = '<code class="language-csharp">var x = 5;</code>'
  213. elem = BeautifulSoup(html, 'html.parser').find('code')
  214. code = elem.get_text()
  215. lang = self.converter.detect_language(elem, code)
  216. self.assertEqual(lang, 'csharp', 'Should detect C# from language-csharp class')
  217. class TestPatternExtraction(unittest.TestCase):
  218. """Test pattern extraction from documentation"""
  219. def setUp(self):
  220. """Set up test converter"""
  221. config = {
  222. 'name': 'test',
  223. 'base_url': 'https://example.com/',
  224. 'selectors': {'main_content': 'article', 'title': 'h1', 'code_blocks': 'pre'},
  225. 'rate_limit': 0.1,
  226. 'max_pages': 10
  227. }
  228. self.converter = DocToSkillConverter(config, dry_run=True)
  229. def test_extract_pattern_with_example_marker(self):
  230. """Test pattern extraction with 'Example:' marker"""
  231. html = '''
  232. <article>
  233. <p>Example: Here's how to use it</p>
  234. <pre><code>print("hello")</code></pre>
  235. </article>
  236. '''
  237. soup = BeautifulSoup(html, 'html.parser')
  238. main = soup.find('article')
  239. patterns = self.converter.extract_patterns(main, [])
  240. self.assertGreater(len(patterns), 0)
  241. self.assertIn('example', patterns[0]['description'].lower())
  242. def test_extract_pattern_with_usage_marker(self):
  243. """Test pattern extraction with 'Usage:' marker"""
  244. html = '''
  245. <article>
  246. <p>Usage: Call this function like so</p>
  247. <pre><code>my_function(arg)</code></pre>
  248. </article>
  249. '''
  250. soup = BeautifulSoup(html, 'html.parser')
  251. main = soup.find('article')
  252. patterns = self.converter.extract_patterns(main, [])
  253. self.assertGreater(len(patterns), 0)
  254. self.assertIn('usage', patterns[0]['description'].lower())
  255. def test_extract_pattern_limit(self):
  256. """Test pattern extraction limits to 5 patterns"""
  257. html = '<article>'
  258. for i in range(10):
  259. html += f'<p>Example {i}: Test</p><pre><code>code_{i}</code></pre>'
  260. html += '</article>'
  261. soup = BeautifulSoup(html, 'html.parser')
  262. main = soup.find('article')
  263. patterns = self.converter.extract_patterns(main, [])
  264. self.assertLessEqual(len(patterns), 5, "Should limit to 5 patterns max")
  265. class TestCategorization(unittest.TestCase):
  266. """Test smart categorization logic"""
  267. def setUp(self):
  268. """Set up test converter"""
  269. config = {
  270. 'name': 'test',
  271. 'base_url': 'https://example.com/',
  272. 'categories': {
  273. 'getting_started': ['intro', 'tutorial', 'getting-started'],
  274. 'api': ['api', 'reference', 'class'],
  275. 'guides': ['guide', 'how-to']
  276. },
  277. 'selectors': {'main_content': 'article', 'title': 'h1', 'code_blocks': 'pre'},
  278. 'rate_limit': 0.1,
  279. 'max_pages': 10
  280. }
  281. self.converter = DocToSkillConverter(config, dry_run=True)
  282. def test_categorize_by_url(self):
  283. """Test categorization based on URL"""
  284. pages = [{
  285. 'url': 'https://example.com/api/reference',
  286. 'title': 'Some Title',
  287. 'content': 'Some content'
  288. }]
  289. categories = self.converter.smart_categorize(pages)
  290. # Should categorize to 'api' based on URL containing 'api'
  291. self.assertIn('api', categories)
  292. self.assertEqual(len(categories['api']), 1)
  293. def test_categorize_by_title(self):
  294. """Test categorization based on title"""
  295. pages = [{
  296. 'url': 'https://example.com/docs/page',
  297. 'title': 'API Reference Documentation',
  298. 'content': 'Some content'
  299. }]
  300. categories = self.converter.smart_categorize(pages)
  301. self.assertIn('api', categories)
  302. self.assertEqual(len(categories['api']), 1)
  303. def test_categorize_by_content(self):
  304. """Test categorization based on content (lower priority)"""
  305. pages = [{
  306. 'url': 'https://example.com/docs/page',
  307. 'title': 'Some Page',
  308. 'content': 'This is a tutorial for beginners. An intro to the system.'
  309. }]
  310. categories = self.converter.smart_categorize(pages)
  311. # Should categorize based on 'tutorial' and 'intro' in content
  312. self.assertIn('getting_started', categories)
  313. def test_categorize_to_other(self):
  314. """Test pages that don't match any category go to 'other'"""
  315. pages = [{
  316. 'url': 'https://example.com/random/page',
  317. 'title': 'Random Page',
  318. 'content': 'Random content with no keywords'
  319. }]
  320. categories = self.converter.smart_categorize(pages)
  321. self.assertIn('other', categories)
  322. self.assertEqual(len(categories['other']), 1)
  323. def test_empty_categories_removed(self):
  324. """Test empty categories are removed"""
  325. pages = [{
  326. 'url': 'https://example.com/api/reference',
  327. 'title': 'API Reference',
  328. 'content': 'API documentation'
  329. }]
  330. categories = self.converter.smart_categorize(pages)
  331. # Only 'api' should exist, not empty 'guides' or 'getting_started'
  332. # (categories with no pages are removed)
  333. self.assertIn('api', categories)
  334. self.assertNotIn('guides', categories)
  335. class TestLinkExtraction(unittest.TestCase):
  336. """Test link extraction and anchor fragment handling"""
  337. def setUp(self):
  338. """Set up test converter"""
  339. config = {
  340. 'name': 'test',
  341. 'base_url': 'https://example.com/',
  342. 'selectors': {'main_content': 'article', 'title': 'h1', 'code_blocks': 'pre code'},
  343. 'url_patterns': {
  344. 'include': [],
  345. 'exclude': []
  346. },
  347. 'rate_limit': 0.1,
  348. 'max_pages': 10
  349. }
  350. self.converter = DocToSkillConverter(config, dry_run=True)
  351. def test_extract_links_strips_anchor_fragments(self):
  352. """Test that anchor fragments (#anchor) are stripped from extracted links"""
  353. html = '''
  354. <article>
  355. <h1>Test Page</h1>
  356. <p>Content with links</p>
  357. <a href="https://example.com/docs/page.html#section1">Link 1</a>
  358. <a href="https://example.com/docs/page.html#section2">Link 2</a>
  359. <a href="https://example.com/docs/other.html">Link 3</a>
  360. </article>
  361. '''
  362. soup = BeautifulSoup(html, 'html.parser')
  363. page = self.converter.extract_content(soup, 'https://example.com/')
  364. # Should have 2 unique URLs (page.html and other.html), not 3
  365. # The two links with different anchors should be deduplicated
  366. self.assertEqual(len(page['links']), 2)
  367. self.assertIn('https://example.com/docs/page.html', page['links'])
  368. self.assertIn('https://example.com/docs/other.html', page['links'])
  369. def test_extract_links_no_anchor_duplicates(self):
  370. """Test that multiple anchor links to same page don't create duplicates"""
  371. html = '''
  372. <article>
  373. <h1>Test Page</h1>
  374. <a href="https://example.com/docs/api.html#cb1-1">Anchor 1</a>
  375. <a href="https://example.com/docs/api.html#cb1-2">Anchor 2</a>
  376. <a href="https://example.com/docs/api.html#cb1-3">Anchor 3</a>
  377. <a href="https://example.com/docs/api.html#cb1-4">Anchor 4</a>
  378. <a href="https://example.com/docs/api.html#cb1-5">Anchor 5</a>
  379. </article>
  380. '''
  381. soup = BeautifulSoup(html, 'html.parser')
  382. page = self.converter.extract_content(soup, 'https://example.com/')
  383. # All 5 links point to the same page, should result in only 1 URL
  384. self.assertEqual(len(page['links']), 1)
  385. self.assertEqual(page['links'][0], 'https://example.com/docs/api.html')
  386. def test_extract_links_preserves_query_params(self):
  387. """Test that query parameters are preserved when stripping anchors"""
  388. html = '''
  389. <article>
  390. <h1>Test Page</h1>
  391. <a href="https://example.com/search?q=test#result1">Search Result</a>
  392. </article>
  393. '''
  394. soup = BeautifulSoup(html, 'html.parser')
  395. page = self.converter.extract_content(soup, 'https://example.com/')
  396. # Query params should be preserved, only anchor stripped
  397. self.assertEqual(len(page['links']), 1)
  398. self.assertEqual(page['links'][0], 'https://example.com/search?q=test')
  399. def test_extract_links_relative_urls_with_anchors(self):
  400. """Test that relative URLs with anchors are handled correctly"""
  401. html = '''
  402. <article>
  403. <h1>Test Page</h1>
  404. <a href="/docs/guide.html#intro">Relative Link 1</a>
  405. <a href="/docs/guide.html#advanced">Relative Link 2</a>
  406. <a href="/docs/tutorial.html#start">Relative Link 3</a>
  407. </article>
  408. '''
  409. soup = BeautifulSoup(html, 'html.parser')
  410. page = self.converter.extract_content(soup, 'https://example.com/')
  411. # Should have 2 unique URLs (guide.html and tutorial.html)
  412. self.assertEqual(len(page['links']), 2)
  413. self.assertIn('https://example.com/docs/guide.html', page['links'])
  414. self.assertIn('https://example.com/docs/tutorial.html', page['links'])
  415. class TestTextCleaning(unittest.TestCase):
  416. """Test text cleaning utility"""
  417. def setUp(self):
  418. """Set up test converter"""
  419. config = {
  420. 'name': 'test',
  421. 'base_url': 'https://example.com/',
  422. 'selectors': {'main_content': 'article', 'title': 'h1', 'code_blocks': 'pre'},
  423. 'rate_limit': 0.1,
  424. 'max_pages': 10
  425. }
  426. self.converter = DocToSkillConverter(config, dry_run=True)
  427. def test_clean_multiple_spaces(self):
  428. """Test cleaning multiple spaces"""
  429. text = "Hello world test"
  430. cleaned = self.converter.clean_text(text)
  431. self.assertEqual(cleaned, "Hello world test")
  432. def test_clean_newlines(self):
  433. """Test cleaning newlines"""
  434. text = "Hello\n\nworld\ntest"
  435. cleaned = self.converter.clean_text(text)
  436. self.assertEqual(cleaned, "Hello world test")
  437. def test_clean_tabs(self):
  438. """Test cleaning tabs"""
  439. text = "Hello\t\tworld\ttest"
  440. cleaned = self.converter.clean_text(text)
  441. self.assertEqual(cleaned, "Hello world test")
  442. def test_clean_strip_whitespace(self):
  443. """Test stripping leading/trailing whitespace"""
  444. text = " Hello world "
  445. cleaned = self.converter.clean_text(text)
  446. self.assertEqual(cleaned, "Hello world")
  447. if __name__ == '__main__':
  448. unittest.main()