test_skip_llms_txt.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318
  1. """Tests for skip_llms_txt configuration option.
  2. This config option allows users to explicitly skip llms.txt detection and fetching,
  3. which is useful when:
  4. - A site's llms.txt is incomplete or incorrect
  5. - You need specific pages not in llms.txt
  6. - You want to force HTML scraping
  7. """
  8. import os
  9. import tempfile
  10. import unittest
  11. import logging
  12. from unittest.mock import patch, Mock, MagicMock
  13. from skill_seekers.cli.doc_scraper import DocToSkillConverter
  14. class TestSkipLlmsTxtConfig(unittest.TestCase):
  15. """Test skip_llms_txt configuration option."""
  16. def test_default_skip_llms_txt_is_false(self):
  17. """Test that skip_llms_txt defaults to False when not specified."""
  18. config = {
  19. 'name': 'test',
  20. 'base_url': 'https://example.com/',
  21. 'selectors': {'main_content': 'article'}
  22. }
  23. converter = DocToSkillConverter(config, dry_run=True)
  24. self.assertFalse(converter.skip_llms_txt)
  25. def test_skip_llms_txt_can_be_set_true(self):
  26. """Test that skip_llms_txt can be explicitly set to True."""
  27. config = {
  28. 'name': 'test',
  29. 'base_url': 'https://example.com/',
  30. 'selectors': {'main_content': 'article'},
  31. 'skip_llms_txt': True
  32. }
  33. converter = DocToSkillConverter(config, dry_run=True)
  34. self.assertTrue(converter.skip_llms_txt)
  35. def test_skip_llms_txt_can_be_set_false(self):
  36. """Test that skip_llms_txt can be explicitly set to False."""
  37. config = {
  38. 'name': 'test',
  39. 'base_url': 'https://example.com/',
  40. 'selectors': {'main_content': 'article'},
  41. 'skip_llms_txt': False
  42. }
  43. converter = DocToSkillConverter(config, dry_run=True)
  44. self.assertFalse(converter.skip_llms_txt)
  45. class TestSkipLlmsTxtSyncBehavior(unittest.TestCase):
  46. """Test skip_llms_txt behavior in sync scraping mode."""
  47. def test_llms_txt_tried_when_not_skipped(self):
  48. """Test that _try_llms_txt is called when skip_llms_txt is False."""
  49. config = {
  50. 'name': 'test',
  51. 'base_url': 'https://example.com/',
  52. 'selectors': {'main_content': 'article'},
  53. 'skip_llms_txt': False
  54. }
  55. original_cwd = os.getcwd()
  56. with tempfile.TemporaryDirectory() as tmpdir:
  57. try:
  58. os.chdir(tmpdir)
  59. converter = DocToSkillConverter(config, dry_run=False)
  60. with patch.object(converter, '_try_llms_txt', return_value=False) as mock_try:
  61. with patch.object(converter, 'scrape_page'):
  62. with patch.object(converter, 'save_summary'):
  63. converter.scrape_all()
  64. mock_try.assert_called_once()
  65. finally:
  66. os.chdir(original_cwd)
  67. def test_llms_txt_skipped_when_skip_true(self):
  68. """Test that _try_llms_txt is NOT called when skip_llms_txt is True."""
  69. config = {
  70. 'name': 'test',
  71. 'base_url': 'https://example.com/',
  72. 'selectors': {'main_content': 'article'},
  73. 'skip_llms_txt': True
  74. }
  75. original_cwd = os.getcwd()
  76. with tempfile.TemporaryDirectory() as tmpdir:
  77. try:
  78. os.chdir(tmpdir)
  79. converter = DocToSkillConverter(config, dry_run=False)
  80. with patch.object(converter, '_try_llms_txt') as mock_try:
  81. with patch.object(converter, 'scrape_page'):
  82. with patch.object(converter, 'save_summary'):
  83. converter.scrape_all()
  84. mock_try.assert_not_called()
  85. finally:
  86. os.chdir(original_cwd)
  87. def test_llms_txt_skipped_in_dry_run_mode(self):
  88. """Test that _try_llms_txt is NOT called in dry-run mode regardless of skip setting."""
  89. config = {
  90. 'name': 'test',
  91. 'base_url': 'https://example.com/',
  92. 'selectors': {'main_content': 'article'},
  93. 'skip_llms_txt': False # Even when False
  94. }
  95. original_cwd = os.getcwd()
  96. with tempfile.TemporaryDirectory() as tmpdir:
  97. try:
  98. os.chdir(tmpdir)
  99. converter = DocToSkillConverter(config, dry_run=True)
  100. with patch.object(converter, '_try_llms_txt') as mock_try:
  101. with patch.object(converter, 'save_summary'):
  102. converter.scrape_all()
  103. mock_try.assert_not_called()
  104. finally:
  105. os.chdir(original_cwd)
  106. class TestSkipLlmsTxtAsyncBehavior(unittest.TestCase):
  107. """Test skip_llms_txt behavior in async scraping mode."""
  108. def test_async_llms_txt_tried_when_not_skipped(self):
  109. """Test that _try_llms_txt is called in async mode when skip_llms_txt is False."""
  110. config = {
  111. 'name': 'test',
  112. 'base_url': 'https://example.com/',
  113. 'selectors': {'main_content': 'article'},
  114. 'async_mode': True,
  115. 'skip_llms_txt': False
  116. }
  117. original_cwd = os.getcwd()
  118. with tempfile.TemporaryDirectory() as tmpdir:
  119. try:
  120. os.chdir(tmpdir)
  121. converter = DocToSkillConverter(config, dry_run=False)
  122. with patch.object(converter, '_try_llms_txt', return_value=False) as mock_try:
  123. with patch.object(converter, 'scrape_page_async', return_value=None):
  124. with patch.object(converter, 'save_summary'):
  125. converter.scrape_all()
  126. mock_try.assert_called_once()
  127. finally:
  128. os.chdir(original_cwd)
  129. def test_async_llms_txt_skipped_when_skip_true(self):
  130. """Test that _try_llms_txt is NOT called in async mode when skip_llms_txt is True."""
  131. config = {
  132. 'name': 'test',
  133. 'base_url': 'https://example.com/',
  134. 'selectors': {'main_content': 'article'},
  135. 'async_mode': True,
  136. 'skip_llms_txt': True
  137. }
  138. original_cwd = os.getcwd()
  139. with tempfile.TemporaryDirectory() as tmpdir:
  140. try:
  141. os.chdir(tmpdir)
  142. converter = DocToSkillConverter(config, dry_run=False)
  143. with patch.object(converter, '_try_llms_txt') as mock_try:
  144. with patch.object(converter, 'scrape_page_async', return_value=None):
  145. with patch.object(converter, 'save_summary'):
  146. converter.scrape_all()
  147. mock_try.assert_not_called()
  148. finally:
  149. os.chdir(original_cwd)
  150. class TestSkipLlmsTxtWithRealConfig(unittest.TestCase):
  151. """Test skip_llms_txt with real-world config patterns."""
  152. def test_telegram_bots_config_pattern(self):
  153. """Test the telegram-bots config pattern which uses skip_llms_txt."""
  154. config = {
  155. 'name': 'telegram-bots',
  156. 'description': 'Telegram bot documentation',
  157. 'base_url': 'https://core.telegram.org/bots',
  158. 'skip_llms_txt': True, # Telegram doesn't have useful llms.txt
  159. 'start_urls': [
  160. 'https://core.telegram.org/bots',
  161. 'https://core.telegram.org/bots/api'
  162. ],
  163. 'selectors': {
  164. 'main_content': '#dev_page_content, main, article',
  165. 'title': 'h1, title',
  166. 'code_blocks': 'pre code, pre'
  167. }
  168. }
  169. converter = DocToSkillConverter(config, dry_run=True)
  170. self.assertTrue(converter.skip_llms_txt)
  171. self.assertEqual(converter.name, 'telegram-bots')
  172. def test_skip_llms_txt_with_multiple_start_urls(self):
  173. """Test skip_llms_txt works correctly with multiple start URLs."""
  174. config = {
  175. 'name': 'test-multi',
  176. 'base_url': 'https://example.com/',
  177. 'selectors': {'main_content': 'article'},
  178. 'skip_llms_txt': True,
  179. 'start_urls': [
  180. 'https://example.com/docs/',
  181. 'https://example.com/api/',
  182. 'https://example.com/guide/'
  183. ]
  184. }
  185. converter = DocToSkillConverter(config, dry_run=True)
  186. self.assertTrue(converter.skip_llms_txt)
  187. # start_urls are stored in pending_urls deque
  188. self.assertEqual(len(converter.pending_urls), 3)
  189. class TestSkipLlmsTxtEdgeCases(unittest.TestCase):
  190. """Test edge cases for skip_llms_txt."""
  191. def test_skip_llms_txt_with_int_zero_logs_warning(self):
  192. """Test that integer 0 logs warning and defaults to False."""
  193. config = {
  194. 'name': 'test',
  195. 'base_url': 'https://example.com/',
  196. 'selectors': {'main_content': 'article'},
  197. 'skip_llms_txt': 0 # Invalid type
  198. }
  199. with self.assertLogs('skill_seekers.cli.doc_scraper', level='WARNING') as cm:
  200. converter = DocToSkillConverter(config, dry_run=True)
  201. self.assertFalse(converter.skip_llms_txt)
  202. self.assertTrue(any('Invalid value' in log and '0' in log for log in cm.output))
  203. def test_skip_llms_txt_with_int_one_logs_warning(self):
  204. """Test that integer 1 logs warning and defaults to False."""
  205. config = {
  206. 'name': 'test',
  207. 'base_url': 'https://example.com/',
  208. 'selectors': {'main_content': 'article'},
  209. 'skip_llms_txt': 1 # Invalid type
  210. }
  211. with self.assertLogs('skill_seekers.cli.doc_scraper', level='WARNING') as cm:
  212. converter = DocToSkillConverter(config, dry_run=True)
  213. self.assertFalse(converter.skip_llms_txt)
  214. self.assertTrue(any('Invalid value' in log and '1' in log for log in cm.output))
  215. def test_skip_llms_txt_with_string_logs_warning(self):
  216. """Test that string values log warning and default to False."""
  217. config = {
  218. 'name': 'test',
  219. 'base_url': 'https://example.com/',
  220. 'selectors': {'main_content': 'article'},
  221. 'skip_llms_txt': "true" # Invalid type
  222. }
  223. with self.assertLogs('skill_seekers.cli.doc_scraper', level='WARNING') as cm:
  224. converter = DocToSkillConverter(config, dry_run=True)
  225. self.assertFalse(converter.skip_llms_txt)
  226. self.assertTrue(any('Invalid value' in log and 'true' in log for log in cm.output))
  227. def test_skip_llms_txt_with_none_logs_warning(self):
  228. """Test that None logs warning and defaults to False."""
  229. config = {
  230. 'name': 'test',
  231. 'base_url': 'https://example.com/',
  232. 'selectors': {'main_content': 'article'},
  233. 'skip_llms_txt': None # Invalid type
  234. }
  235. with self.assertLogs('skill_seekers.cli.doc_scraper', level='WARNING') as cm:
  236. converter = DocToSkillConverter(config, dry_run=True)
  237. self.assertFalse(converter.skip_llms_txt)
  238. self.assertTrue(any('Invalid value' in log and 'None' in log for log in cm.output))
  239. def test_scraping_proceeds_when_llms_txt_skipped(self):
  240. """Test that HTML scraping proceeds normally when llms.txt is skipped."""
  241. config = {
  242. 'name': 'test',
  243. 'base_url': 'https://example.com/',
  244. 'selectors': {'main_content': 'article'},
  245. 'skip_llms_txt': True
  246. }
  247. original_cwd = os.getcwd()
  248. with tempfile.TemporaryDirectory() as tmpdir:
  249. try:
  250. os.chdir(tmpdir)
  251. converter = DocToSkillConverter(config, dry_run=False)
  252. # Track if scrape_page was called
  253. scrape_called = []
  254. def mock_scrape(url):
  255. scrape_called.append(url)
  256. return None
  257. with patch.object(converter, 'scrape_page', side_effect=mock_scrape):
  258. with patch.object(converter, 'save_summary'):
  259. converter.scrape_all()
  260. # Should have attempted to scrape the base URL
  261. self.assertTrue(len(scrape_called) > 0)
  262. finally:
  263. os.chdir(original_cwd)
  264. if __name__ == '__main__':
  265. unittest.main()