| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318 |
- """Tests for skip_llms_txt configuration option.
- This config option allows users to explicitly skip llms.txt detection and fetching,
- which is useful when:
- - A site's llms.txt is incomplete or incorrect
- - You need specific pages not in llms.txt
- - You want to force HTML scraping
- """
- import os
- import tempfile
- import unittest
- import logging
- from unittest.mock import patch, Mock, MagicMock
- from skill_seekers.cli.doc_scraper import DocToSkillConverter
- class TestSkipLlmsTxtConfig(unittest.TestCase):
- """Test skip_llms_txt configuration option."""
- def test_default_skip_llms_txt_is_false(self):
- """Test that skip_llms_txt defaults to False when not specified."""
- config = {
- 'name': 'test',
- 'base_url': 'https://example.com/',
- 'selectors': {'main_content': 'article'}
- }
- converter = DocToSkillConverter(config, dry_run=True)
- self.assertFalse(converter.skip_llms_txt)
- def test_skip_llms_txt_can_be_set_true(self):
- """Test that skip_llms_txt can be explicitly set to True."""
- config = {
- 'name': 'test',
- 'base_url': 'https://example.com/',
- 'selectors': {'main_content': 'article'},
- 'skip_llms_txt': True
- }
- converter = DocToSkillConverter(config, dry_run=True)
- self.assertTrue(converter.skip_llms_txt)
- def test_skip_llms_txt_can_be_set_false(self):
- """Test that skip_llms_txt can be explicitly set to False."""
- config = {
- 'name': 'test',
- 'base_url': 'https://example.com/',
- 'selectors': {'main_content': 'article'},
- 'skip_llms_txt': False
- }
- converter = DocToSkillConverter(config, dry_run=True)
- self.assertFalse(converter.skip_llms_txt)
- class TestSkipLlmsTxtSyncBehavior(unittest.TestCase):
- """Test skip_llms_txt behavior in sync scraping mode."""
- def test_llms_txt_tried_when_not_skipped(self):
- """Test that _try_llms_txt is called when skip_llms_txt is False."""
- config = {
- 'name': 'test',
- 'base_url': 'https://example.com/',
- 'selectors': {'main_content': 'article'},
- 'skip_llms_txt': False
- }
- original_cwd = os.getcwd()
- with tempfile.TemporaryDirectory() as tmpdir:
- try:
- os.chdir(tmpdir)
- converter = DocToSkillConverter(config, dry_run=False)
- with patch.object(converter, '_try_llms_txt', return_value=False) as mock_try:
- with patch.object(converter, 'scrape_page'):
- with patch.object(converter, 'save_summary'):
- converter.scrape_all()
- mock_try.assert_called_once()
- finally:
- os.chdir(original_cwd)
- def test_llms_txt_skipped_when_skip_true(self):
- """Test that _try_llms_txt is NOT called when skip_llms_txt is True."""
- config = {
- 'name': 'test',
- 'base_url': 'https://example.com/',
- 'selectors': {'main_content': 'article'},
- 'skip_llms_txt': True
- }
- original_cwd = os.getcwd()
- with tempfile.TemporaryDirectory() as tmpdir:
- try:
- os.chdir(tmpdir)
- converter = DocToSkillConverter(config, dry_run=False)
- with patch.object(converter, '_try_llms_txt') as mock_try:
- with patch.object(converter, 'scrape_page'):
- with patch.object(converter, 'save_summary'):
- converter.scrape_all()
- mock_try.assert_not_called()
- finally:
- os.chdir(original_cwd)
- def test_llms_txt_skipped_in_dry_run_mode(self):
- """Test that _try_llms_txt is NOT called in dry-run mode regardless of skip setting."""
- config = {
- 'name': 'test',
- 'base_url': 'https://example.com/',
- 'selectors': {'main_content': 'article'},
- 'skip_llms_txt': False # Even when False
- }
- original_cwd = os.getcwd()
- with tempfile.TemporaryDirectory() as tmpdir:
- try:
- os.chdir(tmpdir)
- converter = DocToSkillConverter(config, dry_run=True)
- with patch.object(converter, '_try_llms_txt') as mock_try:
- with patch.object(converter, 'save_summary'):
- converter.scrape_all()
- mock_try.assert_not_called()
- finally:
- os.chdir(original_cwd)
- class TestSkipLlmsTxtAsyncBehavior(unittest.TestCase):
- """Test skip_llms_txt behavior in async scraping mode."""
- def test_async_llms_txt_tried_when_not_skipped(self):
- """Test that _try_llms_txt is called in async mode when skip_llms_txt is False."""
- config = {
- 'name': 'test',
- 'base_url': 'https://example.com/',
- 'selectors': {'main_content': 'article'},
- 'async_mode': True,
- 'skip_llms_txt': False
- }
- original_cwd = os.getcwd()
- with tempfile.TemporaryDirectory() as tmpdir:
- try:
- os.chdir(tmpdir)
- converter = DocToSkillConverter(config, dry_run=False)
- with patch.object(converter, '_try_llms_txt', return_value=False) as mock_try:
- with patch.object(converter, 'scrape_page_async', return_value=None):
- with patch.object(converter, 'save_summary'):
- converter.scrape_all()
- mock_try.assert_called_once()
- finally:
- os.chdir(original_cwd)
- def test_async_llms_txt_skipped_when_skip_true(self):
- """Test that _try_llms_txt is NOT called in async mode when skip_llms_txt is True."""
- config = {
- 'name': 'test',
- 'base_url': 'https://example.com/',
- 'selectors': {'main_content': 'article'},
- 'async_mode': True,
- 'skip_llms_txt': True
- }
- original_cwd = os.getcwd()
- with tempfile.TemporaryDirectory() as tmpdir:
- try:
- os.chdir(tmpdir)
- converter = DocToSkillConverter(config, dry_run=False)
- with patch.object(converter, '_try_llms_txt') as mock_try:
- with patch.object(converter, 'scrape_page_async', return_value=None):
- with patch.object(converter, 'save_summary'):
- converter.scrape_all()
- mock_try.assert_not_called()
- finally:
- os.chdir(original_cwd)
- class TestSkipLlmsTxtWithRealConfig(unittest.TestCase):
- """Test skip_llms_txt with real-world config patterns."""
- def test_telegram_bots_config_pattern(self):
- """Test the telegram-bots config pattern which uses skip_llms_txt."""
- config = {
- 'name': 'telegram-bots',
- 'description': 'Telegram bot documentation',
- 'base_url': 'https://core.telegram.org/bots',
- 'skip_llms_txt': True, # Telegram doesn't have useful llms.txt
- 'start_urls': [
- 'https://core.telegram.org/bots',
- 'https://core.telegram.org/bots/api'
- ],
- 'selectors': {
- 'main_content': '#dev_page_content, main, article',
- 'title': 'h1, title',
- 'code_blocks': 'pre code, pre'
- }
- }
- converter = DocToSkillConverter(config, dry_run=True)
- self.assertTrue(converter.skip_llms_txt)
- self.assertEqual(converter.name, 'telegram-bots')
- def test_skip_llms_txt_with_multiple_start_urls(self):
- """Test skip_llms_txt works correctly with multiple start URLs."""
- config = {
- 'name': 'test-multi',
- 'base_url': 'https://example.com/',
- 'selectors': {'main_content': 'article'},
- 'skip_llms_txt': True,
- 'start_urls': [
- 'https://example.com/docs/',
- 'https://example.com/api/',
- 'https://example.com/guide/'
- ]
- }
- converter = DocToSkillConverter(config, dry_run=True)
- self.assertTrue(converter.skip_llms_txt)
- # start_urls are stored in pending_urls deque
- self.assertEqual(len(converter.pending_urls), 3)
- class TestSkipLlmsTxtEdgeCases(unittest.TestCase):
- """Test edge cases for skip_llms_txt."""
- def test_skip_llms_txt_with_int_zero_logs_warning(self):
- """Test that integer 0 logs warning and defaults to False."""
- config = {
- 'name': 'test',
- 'base_url': 'https://example.com/',
- 'selectors': {'main_content': 'article'},
- 'skip_llms_txt': 0 # Invalid type
- }
- with self.assertLogs('skill_seekers.cli.doc_scraper', level='WARNING') as cm:
- converter = DocToSkillConverter(config, dry_run=True)
- self.assertFalse(converter.skip_llms_txt)
- self.assertTrue(any('Invalid value' in log and '0' in log for log in cm.output))
- def test_skip_llms_txt_with_int_one_logs_warning(self):
- """Test that integer 1 logs warning and defaults to False."""
- config = {
- 'name': 'test',
- 'base_url': 'https://example.com/',
- 'selectors': {'main_content': 'article'},
- 'skip_llms_txt': 1 # Invalid type
- }
- with self.assertLogs('skill_seekers.cli.doc_scraper', level='WARNING') as cm:
- converter = DocToSkillConverter(config, dry_run=True)
- self.assertFalse(converter.skip_llms_txt)
- self.assertTrue(any('Invalid value' in log and '1' in log for log in cm.output))
- def test_skip_llms_txt_with_string_logs_warning(self):
- """Test that string values log warning and default to False."""
- config = {
- 'name': 'test',
- 'base_url': 'https://example.com/',
- 'selectors': {'main_content': 'article'},
- 'skip_llms_txt': "true" # Invalid type
- }
- with self.assertLogs('skill_seekers.cli.doc_scraper', level='WARNING') as cm:
- converter = DocToSkillConverter(config, dry_run=True)
- self.assertFalse(converter.skip_llms_txt)
- self.assertTrue(any('Invalid value' in log and 'true' in log for log in cm.output))
- def test_skip_llms_txt_with_none_logs_warning(self):
- """Test that None logs warning and defaults to False."""
- config = {
- 'name': 'test',
- 'base_url': 'https://example.com/',
- 'selectors': {'main_content': 'article'},
- 'skip_llms_txt': None # Invalid type
- }
- with self.assertLogs('skill_seekers.cli.doc_scraper', level='WARNING') as cm:
- converter = DocToSkillConverter(config, dry_run=True)
- self.assertFalse(converter.skip_llms_txt)
- self.assertTrue(any('Invalid value' in log and 'None' in log for log in cm.output))
- def test_scraping_proceeds_when_llms_txt_skipped(self):
- """Test that HTML scraping proceeds normally when llms.txt is skipped."""
- config = {
- 'name': 'test',
- 'base_url': 'https://example.com/',
- 'selectors': {'main_content': 'article'},
- 'skip_llms_txt': True
- }
- original_cwd = os.getcwd()
- with tempfile.TemporaryDirectory() as tmpdir:
- try:
- os.chdir(tmpdir)
- converter = DocToSkillConverter(config, dry_run=False)
- # Track if scrape_page was called
- scrape_called = []
- def mock_scrape(url):
- scrape_called.append(url)
- return None
- with patch.object(converter, 'scrape_page', side_effect=mock_scrape):
- with patch.object(converter, 'save_summary'):
- converter.scrape_all()
- # Should have attempted to scrape the base URL
- self.assertTrue(len(scrape_called) > 0)
- finally:
- os.chdir(original_cwd)
- if __name__ == '__main__':
- unittest.main()
|