test_llms_txt_downloader.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170
  1. import pytest
  2. from unittest.mock import patch, Mock
  3. import requests
  4. from skill_seekers.cli.llms_txt_downloader import LlmsTxtDownloader
  5. def test_successful_download():
  6. """Test successful download with valid markdown content"""
  7. downloader = LlmsTxtDownloader("https://example.com/llms.txt")
  8. mock_response = Mock()
  9. mock_response.text = "# Header\n\nSome content with markdown patterns.\n\n## Subheader\n\n- List item\n- Another item\n\n```python\ncode_block()\n```\n" + "x" * 200
  10. mock_response.raise_for_status = Mock()
  11. with patch('requests.get', return_value=mock_response) as mock_get:
  12. content = downloader.download()
  13. assert content is not None
  14. assert len(content) > 100
  15. assert isinstance(content, str)
  16. assert "# Header" in content
  17. mock_get.assert_called_once()
  18. def test_timeout_with_retry():
  19. """Test timeout scenario with retry logic"""
  20. downloader = LlmsTxtDownloader("https://example.com/llms.txt", max_retries=2)
  21. with patch('requests.get', side_effect=requests.Timeout("Connection timeout")) as mock_get:
  22. with patch('time.sleep') as mock_sleep: # Mock sleep to speed up test
  23. content = downloader.download()
  24. assert content is None
  25. assert mock_get.call_count == 2 # Should retry once (2 total attempts)
  26. assert mock_sleep.call_count == 1 # Should sleep once between retries
  27. def test_empty_content_rejection():
  28. """Test rejection of content shorter than 100 chars"""
  29. downloader = LlmsTxtDownloader("https://example.com/llms.txt")
  30. mock_response = Mock()
  31. mock_response.text = "# Short"
  32. mock_response.raise_for_status = Mock()
  33. with patch('requests.get', return_value=mock_response):
  34. content = downloader.download()
  35. assert content is None
  36. def test_non_markdown_rejection():
  37. """Test rejection of content that doesn't look like markdown"""
  38. downloader = LlmsTxtDownloader("https://example.com/llms.txt")
  39. mock_response = Mock()
  40. mock_response.text = "Plain text without any markdown patterns at all. " * 10
  41. mock_response.raise_for_status = Mock()
  42. with patch('requests.get', return_value=mock_response):
  43. content = downloader.download()
  44. assert content is None
  45. def test_http_error_handling():
  46. """Test handling of HTTP errors (404, 500, etc.)"""
  47. downloader = LlmsTxtDownloader("https://example.com/llms.txt", max_retries=2)
  48. mock_response = Mock()
  49. mock_response.raise_for_status.side_effect = requests.HTTPError("404 Not Found")
  50. with patch('requests.get', return_value=mock_response) as mock_get:
  51. with patch('time.sleep'):
  52. content = downloader.download()
  53. assert content is None
  54. assert mock_get.call_count == 2 # Should retry once
  55. def test_exponential_backoff():
  56. """Test that exponential backoff delays are correct"""
  57. downloader = LlmsTxtDownloader("https://example.com/llms.txt", max_retries=3)
  58. with patch('requests.get', side_effect=requests.Timeout("Connection timeout")):
  59. with patch('time.sleep') as mock_sleep:
  60. content = downloader.download()
  61. assert content is None
  62. # Should sleep with delays: 1s, 2s (2^0, 2^1)
  63. assert mock_sleep.call_count == 2
  64. mock_sleep.assert_any_call(1) # First retry delay
  65. mock_sleep.assert_any_call(2) # Second retry delay
  66. def test_markdown_validation():
  67. """Test markdown pattern detection"""
  68. downloader = LlmsTxtDownloader("https://example.com/llms.txt")
  69. # Test various markdown patterns
  70. assert downloader._is_markdown("# Header")
  71. assert downloader._is_markdown("## Subheader")
  72. assert downloader._is_markdown("```code```")
  73. assert downloader._is_markdown("- list item")
  74. assert downloader._is_markdown("* bullet point")
  75. assert downloader._is_markdown("`inline code`")
  76. # Test non-markdown content
  77. assert not downloader._is_markdown("Plain text without any markdown patterns")
  78. def test_custom_timeout():
  79. """Test custom timeout parameter"""
  80. downloader = LlmsTxtDownloader("https://example.com/llms.txt", timeout=10)
  81. mock_response = Mock()
  82. mock_response.text = "# Header\n\nContent " * 50
  83. mock_response.raise_for_status = Mock()
  84. with patch('requests.get', return_value=mock_response) as mock_get:
  85. content = downloader.download()
  86. assert content is not None
  87. # Verify timeout was passed to requests.get
  88. call_kwargs = mock_get.call_args[1]
  89. assert call_kwargs['timeout'] == 10
  90. def test_custom_max_retries():
  91. """Test custom max_retries parameter"""
  92. downloader = LlmsTxtDownloader("https://example.com/llms.txt", max_retries=5)
  93. with patch('requests.get', side_effect=requests.Timeout("Connection timeout")) as mock_get:
  94. with patch('time.sleep'):
  95. content = downloader.download()
  96. assert content is None
  97. assert mock_get.call_count == 5 # Should attempt 5 times
  98. def test_user_agent_header():
  99. """Test that custom user agent is set"""
  100. downloader = LlmsTxtDownloader("https://example.com/llms.txt")
  101. mock_response = Mock()
  102. mock_response.text = "# Header\n\nContent " * 50
  103. mock_response.raise_for_status = Mock()
  104. with patch('requests.get', return_value=mock_response) as mock_get:
  105. content = downloader.download()
  106. assert content is not None
  107. # Verify custom user agent was passed
  108. call_kwargs = mock_get.call_args[1]
  109. assert call_kwargs['headers']['User-Agent'] == 'Skill-Seekers-llms.txt-Reader/1.0'
  110. def test_get_proper_filename():
  111. """Test filename conversion from .txt to .md"""
  112. downloader = LlmsTxtDownloader("https://hono.dev/llms-full.txt")
  113. filename = downloader.get_proper_filename()
  114. assert filename == "llms-full.md"
  115. assert not filename.endswith('.txt')
  116. def test_get_proper_filename_standard():
  117. """Test standard variant naming"""
  118. downloader = LlmsTxtDownloader("https://hono.dev/llms.txt")
  119. filename = downloader.get_proper_filename()
  120. assert filename == "llms.md"
  121. def test_get_proper_filename_small():
  122. """Test small variant naming"""
  123. downloader = LlmsTxtDownloader("https://hono.dev/llms-small.txt")
  124. filename = downloader.get_proper_filename()
  125. assert filename == "llms-small.md"