test_excluded_dirs_config.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375
  1. """Tests for configurable directory exclusions in GitHub scraper.
  2. Tests Issue #203: Make EXCLUDED_DIRS configurable
  3. """
  4. import unittest
  5. from unittest.mock import patch, Mock
  6. from skill_seekers.cli.github_scraper import GitHubScraper, EXCLUDED_DIRS
  7. class TestExcludedDirsDefaults(unittest.TestCase):
  8. """Test default EXCLUDED_DIRS behavior (backward compatibility)."""
  9. @patch('skill_seekers.cli.github_scraper.Github')
  10. def test_defaults_when_no_config(self, mock_github):
  11. """Test that default exclusions are used when no config provided."""
  12. config = {
  13. 'repo': 'owner/repo'
  14. }
  15. scraper = GitHubScraper(config)
  16. # Should use default EXCLUDED_DIRS
  17. self.assertEqual(scraper.excluded_dirs, EXCLUDED_DIRS)
  18. @patch('skill_seekers.cli.github_scraper.Github')
  19. def test_defaults_exclude_common_dirs(self, mock_github):
  20. """Test that default exclusions work correctly."""
  21. config = {
  22. 'repo': 'owner/repo'
  23. }
  24. scraper = GitHubScraper(config)
  25. # Test common directories are excluded
  26. self.assertTrue(scraper.should_exclude_dir('venv'))
  27. self.assertTrue(scraper.should_exclude_dir('node_modules'))
  28. self.assertTrue(scraper.should_exclude_dir('__pycache__'))
  29. self.assertTrue(scraper.should_exclude_dir('.git'))
  30. self.assertTrue(scraper.should_exclude_dir('build'))
  31. # Test normal directories are not excluded
  32. self.assertFalse(scraper.should_exclude_dir('src'))
  33. self.assertFalse(scraper.should_exclude_dir('tests'))
  34. self.assertFalse(scraper.should_exclude_dir('docs'))
  35. @patch('skill_seekers.cli.github_scraper.Github')
  36. def test_dot_directories_always_excluded(self, mock_github):
  37. """Test that directories starting with '.' are always excluded."""
  38. config = {
  39. 'repo': 'owner/repo'
  40. }
  41. scraper = GitHubScraper(config)
  42. # Dot directories should be excluded (even if not in EXCLUDED_DIRS)
  43. self.assertTrue(scraper.should_exclude_dir('.hidden'))
  44. self.assertTrue(scraper.should_exclude_dir('.cache'))
  45. self.assertTrue(scraper.should_exclude_dir('.vscode'))
  46. class TestExcludedDirsAdditional(unittest.TestCase):
  47. """Test exclude_dirs_additional (extend mode)."""
  48. @patch('skill_seekers.cli.github_scraper.Github')
  49. def test_extend_with_additional_dirs(self, mock_github):
  50. """Test adding custom exclusions to defaults."""
  51. config = {
  52. 'repo': 'owner/repo',
  53. 'exclude_dirs_additional': ['proprietary', 'vendor', 'third_party']
  54. }
  55. scraper = GitHubScraper(config)
  56. # Should include both defaults and additional
  57. self.assertIn('venv', scraper.excluded_dirs) # Default
  58. self.assertIn('node_modules', scraper.excluded_dirs) # Default
  59. self.assertIn('proprietary', scraper.excluded_dirs) # Additional
  60. self.assertIn('vendor', scraper.excluded_dirs) # Additional
  61. self.assertIn('third_party', scraper.excluded_dirs) # Additional
  62. # Verify total count
  63. self.assertEqual(
  64. len(scraper.excluded_dirs),
  65. len(EXCLUDED_DIRS) + 3
  66. )
  67. @patch('skill_seekers.cli.github_scraper.Github')
  68. def test_extend_excludes_additional_dirs(self, mock_github):
  69. """Test that additional directories are actually excluded."""
  70. config = {
  71. 'repo': 'owner/repo',
  72. 'exclude_dirs_additional': ['legacy', 'deprecated']
  73. }
  74. scraper = GitHubScraper(config)
  75. # Additional dirs should be excluded
  76. self.assertTrue(scraper.should_exclude_dir('legacy'))
  77. self.assertTrue(scraper.should_exclude_dir('deprecated'))
  78. # Default dirs still excluded
  79. self.assertTrue(scraper.should_exclude_dir('venv'))
  80. self.assertTrue(scraper.should_exclude_dir('node_modules'))
  81. # Normal dirs not excluded
  82. self.assertFalse(scraper.should_exclude_dir('src'))
  83. @patch('skill_seekers.cli.github_scraper.Github')
  84. def test_extend_with_empty_list(self, mock_github):
  85. """Test that empty additional list works correctly."""
  86. config = {
  87. 'repo': 'owner/repo',
  88. 'exclude_dirs_additional': []
  89. }
  90. scraper = GitHubScraper(config)
  91. # Should just have defaults
  92. self.assertEqual(scraper.excluded_dirs, EXCLUDED_DIRS)
  93. class TestExcludedDirsReplace(unittest.TestCase):
  94. """Test exclude_dirs (replace mode)."""
  95. @patch('skill_seekers.cli.github_scraper.Github')
  96. def test_replace_with_custom_list(self, mock_github):
  97. """Test replacing default exclusions entirely."""
  98. config = {
  99. 'repo': 'owner/repo',
  100. 'exclude_dirs': ['node_modules', 'custom_vendor']
  101. }
  102. scraper = GitHubScraper(config)
  103. # Should ONLY have specified dirs
  104. self.assertEqual(scraper.excluded_dirs, {'node_modules', 'custom_vendor'})
  105. self.assertEqual(len(scraper.excluded_dirs), 2)
  106. @patch('skill_seekers.cli.github_scraper.Github')
  107. def test_replace_excludes_only_specified_dirs(self, mock_github):
  108. """Test that only specified directories are excluded in replace mode."""
  109. config = {
  110. 'repo': 'owner/repo',
  111. 'exclude_dirs': ['node_modules', '.git']
  112. }
  113. scraper = GitHubScraper(config)
  114. # Specified dirs should be excluded
  115. self.assertTrue(scraper.should_exclude_dir('node_modules'))
  116. # Note: .git would be excluded anyway due to dot prefix
  117. self.assertTrue(scraper.should_exclude_dir('.git'))
  118. # Default dirs NOT in our list should NOT be excluded
  119. self.assertFalse(scraper.should_exclude_dir('venv'))
  120. self.assertFalse(scraper.should_exclude_dir('__pycache__'))
  121. self.assertFalse(scraper.should_exclude_dir('build'))
  122. # Normal dirs still not excluded
  123. self.assertFalse(scraper.should_exclude_dir('src'))
  124. @patch('skill_seekers.cli.github_scraper.Github')
  125. def test_replace_with_empty_list(self, mock_github):
  126. """Test that empty replace list allows all directories (except dot-prefixed)."""
  127. config = {
  128. 'repo': 'owner/repo',
  129. 'exclude_dirs': []
  130. }
  131. scraper = GitHubScraper(config)
  132. # No explicit exclusions
  133. self.assertEqual(scraper.excluded_dirs, set())
  134. # Nothing explicitly excluded
  135. self.assertFalse(scraper.should_exclude_dir('venv'))
  136. self.assertFalse(scraper.should_exclude_dir('node_modules'))
  137. self.assertFalse(scraper.should_exclude_dir('build'))
  138. # But dot dirs still excluded (different logic)
  139. self.assertTrue(scraper.should_exclude_dir('.git'))
  140. self.assertTrue(scraper.should_exclude_dir('.hidden'))
  141. class TestExcludedDirsPrecedence(unittest.TestCase):
  142. """Test precedence when both options provided."""
  143. @patch('skill_seekers.cli.github_scraper.Github')
  144. def test_replace_takes_precedence_over_additional(self, mock_github):
  145. """Test that exclude_dirs takes precedence over exclude_dirs_additional."""
  146. config = {
  147. 'repo': 'owner/repo',
  148. 'exclude_dirs': ['only', 'these'], # Replace mode
  149. 'exclude_dirs_additional': ['ignored'] # Should be ignored
  150. }
  151. scraper = GitHubScraper(config)
  152. # Should use replace mode (exclude_dirs), ignore additional
  153. self.assertEqual(scraper.excluded_dirs, {'only', 'these'})
  154. self.assertNotIn('ignored', scraper.excluded_dirs)
  155. self.assertNotIn('venv', scraper.excluded_dirs) # Defaults also ignored
  156. class TestExcludedDirsEdgeCases(unittest.TestCase):
  157. """Test edge cases and error handling."""
  158. @patch('skill_seekers.cli.github_scraper.Github')
  159. def test_duplicate_exclusions_in_additional(self, mock_github):
  160. """Test that duplicates in additional list are handled (set deduplication)."""
  161. config = {
  162. 'repo': 'owner/repo',
  163. 'exclude_dirs_additional': ['venv', 'custom', 'venv'] # venv is duplicate (default + listed)
  164. }
  165. scraper = GitHubScraper(config)
  166. # Should deduplicate automatically (using set)
  167. self.assertIn('venv', scraper.excluded_dirs)
  168. self.assertIn('custom', scraper.excluded_dirs)
  169. # Count should account for deduplication
  170. self.assertEqual(
  171. len(scraper.excluded_dirs),
  172. len(EXCLUDED_DIRS) + 1 # Only 'custom' is truly additional
  173. )
  174. @patch('skill_seekers.cli.github_scraper.Github')
  175. def test_case_sensitive_exclusions(self, mock_github):
  176. """Test that exclusions are case-sensitive."""
  177. config = {
  178. 'repo': 'owner/repo',
  179. 'exclude_dirs': ['Venv', 'NODE_MODULES']
  180. }
  181. scraper = GitHubScraper(config)
  182. # Case-sensitive matching
  183. self.assertTrue(scraper.should_exclude_dir('Venv'))
  184. self.assertTrue(scraper.should_exclude_dir('NODE_MODULES'))
  185. self.assertFalse(scraper.should_exclude_dir('venv')) # Different case
  186. self.assertFalse(scraper.should_exclude_dir('node_modules')) # Different case
  187. class TestExcludedDirsWithLocalRepo(unittest.TestCase):
  188. """Test exclude_dirs integration with local_repo_path."""
  189. @patch('skill_seekers.cli.github_scraper.Github')
  190. def test_exclude_dirs_with_local_repo_path(self, mock_github):
  191. """Test that exclude_dirs works when local_repo_path is provided."""
  192. config = {
  193. 'repo': 'owner/repo',
  194. 'local_repo_path': '/tmp/test/repo',
  195. 'exclude_dirs_additional': ['proprietary', 'internal']
  196. }
  197. scraper = GitHubScraper(config)
  198. # Should have both defaults and additional
  199. self.assertIn('venv', scraper.excluded_dirs)
  200. self.assertIn('proprietary', scraper.excluded_dirs)
  201. self.assertIn('internal', scraper.excluded_dirs)
  202. # Test exclusion works
  203. self.assertTrue(scraper.should_exclude_dir('proprietary'))
  204. self.assertTrue(scraper.should_exclude_dir('internal'))
  205. self.assertTrue(scraper.should_exclude_dir('venv'))
  206. @patch('skill_seekers.cli.github_scraper.Github')
  207. def test_replace_mode_with_local_repo_path(self, mock_github):
  208. """Test that replace mode works with local_repo_path."""
  209. config = {
  210. 'repo': 'owner/repo',
  211. 'local_repo_path': '/tmp/test/repo',
  212. 'exclude_dirs': ['only_this']
  213. }
  214. scraper = GitHubScraper(config)
  215. # Should ONLY have specified dir
  216. self.assertEqual(scraper.excluded_dirs, {'only_this'})
  217. self.assertTrue(scraper.should_exclude_dir('only_this'))
  218. self.assertFalse(scraper.should_exclude_dir('venv'))
  219. class TestExcludedDirsLogging(unittest.TestCase):
  220. """Test logging output for exclude_dirs configuration."""
  221. @patch('skill_seekers.cli.github_scraper.Github')
  222. @patch('skill_seekers.cli.github_scraper.logger')
  223. def test_extend_mode_logs_info(self, mock_logger, mock_github):
  224. """Test that extend mode logs INFO level message."""
  225. config = {
  226. 'repo': 'owner/repo',
  227. 'exclude_dirs_additional': ['custom1', 'custom2']
  228. }
  229. scraper = GitHubScraper(config)
  230. # Should have logged INFO message
  231. # Check that info was called with a message about adding custom exclusions
  232. info_calls = [str(call) for call in mock_logger.info.call_args_list]
  233. self.assertTrue(any('Added 2 custom directory exclusions' in call for call in info_calls))
  234. @patch('skill_seekers.cli.github_scraper.Github')
  235. @patch('skill_seekers.cli.github_scraper.logger')
  236. def test_replace_mode_logs_warning(self, mock_logger, mock_github):
  237. """Test that replace mode logs WARNING level message."""
  238. config = {
  239. 'repo': 'owner/repo',
  240. 'exclude_dirs': ['only', 'these']
  241. }
  242. scraper = GitHubScraper(config)
  243. # Should have logged WARNING message
  244. warning_calls = [str(call) for call in mock_logger.warning.call_args_list]
  245. self.assertTrue(any('Using custom directory exclusions' in call and 'defaults overridden' in call for call in warning_calls))
  246. @patch('skill_seekers.cli.github_scraper.Github')
  247. @patch('skill_seekers.cli.github_scraper.logger')
  248. def test_no_config_no_logging(self, mock_logger, mock_github):
  249. """Test that default mode doesn't log exclude_dirs messages."""
  250. config = {
  251. 'repo': 'owner/repo'
  252. }
  253. scraper = GitHubScraper(config)
  254. # Should NOT have logged any exclude_dirs messages
  255. info_calls = [str(call) for call in mock_logger.info.call_args_list]
  256. warning_calls = [str(call) for call in mock_logger.warning.call_args_list]
  257. # Filter for exclude_dirs related messages
  258. exclude_info = [c for c in info_calls if 'directory exclusion' in c]
  259. exclude_warnings = [c for c in warning_calls if 'directory exclusion' in c]
  260. self.assertEqual(len(exclude_info), 0)
  261. self.assertEqual(len(exclude_warnings), 0)
  262. class TestExcludedDirsTypeHandling(unittest.TestCase):
  263. """Test type handling for exclude_dirs configuration."""
  264. @patch('skill_seekers.cli.github_scraper.Github')
  265. def test_exclude_dirs_with_tuple(self, mock_github):
  266. """Test that tuples are converted to sets correctly."""
  267. config = {
  268. 'repo': 'owner/repo',
  269. 'exclude_dirs': ('node_modules', 'build') # Tuple instead of list
  270. }
  271. scraper = GitHubScraper(config)
  272. # Should work with tuples (set() accepts tuples)
  273. self.assertEqual(scraper.excluded_dirs, {'node_modules', 'build'})
  274. @patch('skill_seekers.cli.github_scraper.Github')
  275. def test_exclude_dirs_additional_with_set(self, mock_github):
  276. """Test that sets work correctly for exclude_dirs_additional."""
  277. config = {
  278. 'repo': 'owner/repo',
  279. 'exclude_dirs_additional': {'custom1', 'custom2'} # Set instead of list
  280. }
  281. scraper = GitHubScraper(config)
  282. # Should work with sets
  283. self.assertIn('custom1', scraper.excluded_dirs)
  284. self.assertIn('custom2', scraper.excluded_dirs)
  285. self.assertIn('venv', scraper.excluded_dirs) # Defaults still there
  286. if __name__ == '__main__':
  287. unittest.main()