| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352 |
- #!/usr/bin/env python3
- """
- Tests for parallel scraping, unlimited mode, and rate limiting features (PR #144)
- """
- import sys
- import os
- import unittest
- import tempfile
- import json
- import time
- from pathlib import Path
- from unittest.mock import Mock, patch, MagicMock
- from collections import deque
- from skill_seekers.cli.doc_scraper import DocToSkillConverter
- class TestParallelScrapingConfiguration(unittest.TestCase):
- """Test parallel scraping configuration and initialization"""
- def setUp(self):
- """Save original working directory"""
- self.original_cwd = os.getcwd()
- def tearDown(self):
- """Restore original working directory"""
- os.chdir(self.original_cwd)
- def test_single_worker_default(self):
- """Test default is single-worker mode"""
- config = {
- 'name': 'test',
- 'base_url': 'https://example.com/',
- 'selectors': {'main_content': 'article'},
- 'max_pages': 10
- }
- with tempfile.TemporaryDirectory() as tmpdir:
- os.chdir(tmpdir)
- converter = DocToSkillConverter(config, dry_run=True)
- self.assertEqual(converter.workers, 1)
- self.assertFalse(hasattr(converter, 'lock'))
- def test_multiple_workers_creates_lock(self):
- """Test multiple workers creates thread lock"""
- config = {
- 'name': 'test',
- 'base_url': 'https://example.com/',
- 'selectors': {'main_content': 'article'},
- 'max_pages': 10,
- 'workers': 4
- }
- with tempfile.TemporaryDirectory() as tmpdir:
- os.chdir(tmpdir)
- converter = DocToSkillConverter(config, dry_run=True)
- self.assertEqual(converter.workers, 4)
- self.assertTrue(hasattr(converter, 'lock'))
- def test_workers_from_config(self):
- """Test workers parameter is read from config"""
- config = {
- 'name': 'test',
- 'base_url': 'https://example.com/',
- 'selectors': {'main_content': 'article'},
- 'workers': 8
- }
- with tempfile.TemporaryDirectory() as tmpdir:
- os.chdir(tmpdir)
- converter = DocToSkillConverter(config, dry_run=True)
- self.assertEqual(converter.workers, 8)
- class TestUnlimitedMode(unittest.TestCase):
- """Test unlimited scraping mode"""
- def setUp(self):
- """Save original working directory"""
- self.original_cwd = os.getcwd()
- def tearDown(self):
- """Restore original working directory"""
- os.chdir(self.original_cwd)
- def test_unlimited_with_none(self):
- """Test max_pages: None enables unlimited mode"""
- config = {
- 'name': 'test',
- 'base_url': 'https://example.com/',
- 'selectors': {'main_content': 'article'},
- 'max_pages': None
- }
- with tempfile.TemporaryDirectory() as tmpdir:
- os.chdir(tmpdir)
- converter = DocToSkillConverter(config, dry_run=True)
- self.assertIsNone(converter.config.get('max_pages'))
- def test_unlimited_with_minus_one(self):
- """Test max_pages: -1 enables unlimited mode"""
- config = {
- 'name': 'test',
- 'base_url': 'https://example.com/',
- 'selectors': {'main_content': 'article'},
- 'max_pages': -1
- }
- with tempfile.TemporaryDirectory() as tmpdir:
- os.chdir(tmpdir)
- converter = DocToSkillConverter(config, dry_run=True)
- self.assertEqual(converter.config.get('max_pages'), -1)
- def test_limited_mode_default(self):
- """Test default max_pages is limited"""
- config = {
- 'name': 'test',
- 'base_url': 'https://example.com/',
- 'selectors': {'main_content': 'article'}
- }
- with tempfile.TemporaryDirectory() as tmpdir:
- os.chdir(tmpdir)
- converter = DocToSkillConverter(config, dry_run=True)
- max_pages = converter.config.get('max_pages', 500)
- self.assertIsNotNone(max_pages)
- self.assertGreater(max_pages, 0)
- class TestRateLimiting(unittest.TestCase):
- """Test rate limiting configuration"""
- def setUp(self):
- """Save original working directory"""
- self.original_cwd = os.getcwd()
- def tearDown(self):
- """Restore original working directory"""
- os.chdir(self.original_cwd)
- def test_rate_limit_from_config(self):
- """Test rate_limit is read from config"""
- config = {
- 'name': 'test',
- 'base_url': 'https://example.com/',
- 'selectors': {'main_content': 'article'},
- 'rate_limit': 0.1
- }
- with tempfile.TemporaryDirectory() as tmpdir:
- os.chdir(tmpdir)
- converter = DocToSkillConverter(config, dry_run=True)
- self.assertEqual(converter.config.get('rate_limit'), 0.1)
- def test_rate_limit_default(self):
- """Test default rate_limit is 0.5"""
- config = {
- 'name': 'test',
- 'base_url': 'https://example.com/',
- 'selectors': {'main_content': 'article'}
- }
- with tempfile.TemporaryDirectory() as tmpdir:
- os.chdir(tmpdir)
- converter = DocToSkillConverter(config, dry_run=True)
- self.assertEqual(converter.config.get('rate_limit', 0.5), 0.5)
- def test_zero_rate_limit_disables(self):
- """Test rate_limit: 0 disables rate limiting"""
- config = {
- 'name': 'test',
- 'base_url': 'https://example.com/',
- 'selectors': {'main_content': 'article'},
- 'rate_limit': 0
- }
- with tempfile.TemporaryDirectory() as tmpdir:
- os.chdir(tmpdir)
- converter = DocToSkillConverter(config, dry_run=True)
- self.assertEqual(converter.config.get('rate_limit'), 0)
- class TestThreadSafety(unittest.TestCase):
- """Test thread-safety fixes"""
- def setUp(self):
- """Save original working directory"""
- self.original_cwd = os.getcwd()
- def tearDown(self):
- """Restore original working directory"""
- os.chdir(self.original_cwd)
- def test_lock_protects_visited_urls(self):
- """Test visited_urls operations are protected by lock"""
- config = {
- 'name': 'test',
- 'base_url': 'https://example.com/',
- 'selectors': {'main_content': 'article'},
- 'workers': 4
- }
- with tempfile.TemporaryDirectory() as tmpdir:
- os.chdir(tmpdir)
- converter = DocToSkillConverter(config, dry_run=True)
- # Verify lock exists
- self.assertTrue(hasattr(converter, 'lock'))
- # Verify it's a threading.Lock
- import threading
- self.assertIsInstance(converter.lock, type(threading.Lock()))
- def test_single_worker_no_lock(self):
- """Test single worker doesn't create unnecessary lock"""
- config = {
- 'name': 'test',
- 'base_url': 'https://example.com/',
- 'selectors': {'main_content': 'article'},
- 'workers': 1
- }
- with tempfile.TemporaryDirectory() as tmpdir:
- os.chdir(tmpdir)
- converter = DocToSkillConverter(config, dry_run=True)
- self.assertFalse(hasattr(converter, 'lock'))
- class TestScrapingModes(unittest.TestCase):
- """Test different scraping mode combinations"""
- def setUp(self):
- """Save original working directory"""
- self.original_cwd = os.getcwd()
- def tearDown(self):
- """Restore original working directory"""
- os.chdir(self.original_cwd)
- def test_single_threaded_limited(self):
- """Test traditional single-threaded limited mode"""
- config = {
- 'name': 'test',
- 'base_url': 'https://example.com/',
- 'selectors': {'main_content': 'article'},
- 'max_pages': 10,
- 'workers': 1
- }
- with tempfile.TemporaryDirectory() as tmpdir:
- os.chdir(tmpdir)
- converter = DocToSkillConverter(config, dry_run=True)
- self.assertEqual(converter.workers, 1)
- self.assertEqual(converter.config.get('max_pages'), 10)
- def test_parallel_limited(self):
- """Test parallel scraping with page limit"""
- config = {
- 'name': 'test',
- 'base_url': 'https://example.com/',
- 'selectors': {'main_content': 'article'},
- 'max_pages': 100,
- 'workers': 4
- }
- with tempfile.TemporaryDirectory() as tmpdir:
- os.chdir(tmpdir)
- converter = DocToSkillConverter(config, dry_run=True)
- self.assertEqual(converter.workers, 4)
- self.assertEqual(converter.config.get('max_pages'), 100)
- self.assertTrue(hasattr(converter, 'lock'))
- def test_parallel_unlimited(self):
- """Test parallel scraping with unlimited pages"""
- config = {
- 'name': 'test',
- 'base_url': 'https://example.com/',
- 'selectors': {'main_content': 'article'},
- 'max_pages': None,
- 'workers': 8
- }
- with tempfile.TemporaryDirectory() as tmpdir:
- os.chdir(tmpdir)
- converter = DocToSkillConverter(config, dry_run=True)
- self.assertEqual(converter.workers, 8)
- self.assertIsNone(converter.config.get('max_pages'))
- self.assertTrue(hasattr(converter, 'lock'))
- def test_fast_scraping_mode(self):
- """Test fast scraping with low rate limit and workers"""
- config = {
- 'name': 'test',
- 'base_url': 'https://example.com/',
- 'selectors': {'main_content': 'article'},
- 'rate_limit': 0.1,
- 'workers': 8,
- 'max_pages': 1000
- }
- with tempfile.TemporaryDirectory() as tmpdir:
- os.chdir(tmpdir)
- converter = DocToSkillConverter(config, dry_run=True)
- self.assertEqual(converter.workers, 8)
- self.assertEqual(converter.config.get('rate_limit'), 0.1)
- class TestDryRunWithNewFeatures(unittest.TestCase):
- """Test dry-run mode works with new features"""
- def setUp(self):
- """Save original working directory"""
- self.original_cwd = os.getcwd()
- def tearDown(self):
- """Restore original working directory"""
- os.chdir(self.original_cwd)
- def test_dry_run_with_parallel(self):
- """Test dry-run with parallel workers"""
- config = {
- 'name': 'test',
- 'base_url': 'https://example.com/',
- 'selectors': {'main_content': 'article'},
- 'workers': 4
- }
- with tempfile.TemporaryDirectory() as tmpdir:
- os.chdir(tmpdir)
- converter = DocToSkillConverter(config, dry_run=True)
- self.assertTrue(converter.dry_run)
- self.assertEqual(converter.workers, 4)
- def test_dry_run_with_unlimited(self):
- """Test dry-run with unlimited mode"""
- config = {
- 'name': 'test',
- 'base_url': 'https://example.com/',
- 'selectors': {'main_content': 'article'},
- 'max_pages': None
- }
- with tempfile.TemporaryDirectory() as tmpdir:
- os.chdir(tmpdir)
- converter = DocToSkillConverter(config, dry_run=True)
- self.assertTrue(converter.dry_run)
- self.assertIsNone(converter.config.get('max_pages'))
- if __name__ == '__main__':
- unittest.main()
|