test_pdf_scraper.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602
  1. #!/usr/bin/env python3
  2. """
  3. Tests for PDF Scraper (cli/pdf_scraper.py)
  4. Tests cover:
  5. - Config-based PDF extraction
  6. - Direct PDF path conversion
  7. - JSON-based workflow
  8. - Skill structure generation
  9. - Categorization
  10. - Error handling
  11. """
  12. import unittest
  13. import sys
  14. import json
  15. import tempfile
  16. import shutil
  17. from pathlib import Path
  18. from unittest.mock import Mock, patch, MagicMock
  19. try:
  20. import fitz # PyMuPDF
  21. PYMUPDF_AVAILABLE = True
  22. except ImportError:
  23. PYMUPDF_AVAILABLE = False
  24. class TestPDFToSkillConverter(unittest.TestCase):
  25. """Test PDFToSkillConverter initialization and basic functionality"""
  26. def setUp(self):
  27. if not PYMUPDF_AVAILABLE:
  28. self.skipTest("PyMuPDF not installed")
  29. from skill_seekers.cli.pdf_scraper import PDFToSkillConverter
  30. self.PDFToSkillConverter = PDFToSkillConverter
  31. # Create temporary directory for test output
  32. self.temp_dir = tempfile.mkdtemp()
  33. self.output_dir = Path(self.temp_dir)
  34. def tearDown(self):
  35. # Clean up temporary directory
  36. if hasattr(self, 'temp_dir'):
  37. shutil.rmtree(self.temp_dir, ignore_errors=True)
  38. def test_init_with_name_and_pdf_path(self):
  39. """Test initialization with name and PDF path"""
  40. config = {
  41. "name": "test_skill",
  42. "pdf_path": "test.pdf"
  43. }
  44. converter = self.PDFToSkillConverter(config)
  45. self.assertEqual(converter.name, "test_skill")
  46. self.assertEqual(converter.pdf_path, "test.pdf")
  47. def test_init_with_config(self):
  48. """Test initialization with config file"""
  49. # Create test config
  50. config = {
  51. "name": "config_skill",
  52. "description": "Test skill",
  53. "pdf_path": "docs/test.pdf",
  54. "extract_options": {
  55. "chunk_size": 10,
  56. "min_quality": 5.0
  57. }
  58. }
  59. converter = self.PDFToSkillConverter(config)
  60. self.assertEqual(converter.name, "config_skill")
  61. self.assertEqual(converter.config.get("description"), "Test skill")
  62. def test_init_requires_name_or_config(self):
  63. """Test that initialization requires config dict with 'name' field"""
  64. with self.assertRaises((ValueError, TypeError, KeyError)):
  65. self.PDFToSkillConverter({})
  66. class TestCategorization(unittest.TestCase):
  67. """Test content categorization functionality"""
  68. def setUp(self):
  69. if not PYMUPDF_AVAILABLE:
  70. self.skipTest("PyMuPDF not installed")
  71. from skill_seekers.cli.pdf_scraper import PDFToSkillConverter
  72. self.PDFToSkillConverter = PDFToSkillConverter
  73. self.temp_dir = tempfile.mkdtemp()
  74. def tearDown(self):
  75. shutil.rmtree(self.temp_dir, ignore_errors=True)
  76. def test_categorize_by_keywords(self):
  77. """Test categorization using keyword matching"""
  78. config = {
  79. "name": "test",
  80. "pdf_path": "test.pdf",
  81. "categories": {
  82. "getting_started": ["introduction", "getting started"],
  83. "api": ["api", "reference", "function"]
  84. }
  85. }
  86. converter = self.PDFToSkillConverter(config)
  87. # Mock extracted data with different content
  88. converter.extracted_data = {
  89. "pages": [
  90. {
  91. "page_number": 1,
  92. "text": "Introduction to the API",
  93. "chapter": "Chapter 1: Getting Started"
  94. },
  95. {
  96. "page_number": 2,
  97. "text": "API reference for functions",
  98. "chapter": None
  99. }
  100. ]
  101. }
  102. categories = converter.categorize_content()
  103. # Should have both categories
  104. self.assertIn("getting_started", categories)
  105. self.assertIn("api", categories)
  106. def test_categorize_by_chapters(self):
  107. """Test categorization using chapter information"""
  108. config = {
  109. "name": "test",
  110. "pdf_path": "test.pdf"
  111. }
  112. converter = self.PDFToSkillConverter(config)
  113. # Mock data with chapters
  114. converter.extracted_data = {
  115. "pages": [
  116. {
  117. "page_number": 1,
  118. "text": "Content here",
  119. "chapter": "Chapter 1: Introduction"
  120. },
  121. {
  122. "page_number": 2,
  123. "text": "More content",
  124. "chapter": "Chapter 1: Introduction"
  125. },
  126. {
  127. "page_number": 3,
  128. "text": "New chapter",
  129. "chapter": "Chapter 2: Advanced Topics"
  130. }
  131. ]
  132. }
  133. categories = converter.categorize_content()
  134. # Should create categories based on chapters
  135. self.assertIsInstance(categories, dict)
  136. self.assertGreater(len(categories), 0)
  137. def test_categorize_handles_no_chapters(self):
  138. """Test categorization when no chapters are detected"""
  139. config = {
  140. "name": "test",
  141. "pdf_path": "test.pdf"
  142. }
  143. converter = self.PDFToSkillConverter(config)
  144. # Mock data without chapters
  145. converter.extracted_data = {
  146. "pages": [
  147. {
  148. "page_number": 1,
  149. "text": "Some content",
  150. "chapter": None
  151. }
  152. ]
  153. }
  154. categories = converter.categorize_content()
  155. # Should still create categories (fallback to "other")
  156. self.assertIsInstance(categories, dict)
  157. class TestSkillBuilding(unittest.TestCase):
  158. """Test skill structure generation"""
  159. def setUp(self):
  160. if not PYMUPDF_AVAILABLE:
  161. self.skipTest("PyMuPDF not installed")
  162. from skill_seekers.cli.pdf_scraper import PDFToSkillConverter
  163. self.PDFToSkillConverter = PDFToSkillConverter
  164. self.temp_dir = tempfile.mkdtemp()
  165. def tearDown(self):
  166. shutil.rmtree(self.temp_dir, ignore_errors=True)
  167. def test_build_skill_creates_structure(self):
  168. """Test that build_skill creates required directory structure"""
  169. config = {
  170. "name": "test_skill",
  171. "pdf_path": "test.pdf"
  172. }
  173. converter = self.PDFToSkillConverter(config)
  174. # Override skill_dir to use temp directory
  175. converter.skill_dir = str(Path(self.temp_dir) / "test_skill")
  176. # Mock extracted data
  177. converter.extracted_data = {
  178. "pages": [
  179. {
  180. "page_number": 1,
  181. "text": "Test content",
  182. "code_blocks": [],
  183. "images": []
  184. }
  185. ],
  186. "total_pages": 1
  187. }
  188. # Mock categorization
  189. converter.categories = {
  190. "getting_started": [converter.extracted_data["pages"][0]]
  191. }
  192. converter.build_skill()
  193. # Check directory structure
  194. skill_dir = Path(self.temp_dir) / "test_skill"
  195. self.assertTrue(skill_dir.exists())
  196. self.assertTrue((skill_dir / "references").exists())
  197. self.assertTrue((skill_dir / "scripts").exists())
  198. self.assertTrue((skill_dir / "assets").exists())
  199. def test_build_skill_creates_skill_md(self):
  200. """Test that SKILL.md is created"""
  201. config = {
  202. "name": "test_skill",
  203. "pdf_path": "test.pdf",
  204. "description": "Test description"
  205. }
  206. converter = self.PDFToSkillConverter(config)
  207. # Override skill_dir to use temp directory
  208. converter.skill_dir = str(Path(self.temp_dir) / "test_skill")
  209. converter.extracted_data = {
  210. "pages": [{"page_number": 1, "text": "Test", "code_blocks": [], "images": []}],
  211. "total_pages": 1
  212. }
  213. converter.categories = {"test": [converter.extracted_data["pages"][0]]}
  214. converter.build_skill()
  215. skill_md = Path(self.temp_dir) / "test_skill" / "SKILL.md"
  216. self.assertTrue(skill_md.exists())
  217. # Check content
  218. content = skill_md.read_text()
  219. self.assertIn("test_skill", content)
  220. self.assertIn("Test description", content)
  221. def test_build_skill_creates_reference_files(self):
  222. """Test that reference files are created for categories"""
  223. config = {
  224. "name": "test_skill",
  225. "pdf_path": "test.pdf"
  226. }
  227. converter = self.PDFToSkillConverter(config)
  228. # Override skill_dir to use temp directory
  229. converter.skill_dir = str(Path(self.temp_dir) / "test_skill")
  230. converter.extracted_data = {
  231. "pages": [
  232. {"page_number": 1, "text": "Getting started", "code_blocks": [], "images": []},
  233. {"page_number": 2, "text": "API reference", "code_blocks": [], "images": []}
  234. ],
  235. "total_pages": 2
  236. }
  237. converter.categories = {
  238. "getting_started": [converter.extracted_data["pages"][0]],
  239. "api": [converter.extracted_data["pages"][1]]
  240. }
  241. converter.build_skill()
  242. # Check reference files exist
  243. refs_dir = Path(self.temp_dir) / "test_skill" / "references"
  244. self.assertTrue((refs_dir / "getting_started.md").exists())
  245. self.assertTrue((refs_dir / "api.md").exists())
  246. self.assertTrue((refs_dir / "index.md").exists())
  247. class TestCodeBlockHandling(unittest.TestCase):
  248. """Test code block extraction and inclusion in references"""
  249. def setUp(self):
  250. if not PYMUPDF_AVAILABLE:
  251. self.skipTest("PyMuPDF not installed")
  252. from skill_seekers.cli.pdf_scraper import PDFToSkillConverter
  253. self.PDFToSkillConverter = PDFToSkillConverter
  254. self.temp_dir = tempfile.mkdtemp()
  255. def tearDown(self):
  256. shutil.rmtree(self.temp_dir, ignore_errors=True)
  257. def test_code_blocks_included_in_references(self):
  258. """Test that code blocks are included in reference files"""
  259. config = {
  260. "name": "test_skill",
  261. "pdf_path": "test.pdf"
  262. }
  263. converter = self.PDFToSkillConverter(config)
  264. # Override skill_dir to use temp directory
  265. converter.skill_dir = str(Path(self.temp_dir) / "test_skill")
  266. # Mock data with code blocks
  267. converter.extracted_data = {
  268. "pages": [
  269. {
  270. "page_number": 1,
  271. "text": "Example code",
  272. "code_blocks": [
  273. {
  274. "code": "def hello():\n print('world')",
  275. "language": "python",
  276. "quality": 8.0
  277. }
  278. ],
  279. "images": []
  280. }
  281. ],
  282. "total_pages": 1
  283. }
  284. converter.categories = {
  285. "examples": [converter.extracted_data["pages"][0]]
  286. }
  287. converter.build_skill()
  288. # Check code block in reference file
  289. ref_file = Path(self.temp_dir) / "test_skill" / "references" / "examples.md"
  290. content = ref_file.read_text()
  291. self.assertIn("```python", content)
  292. self.assertIn("def hello()", content)
  293. self.assertIn("print('world')", content)
  294. def test_high_quality_code_preferred(self):
  295. """Test that high-quality code blocks are prioritized"""
  296. config = {
  297. "name": "test_skill",
  298. "pdf_path": "test.pdf"
  299. }
  300. converter = self.PDFToSkillConverter(config)
  301. # Override skill_dir to use temp directory
  302. converter.skill_dir = str(Path(self.temp_dir) / "test_skill")
  303. # Mock data with varying quality
  304. converter.extracted_data = {
  305. "pages": [
  306. {
  307. "page_number": 1,
  308. "text": "Code examples",
  309. "code_blocks": [
  310. {"code": "x = 1", "language": "python", "quality": 2.0},
  311. {"code": "def process():\n return result", "language": "python", "quality": 9.0}
  312. ],
  313. "images": []
  314. }
  315. ],
  316. "total_pages": 1
  317. }
  318. converter.categories = {"examples": [converter.extracted_data["pages"][0]]}
  319. converter.build_skill()
  320. ref_file = Path(self.temp_dir) / "test_skill" / "references" / "examples.md"
  321. content = ref_file.read_text()
  322. # High quality code should be included
  323. self.assertIn("def process()", content)
  324. class TestImageHandling(unittest.TestCase):
  325. """Test image extraction and handling"""
  326. def setUp(self):
  327. if not PYMUPDF_AVAILABLE:
  328. self.skipTest("PyMuPDF not installed")
  329. from skill_seekers.cli.pdf_scraper import PDFToSkillConverter
  330. self.PDFToSkillConverter = PDFToSkillConverter
  331. self.temp_dir = tempfile.mkdtemp()
  332. def tearDown(self):
  333. shutil.rmtree(self.temp_dir, ignore_errors=True)
  334. def test_images_saved_to_assets(self):
  335. """Test that images are saved to assets directory"""
  336. config = {
  337. "name": "test_skill",
  338. "pdf_path": "test.pdf"
  339. }
  340. converter = self.PDFToSkillConverter(config)
  341. # Override skill_dir to use temp directory
  342. converter.skill_dir = str(Path(self.temp_dir) / "test_skill")
  343. # Mock image data (1x1 white PNG)
  344. mock_image_bytes = b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x06\x00\x00\x00\x1f\x15\xc4\x89\x00\x00\x00\nIDATx\x9cc\x00\x01\x00\x00\x05\x00\x01\r\n-\xb4\x00\x00\x00\x00IEND\xaeB`\x82'
  345. converter.extracted_data = {
  346. "pages": [
  347. {
  348. "page_number": 1,
  349. "text": "See diagram",
  350. "code_blocks": [],
  351. "images": [
  352. {
  353. "page": 1,
  354. "index": 0,
  355. "width": 100,
  356. "height": 100,
  357. "data": mock_image_bytes
  358. }
  359. ]
  360. }
  361. ],
  362. "total_pages": 1
  363. }
  364. converter.categories = {"diagrams": [converter.extracted_data["pages"][0]]}
  365. converter.build_skill()
  366. # Check assets directory has image
  367. assets_dir = Path(self.temp_dir) / "test_skill" / "assets"
  368. image_files = list(assets_dir.glob("*.png"))
  369. self.assertGreater(len(image_files), 0)
  370. def test_image_references_in_markdown(self):
  371. """Test that images are referenced in markdown files"""
  372. config = {
  373. "name": "test_skill",
  374. "pdf_path": "test.pdf"
  375. }
  376. converter = self.PDFToSkillConverter(config)
  377. # Override skill_dir to use temp directory
  378. converter.skill_dir = str(Path(self.temp_dir) / "test_skill")
  379. mock_image_bytes = b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x06\x00\x00\x00\x1f\x15\xc4\x89\x00\x00\x00\nIDATx\x9cc\x00\x01\x00\x00\x05\x00\x01\r\n-\xb4\x00\x00\x00\x00IEND\xaeB`\x82'
  380. converter.extracted_data = {
  381. "pages": [
  382. {
  383. "page_number": 1,
  384. "text": "Architecture diagram",
  385. "code_blocks": [],
  386. "images": [
  387. {
  388. "page": 1,
  389. "index": 0,
  390. "width": 200,
  391. "height": 150,
  392. "data": mock_image_bytes
  393. }
  394. ]
  395. }
  396. ],
  397. "total_pages": 1
  398. }
  399. converter.categories = {"architecture": [converter.extracted_data["pages"][0]]}
  400. converter.build_skill()
  401. # Check markdown has image reference
  402. ref_file = Path(self.temp_dir) / "test_skill" / "references" / "architecture.md"
  403. content = ref_file.read_text()
  404. self.assertIn("![", content) # Markdown image syntax
  405. self.assertIn("../assets/", content) # Relative path to assets
  406. class TestErrorHandling(unittest.TestCase):
  407. """Test error handling for invalid inputs"""
  408. def setUp(self):
  409. if not PYMUPDF_AVAILABLE:
  410. self.skipTest("PyMuPDF not installed")
  411. from skill_seekers.cli.pdf_scraper import PDFToSkillConverter
  412. self.PDFToSkillConverter = PDFToSkillConverter
  413. self.temp_dir = tempfile.mkdtemp()
  414. def tearDown(self):
  415. shutil.rmtree(self.temp_dir, ignore_errors=True)
  416. def test_missing_pdf_file(self):
  417. """Test error when PDF file doesn't exist"""
  418. config = {
  419. "name": "test",
  420. "pdf_path": "nonexistent.pdf"
  421. }
  422. converter = self.PDFToSkillConverter(config)
  423. with self.assertRaises((FileNotFoundError, RuntimeError)):
  424. converter.extract_pdf()
  425. def test_invalid_config_file(self):
  426. """Test error when config dict is invalid"""
  427. invalid_config = "invalid string not a dict"
  428. with self.assertRaises((ValueError, TypeError, AttributeError)):
  429. self.PDFToSkillConverter(invalid_config)
  430. def test_missing_required_config_fields(self):
  431. """Test error when config is missing required fields"""
  432. config = {"description": "Missing name and pdf_path"}
  433. with self.assertRaises((ValueError, KeyError)):
  434. converter = self.PDFToSkillConverter(config)
  435. converter.extract_pdf()
  436. class TestJSONWorkflow(unittest.TestCase):
  437. """Test building skills from extracted JSON"""
  438. def setUp(self):
  439. if not PYMUPDF_AVAILABLE:
  440. self.skipTest("PyMuPDF not installed")
  441. from skill_seekers.cli.pdf_scraper import PDFToSkillConverter
  442. self.PDFToSkillConverter = PDFToSkillConverter
  443. self.temp_dir = tempfile.mkdtemp()
  444. def tearDown(self):
  445. shutil.rmtree(self.temp_dir, ignore_errors=True)
  446. def test_load_from_json(self):
  447. """Test loading extracted data from JSON file"""
  448. # Create mock extracted JSON
  449. extracted_data = {
  450. "pages": [
  451. {
  452. "page_number": 1,
  453. "text": "Test content",
  454. "code_blocks": [],
  455. "images": []
  456. }
  457. ],
  458. "total_pages": 1,
  459. "metadata": {
  460. "title": "Test PDF"
  461. }
  462. }
  463. json_path = Path(self.temp_dir) / "extracted.json"
  464. json_path.write_text(json.dumps(extracted_data, indent=2))
  465. config = {
  466. "name": "test_skill",
  467. "pdf_path": "test.pdf"
  468. }
  469. converter = self.PDFToSkillConverter(config)
  470. converter.load_extracted_data(str(json_path))
  471. self.assertEqual(converter.extracted_data["total_pages"], 1)
  472. self.assertEqual(len(converter.extracted_data["pages"]), 1)
  473. def test_build_from_json_without_extraction(self):
  474. """Test that from_json workflow skips PDF extraction"""
  475. extracted_data = {
  476. "pages": [{"page_number": 1, "text": "Content", "code_blocks": [], "images": []}],
  477. "total_pages": 1
  478. }
  479. json_path = Path(self.temp_dir) / "extracted.json"
  480. json_path.write_text(json.dumps(extracted_data))
  481. config = {
  482. "name": "test_skill",
  483. "pdf_path": "test.pdf"
  484. }
  485. converter = self.PDFToSkillConverter(config)
  486. converter.load_extracted_data(str(json_path))
  487. # Should have data loaded without calling extract_pdf()
  488. self.assertIsNotNone(converter.extracted_data)
  489. self.assertEqual(converter.extracted_data["total_pages"], 1)
  490. if __name__ == '__main__':
  491. unittest.main()