test_pdf_extractor.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404
  1. #!/usr/bin/env python3
  2. """
  3. Tests for PDF Extractor (cli/pdf_extractor_poc.py)
  4. Tests cover:
  5. - Language detection with confidence scoring
  6. - Code block detection (font, indent, pattern)
  7. - Syntax validation
  8. - Quality scoring
  9. - Chapter detection
  10. - Page chunking
  11. - Code block merging
  12. """
  13. import unittest
  14. import sys
  15. from pathlib import Path
  16. # Add parent directory to path for imports
  17. sys.path.insert(0, str(Path(__file__).parent.parent / "cli"))
  18. try:
  19. import fitz # PyMuPDF
  20. PYMUPDF_AVAILABLE = True
  21. except ImportError:
  22. PYMUPDF_AVAILABLE = False
  23. class TestLanguageDetection(unittest.TestCase):
  24. """Test language detection with confidence scoring"""
  25. def setUp(self):
  26. if not PYMUPDF_AVAILABLE:
  27. self.skipTest("PyMuPDF not installed")
  28. from pdf_extractor_poc import PDFExtractor
  29. self.PDFExtractor = PDFExtractor
  30. def test_detect_python_with_confidence(self):
  31. """Test Python detection returns language and confidence"""
  32. extractor = self.PDFExtractor.__new__(self.PDFExtractor)
  33. code = "def hello():\n print('world')\n return True"
  34. language, confidence = extractor.detect_language_from_code(code)
  35. self.assertEqual(language, "python")
  36. self.assertGreater(confidence, 0.4) # Should have reasonable confidence
  37. self.assertLessEqual(confidence, 1.0)
  38. def test_detect_javascript_with_confidence(self):
  39. """Test JavaScript detection"""
  40. extractor = self.PDFExtractor.__new__(self.PDFExtractor)
  41. code = "const handleClick = () => {\n console.log('clicked');\n};"
  42. language, confidence = extractor.detect_language_from_code(code)
  43. self.assertEqual(language, "javascript")
  44. self.assertGreater(confidence, 0.5)
  45. def test_detect_cpp_with_confidence(self):
  46. """Test C++ detection"""
  47. extractor = self.PDFExtractor.__new__(self.PDFExtractor)
  48. code = "#include <iostream>\nint main() {\n std::cout << \"Hello\";\n}"
  49. language, confidence = extractor.detect_language_from_code(code)
  50. self.assertEqual(language, "cpp")
  51. self.assertGreater(confidence, 0.5)
  52. def test_detect_unknown_low_confidence(self):
  53. """Test unknown language returns low confidence"""
  54. extractor = self.PDFExtractor.__new__(self.PDFExtractor)
  55. code = "this is not code at all just plain text"
  56. language, confidence = extractor.detect_language_from_code(code)
  57. self.assertEqual(language, "unknown")
  58. self.assertLess(confidence, 0.3) # Should be low confidence
  59. def test_confidence_range(self):
  60. """Test confidence is always between 0 and 1"""
  61. extractor = self.PDFExtractor.__new__(self.PDFExtractor)
  62. test_codes = [
  63. "def foo(): pass",
  64. "const x = 10;",
  65. "#include <stdio.h>",
  66. "random text here",
  67. ""
  68. ]
  69. for code in test_codes:
  70. _, confidence = extractor.detect_language_from_code(code)
  71. self.assertGreaterEqual(confidence, 0.0)
  72. self.assertLessEqual(confidence, 1.0)
  73. class TestSyntaxValidation(unittest.TestCase):
  74. """Test syntax validation for different languages"""
  75. def setUp(self):
  76. if not PYMUPDF_AVAILABLE:
  77. self.skipTest("PyMuPDF not installed")
  78. from pdf_extractor_poc import PDFExtractor
  79. self.PDFExtractor = PDFExtractor
  80. def test_validate_python_valid(self):
  81. """Test valid Python syntax"""
  82. extractor = self.PDFExtractor.__new__(self.PDFExtractor)
  83. code = "def hello():\n print('world')\n return True"
  84. is_valid, issues = extractor.validate_code_syntax(code, "python")
  85. self.assertTrue(is_valid)
  86. self.assertEqual(len(issues), 0)
  87. def test_validate_python_invalid_indentation(self):
  88. """Test invalid Python indentation"""
  89. extractor = self.PDFExtractor.__new__(self.PDFExtractor)
  90. code = "def hello():\n print('world')\n\tprint('mixed')" # Mixed tabs and spaces
  91. is_valid, issues = extractor.validate_code_syntax(code, "python")
  92. self.assertFalse(is_valid)
  93. self.assertGreater(len(issues), 0)
  94. def test_validate_python_unbalanced_brackets(self):
  95. """Test unbalanced brackets"""
  96. extractor = self.PDFExtractor.__new__(self.PDFExtractor)
  97. code = "x = [[[1, 2, 3" # Severely unbalanced brackets
  98. is_valid, issues = extractor.validate_code_syntax(code, "python")
  99. self.assertFalse(is_valid)
  100. self.assertGreater(len(issues), 0)
  101. def test_validate_javascript_valid(self):
  102. """Test valid JavaScript syntax"""
  103. extractor = self.PDFExtractor.__new__(self.PDFExtractor)
  104. code = "const x = () => { return 42; };"
  105. is_valid, issues = extractor.validate_code_syntax(code, "javascript")
  106. self.assertTrue(is_valid)
  107. self.assertEqual(len(issues), 0)
  108. def test_validate_natural_language_fails(self):
  109. """Test natural language fails validation"""
  110. extractor = self.PDFExtractor.__new__(self.PDFExtractor)
  111. code = "This is just a regular sentence with the and for and with and that and have and from words."
  112. is_valid, issues = extractor.validate_code_syntax(code, "python")
  113. self.assertFalse(is_valid)
  114. self.assertIn('May be natural language', ' '.join(issues))
  115. class TestQualityScoring(unittest.TestCase):
  116. """Test code quality scoring (0-10 scale)"""
  117. def setUp(self):
  118. if not PYMUPDF_AVAILABLE:
  119. self.skipTest("PyMuPDF not installed")
  120. from pdf_extractor_poc import PDFExtractor
  121. self.PDFExtractor = PDFExtractor
  122. def test_quality_score_range(self):
  123. """Test quality score is between 0 and 10"""
  124. extractor = self.PDFExtractor.__new__(self.PDFExtractor)
  125. code = "def hello():\n print('world')"
  126. quality = extractor.score_code_quality(code, "python", 0.8)
  127. self.assertGreaterEqual(quality, 0.0)
  128. self.assertLessEqual(quality, 10.0)
  129. def test_high_quality_code(self):
  130. """Test high-quality code gets good score"""
  131. extractor = self.PDFExtractor.__new__(self.PDFExtractor)
  132. code = """def calculate_sum(numbers):
  133. '''Calculate sum of numbers'''
  134. total = 0
  135. for num in numbers:
  136. total += num
  137. return total"""
  138. quality = extractor.score_code_quality(code, "python", 0.9)
  139. self.assertGreater(quality, 6.0) # Should be good quality
  140. def test_low_quality_code(self):
  141. """Test low-quality code gets low score"""
  142. extractor = self.PDFExtractor.__new__(self.PDFExtractor)
  143. code = "x" # Too short, no structure
  144. quality = extractor.score_code_quality(code, "unknown", 0.1)
  145. self.assertLess(quality, 6.0) # Should be low quality
  146. def test_quality_factors(self):
  147. """Test that quality considers multiple factors"""
  148. extractor = self.PDFExtractor.__new__(self.PDFExtractor)
  149. # Good: proper structure, indentation, confidence
  150. good_code = "def foo():\n return bar()"
  151. good_quality = extractor.score_code_quality(good_code, "python", 0.9)
  152. # Bad: no structure, low confidence
  153. bad_code = "some text"
  154. bad_quality = extractor.score_code_quality(bad_code, "unknown", 0.1)
  155. self.assertGreater(good_quality, bad_quality)
  156. class TestChapterDetection(unittest.TestCase):
  157. """Test chapter/section detection"""
  158. def setUp(self):
  159. if not PYMUPDF_AVAILABLE:
  160. self.skipTest("PyMuPDF not installed")
  161. from pdf_extractor_poc import PDFExtractor
  162. self.PDFExtractor = PDFExtractor
  163. def test_detect_chapter_with_number(self):
  164. """Test chapter detection with number"""
  165. extractor = self.PDFExtractor.__new__(self.PDFExtractor)
  166. page_data = {
  167. 'text': 'Chapter 1: Introduction to Python\nThis is the first chapter.',
  168. 'headings': []
  169. }
  170. is_chapter, title = extractor.detect_chapter_start(page_data)
  171. self.assertTrue(is_chapter)
  172. self.assertIsNotNone(title)
  173. def test_detect_chapter_uppercase(self):
  174. """Test chapter detection with uppercase"""
  175. extractor = self.PDFExtractor.__new__(self.PDFExtractor)
  176. page_data = {
  177. 'text': 'Chapter 1\nThis is the introduction', # Pattern requires Chapter + digit
  178. 'headings': []
  179. }
  180. is_chapter, title = extractor.detect_chapter_start(page_data)
  181. self.assertTrue(is_chapter)
  182. def test_detect_section_heading(self):
  183. """Test section heading detection"""
  184. extractor = self.PDFExtractor.__new__(self.PDFExtractor)
  185. page_data = {
  186. 'text': '2. Getting Started\nThis is a section.',
  187. 'headings': []
  188. }
  189. is_chapter, title = extractor.detect_chapter_start(page_data)
  190. self.assertTrue(is_chapter)
  191. def test_not_chapter(self):
  192. """Test normal text is not detected as chapter"""
  193. extractor = self.PDFExtractor.__new__(self.PDFExtractor)
  194. page_data = {
  195. 'text': 'This is just normal paragraph text without any chapter markers.',
  196. 'headings': []
  197. }
  198. is_chapter, title = extractor.detect_chapter_start(page_data)
  199. self.assertFalse(is_chapter)
  200. class TestCodeBlockMerging(unittest.TestCase):
  201. """Test code block merging across pages"""
  202. def setUp(self):
  203. if not PYMUPDF_AVAILABLE:
  204. self.skipTest("PyMuPDF not installed")
  205. from pdf_extractor_poc import PDFExtractor
  206. self.PDFExtractor = PDFExtractor
  207. def test_merge_continued_blocks(self):
  208. """Test merging code blocks split across pages"""
  209. extractor = self.PDFExtractor.__new__(self.PDFExtractor)
  210. extractor.verbose = False # Initialize verbose attribute
  211. pages = [
  212. {
  213. 'page_number': 1,
  214. 'code_samples': [
  215. {'code': 'def hello():', 'language': 'python', 'detection_method': 'pattern'}
  216. ],
  217. 'code_blocks_count': 1
  218. },
  219. {
  220. 'page_number': 2,
  221. 'code_samples': [
  222. {'code': ' print("world")', 'language': 'python', 'detection_method': 'pattern'}
  223. ],
  224. 'code_blocks_count': 1
  225. }
  226. ]
  227. merged = extractor.merge_continued_code_blocks(pages)
  228. # Should have merged the two blocks
  229. self.assertIn('def hello():', merged[0]['code_samples'][0]['code'])
  230. self.assertIn('print("world")', merged[0]['code_samples'][0]['code'])
  231. def test_no_merge_different_languages(self):
  232. """Test blocks with different languages are not merged"""
  233. extractor = self.PDFExtractor.__new__(self.PDFExtractor)
  234. pages = [
  235. {
  236. 'page_number': 1,
  237. 'code_samples': [
  238. {'code': 'def foo():', 'language': 'python', 'detection_method': 'pattern'}
  239. ],
  240. 'code_blocks_count': 1
  241. },
  242. {
  243. 'page_number': 2,
  244. 'code_samples': [
  245. {'code': 'const x = 10;', 'language': 'javascript', 'detection_method': 'pattern'}
  246. ],
  247. 'code_blocks_count': 1
  248. }
  249. ]
  250. merged = extractor.merge_continued_code_blocks(pages)
  251. # Should NOT merge different languages
  252. self.assertEqual(len(merged[0]['code_samples']), 1)
  253. self.assertEqual(len(merged[1]['code_samples']), 1)
  254. class TestCodeDetectionMethods(unittest.TestCase):
  255. """Test different code detection methods"""
  256. def setUp(self):
  257. if not PYMUPDF_AVAILABLE:
  258. self.skipTest("PyMuPDF not installed")
  259. from pdf_extractor_poc import PDFExtractor
  260. self.PDFExtractor = PDFExtractor
  261. def test_pattern_based_detection(self):
  262. """Test pattern-based code detection"""
  263. extractor = self.PDFExtractor.__new__(self.PDFExtractor)
  264. # Should detect function definitions
  265. text = "Here is an example:\ndef calculate(x, y):\n return x + y"
  266. # Pattern-based detection should find this
  267. # (implementation details depend on pdf_extractor_poc.py)
  268. self.assertIn("def ", text)
  269. self.assertIn("return", text)
  270. def test_indent_based_detection(self):
  271. """Test indent-based code detection"""
  272. extractor = self.PDFExtractor.__new__(self.PDFExtractor)
  273. # Code with consistent indentation
  274. indented_text = """ def foo():
  275. return bar()"""
  276. # Should detect as code due to indentation
  277. self.assertTrue(indented_text.startswith(" " * 4))
  278. class TestQualityFiltering(unittest.TestCase):
  279. """Test quality-based filtering"""
  280. def setUp(self):
  281. if not PYMUPDF_AVAILABLE:
  282. self.skipTest("PyMuPDF not installed")
  283. from pdf_extractor_poc import PDFExtractor
  284. self.PDFExtractor = PDFExtractor
  285. def test_filter_by_min_quality(self):
  286. """Test filtering code blocks by minimum quality"""
  287. extractor = self.PDFExtractor.__new__(self.PDFExtractor)
  288. extractor.min_quality = 5.0
  289. # High quality block
  290. high_quality = {
  291. 'code': 'def calculate():\n return 42',
  292. 'language': 'python',
  293. 'quality': 8.0
  294. }
  295. # Low quality block
  296. low_quality = {
  297. 'code': 'x',
  298. 'language': 'unknown',
  299. 'quality': 2.0
  300. }
  301. # Only high quality should pass
  302. self.assertGreaterEqual(high_quality['quality'], extractor.min_quality)
  303. self.assertLess(low_quality['quality'], extractor.min_quality)
  304. if __name__ == '__main__':
  305. unittest.main()