pdf_scraper.py 15 KB


  1. #!/usr/bin/env python3
  2. """
  3. PDF Documentation to Claude Skill Converter (Task B1.6)
  4. Converts PDF documentation into Claude AI skills.
  5. Uses pdf_extractor_poc.py for extraction, builds skill structure.
  6. Usage:
  7. python3 pdf_scraper.py --config configs/manual_pdf.json
  8. python3 pdf_scraper.py --pdf manual.pdf --name myskill
  9. python3 pdf_scraper.py --from-json manual_extracted.json
  10. """
  11. import os
  12. import sys
  13. import json
  14. import re
  15. import argparse
  16. from pathlib import Path
  17. # Import the PDF extractor
  18. from .pdf_extractor_poc import PDFExtractor
  19. class PDFToSkillConverter:
  20. """Convert PDF documentation to Claude skill"""
  21. def __init__(self, config):
  22. self.config = config
  23. self.name = config['name']
  24. self.pdf_path = config.get('pdf_path', '')
  25. self.description = config.get('description', f'Documentation skill for {self.name}')
  26. # Paths
  27. self.skill_dir = f"output/{self.name}"
  28. self.data_file = f"output/{self.name}_extracted.json"
  29. # Extraction options
  30. self.extract_options = config.get('extract_options', {})
  31. # Categories
  32. self.categories = config.get('categories', {})
  33. # Extracted data
  34. self.extracted_data = None
  35. def extract_pdf(self):
  36. """Extract content from PDF using pdf_extractor_poc.py"""
  37. print(f"\n🔍 Extracting from PDF: {self.pdf_path}")
  38. # Create extractor with options
  39. extractor = PDFExtractor(
  40. self.pdf_path,
  41. verbose=True,
  42. chunk_size=self.extract_options.get('chunk_size', 10),
  43. min_quality=self.extract_options.get('min_quality', 5.0),
  44. extract_images=self.extract_options.get('extract_images', True),
  45. image_dir=f"{self.skill_dir}/assets/images",
  46. min_image_size=self.extract_options.get('min_image_size', 100)
  47. )
  48. # Extract
  49. result = extractor.extract_all()
  50. if not result:
  51. print("❌ Extraction failed")
  52. raise RuntimeError(f"Failed to extract PDF: {self.pdf_path}")
  53. # Save extracted data
  54. with open(self.data_file, 'w', encoding='utf-8') as f:
  55. json.dump(result, f, indent=2, ensure_ascii=False)
  56. print(f"\n💾 Saved extracted data to: {self.data_file}")
  57. self.extracted_data = result
  58. return True
  59. def load_extracted_data(self, json_path):
  60. """Load previously extracted data from JSON"""
  61. print(f"\n📂 Loading extracted data from: {json_path}")
  62. with open(json_path, 'r', encoding='utf-8') as f:
  63. self.extracted_data = json.load(f)
  64. print(f"✅ Loaded {self.extracted_data['total_pages']} pages")
  65. return True
  66. def categorize_content(self):
  67. """Categorize pages based on chapters or keywords"""
  68. print(f"\n📋 Categorizing content...")
  69. categorized = {}
  70. # Use chapters if available
  71. if self.extracted_data.get('chapters'):
  72. for chapter in self.extracted_data['chapters']:
  73. category_key = self._sanitize_filename(chapter['title'])
  74. categorized[category_key] = {
  75. 'title': chapter['title'],
  76. 'pages': []
  77. }
  78. # Assign pages to chapters
  79. for page in self.extracted_data['pages']:
  80. page_num = page['page_number']
  81. # Find which chapter this page belongs to
  82. for chapter in self.extracted_data['chapters']:
  83. if chapter['start_page'] <= page_num <= chapter['end_page']:
  84. category_key = self._sanitize_filename(chapter['title'])
  85. categorized[category_key]['pages'].append(page)
  86. break
  87. # Fall back to keyword-based categorization
  88. elif self.categories:
  89. # Check if categories is already in the right format (for tests)
  90. # If first value is a list of dicts (pages), use as-is
  91. first_value = next(iter(self.categories.values()))
  92. if isinstance(first_value, list) and first_value and isinstance(first_value[0], dict):
  93. # Already categorized - convert to expected format
  94. for cat_key, pages in self.categories.items():
  95. categorized[cat_key] = {
  96. 'title': cat_key.replace('_', ' ').title(),
  97. 'pages': pages
  98. }
  99. else:
  100. # Keyword-based categorization
  101. # Initialize categories
  102. for cat_key, keywords in self.categories.items():
  103. categorized[cat_key] = {
  104. 'title': cat_key.replace('_', ' ').title(),
  105. 'pages': []
  106. }
  107. # Categorize by keywords
  108. for page in self.extracted_data['pages']:
  109. text = page.get('text', '').lower()
  110. headings_text = ' '.join([h['text'] for h in page.get('headings', [])]).lower()
  111. # Score against each category
  112. scores = {}
  113. for cat_key, keywords in self.categories.items():
  114. # Handle both string keywords and dict keywords (shouldn't happen, but be safe)
  115. if isinstance(keywords, list):
  116. score = sum(1 for kw in keywords
  117. if isinstance(kw, str) and (kw.lower() in text or kw.lower() in headings_text))
  118. else:
  119. score = 0
  120. if score > 0:
  121. scores[cat_key] = score
  122. # Assign to highest scoring category
  123. if scores:
  124. best_cat = max(scores, key=scores.get)
  125. categorized[best_cat]['pages'].append(page)
  126. else:
  127. # Default category
  128. if 'other' not in categorized:
  129. categorized['other'] = {'title': 'Other', 'pages': []}
  130. categorized['other']['pages'].append(page)
  131. else:
  132. # No categorization - use single category
  133. categorized['content'] = {
  134. 'title': 'Content',
  135. 'pages': self.extracted_data['pages']
  136. }
  137. print(f"✅ Created {len(categorized)} categories")
  138. for cat_key, cat_data in categorized.items():
  139. print(f" - {cat_data['title']}: {len(cat_data['pages'])} pages")
  140. return categorized
  141. def build_skill(self):
  142. """Build complete skill structure"""
  143. print(f"\n🏗️ Building skill: {self.name}")
  144. # Create directories
  145. os.makedirs(f"{self.skill_dir}/references", exist_ok=True)
  146. os.makedirs(f"{self.skill_dir}/scripts", exist_ok=True)
  147. os.makedirs(f"{self.skill_dir}/assets", exist_ok=True)
  148. # Categorize content
  149. categorized = self.categorize_content()
  150. # Generate reference files
  151. print(f"\n📝 Generating reference files...")
  152. for cat_key, cat_data in categorized.items():
  153. self._generate_reference_file(cat_key, cat_data)
  154. # Generate index
  155. self._generate_index(categorized)
  156. # Generate SKILL.md
  157. self._generate_skill_md(categorized)
  158. print(f"\n✅ Skill built successfully: {self.skill_dir}/")
  159. print(f"\n📦 Next step: Package with: skill-seekers package {self.skill_dir}/")
  160. def _generate_reference_file(self, cat_key, cat_data):
  161. """Generate a reference markdown file for a category"""
  162. filename = f"{self.skill_dir}/references/{cat_key}.md"
  163. with open(filename, 'w', encoding='utf-8') as f:
  164. f.write(f"# {cat_data['title']}\n\n")
  165. for page in cat_data['pages']:
  166. # Add headings as section markers
  167. if page.get('headings'):
  168. f.write(f"## {page['headings'][0]['text']}\n\n")
  169. # Add text content
  170. if page.get('text'):
  171. # Limit to first 1000 chars per page to avoid huge files
  172. text = page['text'][:1000]
  173. f.write(f"{text}\n\n")
  174. # Add code samples (check both 'code_samples' and 'code_blocks' for compatibility)
  175. code_list = page.get('code_samples') or page.get('code_blocks')
  176. if code_list:
  177. f.write("### Code Examples\n\n")
  178. for code in code_list[:3]: # Limit to top 3
  179. lang = code.get('language', '')
  180. f.write(f"```{lang}\n{code['code']}\n```\n\n")
  181. # Add images
  182. if page.get('images'):
  183. # Create assets directory if needed
  184. assets_dir = os.path.join(self.skill_dir, 'assets')
  185. os.makedirs(assets_dir, exist_ok=True)
  186. f.write("### Images\n\n")
  187. for img in page['images']:
  188. # Save image to assets
  189. img_filename = f"page_{page['page_number']}_img_{img['index']}.png"
  190. img_path = os.path.join(assets_dir, img_filename)
  191. with open(img_path, 'wb') as img_file:
  192. img_file.write(img['data'])
  193. # Add markdown image reference
  194. f.write(f"![Image {img['index']}](../assets/{img_filename})\n\n")
  195. f.write("---\n\n")
  196. print(f" Generated: {filename}")
  197. def _generate_index(self, categorized):
  198. """Generate reference index"""
  199. filename = f"{self.skill_dir}/references/index.md"
  200. with open(filename, 'w', encoding='utf-8') as f:
  201. f.write(f"# {self.name.title()} Documentation Reference\n\n")
  202. f.write("## Categories\n\n")
  203. for cat_key, cat_data in categorized.items():
  204. page_count = len(cat_data['pages'])
  205. f.write(f"- [{cat_data['title']}]({cat_key}.md) ({page_count} pages)\n")
  206. f.write("\n## Statistics\n\n")
  207. stats = self.extracted_data.get('quality_statistics', {})
  208. f.write(f"- Total pages: {self.extracted_data.get('total_pages', 0)}\n")
  209. f.write(f"- Code blocks: {self.extracted_data.get('total_code_blocks', 0)}\n")
  210. f.write(f"- Images: {self.extracted_data.get('total_images', 0)}\n")
  211. if stats:
  212. f.write(f"- Average code quality: {stats.get('average_quality', 0):.1f}/10\n")
  213. f.write(f"- Valid code blocks: {stats.get('valid_code_blocks', 0)}\n")
  214. print(f" Generated: {filename}")
  215. def _generate_skill_md(self, categorized):
  216. """Generate main SKILL.md file"""
  217. filename = f"{self.skill_dir}/SKILL.md"
  218. # Generate skill name (lowercase, hyphens only, max 64 chars)
  219. skill_name = self.name.lower().replace('_', '-').replace(' ', '-')[:64]
  220. # Truncate description to 1024 chars if needed
  221. desc = self.description[:1024] if len(self.description) > 1024 else self.description
  222. with open(filename, 'w', encoding='utf-8') as f:
  223. # Write YAML frontmatter
  224. f.write(f"---\n")
  225. f.write(f"name: {skill_name}\n")
  226. f.write(f"description: {desc}\n")
  227. f.write(f"---\n\n")
  228. f.write(f"# {self.name.title()} Documentation Skill\n\n")
  229. f.write(f"{self.description}\n\n")
  230. f.write("## When to use this skill\n\n")
  231. f.write(f"Use this skill when the user asks about {self.name} documentation, ")
  232. f.write("including API references, tutorials, examples, and best practices.\n\n")
  233. f.write("## What's included\n\n")
  234. f.write("This skill contains:\n\n")
  235. for cat_key, cat_data in categorized.items():
  236. f.write(f"- **{cat_data['title']}**: {len(cat_data['pages'])} pages\n")
  237. f.write("\n## Quick Reference\n\n")
  238. # Get high-quality code samples
  239. all_code = []
  240. for page in self.extracted_data['pages']:
  241. all_code.extend(page.get('code_samples', []))
  242. # Sort by quality and get top 5
  243. all_code.sort(key=lambda x: x.get('quality_score', 0), reverse=True)
  244. top_code = all_code[:5]
  245. if top_code:
  246. f.write("### Top Code Examples\n\n")
  247. for i, code in enumerate(top_code, 1):
  248. lang = code['language']
  249. quality = code.get('quality_score', 0)
  250. f.write(f"**Example {i}** (Quality: {quality:.1f}/10):\n\n")
  251. f.write(f"```{lang}\n{code['code'][:300]}...\n```\n\n")
  252. f.write("## Navigation\n\n")
  253. f.write("See `references/index.md` for complete documentation structure.\n\n")
  254. # Add language statistics
  255. langs = self.extracted_data.get('languages_detected', {})
  256. if langs:
  257. f.write("## Languages Covered\n\n")
  258. for lang, count in sorted(langs.items(), key=lambda x: x[1], reverse=True):
  259. f.write(f"- {lang}: {count} examples\n")
  260. print(f" Generated: {filename}")
  261. def _sanitize_filename(self, name):
  262. """Convert string to safe filename"""
  263. # Remove special chars, replace spaces with underscores
  264. safe = re.sub(r'[^\w\s-]', '', name.lower())
  265. safe = re.sub(r'[-\s]+', '_', safe)
  266. return safe
  267. def main():
  268. parser = argparse.ArgumentParser(
  269. description='Convert PDF documentation to Claude skill',
  270. formatter_class=argparse.RawDescriptionHelpFormatter
  271. )
  272. parser.add_argument('--config', help='PDF config JSON file')
  273. parser.add_argument('--pdf', help='Direct PDF file path')
  274. parser.add_argument('--name', help='Skill name (with --pdf)')
  275. parser.add_argument('--from-json', help='Build skill from extracted JSON')
  276. parser.add_argument('--description', help='Skill description')
  277. args = parser.parse_args()
  278. # Validate inputs
  279. if not (args.config or args.pdf or args.from_json):
  280. parser.error("Must specify --config, --pdf, or --from-json")
  281. # Load or create config
  282. if args.config:
  283. with open(args.config, 'r') as f:
  284. config = json.load(f)
  285. elif args.from_json:
  286. # Build from extracted JSON
  287. name = Path(args.from_json).stem.replace('_extracted', '')
  288. config = {
  289. 'name': name,
  290. 'description': args.description or f'Documentation skill for {name}'
  291. }
  292. converter = PDFToSkillConverter(config)
  293. converter.load_extracted_data(args.from_json)
  294. converter.build_skill()
  295. return
  296. else:
  297. # Direct PDF mode
  298. if not args.name:
  299. parser.error("Must specify --name with --pdf")
  300. config = {
  301. 'name': args.name,
  302. 'pdf_path': args.pdf,
  303. 'description': args.description or f'Documentation skill for {args.name}',
  304. 'extract_options': {
  305. 'chunk_size': 10,
  306. 'min_quality': 5.0,
  307. 'extract_images': True,
  308. 'min_image_size': 100
  309. }
  310. }
  311. # Create converter
  312. converter = PDFToSkillConverter(config)
  313. # Extract if needed
  314. if config.get('pdf_path'):
  315. if not converter.extract_pdf():
  316. sys.exit(1)
  317. # Build skill
  318. converter.build_skill()
  319. if __name__ == '__main__':
  320. main()