docs_to_excel.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. docs_to_excel.py
  5. Documents → Excel converter: rebuild a workbook from prompts folders.
  6. Rules (per STRUCTURE_AND_CONVERSION_SPEC.md):
  7. - Each folder under prompt-library/prompts that matches "(N)_<name>" or any folder is a sheet
  8. - For each file matching "(r,c)_*.md", write its full text to Excel cell (r,c), 1-based
  9. - Title part in filename is ignored for cell value
  10. - Non-matching files are ignored
  11. - Optionally clears existing workbook or merges (default: overwrite generate new)
  12. Usage:
  13. python prompt-library/scripts/docs_to_excel.py --out "rebuilt.xlsx"
  14. # optional: --prompts-dir prompt-library/prompts --clear
  15. """
  16. from __future__ import annotations
  17. import argparse
  18. import re
  19. from pathlib import Path
  20. from typing import Dict, Tuple
  21. import pandas as pd
  22. from openpyxl import Workbook
  23. FOLDER_PREFIX_RE = re.compile(r"^\((\d+)\)_")
  24. FILE_NAME_RE = re.compile(r"^\((\d+),(\d+)\)_.*\.md$")
  25. def parse_args() -> argparse.Namespace:
  26. p = argparse.ArgumentParser(description="Rebuild Excel workbook from prompt folders")
  27. p.add_argument("--prompts-dir", type=str, default="prompt-library/prompts", help="Prompts root directory")
  28. p.add_argument("--out", type=str, required=True, help="Output Excel file path")
  29. return p.parse_args()
  30. def list_sheet_folders(prompts_root: Path) -> Dict[str, Path]:
  31. sheets: Dict[str, Path] = {}
  32. for child in sorted(prompts_root.iterdir()):
  33. if not child.is_dir():
  34. continue
  35. if child.name == "prompt-category":
  36. # legacy; skip auto-generated category
  37. continue
  38. sheets[child.name] = child
  39. return sheets
  40. def extract_rc(name: str) -> Tuple[int, int] | None:
  41. m = FILE_NAME_RE.match(name)
  42. if not m:
  43. return None
  44. r = int(m.group(1))
  45. c = int(m.group(2))
  46. return r, c
  47. def main() -> None:
  48. args = parse_args()
  49. prompts_root = Path(args.prompts_dir).resolve()
  50. out_path = Path(args.out).resolve()
  51. if not prompts_root.exists():
  52. raise FileNotFoundError(f"Prompts directory not found: {prompts_root}")
  53. sheet_folders = list_sheet_folders(prompts_root)
  54. if not sheet_folders:
  55. raise RuntimeError("No sheet folders found under prompts root")
  56. wb = Workbook()
  57. # remove default sheet
  58. default = wb.active
  59. wb.remove(default)
  60. for folder_name, folder_path in sheet_folders.items():
  61. # Recover original sheet name (try to drop ordering prefix "(N)_")
  62. m = FOLDER_PREFIX_RE.match(folder_name)
  63. sheet_name = folder_name[m.end():] if m else folder_name
  64. if not sheet_name:
  65. sheet_name = folder_name
  66. ws = wb.create_sheet(title=sheet_name)
  67. # Aggregate cells
  68. max_row = 0
  69. max_col = 0
  70. cells: Dict[Tuple[int, int], str] = {}
  71. for file in folder_path.iterdir():
  72. if not file.is_file() or not file.name.endswith('.md'):
  73. continue
  74. rc = extract_rc(file.name)
  75. if not rc:
  76. continue
  77. r, c = rc
  78. text = file.read_text(encoding='utf-8')
  79. # Trim a single trailing newline for cell value aesthetics
  80. if text.endswith("\n"):
  81. text = text[:-1]
  82. cells[(r, c)] = text
  83. if r > max_row:
  84. max_row = r
  85. if c > max_col:
  86. max_col = c
  87. # Write into sheet
  88. for (r, c), val in cells.items():
  89. ws.cell(row=r, column=c, value=val)
  90. # Save workbook
  91. out_path.parent.mkdir(parents=True, exist_ok=True)
  92. wb.save(str(out_path))
  93. print(f"✅ Rebuilt Excel saved to: {out_path}")
  94. if __name__ == "__main__":
  95. main()