| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- r"""
- main.py
- Unified controller for prompt-library conversions.
- 支持的转换模式
- ==============
- 1. Excel → Docs : 将 Excel 工作簿转换为 Markdown 文档目录
- 2. Docs → Excel : 将 Markdown 文档目录还原为 Excel 工作簿
- 3. Docs → JSONL : 将 Markdown 文档转换为 JSONL 格式(保留完整元信息)
- 4. JSONL → Excel : 将 JSONL 转换为 Excel(单元格存储 JSON 对象)
- 数据格式规范
- ============
- Excel 结构:
- - 每个工作表(sheet) = 一个分类(category)
- - 行(row) = 不同提示词
- - 列(col) = 版本迭代
- Docs 结构:
- - prompts/(N)_分类名/ # N = category_id
- - prompts/(N)_分类名/(r,c)_标题.md # r=row, c=col
- JSONL 格式 (每行一个 JSON 对象):
- {
- "category_id": 2, # 分类编号
- "category": "元提示词", # 分类名称
- "row": 1, # 原 Excel 行号
- "col": 1, # 原 Excel 列号(版本号)
- "title": "...", # 标题(截断80字符)
- "content": "..." # 完整内容
- }
- JSONL → Excel 单元格格式:
- {"title": "...", "content": "..."} # 只保留 title 和 content
- 目录约定
- ========
- - Excel 源文件: ./prompt_excel/
- - Docs 源目录: ./prompt_docs/
- - JSONL 文件: ./prompt_jsonl/
- - 输出:
- - Excel→Docs: ./prompt_docs/prompt_docs_YYYY_MMDD_HHMMSS/
- - Docs→Excel: ./prompt_excel/prompt_excel_YYYY_MMDD_HHMMSS/rebuilt.xlsx
- - Docs→JSONL: ./prompt_jsonl/{docs_name}.jsonl
- - JSONL→Excel: ./prompt_excel/{jsonl_name}.xlsx
- 使用示例
- ========
- # 交互式选择
- python3 main.py
- # Excel → Docs
- python3 main.py --select "prompt_excel/prompt.xlsx"
- # Docs → Excel
- python3 main.py --select "prompt_docs/prompt_docs_2025_1222"
- # Docs → JSONL
- python3 main.py --select "prompt_docs/prompt_docs_2025_1222" --mode docs2jsonl
- # JSONL → Excel
- python3 main.py --select "prompt_jsonl/prompt_docs.jsonl"
- """
- from __future__ import annotations
- import argparse
- import os
- import subprocess
- import sys
- from dataclasses import dataclass
- from pathlib import Path
- from typing import List, Optional, Sequence, Tuple
- # Optional Rich UI imports (fallback to plain if unavailable)
- try:
- from rich.console import Console
- from rich.layout import Layout
- from rich.panel import Panel
- from rich.table import Table
- from rich.text import Text
- from rich import box
- from rich.prompt import IntPrompt
- _RICH_AVAILABLE = True
- except Exception: # pragma: no cover
- _RICH_AVAILABLE = False
- # Optional InquirerPy for arrow-key selection
- try:
- from InquirerPy import inquirer as _inq
- _INQUIRER_AVAILABLE = True
- except Exception: # pragma: no cover
- _INQUIRER_AVAILABLE = False
- @dataclass
- class Candidate:
- index: int
- kind: str # "excel" | "docs" | "docs2jsonl" | "jsonl"
- path: Path
- label: str
- def get_repo_root() -> Path:
- return Path(__file__).resolve().parent
- def list_excel_files(excel_dir: Path) -> List[Path]:
- if not excel_dir.exists():
- return []
- return sorted([p for p in excel_dir.iterdir() if p.is_file() and p.suffix.lower() == ".xlsx"], key=lambda p: p.stat().st_mtime)
- def has_prompt_files(directory: Path) -> bool:
- if not directory.exists():
- return False
- # Detect files like "(r,c)_*.md" anywhere under the directory
- for file_path in directory.rglob("*.md"):
- name = file_path.name
- if name.startswith("(") and ")_" in name:
- return True
- return False
- def list_doc_sets(docs_dir: Path) -> List[Path]:
- results: List[Path] = []
- if not docs_dir.exists():
- return results
- # If the docs_dir itself looks like a set, include it
- if has_prompt_files(docs_dir):
- results.append(docs_dir)
- # Also include any immediate children that look like a docs set
- for child in sorted(docs_dir.iterdir()):
- if child.is_dir() and has_prompt_files(child):
- results.append(child)
- return results
- def run_start_convert(start_convert: Path, mode: str, project_root: Path, select_path: Optional[Path] = None, excel_dir: Optional[Path] = None, docs_dir: Optional[Path] = None) -> int:
- """Delegate to scripts/start_convert.py with appropriate flags."""
- python_exe = sys.executable
- cmd: List[str] = [python_exe, str(start_convert), "--mode", mode]
- if select_path is not None:
- # Always pass as repo-root-relative or absolute string
- cmd.extend(["--select", str(select_path)])
- if excel_dir is not None:
- cmd.extend(["--excel-dir", str(excel_dir)])
- if docs_dir is not None:
- cmd.extend(["--docs-dir", str(docs_dir)])
- # Execute in repo root to ensure relative defaults resolve correctly
- proc = subprocess.run(cmd, cwd=str(project_root))
- return proc.returncode
- def run_docs_to_jsonl(docs_path: Path, project_root: Path) -> int:
- """Convert docs folder to JSONL format."""
- import json
- import re
-
- prompts_dir = docs_path / "prompts"
- if not prompts_dir.exists():
- print(f"❌ 找不到 prompts 目录: {prompts_dir}")
- return 1
-
- output_dir = project_root / "prompt_jsonl"
- output_dir.mkdir(parents=True, exist_ok=True)
- output_file = output_dir / f"{docs_path.name}.jsonl"
-
- records = []
- for category_dir in sorted(prompts_dir.iterdir()):
- if not category_dir.is_dir():
- continue
-
- m = re.match(r'\((\d+)\)_(.+)', category_dir.name)
- cat_id, cat_name = (m.groups() if m else (0, category_dir.name))
-
- for md_file in sorted(category_dir.glob("*.md")):
- if md_file.name == "index.md":
- continue
-
- fm = re.match(r'\((\d+),(\d+)\)_(.+)\.md', md_file.name)
- if not fm:
- continue
-
- row, col, title = fm.groups()
- content = md_file.read_text(encoding='utf-8')
-
- records.append({
- "category_id": int(cat_id),
- "category": cat_name,
- "row": int(row),
- "col": int(col),
- "title": title[:80],
- "content": content
- })
-
- with open(output_file, 'w', encoding='utf-8') as f:
- for r in records:
- f.write(json.dumps(r, ensure_ascii=False) + '\n')
-
- print(f"✅ Docs→JSONL OK: {docs_path.name} → {output_file.relative_to(project_root)}")
- return 0
- def list_jsonl_files(jsonl_dir: Path) -> List[Path]:
- if not jsonl_dir.exists():
- return []
- return sorted([p for p in jsonl_dir.iterdir() if p.is_file() and p.suffix.lower() == ".jsonl"], key=lambda p: p.stat().st_mtime)
- def run_jsonl_to_excel(jsonl_path: Path, project_root: Path) -> int:
- """Convert JSONL to Excel, each cell contains the full JSON object as string."""
- import json
- from collections import defaultdict
- try:
- import pandas as pd
- except ImportError:
- print("❌ 需要 pandas: pip install pandas openpyxl")
- return 1
-
- records = []
- with open(jsonl_path, 'r', encoding='utf-8') as f:
- for line in f:
- if line.strip():
- records.append(json.loads(line))
-
- if not records:
- print(f"❌ JSONL 文件为空: {jsonl_path}")
- return 1
-
- # category -> {row -> {col -> json_string}}
- sheets_data: dict = defaultdict(lambda: defaultdict(dict))
- cat_id_map = {}
-
- for r in records:
- cat_name = r["category"]
- cat_id_map[r["category_id"]] = cat_name
- # 单元格内容只保留 title 和 content
- cell_data = {"title": r["title"], "content": r["content"]}
- sheets_data[cat_name][r["row"]][r["col"]] = json.dumps(cell_data, ensure_ascii=False)
-
- output_dir = project_root / "prompt_excel"
- output_dir.mkdir(parents=True, exist_ok=True)
- output_file = output_dir / f"{jsonl_path.stem}.xlsx"
-
- sorted_cats = sorted(cat_id_map.items(), key=lambda x: x[0])
-
- with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
- for cat_id, cat_name in sorted_cats:
- row_data = sheets_data[cat_name]
- if not row_data:
- continue
-
- max_row = max(row_data.keys())
- max_col = max(c for cols in row_data.values() for c in cols.keys())
-
- data = []
- for row_idx in range(1, max_row + 1):
- row_list = []
- for col_idx in range(1, max_col + 1):
- row_list.append(row_data.get(row_idx, {}).get(col_idx, ""))
- data.append(row_list)
-
- df = pd.DataFrame(data)
- sheet_name = cat_name[:31]
- df.to_excel(writer, sheet_name=sheet_name, index=False, header=False)
-
- print(f"✅ JSONL→Excel OK: {jsonl_path.name} → {output_file.relative_to(project_root)} ({len(sorted_cats)} 个工作表)")
- return 0
- def build_candidates(project_root: Path, excel_dir: Path, docs_dir: Path) -> List[Candidate]:
- candidates: List[Candidate] = []
- idx = 1
- jsonl_dir = project_root / "prompt_jsonl"
-
- for path in list_excel_files(excel_dir):
- label = f"{path.name}"
- candidates.append(Candidate(index=idx, kind="excel", path=path, label=label))
- idx += 1
- for path in list_doc_sets(docs_dir):
- display = path.relative_to(project_root) if path.is_absolute() else path
- # Docs → Excel
- candidates.append(Candidate(index=idx, kind="docs", path=path, label=f"{display}"))
- idx += 1
- # Docs → JSONL
- candidates.append(Candidate(index=idx, kind="docs2jsonl", path=path, label=f"{display}"))
- idx += 1
- for path in list_jsonl_files(jsonl_dir):
- label = f"{path.name}"
- candidates.append(Candidate(index=idx, kind="jsonl", path=path, label=label))
- idx += 1
- return candidates
- def select_interactively(candidates: Sequence[Candidate]) -> Optional[Candidate]:
- if not candidates:
- print("没有可用的 Excel 或 Docs 源。请将 .xlsx 放到 prompt_excel/ 或将文档放到 prompt_docs/ 下。")
- return None
- # Prefer arrow-key selection if available
- if _INQUIRER_AVAILABLE:
- try:
- choices = [
- {"name": f"[{c.kind.upper()}] {c.label}", "value": c.index}
- for c in candidates
- ]
- selection = _inq.select(
- message="选择要转换的源(上下箭头,回车确认,Ctrl+C 取消):",
- choices=choices,
- default=choices[0]["value"],
- ).execute()
- match = next((c for c in candidates if c.index == selection), None)
- return match
- except KeyboardInterrupt:
- return None
- if _RICH_AVAILABLE:
- console = Console()
- layout = Layout()
- layout.split_column(
- Layout(name="header", size=3),
- Layout(name="list"),
- Layout(name="footer", size=3),
- )
- header = Panel(Text("提示词库转换器", style="bold cyan"), subtitle="选择一个源开始转换", box=box.ROUNDED)
- table = Table(box=box.SIMPLE_HEAVY)
- table.add_column("编号", style="bold yellow", justify="right", width=4)
- table.add_column("类型", style="magenta", width=12)
- table.add_column("路径/名称", style="white")
- kind_labels = {"excel": "Excel→Docs", "docs": "Docs→Excel", "docs2jsonl": "Docs→JSONL", "jsonl": "JSONL→Excel"}
- for c in candidates:
- table.add_row(str(c.index), kind_labels.get(c.kind, c.kind), c.label)
- layout["header"].update(header)
- layout["list"].update(Panel(table, title="可选源", border_style="cyan"))
- layout["footer"].update(Panel(Text("输入编号并回车(0 退出)", style="bold"), box=box.ROUNDED))
- console.print(layout)
- while True:
- try:
- choice = IntPrompt.ask("编号", default=0)
- except Exception:
- return None
- if choice == 0:
- return None
- match = next((c for c in candidates if c.index == choice), None)
- if match is not None:
- return match
- console.print("[red]编号不存在,请重试[/red]")
- # Plain fallback
- kind_labels = {"excel": "Excel→Docs", "docs": "Docs→Excel", "docs2jsonl": "Docs→JSONL", "jsonl": "JSONL→Excel"}
- print("请选择一个源进行转换:")
- for c in candidates:
- print(f" {c.index:2d}. [{kind_labels.get(c.kind, c.kind)}] {c.label}")
- print(" 0. 退出")
- while True:
- try:
- raw = input("输入编号后回车:").strip()
- except EOFError:
- return None
- if not raw:
- continue
- if raw == "0":
- return None
- if not raw.isdigit():
- print("请输入有效数字。")
- continue
- choice = int(raw)
- match = next((c for c in candidates if c.index == choice), None)
- if match is None:
- print("编号不存在,请重试。")
- continue
- return match
- def parse_args() -> argparse.Namespace:
- p = argparse.ArgumentParser(description="prompt-library conversion controller")
- p.add_argument("--excel-dir", type=str, default="prompt_excel", help="Excel sources directory (default: prompt_excel)")
- p.add_argument("--docs-dir", type=str, default="prompt_docs", help="Docs sources directory (default: prompt_docs)")
- p.add_argument("--select", type=str, default=None, help="Path to a specific .xlsx file or a docs folder")
- p.add_argument("--mode", type=str, choices=["excel2docs", "docs2excel", "docs2jsonl", "jsonl2excel"], default=None, help="Conversion mode (auto-detect if not specified)")
- p.add_argument("--non-interactive", action="store_true", help="Do not prompt; require --select or exit")
- return p.parse_args()
- def main() -> int:
- repo_root = get_repo_root()
- start_convert = repo_root / "scripts" / "start_convert.py"
- if not start_convert.exists():
- print("找不到 scripts/start_convert.py。")
- return 1
- args = parse_args()
- excel_dir = (repo_root / args.excel_dir).resolve() if not Path(args.excel_dir).is_absolute() else Path(args.excel_dir).resolve()
- docs_dir = (repo_root / args.docs_dir).resolve() if not Path(args.docs_dir).is_absolute() else Path(args.docs_dir).resolve()
- # Non-interactive path with explicit selection
- if args.non_interactive or args.select:
- if not args.select:
- print("--non-interactive 需要配合 --select 使用。")
- return 2
- selected = Path(args.select)
- if not selected.is_absolute():
- selected = (repo_root / selected).resolve()
- if not selected.exists():
- print(f"选择的路径不存在: {selected}")
- return 2
- if selected.is_file() and selected.suffix.lower() == ".xlsx":
- return run_start_convert(start_convert, mode="excel2docs", project_root=repo_root, select_path=selected, excel_dir=excel_dir)
- if selected.is_file() and selected.suffix.lower() == ".jsonl":
- return run_jsonl_to_excel(selected, repo_root)
- if selected.is_dir():
- # Check mode or default to docs2excel
- if args.mode == "docs2jsonl":
- return run_docs_to_jsonl(selected, repo_root)
- return run_start_convert(start_convert, mode="docs2excel", project_root=repo_root, select_path=selected, docs_dir=docs_dir)
- print("无法识别的选择类型。")
- return 2
- # Interactive selection
- candidates = build_candidates(repo_root, excel_dir, docs_dir)
- chosen = select_interactively(candidates)
- if chosen is None:
- return 0
- if chosen.kind == "excel":
- return run_start_convert(start_convert, mode="excel2docs", project_root=repo_root, select_path=chosen.path, excel_dir=excel_dir)
- elif chosen.kind == "docs2jsonl":
- return run_docs_to_jsonl(chosen.path, repo_root)
- elif chosen.kind == "jsonl":
- return run_jsonl_to_excel(chosen.path, repo_root)
- else:
- return run_start_convert(start_convert, mode="docs2excel", project_root=repo_root, select_path=chosen.path, docs_dir=docs_dir)
- if __name__ == "__main__":
- sys.exit(main())
|