main.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. r"""
  4. main.py
  5. Unified controller for prompt-library conversions.
  6. 支持的转换模式
  7. ==============
  8. 1. Excel → Docs : 将 Excel 工作簿转换为 Markdown 文档目录
  9. 2. Docs → Excel : 将 Markdown 文档目录还原为 Excel 工作簿
  10. 3. Docs → JSONL : 将 Markdown 文档转换为 JSONL 格式(保留完整元信息)
  11. 4. JSONL → Excel : 将 JSONL 转换为 Excel(单元格存储 JSON 对象)
  12. 5. Excel(JSONL) → JSONL : 将内部 JSONL 格式的 Excel 转换为 JSONL 文件(自动忽略"说明"工作表)
  13. 数据格式规范
  14. ============
  15. Excel 结构:
  16. - 每个工作表(sheet) = 一个分类(category)
  17. - 行(row) = 不同提示词
  18. - 列(col) = 版本迭代
  19. Excel(JSONL) 结构(内部 JSONL 格式):
  20. - 每个工作表(sheet) = 一个分类(category),"说明"工作表会被忽略
  21. - 每个单元格存储 JSON 对象: {"title": "...", "content": "..."}
  22. Docs 结构:
  23. - prompts/(N)_分类名/ # N = category_id
  24. - prompts/(N)_分类名/(r,c)_标题.md # r=row, c=col
  25. JSONL 格式 (每行一个 JSON 对象):
  26. {
  27. "category_id": 2, # 分类编号
  28. "category": "元提示词", # 分类名称
  29. "row": 1, # 原 Excel 行号
  30. "col": 1, # 原 Excel 列号(版本号)
  31. "title": "...", # 标题(截断80字符)
  32. "content": "..." # 完整内容
  33. }
  34. JSONL → Excel 单元格格式:
  35. {"title": "...", "content": "..."} # 只保留 title 和 content
  36. 目录约定
  37. ========
  38. - Excel 源文件: ./prompt_excel/
  39. - Docs 源目录: ./prompt_docs/
  40. - JSONL 文件: ./prompt_jsonl/
  41. - 输出:
  42. - Excel→Docs: ./prompt_docs/prompt_docs_YYYY_MMDD_HHMMSS/
  43. - Docs→Excel: ./prompt_excel/prompt_excel_YYYY_MMDD_HHMMSS/rebuilt.xlsx
  44. - Docs→JSONL: ./prompt_jsonl/{docs_name}.jsonl
  45. - JSONL→Excel: ./prompt_excel/{jsonl_name}.xlsx
  46. - Excel(JSONL)→JSONL: ./prompt_jsonl/{excel_name}.jsonl
  47. 使用示例
  48. ========
  49. # 交互式选择
  50. python3 main.py
  51. # Excel → Docs
  52. python3 main.py --select "prompt_excel/prompt.xlsx"
  53. # Docs → Excel
  54. python3 main.py --select "prompt_docs/prompt_docs_2025_1222"
  55. # Docs → JSONL
  56. python3 main.py --select "prompt_docs/prompt_docs_2025_1222" --mode docs2jsonl
  57. # JSONL → Excel
  58. python3 main.py --select "prompt_jsonl/prompt_docs.jsonl"
  59. # Excel(JSONL) → JSONL(自动检测或显式指定)
  60. python3 main.py --select "prompt_excel/prompt_jsonl.xlsx"
  61. python3 main.py --select "prompt_excel/prompt_jsonl.xlsx" --mode jsonl_excel2jsonl
  62. """
  63. from __future__ import annotations
  64. import argparse
  65. import os
  66. import subprocess
  67. import sys
  68. from dataclasses import dataclass
  69. from pathlib import Path
  70. from typing import List, Optional, Sequence, Tuple
  71. # Optional Rich UI imports (fallback to plain if unavailable)
  72. try:
  73. from rich.console import Console
  74. from rich.layout import Layout
  75. from rich.panel import Panel
  76. from rich.table import Table
  77. from rich.text import Text
  78. from rich import box
  79. from rich.prompt import IntPrompt
  80. _RICH_AVAILABLE = True
  81. except Exception: # pragma: no cover
  82. _RICH_AVAILABLE = False
  83. # Optional InquirerPy for arrow-key selection
  84. try:
  85. from InquirerPy import inquirer as _inq
  86. _INQUIRER_AVAILABLE = True
  87. except Exception: # pragma: no cover
  88. _INQUIRER_AVAILABLE = False
  89. @dataclass
  90. class Candidate:
  91. index: int
  92. kind: str # "excel" | "docs" | "docs2jsonl" | "jsonl"
  93. path: Path
  94. label: str
  95. def get_repo_root() -> Path:
  96. return Path(__file__).resolve().parent
  97. def list_excel_files(excel_dir: Path) -> List[Path]:
  98. if not excel_dir.exists():
  99. return []
  100. return sorted([p for p in excel_dir.iterdir() if p.is_file() and p.suffix.lower() == ".xlsx"], key=lambda p: p.stat().st_mtime)
  101. def has_prompt_files(directory: Path) -> bool:
  102. if not directory.exists():
  103. return False
  104. # Detect files like "(r,c)_*.md" anywhere under the directory
  105. for file_path in directory.rglob("*.md"):
  106. name = file_path.name
  107. if name.startswith("(") and ")_" in name:
  108. return True
  109. return False
  110. def list_doc_sets(docs_dir: Path) -> List[Path]:
  111. results: List[Path] = []
  112. if not docs_dir.exists():
  113. return results
  114. # If the docs_dir itself looks like a set, include it
  115. if has_prompt_files(docs_dir):
  116. results.append(docs_dir)
  117. # Also include any immediate children that look like a docs set
  118. for child in sorted(docs_dir.iterdir()):
  119. if child.is_dir() and has_prompt_files(child):
  120. results.append(child)
  121. return results
  122. def run_start_convert(start_convert: Path, mode: str, project_root: Path, select_path: Optional[Path] = None, excel_dir: Optional[Path] = None, docs_dir: Optional[Path] = None) -> int:
  123. """Delegate to scripts/start_convert.py with appropriate flags."""
  124. python_exe = sys.executable
  125. cmd: List[str] = [python_exe, str(start_convert), "--mode", mode]
  126. if select_path is not None:
  127. # Always pass as repo-root-relative or absolute string
  128. cmd.extend(["--select", str(select_path)])
  129. if excel_dir is not None:
  130. cmd.extend(["--excel-dir", str(excel_dir)])
  131. if docs_dir is not None:
  132. cmd.extend(["--docs-dir", str(docs_dir)])
  133. # Execute in repo root to ensure relative defaults resolve correctly
  134. proc = subprocess.run(cmd, cwd=str(project_root))
  135. return proc.returncode
  136. def run_docs_to_jsonl(docs_path: Path, project_root: Path) -> int:
  137. """Convert docs folder to JSONL format."""
  138. import json
  139. import re
  140. prompts_dir = docs_path / "prompts"
  141. if not prompts_dir.exists():
  142. print(f"❌ 找不到 prompts 目录: {prompts_dir}")
  143. return 1
  144. output_dir = project_root / "prompt_jsonl"
  145. output_dir.mkdir(parents=True, exist_ok=True)
  146. output_file = output_dir / f"{docs_path.name}.jsonl"
  147. records = []
  148. for category_dir in sorted(prompts_dir.iterdir()):
  149. if not category_dir.is_dir():
  150. continue
  151. m = re.match(r'\((\d+)\)_(.+)', category_dir.name)
  152. cat_id, cat_name = (m.groups() if m else (0, category_dir.name))
  153. for md_file in sorted(category_dir.glob("*.md")):
  154. if md_file.name == "index.md":
  155. continue
  156. fm = re.match(r'\((\d+),(\d+)\)_(.+)\.md', md_file.name)
  157. if not fm:
  158. continue
  159. row, col, title = fm.groups()
  160. content = md_file.read_text(encoding='utf-8')
  161. records.append({
  162. "category_id": int(cat_id),
  163. "category": cat_name,
  164. "row": int(row),
  165. "col": int(col),
  166. "title": title[:80],
  167. "content": content
  168. })
  169. with open(output_file, 'w', encoding='utf-8') as f:
  170. for r in records:
  171. f.write(json.dumps(r, ensure_ascii=False) + '\n')
  172. print(f"✅ Docs→JSONL OK: {docs_path.name} → {output_file.relative_to(project_root)}")
  173. return 0
  174. def list_jsonl_files(jsonl_dir: Path) -> List[Path]:
  175. if not jsonl_dir.exists():
  176. return []
  177. return sorted([p for p in jsonl_dir.iterdir() if p.is_file() and p.suffix.lower() == ".jsonl"], key=lambda p: p.stat().st_mtime)
  178. def is_jsonl_excel(excel_path: Path) -> bool:
  179. """检测 Excel 是否为内部 JSONL 格式(单元格存储 JSON 对象)"""
  180. import json
  181. try:
  182. import pandas as pd
  183. except ImportError:
  184. return False
  185. try:
  186. xlsx = pd.ExcelFile(excel_path)
  187. for sheet in xlsx.sheet_names[:2]: # 检查前两个工作表
  188. if sheet == '说明':
  189. continue
  190. df = pd.read_excel(xlsx, sheet_name=sheet, header=None, nrows=1)
  191. if df.empty:
  192. continue
  193. first_val = str(df.iloc[0, 0]).strip() if not pd.isna(df.iloc[0, 0]) else ""
  194. # 检查列名或第一个单元格是否为 JSON
  195. first_col = str(df.columns[0]).strip() if len(df.columns) > 0 else ""
  196. for val in [first_col, first_val]:
  197. if val.startswith('{') and val.endswith('}'):
  198. try:
  199. obj = json.loads(val)
  200. if 'title' in obj and 'content' in obj:
  201. return True
  202. except:
  203. pass
  204. return False
  205. except:
  206. return False
  207. def run_jsonl_excel_to_jsonl(excel_path: Path, project_root: Path) -> int:
  208. """将内部 JSONL 格式的 Excel 转换为 JSONL 文件(忽略"说明"工作表)"""
  209. import json
  210. try:
  211. import pandas as pd
  212. except ImportError:
  213. print("❌ 需要 pandas: pip install pandas openpyxl")
  214. return 1
  215. xlsx = pd.ExcelFile(excel_path)
  216. output_lines = []
  217. cat_id = 0
  218. for sheet in xlsx.sheet_names:
  219. if sheet == '说明':
  220. continue
  221. cat_id += 1
  222. cat_name = sheet
  223. df = pd.read_excel(xlsx, sheet_name=sheet, header=None)
  224. # 检查列名是否是 JSON 数据
  225. for col_idx, col_name in enumerate(df.columns):
  226. col_str = str(col_name).strip()
  227. if col_str.startswith('{') and col_str.endswith('}'):
  228. try:
  229. obj = json.loads(col_str)
  230. if 'title' in obj and 'content' in obj:
  231. output_lines.append(json.dumps({
  232. "category_id": cat_id,
  233. "category": cat_name,
  234. "row": 1,
  235. "col": col_idx + 1,
  236. "title": obj["title"][:80],
  237. "content": obj["content"]
  238. }, ensure_ascii=False))
  239. except:
  240. pass
  241. # 处理数据行
  242. for row_idx, row in df.iterrows():
  243. for col_idx, val in enumerate(row):
  244. if pd.isna(val):
  245. continue
  246. val_str = str(val).strip()
  247. if val_str.startswith('{') and val_str.endswith('}'):
  248. try:
  249. obj = json.loads(val_str)
  250. if 'title' in obj and 'content' in obj:
  251. output_lines.append(json.dumps({
  252. "category_id": cat_id,
  253. "category": cat_name,
  254. "row": row_idx + 2,
  255. "col": col_idx + 1,
  256. "title": obj["title"][:80],
  257. "content": obj["content"]
  258. }, ensure_ascii=False))
  259. except:
  260. pass
  261. if not output_lines:
  262. print(f"❌ 未找到有效的 JSONL 数据: {excel_path}")
  263. return 1
  264. from datetime import datetime
  265. timestamp = datetime.now().strftime("%Y_%m%d_%H%M%S")
  266. output_dir = project_root / "prompt_jsonl"
  267. output_dir.mkdir(parents=True, exist_ok=True)
  268. output_file = output_dir / f"{excel_path.stem}_{timestamp}.jsonl"
  269. with open(output_file, 'w', encoding='utf-8') as f:
  270. f.write('\n'.join(output_lines))
  271. print(f"✅ Excel(JSONL)→JSONL OK: {excel_path.name} → {output_file.relative_to(project_root)} ({len(output_lines)} 条记录)")
  272. return 0
  273. def run_jsonl_to_excel(jsonl_path: Path, project_root: Path) -> int:
  274. """Convert JSONL to Excel, each cell contains the full JSON object as string."""
  275. import json
  276. from collections import defaultdict
  277. try:
  278. import pandas as pd
  279. except ImportError:
  280. print("❌ 需要 pandas: pip install pandas openpyxl")
  281. return 1
  282. records = []
  283. with open(jsonl_path, 'r', encoding='utf-8') as f:
  284. for line in f:
  285. if line.strip():
  286. records.append(json.loads(line))
  287. if not records:
  288. print(f"❌ JSONL 文件为空: {jsonl_path}")
  289. return 1
  290. # category -> {row -> {col -> json_string}}
  291. sheets_data: dict = defaultdict(lambda: defaultdict(dict))
  292. cat_id_map = {}
  293. for r in records:
  294. cat_name = r["category"]
  295. cat_id_map[r["category_id"]] = cat_name
  296. # 单元格内容只保留 title 和 content
  297. cell_data = {"title": r["title"], "content": r["content"]}
  298. sheets_data[cat_name][r["row"]][r["col"]] = json.dumps(cell_data, ensure_ascii=False)
  299. output_dir = project_root / "prompt_excel"
  300. output_dir.mkdir(parents=True, exist_ok=True)
  301. output_file = output_dir / f"{jsonl_path.stem}.xlsx"
  302. sorted_cats = sorted(cat_id_map.items(), key=lambda x: x[0])
  303. with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
  304. for cat_id, cat_name in sorted_cats:
  305. row_data = sheets_data[cat_name]
  306. if not row_data:
  307. continue
  308. max_row = max(row_data.keys())
  309. max_col = max(c for cols in row_data.values() for c in cols.keys())
  310. data = []
  311. for row_idx in range(1, max_row + 1):
  312. row_list = []
  313. for col_idx in range(1, max_col + 1):
  314. row_list.append(row_data.get(row_idx, {}).get(col_idx, ""))
  315. data.append(row_list)
  316. df = pd.DataFrame(data)
  317. sheet_name = cat_name[:31]
  318. df.to_excel(writer, sheet_name=sheet_name, index=False, header=False)
  319. print(f"✅ JSONL→Excel OK: {jsonl_path.name} → {output_file.relative_to(project_root)} ({len(sorted_cats)} 个工作表)")
  320. return 0
  321. def build_candidates(project_root: Path, excel_dir: Path, docs_dir: Path) -> List[Candidate]:
  322. candidates: List[Candidate] = []
  323. idx = 1
  324. jsonl_dir = project_root / "prompt_jsonl"
  325. for path in list_excel_files(excel_dir):
  326. label = f"{path.name}"
  327. # 检测是否为内部 JSONL 格式的 Excel
  328. if is_jsonl_excel(path):
  329. candidates.append(Candidate(index=idx, kind="jsonl_excel", path=path, label=label))
  330. else:
  331. candidates.append(Candidate(index=idx, kind="excel", path=path, label=label))
  332. idx += 1
  333. for path in list_doc_sets(docs_dir):
  334. display = path.relative_to(project_root) if path.is_absolute() else path
  335. # Docs → Excel
  336. candidates.append(Candidate(index=idx, kind="docs", path=path, label=f"{display}"))
  337. idx += 1
  338. # Docs → JSONL
  339. candidates.append(Candidate(index=idx, kind="docs2jsonl", path=path, label=f"{display}"))
  340. idx += 1
  341. for path in list_jsonl_files(jsonl_dir):
  342. label = f"{path.name}"
  343. candidates.append(Candidate(index=idx, kind="jsonl", path=path, label=label))
  344. idx += 1
  345. return candidates
  346. def select_interactively(candidates: Sequence[Candidate]) -> Optional[Candidate]:
  347. if not candidates:
  348. print("没有可用的 Excel 或 Docs 源。请将 .xlsx 放到 prompt_excel/ 或将文档放到 prompt_docs/ 下。")
  349. return None
  350. # Prefer arrow-key selection if available
  351. if _INQUIRER_AVAILABLE:
  352. try:
  353. choices = [
  354. {"name": f"[{c.kind.upper()}] {c.label}", "value": c.index}
  355. for c in candidates
  356. ]
  357. selection = _inq.select(
  358. message="选择要转换的源(上下箭头,回车确认,Ctrl+C 取消):",
  359. choices=choices,
  360. default=choices[0]["value"],
  361. ).execute()
  362. match = next((c for c in candidates if c.index == selection), None)
  363. return match
  364. except KeyboardInterrupt:
  365. return None
  366. if _RICH_AVAILABLE:
  367. console = Console()
  368. layout = Layout()
  369. layout.split_column(
  370. Layout(name="header", size=3),
  371. Layout(name="list"),
  372. Layout(name="footer", size=3),
  373. )
  374. header = Panel(Text("提示词库转换器", style="bold cyan"), subtitle="选择一个源开始转换", box=box.ROUNDED)
  375. table = Table(box=box.SIMPLE_HEAVY)
  376. table.add_column("编号", style="bold yellow", justify="right", width=4)
  377. table.add_column("类型", style="magenta", width=16)
  378. table.add_column("路径/名称", style="white")
  379. kind_labels = {"excel": "Excel→Docs", "docs": "Docs→Excel", "docs2jsonl": "Docs→JSONL", "jsonl": "JSONL→Excel", "jsonl_excel": "Excel(JSONL)→JSONL"}
  380. for c in candidates:
  381. table.add_row(str(c.index), kind_labels.get(c.kind, c.kind), c.label)
  382. layout["header"].update(header)
  383. layout["list"].update(Panel(table, title="可选源", border_style="cyan"))
  384. layout["footer"].update(Panel(Text("输入编号并回车(0 退出)", style="bold"), box=box.ROUNDED))
  385. console.print(layout)
  386. while True:
  387. try:
  388. choice = IntPrompt.ask("编号", default=0)
  389. except Exception:
  390. return None
  391. if choice == 0:
  392. return None
  393. match = next((c for c in candidates if c.index == choice), None)
  394. if match is not None:
  395. return match
  396. console.print("[red]编号不存在,请重试[/red]")
  397. # Plain fallback
  398. kind_labels = {"excel": "Excel→Docs", "docs": "Docs→Excel", "docs2jsonl": "Docs→JSONL", "jsonl": "JSONL→Excel", "jsonl_excel": "Excel(JSONL)→JSONL"}
  399. print("请选择一个源进行转换:")
  400. for c in candidates:
  401. print(f" {c.index:2d}. [{kind_labels.get(c.kind, c.kind)}] {c.label}")
  402. print(" 0. 退出")
  403. while True:
  404. try:
  405. raw = input("输入编号后回车:").strip()
  406. except EOFError:
  407. return None
  408. if not raw:
  409. continue
  410. if raw == "0":
  411. return None
  412. if not raw.isdigit():
  413. print("请输入有效数字。")
  414. continue
  415. choice = int(raw)
  416. match = next((c for c in candidates if c.index == choice), None)
  417. if match is None:
  418. print("编号不存在,请重试。")
  419. continue
  420. return match
  421. def parse_args() -> argparse.Namespace:
  422. p = argparse.ArgumentParser(description="prompt-library conversion controller")
  423. p.add_argument("--excel-dir", type=str, default="prompt_excel", help="Excel sources directory (default: prompt_excel)")
  424. p.add_argument("--docs-dir", type=str, default="prompt_docs", help="Docs sources directory (default: prompt_docs)")
  425. p.add_argument("--select", type=str, default=None, help="Path to a specific .xlsx file or a docs folder")
  426. p.add_argument("--mode", type=str, choices=["excel2docs", "docs2excel", "docs2jsonl", "jsonl2excel", "jsonl_excel2jsonl"], default=None, help="Conversion mode (auto-detect if not specified)")
  427. p.add_argument("--non-interactive", action="store_true", help="Do not prompt; require --select or exit")
  428. return p.parse_args()
  429. def main() -> int:
  430. repo_root = get_repo_root()
  431. start_convert = repo_root / "scripts" / "start_convert.py"
  432. if not start_convert.exists():
  433. print("找不到 scripts/start_convert.py。")
  434. return 1
  435. args = parse_args()
  436. excel_dir = (repo_root / args.excel_dir).resolve() if not Path(args.excel_dir).is_absolute() else Path(args.excel_dir).resolve()
  437. docs_dir = (repo_root / args.docs_dir).resolve() if not Path(args.docs_dir).is_absolute() else Path(args.docs_dir).resolve()
  438. # Non-interactive path with explicit selection
  439. if args.non_interactive or args.select:
  440. if not args.select:
  441. print("--non-interactive 需要配合 --select 使用。")
  442. return 2
  443. selected = Path(args.select)
  444. if not selected.is_absolute():
  445. selected = (repo_root / selected).resolve()
  446. if not selected.exists():
  447. print(f"选择的路径不存在: {selected}")
  448. return 2
  449. if selected.is_file() and selected.suffix.lower() == ".xlsx":
  450. # 检测是否为内部 JSONL 格式或显式指定模式
  451. if args.mode == "jsonl_excel2jsonl" or is_jsonl_excel(selected):
  452. return run_jsonl_excel_to_jsonl(selected, repo_root)
  453. return run_start_convert(start_convert, mode="excel2docs", project_root=repo_root, select_path=selected, excel_dir=excel_dir)
  454. if selected.is_file() and selected.suffix.lower() == ".jsonl":
  455. return run_jsonl_to_excel(selected, repo_root)
  456. if selected.is_dir():
  457. # Check mode or default to docs2excel
  458. if args.mode == "docs2jsonl":
  459. return run_docs_to_jsonl(selected, repo_root)
  460. return run_start_convert(start_convert, mode="docs2excel", project_root=repo_root, select_path=selected, docs_dir=docs_dir)
  461. print("无法识别的选择类型。")
  462. return 2
  463. # Interactive selection
  464. candidates = build_candidates(repo_root, excel_dir, docs_dir)
  465. chosen = select_interactively(candidates)
  466. if chosen is None:
  467. return 0
  468. if chosen.kind == "excel":
  469. return run_start_convert(start_convert, mode="excel2docs", project_root=repo_root, select_path=chosen.path, excel_dir=excel_dir)
  470. elif chosen.kind == "jsonl_excel":
  471. return run_jsonl_excel_to_jsonl(chosen.path, repo_root)
  472. elif chosen.kind == "docs2jsonl":
  473. return run_docs_to_jsonl(chosen.path, repo_root)
  474. elif chosen.kind == "jsonl":
  475. return run_jsonl_to_excel(chosen.path, repo_root)
  476. else:
  477. return run_start_convert(start_convert, mode="docs2excel", project_root=repo_root, select_path=chosen.path, docs_dir=docs_dir)
  478. if __name__ == "__main__":
  479. sys.exit(main())