| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273 |
- #!/usr/bin/env python3
- """
- 将 prompt_docs 目录下的 md 文件转换为 JSONL 格式
- 用法:
- python md_to_jsonl.py <prompt_docs目录>
- python md_to_jsonl.py prompt_docs/prompt_docs_2025_1222_004537
- """
- import json
- import re
- import sys
- from pathlib import Path
- REPO_ROOT = Path(__file__).resolve().parent.parent
- OUTPUT_DIR = REPO_ROOT / "prompt_jsonl"
- def convert(docs_dir: Path):
- prompts_dir = docs_dir / "prompts"
- if not prompts_dir.exists():
- print(f"❌ 找不到 prompts 目录: {prompts_dir}")
- return
-
- # 输出文件名基于输入目录名
- output_file = OUTPUT_DIR / f"{docs_dir.name}.jsonl"
- OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
-
- records = []
- for category_dir in sorted(prompts_dir.iterdir()):
- if not category_dir.is_dir():
- continue
-
- m = re.match(r'\((\d+)\)_(.+)', category_dir.name)
- cat_id, cat_name = (m.groups() if m else (0, category_dir.name))
-
- for md_file in sorted(category_dir.glob("*.md")):
- if md_file.name == "index.md":
- continue
-
- fm = re.match(r'\((\d+),(\d+)\)_(.+)\.md', md_file.name)
- if not fm:
- continue
-
- row, col, title = fm.groups()
- content = md_file.read_text(encoding='utf-8')
-
- records.append({
- "category_id": int(cat_id),
- "category": cat_name,
- "row": int(row),
- "col": int(col),
- "title": title[:80],
- "content": content
- })
-
- with open(output_file, 'w', encoding='utf-8') as f:
- for r in records:
- f.write(json.dumps(r, ensure_ascii=False) + '\n')
-
- print(f"✅ 转换完成: {len(records)} 条 → {output_file}")
- def main():
- if len(sys.argv) < 2:
- print(__doc__)
- sys.exit(1)
-
- docs_dir = Path(sys.argv[1])
- if not docs_dir.is_absolute():
- docs_dir = REPO_ROOT / docs_dir
-
- convert(docs_dir)
- if __name__ == "__main__":
- main()
|