md_to_jsonl.py 2.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273
  1. #!/usr/bin/env python3
  2. """
  3. 将 prompt_docs 目录下的 md 文件转换为 JSONL 格式
  4. 用法:
  5. python md_to_jsonl.py <prompt_docs目录>
  6. python md_to_jsonl.py prompt_docs/prompt_docs_2025_1222_004537
  7. """
  8. import json
  9. import re
  10. import sys
  11. from pathlib import Path
  12. REPO_ROOT = Path(__file__).resolve().parent.parent
  13. OUTPUT_DIR = REPO_ROOT / "prompt_jsonl"
  14. def convert(docs_dir: Path):
  15. prompts_dir = docs_dir / "prompts"
  16. if not prompts_dir.exists():
  17. print(f"❌ 找不到 prompts 目录: {prompts_dir}")
  18. return
  19. # 输出文件名基于输入目录名
  20. output_file = OUTPUT_DIR / f"{docs_dir.name}.jsonl"
  21. OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
  22. records = []
  23. for category_dir in sorted(prompts_dir.iterdir()):
  24. if not category_dir.is_dir():
  25. continue
  26. m = re.match(r'\((\d+)\)_(.+)', category_dir.name)
  27. cat_id, cat_name = (m.groups() if m else (0, category_dir.name))
  28. for md_file in sorted(category_dir.glob("*.md")):
  29. if md_file.name == "index.md":
  30. continue
  31. fm = re.match(r'\((\d+),(\d+)\)_(.+)\.md', md_file.name)
  32. if not fm:
  33. continue
  34. row, col, title = fm.groups()
  35. content = md_file.read_text(encoding='utf-8')
  36. records.append({
  37. "category_id": int(cat_id),
  38. "category": cat_name,
  39. "row": int(row),
  40. "col": int(col),
  41. "title": title[:80],
  42. "content": content
  43. })
  44. with open(output_file, 'w', encoding='utf-8') as f:
  45. for r in records:
  46. f.write(json.dumps(r, ensure_ascii=False) + '\n')
  47. print(f"✅ 转换完成: {len(records)} 条 → {output_file}")
  48. def main():
  49. if len(sys.argv) < 2:
  50. print(__doc__)
  51. sys.exit(1)
  52. docs_dir = Path(sys.argv[1])
  53. if not docs_dir.is_absolute():
  54. docs_dir = REPO_ROOT / docs_dir
  55. convert(docs_dir)
  56. if __name__ == "__main__":
  57. main()