refactor_jsonl.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154
  1. import json
  2. import os
  3. mapping = {
  4. # 编程技术
  5. "软件工程,glue_coding_用提示词": "编程技术",
  6. "前端复刻流程": "编程技术",
  7. "输入转单行JSON": "编程技术",
  8. "序列图生成": "编程技术",
  9. "流程图": "编程技术",
  10. "函数化万物": "编程技术",
  11. "编程知识库": "编程技术",
  12. "网页UI逆向分析提示词": "编程技术",
  13. "用户优化前端设计": "编程技术",
  14. "图像特征提取": "编程技术",
  15. "前端通用设计": "编程技术",
  16. # 逻辑工具箱
  17. "哲学工具箱": "逻辑工具箱",
  18. "逻辑工具箱": "逻辑工具箱",
  19. "批判性思维分析": "逻辑工具箱",
  20. "思维模型": "逻辑工具箱",
  21. "政治批判工具箱": "逻辑工具箱",
  22. "未来视角": "逻辑工具箱",
  23. "层级结构分析": "逻辑工具箱",
  24. "问题分类识别": "逻辑工具箱",
  25. "分析": "逻辑工具箱",
  26. "终极本质分析": "逻辑工具箱",
  27. "事实核查": "逻辑工具箱",
  28. "关键词图谱": "逻辑工具箱",
  29. "语言分析元prompt": "逻辑工具箱",
  30. "逻辑分析": "逻辑工具箱",
  31. "黄金圈解释": "逻辑工具箱",
  32. "谋士": "逻辑工具箱",
  33. "经验": "逻辑工具箱",
  34. "道": "逻辑工具箱",
  35. "法": "逻辑工具箱",
  36. "术": "逻辑工具箱",
  37. "器": "逻辑工具箱",
  38. "心经口诀创作提示词": "逻辑工具箱",
  39. "临界知识": "逻辑工具箱",
  40. "项目分析": "逻辑工具箱",
  41. "对话提问": "逻辑工具箱",
  42. "思维导图": "逻辑工具箱",
  43. # 内容创作
  44. "文案逆向": "内容创作",
  45. "x_prompt收集": "内容创作",
  46. "x提示词收集": "内容创作",
  47. "x爆款文案生成器": "内容创作",
  48. "推文制作提示词": "内容创作",
  49. "李继刚文选": "内容创作",
  50. "解释提示词": "内容创作",
  51. "一句话描述任何内容": "内容创作",
  52. "子弹总结": "内容创作",
  53. "文本转md语法电子书处理": "内容创作",
  54. "排版和图片,视频转文本": "内容创作",
  55. "艺术风格描述": "内容创作",
  56. "视频生成提示词": "内容创作",
  57. "图片逆向": "内容创作",
  58. "排版": "内容创作",
  59. "内容提炼": "内容创作",
  60. "简讯提示词": "内容创作",
  61. "艺术": "内容创作",
  62. "人话写作": "内容创作",
  63. "小红书": "内容创作",
  64. "组织语言": "内容创作",
  65. "正向人物生平报告官方文案": "内容创作",
  66. "gemini字幕处理": "内容创作",
  67. # 学习教育
  68. "学习提示词": "学习教育",
  69. "学习用提示词": "学习教育",
  70. "ai学习用提示词": "学习教育",
  71. "书籍结构化分析": "学习教育",
  72. "典籍句子学习": "学习教育",
  73. "anki卡片格式输出": "学习教育",
  74. "notebookllm用提示词": "学习教育",
  75. "英文学习": "学习教育",
  76. "速成学习": "学习教育",
  77. "论文解读": "学习教育",
  78. "真传一句话": "学习教育",
  79. "学习音频": "学习教育",
  80. "豆包听书": "学习教育",
  81. "最小知识框架": "学习教育",
  82. # 商业分析
  83. "grok商业金融分析提示词": "商业分析",
  84. "投资调研": "商业分析",
  85. "行业分析": "商业分析",
  86. "需求对齐": "商业分析",
  87. "需求结构化描述": "商业分析",
  88. "麦肯锡行业分析": "商业分析",
  89. "产品策略": "商业分析",
  90. "行业咨询": "商业分析",
  91. "需求解析": "商业分析",
  92. "SOP制作": "商业分析",
  93. # 提示词工程
  94. "元提示词": "提示词工程",
  95. "提示词模块": "提示词工程",
  96. "根据内容逆向提示词": "提示词工程",
  97. "系统提示词": "提示词工程",
  98. "AI使用思维": "提示词工程",
  99. "使用ai的思维": "提示词工程",
  100. "最小字数系统提示词": "提示词工程",
  101. "ChatGPT": "提示词工程",
  102. "Reddit提示词": "提示词工程",
  103. "好prompt生成器": "提示词工程",
  104. "思维协议": "提示词工程",
  105. "grok抓取提示词": "提示词工程",
  106. # 其他
  107. "AI_交易系统提示词": "综合杂项",
  108. "面向CZ": "综合杂项",
  109. }
  110. id_map = {
  111. "编程技术": 1,
  112. "逻辑工具箱": 2,
  113. "内容创作": 3,
  114. "学习教育": 4,
  115. "商业分析": 5,
  116. "提示词工程": 6,
  117. "综合杂项": 7
  118. }
  119. input_file = "prompt_jsonl/prompt_docs_2025_1222_004537.jsonl"
  120. output_file = "prompt_jsonl/prompt_docs_refactored.jsonl"
  121. def process():
  122. stats = {}
  123. with open(input_file, 'r', encoding='utf-8') as fin, \
  124. open(output_file, 'w', encoding='utf-8') as fout:
  125. for line in fin:
  126. if not line.strip(): continue
  127. data = json.loads(line)
  128. old_cat = data.get('category', '')
  129. new_cat = mapping.get(old_cat, "综合杂项")
  130. # Keep original category in tags if it doesn't exist?
  131. # Or just replace. The user said "只调整 'category'"
  132. data['category'] = new_cat
  133. data['category_id'] = id_map.get(new_cat, 7)
  134. fout.write(json.dumps(data, ensure_ascii=False) + '\n')
  135. stats[new_cat] = stats.get(new_cat, 0) + 1
  136. print("Refactor complete.")
  137. for cat, count in stats.items():
  138. print(f"{cat}: {count}")
  139. if __name__ == "__main__":
  140. process()