verify_integrity.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. import json
  2. import os
  3. from collections import defaultdict
  4. jsonl_path = "prompt_jsonl/prompt_docs_refactored.jsonl"
  5. docs_root = "prompt_docs/prompt_docs_refactored/prompts"
  6. def verify():
  7. print("=== 开始全面完整性检查 ===\n")
  8. # 1. JSONL 数据加载与基础检查
  9. if not os.path.exists(jsonl_path):
  10. print(f"❌ 错误: JSONL 文件不存在: {jsonl_path}")
  11. return
  12. data = []
  13. with open(jsonl_path, 'r', encoding='utf-8') as f:
  14. for line in f:
  15. if line.strip():
  16. try:
  17. data.append(json.loads(line))
  18. except json.JSONDecodeError:
  19. print(f"❌ 错误: 发现无效的 JSON 行: {line[:50]}...")
  20. total_items = len(data)
  21. print(f"✅ JSONL 读取成功,共 {total_items} 条数据。")
  22. # 2. 规则验证
  23. errors = []
  24. categories = defaultdict(list)
  25. expected_categories = {
  26. "内容创作", "商业分析", "学习教育", "提示词工程", "综合杂项", "编程技术", "逻辑工具箱"
  27. }
  28. for item in data:
  29. cat = item.get('category')
  30. row = item.get('row')
  31. col = item.get('col')
  32. title = item.get('title')
  33. content = item.get('content')
  34. # 收集分类数据用于后续分析
  35. categories[cat].append(row)
  36. # 检查 1: 分类合法性
  37. if cat not in expected_categories:
  38. errors.append(f"❌ 未知分类: '{cat}' (Title: {title[:20]}...)")
  39. # 检查 2: 列归位 (col == 1)
  40. if col != 1:
  41. errors.append(f"❌ 列未归位: Category '{cat}', Row {row}, Col {col} (应为 1)")
  42. # 检查 3: 内容完整性 (简单检查)
  43. if not title:
  44. errors.append(f"⚠️ 警告: 标题为空 (Category '{cat}', Row {row})")
  45. if not content or len(content) < 5:
  46. errors.append(f"⚠️ 警告: 内容过短或为空 (Category '{cat}', Row {row}, Content len: {len(content) if content else 0})")
  47. # 检查 4: 行连续性
  48. print("\n--- 分类与行号连续性检查 ---")
  49. for cat, rows in categories.items():
  50. rows.sort()
  51. count = len(rows)
  52. if count == 0:
  53. print(f"⚠️ 分类 '{cat}' 为空")
  54. continue
  55. max_row = rows[-1]
  56. expected_rows = list(range(1, count + 1))
  57. status = "✅ 正常"
  58. if rows != expected_rows:
  59. status = "❌ 异常 (行号不连续或重复)"
  60. errors.append(f"行号错误: {cat} (Expect 1-{count}, Got max {max_row})")
  61. print(f"{cat.ljust(10)}: {count} 条 | Max Row: {max_row} | {status}")
  62. # 3. 文件系统同步检查
  63. print("\n--- 文档文件同步检查 ---")
  64. files_found = 0
  65. if os.path.exists(docs_root):
  66. for root, dirs, files in os.walk(docs_root):
  67. for file in files:
  68. if file.endswith(".md") and not file.startswith("index"):
  69. files_found += 1
  70. else:
  71. print(f"❌ 文档目录不存在: {docs_root}")
  72. print(f"JSONL 条目数: {total_items}")
  73. print(f"Markdown 文件数: {files_found}")
  74. if total_items == files_found:
  75. print("✅ 文件数量一致")
  76. else:
  77. print(f"❌ 文件数量不匹配! (差值: {files_found - total_items})")
  78. errors.append("文件系统数量与 JSONL 不一致")
  79. # 4. 总结
  80. print("\n=== 检查总结 ===")
  81. if not errors:
  82. print("🎉 完美!所有检查通过。数据结构完整、规范。")
  83. else:
  84. print(f"发现 {len(errors)} 个问题,请检视:")
  85. for err in errors:
  86. print(err)
  87. if __name__ == "__main__":
  88. verify()