| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109 |
- import json
- import os
- from collections import defaultdict
- jsonl_path = "prompt_jsonl/prompt_docs_refactored.jsonl"
- docs_root = "prompt_docs/prompt_docs_refactored/prompts"
- def verify():
- print("=== 开始全面完整性检查 ===\n")
-
- # 1. JSONL 数据加载与基础检查
- if not os.path.exists(jsonl_path):
- print(f"❌ 错误: JSONL 文件不存在: {jsonl_path}")
- return
- data = []
- with open(jsonl_path, 'r', encoding='utf-8') as f:
- for line in f:
- if line.strip():
- try:
- data.append(json.loads(line))
- except json.JSONDecodeError:
- print(f"❌ 错误: 发现无效的 JSON 行: {line[:50]}...")
- total_items = len(data)
- print(f"✅ JSONL 读取成功,共 {total_items} 条数据。")
- # 2. 规则验证
- errors = []
- categories = defaultdict(list)
-
- expected_categories = {
- "内容创作", "商业分析", "学习教育", "提示词工程", "综合杂项", "编程技术", "逻辑工具箱"
- }
- for item in data:
- cat = item.get('category')
- row = item.get('row')
- col = item.get('col')
- title = item.get('title')
- content = item.get('content')
- # 收集分类数据用于后续分析
- categories[cat].append(row)
- # 检查 1: 分类合法性
- if cat not in expected_categories:
- errors.append(f"❌ 未知分类: '{cat}' (Title: {title[:20]}...)")
- # 检查 2: 列归位 (col == 1)
- if col != 1:
- errors.append(f"❌ 列未归位: Category '{cat}', Row {row}, Col {col} (应为 1)")
- # 检查 3: 内容完整性 (简单检查)
- if not title:
- errors.append(f"⚠️ 警告: 标题为空 (Category '{cat}', Row {row})")
- if not content or len(content) < 5:
- errors.append(f"⚠️ 警告: 内容过短或为空 (Category '{cat}', Row {row}, Content len: {len(content) if content else 0})")
- # 检查 4: 行连续性
- print("\n--- 分类与行号连续性检查 ---")
- for cat, rows in categories.items():
- rows.sort()
- count = len(rows)
- if count == 0:
- print(f"⚠️ 分类 '{cat}' 为空")
- continue
-
- max_row = rows[-1]
- expected_rows = list(range(1, count + 1))
-
- status = "✅ 正常"
- if rows != expected_rows:
- status = "❌ 异常 (行号不连续或重复)"
- errors.append(f"行号错误: {cat} (Expect 1-{count}, Got max {max_row})")
-
- print(f"{cat.ljust(10)}: {count} 条 | Max Row: {max_row} | {status}")
- # 3. 文件系统同步检查
- print("\n--- 文档文件同步检查 ---")
- files_found = 0
- if os.path.exists(docs_root):
- for root, dirs, files in os.walk(docs_root):
- for file in files:
- if file.endswith(".md") and not file.startswith("index"):
- files_found += 1
- else:
- print(f"❌ 文档目录不存在: {docs_root}")
- print(f"JSONL 条目数: {total_items}")
- print(f"Markdown 文件数: {files_found}")
-
- if total_items == files_found:
- print("✅ 文件数量一致")
- else:
- print(f"❌ 文件数量不匹配! (差值: {files_found - total_items})")
- errors.append("文件系统数量与 JSONL 不一致")
- # 4. 总结
- print("\n=== 检查总结 ===")
- if not errors:
- print("🎉 完美!所有检查通过。数据结构完整、规范。")
- else:
- print(f"发现 {len(errors)} 个问题,请检视:")
- for err in errors:
- print(err)
- if __name__ == "__main__":
- verify()
|