ka-cn
/
vibe-coding-cn


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
							import json
import os
from collections import defaultdict

jsonl_path = "prompt_jsonl/prompt_docs_refactored.jsonl"
docs_root = "prompt_docs/prompt_docs_refactored/prompts"

def verify():
    print("=== 开始全面完整性检查 ===\n")
    
    # 1. JSONL 数据加载与基础检查
    if not os.path.exists(jsonl_path):
        print(f"❌ 错误: JSONL 文件不存在: {jsonl_path}")
        return

    data = []
    with open(jsonl_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                try:
                    data.append(json.loads(line))
                except json.JSONDecodeError:
                    print(f"❌ 错误: 发现无效的 JSON 行: {line[:50]}...")

    total_items = len(data)
    print(f"✅ JSONL 读取成功，共 {total_items} 条数据。")

    # 2. 规则验证
    errors = []
    categories = defaultdict(list)
    
    expected_categories = {
        "内容创作", "商业分析", "学习教育", "提示词工程", "综合杂项", "编程技术", "逻辑工具箱"
    }

    for item in data:
        cat = item.get('category')
        row = item.get('row')
        col = item.get('col')
        title = item.get('title')
        content = item.get('content')

        # 收集分类数据用于后续分析
        categories[cat].append(row)

        # 检查 1: 分类合法性
        if cat not in expected_categories:
            errors.append(f"❌ 未知分类: '{cat}' (Title: {title[:20]}...)")

        # 检查 2: 列归位 (col == 1)
        if col != 1:
            errors.append(f"❌ 列未归位: Category '{cat}', Row {row}, Col {col} (应为 1)")

        # 检查 3: 内容完整性 (简单检查)
        if not title:
            errors.append(f"⚠️ 警告: 标题为空 (Category '{cat}', Row {row})")
        if not content or len(content) < 5:
            errors.append(f"⚠️ 警告: 内容过短或为空 (Category '{cat}', Row {row}, Content len: {len(content) if content else 0})")

    # 检查 4: 行连续性
    print("\n--- 分类与行号连续性检查 ---")
    for cat, rows in categories.items():
        rows.sort()
        count = len(rows)
        if count == 0:
            print(f"⚠️ 分类 '{cat}' 为空")
            continue
            
        max_row = rows[-1]
        expected_rows = list(range(1, count + 1))
        
        status = "✅ 正常"
        if rows != expected_rows:
            status = "❌ 异常 (行号不连续或重复)"
            errors.append(f"行号错误: {cat} (Expect 1-{count}, Got max {max_row})")
        
        print(f"{cat.ljust(10)}: {count} 条 | Max Row: {max_row} | {status}")

    # 3. 文件系统同步检查
    print("\n--- 文档文件同步检查 ---")
    files_found = 0
    if os.path.exists(docs_root):
        for root, dirs, files in os.walk(docs_root):
            for file in files:
                if file.endswith(".md") and not file.startswith("index"):
                    files_found += 1
    else:
        print(f"❌ 文档目录不存在: {docs_root}")

    print(f"JSONL 条目数: {total_items}")
    print(f"Markdown 文件数: {files_found}")
    
    if total_items == files_found:
        print("✅ 文件数量一致")
    else:
        print(f"❌ 文件数量不匹配! (差值: {files_found - total_items})")
        errors.append("文件系统数量与 JSONL 不一致")

    # 4. 总结
    print("\n=== 检查总结 ===")
    if not errors:
        print("🎉 完美！所有检查通过。数据结构完整、规范。")
    else:
        print(f"发现 {len(errors)} 个问题，请检视：")
        for err in errors:
            print(err)

if __name__ == "__main__":
    verify()