reindex_rows.py 1.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849
  1. import json
  2. import shutil
  3. from collections import defaultdict
  4. input_file = "prompt_jsonl/prompt_docs_refactored.jsonl"
  5. output_file = "prompt_jsonl/prompt_docs_refactored_reindexed.jsonl"
  6. backup_file = "prompt_jsonl/prompt_docs_refactored_before_reindex.jsonl.bak"
  7. def reindex_rows():
  8. # 1. Backup
  9. shutil.copy(input_file, backup_file)
  10. print(f"Backup created: {backup_file}")
  11. # 2. Load and Group
  12. items_by_cat = defaultdict(list)
  13. with open(input_file, 'r', encoding='utf-8') as f:
  14. for line in f:
  15. if not line.strip(): continue
  16. item = json.loads(line)
  17. cat = item.get('category', 'Uncategorized')
  18. items_by_cat[cat].append(item)
  19. # 3. Sort and Reindex
  20. total_items = 0
  21. with open(output_file, 'w', encoding='utf-8') as f:
  22. # Sort categories for consistent file order
  23. for cat in sorted(items_by_cat.keys()):
  24. items = items_by_cat[cat]
  25. # Sort items by their OLD row to preserve relative order
  26. items.sort(key=lambda x: x.get('row', 0))
  27. # Reassign row numbers starting from 1
  28. for i, item in enumerate(items):
  29. item['row'] = i + 1
  30. f.write(json.dumps(item, ensure_ascii=False) + '\n')
  31. total_items += 1
  32. print(f"Category '{cat}': re-indexed {len(items)} items.")
  33. print(f"Re-indexed file written: {output_file}")
  34. print(f"Total items: {total_items}")
  35. # Overwrite original
  36. shutil.move(output_file, input_file)
  37. print(f"Overwritten original file: {input_file}")
  38. if __name__ == "__main__":
  39. reindex_rows()