| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849 |
- import json
- import shutil
- from collections import defaultdict
- input_file = "prompt_jsonl/prompt_docs_refactored.jsonl"
- output_file = "prompt_jsonl/prompt_docs_refactored_reindexed.jsonl"
- backup_file = "prompt_jsonl/prompt_docs_refactored_before_reindex.jsonl.bak"
- def reindex_rows():
- # 1. Backup
- shutil.copy(input_file, backup_file)
- print(f"Backup created: {backup_file}")
- # 2. Load and Group
- items_by_cat = defaultdict(list)
-
- with open(input_file, 'r', encoding='utf-8') as f:
- for line in f:
- if not line.strip(): continue
- item = json.loads(line)
- cat = item.get('category', 'Uncategorized')
- items_by_cat[cat].append(item)
- # 3. Sort and Reindex
- total_items = 0
- with open(output_file, 'w', encoding='utf-8') as f:
- # Sort categories for consistent file order
- for cat in sorted(items_by_cat.keys()):
- items = items_by_cat[cat]
- # Sort items by their OLD row to preserve relative order
- items.sort(key=lambda x: x.get('row', 0))
-
- # Reassign row numbers starting from 1
- for i, item in enumerate(items):
- item['row'] = i + 1
- f.write(json.dumps(item, ensure_ascii=False) + '\n')
- total_items += 1
-
- print(f"Category '{cat}': re-indexed {len(items)} items.")
- print(f"Re-indexed file written: {output_file}")
- print(f"Total items: {total_items}")
-
- # Overwrite original
- shutil.move(output_file, input_file)
- print(f"Overwritten original file: {input_file}")
- if __name__ == "__main__":
- reindex_rows()
|