forked from resemble-ai/chatterbox
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfix_tokenizer_corrupted.py
More file actions
107 lines (93 loc) · 3.16 KB
/
fix_tokenizer_corrupted.py
File metadata and controls
107 lines (93 loc) · 3.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/bin/env python3
"""
Fix corrupted tokenizer.json file
"""
import os
from pathlib import Path
import shutil
from huggingface_hub import hf_hub_download
print("=" * 60)
print("Fix Corrupted tokenizer.json")
print("=" * 60)
print()
model_dir = Path("checkpoints_lora/merged_model")
tokenizer_path = model_dir / "tokenizer.json"
print(f"Tokenizer path: {tokenizer_path.absolute()}")
print(f"Exists: {tokenizer_path.exists()}")
if tokenizer_path.exists():
size = tokenizer_path.stat().st_size
print(f"Size: {size} bytes ({size / 1024:.2f} KB)")
if size == 0:
print("❌ File is EMPTY (0 bytes)!")
elif size < 1000:
print("⚠️ File is suspiciously small!")
else:
print("✅ File size looks OK")
# Try to read it
try:
with open(tokenizer_path, 'r') as f:
content = f.read(100)
print(f"Content preview: {content[:50]}...")
print("✅ File appears to be readable")
print()
print("File seems OK. The issue might be elsewhere.")
print("Try checking the Chatterbox source code for tokenizer loading.")
exit(0)
except Exception as e:
print(f"❌ Cannot read file: {e}")
else:
print("❌ File doesn't exist!")
print()
print("=" * 60)
print("Downloading fresh tokenizer.json from HuggingFace...")
print("=" * 60)
print()
# Backup old file if it exists
if tokenizer_path.exists():
backup_path = model_dir / "tokenizer.json.backup"
print(f"Backing up old file to: {backup_path}")
shutil.move(str(tokenizer_path), str(backup_path))
print("✅ Backup created")
print()
# Download fresh tokenizer
try:
print("📥 Downloading from ResembleAI/chatterbox...")
downloaded_path = hf_hub_download(
repo_id="ResembleAI/chatterbox",
filename="tokenizer.json",
force_download=True # Force fresh download
)
print(f"✅ Downloaded to: {downloaded_path}")
# Copy to model directory
print(f"📋 Copying to: {tokenizer_path}")
shutil.copy(downloaded_path, tokenizer_path)
# Verify
if tokenizer_path.exists():
size = tokenizer_path.stat().st_size
print(f"✅ Copied successfully!")
print(f" Size: {size} bytes ({size / 1024:.2f} KB)")
# Try to read it
try:
with open(tokenizer_path, 'r') as f:
content = f.read(100)
print(f" Content preview: {content[:50]}...")
print("✅ File is readable!")
print()
print("=" * 60)
print("🎉 tokenizer.json fixed!")
print("=" * 60)
print()
print("Now run: python diagnose_and_fix.py")
print()
except Exception as e:
print(f"❌ Still cannot read file: {e}")
else:
print("❌ Copy failed!")
except Exception as e:
print(f"❌ Download failed: {e}")
print()
print("Manual fix:")
print("1. Visit: https://huggingface.co/ResembleAI/chatterbox/blob/main/tokenizer.json")
print("2. Click 'Download'")
print(f"3. Save to: {tokenizer_path}")
print()