AI-Tutorial-Codes-Included/mistral_devstral_compact_loading_marktechpost.py at main · szuke/AI-Tutorial-Codes-Included · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
# -*- coding: utf-8 -*-
"""Mistral_Devstral_Compact_Loading_Marktechpost.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1oCv2jEGO3lz41H8jBqF8eWw4TyiFzURL
"""

!pip install -q kagglehub mistral-common bitsandbytes transformers --no-cache-dir
!pip install -q accelerate torch --no-cache-dir

import shutil
import os
import gc

def cleanup_cache():
    """Clean up unnecessary files to save disk space"""
    cache_dirs = ['/root/.cache', '/tmp/kagglehub']
    for cache_dir in cache_dirs:
        if os.path.exists(cache_dir):
            shutil.rmtree(cache_dir, ignore_errors=True)
    gc.collect()

cleanup_cache()
print("🧹 Disk space optimized!")

import warnings
warnings.filterwarnings("ignore")

import torch
import kagglehub
from mistral_common.protocol.instruct.messages import UserMessage
from mistral_common.protocol.instruct.request import ChatCompletionRequest
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

class LightweightDevstral:
    def __init__(self):
        print("📦 Downloading model (streaming mode)...")

        self.model_path = kagglehub.model_download(
            'mistral-ai/devstral-small-2505/Transformers/devstral-small-2505/1',
            force_download=False
        )

        quantization_config = BitsAndBytesConfig(
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_storage=torch.uint8,
            load_in_4bit=True
        )

        print("⚡ Loading ultra-compressed model...")
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_path,
            torch_dtype=torch.float16,
            device_map="auto",
            quantization_config=quantization_config,
            low_cpu_mem_usage=True,
            trust_remote_code=True
        )

        self.tokenizer = MistralTokenizer.from_file(f'{self.model_path}/tekken.json')

        cleanup_cache()
        print("✅ Lightweight assistant ready! (~2GB disk usage)")

    def generate(self, prompt, max_tokens=400):
        """Memory-efficient generation"""
        tokenized = self.tokenizer.encode_chat_completion(
            ChatCompletionRequest(messages=[UserMessage(content=prompt)])
        )

        input_ids = torch.tensor([tokenized.tokens])
        if torch.cuda.is_available():
            input_ids = input_ids.to(self.model.device)

        with torch.inference_mode():
            output = self.model.generate(
                input_ids=input_ids,
                max_new_tokens=max_tokens,
                temperature=0.6,
                top_p=0.85,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id,
                use_cache=True
            )[0]

        del input_ids
        torch.cuda.empty_cache() if torch.cuda.is_available() else None

        return self.tokenizer.decode(output[len(tokenized.tokens):])

print("🚀 Initializing lightweight AI assistant...")
assistant = LightweightDevstral()

def run_demo(title, prompt, emoji="🎯"):
    """Run a single demo with cleanup"""
    print(f"\n{emoji} {title}")
    print("-" * 50)

    result = assistant.generate(prompt, max_tokens=350)
    print(result)

    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

run_demo(
    "Quick Prime Finder",
    "Write a fast prime checker function `is_prime(n)` with explanation and test cases.",
    "🔢"
)

run_demo(
    "Debug This Code",
    """Fix this buggy function and explain the issues:
```python
def avg_positive(numbers):
    total = sum([n for n in numbers if n > 0])
    return total / len([n for n in numbers if n > 0])
```""",
    "🐛"
)

run_demo(
    "Text Tool Creator",
    "Create a simple `TextAnalyzer` class with word count, char count, and palindrome check methods.",
    "🛠️"
)

def quick_coding():
    """Lightweight interactive session"""
    print("\n🎮 QUICK CODING MODE")
    print("=" * 40)
    print("Enter short coding prompts (type 'exit' to quit)")

    session_count = 0
    max_sessions = 5

    while session_count < max_sessions:
        prompt = input(f"\n[{session_count+1}/{max_sessions}] Your prompt: ")

        if prompt.lower() in ['exit', 'quit', '']:
            break

        try:
            result = assistant.generate(prompt, max_tokens=300)
            print("💡 Solution:")
            print(result[:500])

            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

        except Exception as e:
            print(f"❌ Error: {str(e)[:100]}...")

        session_count += 1

    print(f"\n✅ Session complete! Memory cleaned.")

def check_disk_usage():
    """Monitor disk usage"""
    import subprocess
    try:
        result = subprocess.run(['df', '-h', '/'], capture_output=True, text=True)
        lines = result.stdout.split('\n')
        if len(lines) > 1:
            usage_line = lines[1].split()
            used = usage_line[2]
            available = usage_line[3]
            print(f"💾 Disk: {used} used, {available} available")
    except:
        print("💾 Disk usage check unavailable")


print("\n🎉 Tutorial Complete!")
cleanup_cache()
check_disk_usage()

print("\n💡 Space-Saving Tips:")
print("• Model uses ~2GB vs original ~7GB+")
print("• Automatic cache cleanup after each use")
print("• Limited token generation to save memory")
print("• Use 'del assistant' when done to free ~2GB")
print("• Restart runtime if memory issues persist")