diff --git a/.gitignore b/.gitignore
index f3c28571bdf..de63e6c2538 100644
--- a/.gitignore
+++ b/.gitignore
@@ -150,3 +150,5 @@ venv/*
 # resource optimization
 scripts/resource/output
 *.pem
+scripts/nn/examples/mnist_data/mnist_test.csv
+scripts/nn/examples/mnist_data/mnist_train.csv
\ No newline at end of file
diff --git a/scripts/data_prep/create_binary_chunks.py b/scripts/data_prep/create_binary_chunks.py
new file mode 100644
index 00000000000..6a2d273410a
--- /dev/null
+++ b/scripts/data_prep/create_binary_chunks.py
@@ -0,0 +1,215 @@
+#!/usr/bin/env python3
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+"""
+Create pre-split binary chunks from ImageNet data for SystemDS LARS training.
+
+This script reads existing CSV or binary data and splits it into manageable chunks
+for memory-efficient training with large datasets.
+"""
+
+import os
+import sys
+import numpy as np
+import pandas as pd
+from pathlib import Path
+
+def create_binary_chunks(data_dir="imagenet_data", chunk_size=10000):
+    """
+    Create binary chunk files from existing ImageNet data.
+    
+    Args:
+        data_dir: Directory containing the ImageNet data
+        chunk_size: Number of samples per chunk
+    """
+    data_path = Path(data_dir)
+    
+    print(f"Creating binary chunks from data in: {data_path}")
+    print(f"Chunk size: {chunk_size}")
+    
+    # Check what data we have available
+    csv_train = data_path / "imagenet_train.csv"
+    csv_val = data_path / "imagenet_val.csv"
+    
+    if csv_train.exists() and csv_val.exists():
+        print("Found CSV files, converting to binary chunks...")
+        create_chunks_from_csv(data_path, chunk_size)
+    else:
+        print("CSV files not found, creating dummy chunks for testing...")
+        create_dummy_chunks(data_path, chunk_size)
+
+def create_chunks_from_csv(data_path, chunk_size):
+    """Create chunks from CSV files."""
+    
+    # Read training data
+    print("Reading training CSV...")
+    train_df = pd.read_csv(data_path / "imagenet_train.csv", header=None)
+    print(f"Training data shape: {train_df.shape}")
+    
+    # Read validation data  
+    print("Reading validation CSV...")
+    val_df = pd.read_csv(data_path / "imagenet_val.csv", header=None)
+    print(f"Validation data shape: {val_df.shape}")
+    
+    # Split training data into chunks
+    train_labels = train_df.iloc[:, 0].values
+    train_data = train_df.iloc[:, 1:].values
+    
+    # Convert to float and normalize
+    train_data = train_data.astype(np.float64) / 255.0
+    
+    num_train_chunks = (len(train_data) + chunk_size - 1) // chunk_size
+    print(f"Creating {num_train_chunks} training chunks...")
+    
+    for i in range(num_train_chunks):
+        start_idx = i * chunk_size
+        end_idx = min((i + 1) * chunk_size, len(train_data))
+        
+        chunk_data = train_data[start_idx:end_idx]
+        chunk_labels = train_labels[start_idx:end_idx]
+        
+        # Convert labels to one-hot (assuming 10 classes for now)
+        num_classes = 10
+        chunk_labels_onehot = np.eye(num_classes)[chunk_labels]
+        
+        # Save as binary files that SystemDS can read
+        chunk_num = f"{i+1:03d}"
+        
+        # Save data chunk as CSV
+        data_file = data_path / f"train_chunk_{chunk_num}.csv"
+        pd.DataFrame(chunk_data).to_csv(data_file, header=False, index=False)
+        
+        # Save labels chunk as CSV
+        labels_file = data_path / f"train_labels_{chunk_num}.csv"
+        pd.DataFrame(chunk_labels_onehot).to_csv(labels_file, header=False, index=False)
+        
+        print(f"  Chunk {chunk_num}: {chunk_data.shape[0]} samples")
+    
+    # Process validation data (typically smaller, so fewer chunks)
+    val_labels = val_df.iloc[:, 0].values
+    val_data = val_df.iloc[:, 1:].values
+    val_data = val_data.astype(np.float64) / 255.0
+    
+    val_chunk_size = min(chunk_size, len(val_data))
+    num_val_chunks = (len(val_data) + val_chunk_size - 1) // val_chunk_size
+    print(f"Creating {num_val_chunks} validation chunks...")
+    
+    for i in range(num_val_chunks):
+        start_idx = i * val_chunk_size
+        end_idx = min((i + 1) * val_chunk_size, len(val_data))
+        
+        chunk_data = val_data[start_idx:end_idx]
+        chunk_labels = val_labels[start_idx:end_idx]
+        
+        # Convert labels to one-hot
+        chunk_labels_onehot = np.eye(num_classes)[chunk_labels]
+        
+        chunk_num = f"{i+1:03d}"
+        
+        # Save data chunk as CSV
+        data_file = data_path / f"val_chunk_{chunk_num}.csv"
+        pd.DataFrame(chunk_data).to_csv(data_file, header=False, index=False)
+        
+        # Save labels chunk as CSV
+        labels_file = data_path / f"val_labels_{chunk_num}.csv"
+        pd.DataFrame(chunk_labels_onehot).to_csv(labels_file, header=False, index=False)
+        
+        print(f"  Val chunk {chunk_num}: {chunk_data.shape[0]} samples")
+
+def create_dummy_chunks(data_path, chunk_size):
+    """Create dummy chunks for testing when real data isn't available."""
+    print("Creating dummy data chunks for testing...")
+    
+    # ImageNet-like dimensions
+    img_height, img_width, channels = 224, 224, 3
+    num_features = img_height * img_width * channels
+    num_classes = 10
+    
+    # Create training chunks
+    num_train_samples = chunk_size * 2  # Create 2 chunks for demo
+    
+    print(f"Generating {num_train_samples} dummy training samples...")
+    train_data = np.random.rand(num_train_samples, num_features).astype(np.float64)
+    train_labels = np.random.randint(0, num_classes, num_train_samples)
+    train_labels_onehot = np.eye(num_classes)[train_labels]
+    
+    # Split into chunks
+    for i in range(2):  # 2 training chunks
+        start_idx = i * chunk_size
+        end_idx = (i + 1) * chunk_size
+        
+        chunk_data = train_data[start_idx:end_idx]
+        chunk_labels_onehot_chunk = train_labels_onehot[start_idx:end_idx]
+        
+        chunk_num = f"{i+1:03d}"
+        
+        # Save chunks as CSV
+        data_file = data_path / f"train_chunk_{chunk_num}.csv"
+        pd.DataFrame(chunk_data).to_csv(data_file, header=False, index=False)
+        
+        labels_file = data_path / f"train_labels_{chunk_num}.csv"
+        pd.DataFrame(chunk_labels_onehot_chunk).to_csv(labels_file, header=False, index=False)
+        
+        print(f"  Created train chunk {chunk_num}: {chunk_data.shape}")
+    
+    # Create validation chunk
+    num_val_samples = min(chunk_size, 5000)  # Smaller validation set
+    print(f"Generating {num_val_samples} dummy validation samples...")
+    
+    val_data = np.random.rand(num_val_samples, num_features).astype(np.float64)
+    val_labels = np.random.randint(0, num_classes, num_val_samples)
+    val_labels_onehot = np.eye(num_classes)[val_labels]
+    
+    # Save validation chunk as CSV
+    data_file = data_path / "val_chunk_001.csv"
+    pd.DataFrame(val_data).to_csv(data_file, header=False, index=False)
+    
+    labels_file = data_path / "val_labels_001.csv"
+    pd.DataFrame(val_labels_onehot).to_csv(labels_file, header=False, index=False)
+    
+    print(f"  Created val chunk 001: {val_data.shape}")
+
+def main():
+    """Main execution."""
+    data_dir = "imagenet_data"
+    chunk_size = 10000
+    
+    if len(sys.argv) > 1:
+        data_dir = sys.argv[1]
+    if len(sys.argv) > 2:
+        chunk_size = int(sys.argv[2])
+    
+    # Create data directory if it doesn't exist
+    os.makedirs(data_dir, exist_ok=True)
+    
+    create_binary_chunks(data_dir, chunk_size)
+    
+    print("\n✅ Binary chunk creation completed!")
+    print(f"Chunks saved in: {data_dir}/")
+    print("Files created:")
+    
+    data_path = Path(data_dir)
+    for file in sorted(data_path.glob("*_chunk_*.bin")):
+        size_mb = file.stat().st_size / (1024 * 1024)
+        print(f"  {file.name} ({size_mb:.1f} MB)")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/scripts/data_prep/prepare_raw_imagenet.py b/scripts/data_prep/prepare_raw_imagenet.py
new file mode 100644
index 00000000000..63b51374876
--- /dev/null
+++ b/scripts/data_prep/prepare_raw_imagenet.py
@@ -0,0 +1,433 @@
+#!/usr/bin/env python3
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+"""
+Raw ImageNet Data Preprocessing Pipeline
+=========================================
+
+This script processes raw ImageNet JPG images with metadata CSV files and prepares them
+for SystemDS AlexNet training. It handles:
+
+1. Reading metadata CSV files with file_path,label format
+2. Loading JPG images (typically 256x256) and resizing to specified target size (default: 224x224)
+3. Converting to normalized feature vectors
+4. Creating one-hot encoded labels
+5. Saving in SystemDS-compatible CSV format with resolution-based naming
+
+Usage:
+    python prepare_raw_imagenet.py --input_dir "C:/Users/romer/Desktop/Big_Data/imagenet/256x256" --output_dir "imagenet_data"
+    python prepare_raw_imagenet.py --input_dir "C:/Users/romer/Desktop/Big_Data/imagenet/256x256" --target_size 299
+    python prepare_raw_imagenet.py --input_dir "path/to/imagenet" --dry_run
+
+Output files will be saved as:
+    imagenet_data/<target_size>x<target_size>/imagenet_<target_size>x<target_size>_train.csv
+    imagenet_data/<target_size>x<target_size>/imagenet_<target_size>x<target_size>_train_labels.csv
+    imagenet_data/<target_size>x<target_size>/imagenet_<target_size>x<target_size>_test.csv
+    imagenet_data/<target_size>x<target_size>/imagenet_<target_size>x<target_size>_test_labels.csv
+"""
+
+import os
+import sys
+import argparse
+import numpy as np
+import pandas as pd
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+import time
+import gc
+from PIL import Image
+import csv
+
+class RawImageNetProcessor:
+    """Raw ImageNet JPG image processor for SystemDS."""
+    
+    def __init__(self, input_dir: str, output_dir: str = "imagenet_data/224x224", target_size: int = 224):
+        self.input_dir = Path(input_dir)
+        self.target_size = target_size
+        
+        # Create output directory based on resolution
+        base_output = Path(output_dir).parent if "x" in Path(output_dir).name else Path(output_dir)
+        self.output_dir = base_output / f"{target_size}x{target_size}"
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        
+        # Target specifications for SystemDS AlexNet
+        self.channels = 3
+        self.features = self.target_size * self.target_size * self.channels
+        self.num_classes = 1000
+        
+        print(f"Raw ImageNet Processor initialized")
+        print(f"Input directory: {self.input_dir}")
+        print(f"Output directory: {self.output_dir}")
+        print(f"Target format: {self.target_size}x{self.target_size}x{self.channels} images ({self.features} features), {self.num_classes} classes")
+        print(f"Note: Source images will be resized from their original size to {self.target_size}x{self.target_size}")
+    
+    def inspect_raw_data(self) -> Dict:
+        """Inspect the raw data structure and return metadata."""
+        print("\n=== Raw Data Inspection ===")
+        
+        # Look for metadata files
+        train_metadata_file = self.input_dir / "imagenet_train_metadata.csv"
+        test_metadata_file = self.input_dir / "imagenet_test_metadata.csv"
+        
+        if not train_metadata_file.exists():
+            raise FileNotFoundError(f"Training metadata file not found: {train_metadata_file}")
+        if not test_metadata_file.exists():
+            raise FileNotFoundError(f"Test metadata file not found: {test_metadata_file}")
+        
+        # Read metadata
+        print(f"Reading training metadata from: {train_metadata_file}")
+        train_df = pd.read_csv(train_metadata_file)
+        print(f"Reading test metadata from: {test_metadata_file}")
+        test_df = pd.read_csv(test_metadata_file)
+        
+        # Inspect structure
+        print(f"\nTraining metadata shape: {train_df.shape}")
+        print(f"Training columns: {list(train_df.columns)}")
+        print(f"Training label range: {train_df['label'].min()} to {train_df['label'].max()}")
+        print(f"Training unique labels: {train_df['label'].nunique()}")
+        
+        print(f"\nTest metadata shape: {test_df.shape}")
+        print(f"Test columns: {list(test_df.columns)}")
+        print(f"Test label range: {test_df['label'].min()} to {test_df['label'].max()}")
+        print(f"Test unique labels: {test_df['label'].nunique()}")
+        
+        # Check if images actually exist
+        print(f"\nChecking image availability...")
+        train_available = self._count_available_images(train_df)
+        test_available = self._count_available_images(test_df)
+        
+        # Sample a few images to check dimensions
+        sample_dims = self._check_sample_image_dimensions(train_df.head(5))
+        
+        metadata = {
+            'train_total': len(train_df),
+            'train_available': train_available,
+            'test_total': len(test_df),
+            'test_available': test_available,
+            'train_labels': sorted(train_df['label'].unique()),
+            'test_labels': sorted(test_df['label'].unique()),
+            'sample_dimensions': sample_dims
+        }
+        
+        print(f"\n=== Summary ===")
+        print(f"Training: {train_available}/{len(train_df)} images available")
+        print(f"Test: {test_available}/{len(test_df)} images available")
+        print(f"Sample image dimensions: {sample_dims}")
+        
+        return metadata
+    
+    def _count_available_images(self, df: pd.DataFrame) -> int:
+        """Count how many images actually exist on disk."""
+        available = 0
+        total = len(df)
+        
+        print(f"  Checking {total} image files...")
+        for i, row in df.iterrows():
+            image_path = self.input_dir / row['file_path']
+            if image_path.exists():
+                available += 1
+            
+            # Progress update every 1000 images
+            if (i + 1) % 1000 == 0:
+                print(f"    Checked {i + 1}/{total} images, {available} available")
+        
+        print(f"  Final: {available}/{total} images available")
+        return available
+    
+    def _check_sample_image_dimensions(self, sample_df: pd.DataFrame) -> List[Tuple]:
+        """Check dimensions of a few sample images."""
+        dimensions = []
+        
+        for _, row in sample_df.iterrows():
+            image_path = self.input_dir / row['file_path']
+            if image_path.exists():
+                try:
+                    with Image.open(image_path) as img:
+                        dimensions.append((img.width, img.height, len(img.getbands())))
+                except Exception as e:
+                    print(f"    Error reading {image_path}: {e}")
+            
+            if len(dimensions) >= 3:  # Just check a few
+                break
+        
+        return dimensions
+    
+    def process_dataset(self, max_samples: Optional[int] = None, dry_run: bool = False, skip_check: bool = False, split_from_train: bool = False) -> Dict:
+        """Process the complete dataset."""
+        print(f"\n=== Processing Dataset (dry_run={dry_run}) ===")
+        
+        # Read metadata
+        train_df = pd.read_csv(self.input_dir / "imagenet_train_metadata.csv")
+        
+        if split_from_train:
+            print("Creating validation set from training data...")
+            # Skip test metadata entirely
+            test_df = None
+        else:
+            test_df = pd.read_csv(self.input_dir / "imagenet_test_metadata.csv")
+        
+        # Filter to only available images (unless skipping)
+        if not skip_check:
+            print("Filtering to available images...")
+            train_df = self._filter_available_images(train_df)
+            if test_df is not None:
+                test_df = self._filter_available_images(test_df)
+        else:
+            print("Skipping image availability check...")
+        
+        # Handle data splitting
+        if split_from_train:
+            # Use training data for both train and validation
+            if max_samples:
+                # Take first max_samples for training
+                train_samples = max_samples
+                # Use 20% of training samples for validation (or 400, whichever is smaller)
+                val_samples = min(400, int(train_samples * 0.2), len(train_df) - train_samples)
+                
+                print(f"Splitting from training data:")
+                print(f"  - Training: first {train_samples} samples")
+                print(f"  - Validation: next {val_samples} samples")
+                
+                val_df = train_df.iloc[train_samples:train_samples + val_samples].copy()
+                train_df = train_df.head(train_samples)
+            else:
+                # Default split: 90% train, 10% validation
+                split_idx = int(len(train_df) * 0.9)
+                val_df = train_df.iloc[split_idx:].copy()
+                train_df = train_df.iloc[:split_idx].copy()
+                print(f"Splitting training data: {len(train_df)} train, {len(val_df)} validation")
+            
+            test_df = val_df  # Use validation split as "test" for consistency
+        else:
+            # Limit samples if requested
+            if max_samples:
+                print(f"Limiting to {max_samples} samples per split...")
+                train_df = train_df.head(max_samples)
+                if test_df is not None:
+                    test_df = test_df.head(max_samples)
+        
+        print(f"Processing {len(train_df)} training samples...")
+        print(f"Processing {len(test_df)} test samples...")
+        
+        if dry_run:
+            print("DRY RUN: Would process the above samples")
+            return {'dry_run': True, 'train_samples': len(train_df), 'test_samples': len(test_df)}
+        
+        # Process training data
+        train_results = self._process_split(train_df, "train")
+        
+        # Process test data (as validation)
+        test_results = self._process_split(test_df, "val")
+        
+        return {
+            'train': train_results,
+            'validation': test_results
+        }
+    
+    def _filter_available_images(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Filter dataframe to only include images that exist on disk."""
+        available_mask = []
+        
+        for _, row in df.iterrows():
+            image_path = self.input_dir / row['file_path']
+            available_mask.append(image_path.exists())
+        
+        filtered_df = df[available_mask].copy()
+        print(f"  Filtered {len(df)} -> {len(filtered_df)} available images")
+        return filtered_df
+    
+    def _process_split(self, df: pd.DataFrame, split_name: str) -> Dict:
+        """Process a data split (train or val)."""
+        print(f"\nProcessing {split_name} split...")
+        
+        # Prepare output files with resolution in name
+        # For val split, use 'test' in filename for consistency
+        file_split_name = 'test' if split_name == 'val' else split_name
+        features_file = self.output_dir / f"imagenet_{self.target_size}x{self.target_size}_{file_split_name}.csv"
+        labels_file = self.output_dir / f"imagenet_{self.target_size}x{self.target_size}_{file_split_name}_labels.csv"
+        
+        # Process images in batches to manage memory
+        batch_size = 1000
+        total_samples = len(df)
+        num_batches = (total_samples + batch_size - 1) // batch_size
+        
+        print(f"Processing {total_samples} samples in {num_batches} batches of {batch_size}")
+        
+        # Initialize CSV files
+        features_written = 0
+        labels_written = 0
+        
+        with open(features_file, 'w', newline='') as f_feat, \
+             open(labels_file, 'w', newline='') as f_label:
+            
+            feat_writer = csv.writer(f_feat)
+            label_writer = csv.writer(f_label)
+            
+            for batch_idx in range(num_batches):
+                start_idx = batch_idx * batch_size
+                end_idx = min(start_idx + batch_size, total_samples)
+                batch_df = df.iloc[start_idx:end_idx]
+                
+                print(f"  Batch {batch_idx + 1}/{num_batches}: Processing samples {start_idx}-{end_idx-1}")
+                
+                # Process batch
+                batch_features, batch_labels = self._process_image_batch(batch_df)
+                
+                # Write to CSV
+                for features_row in batch_features:
+                    feat_writer.writerow(features_row)
+                    features_written += 1
+                
+                for labels_row in batch_labels:
+                    label_writer.writerow(labels_row)
+                    labels_written += 1
+                
+                # Memory cleanup
+                del batch_features, batch_labels
+                gc.collect()
+                
+                print(f"    Wrote {len(batch_df)} samples to CSV")
+        
+        result = {
+            'samples_processed': features_written,
+            'features_file': str(features_file),
+            'labels_file': str(labels_file),
+            'features_shape': (features_written, self.features),
+            'labels_shape': (labels_written, self.num_classes)
+        }
+        
+        print(f"  {split_name} processing complete: {features_written} samples")
+        return result
+    
+    def _process_image_batch(self, batch_df: pd.DataFrame) -> Tuple[List, List]:
+        """Process a batch of images."""
+        batch_features = []
+        batch_labels = []
+        
+        for _, row in batch_df.iterrows():
+            try:
+                # Load and process image
+                image_path = self.input_dir / row['file_path']
+                features = self._process_single_image(image_path)
+                
+                # Process label
+                label = int(row['label'])
+                # Convert to 0-indexed if needed (ImageNet labels are usually 1-indexed)
+                if label > 0:
+                    label = label - 1
+                
+                # Create one-hot encoding
+                one_hot = [0.0] * self.num_classes
+                if 0 <= label < self.num_classes:
+                    one_hot[label] = 1.0
+                
+                batch_features.append(features)
+                batch_labels.append(one_hot)
+                
+            except Exception as e:
+                print(f"    Error processing {row['file_path']}: {e}")
+                # Skip this sample
+                continue
+        
+        return batch_features, batch_labels
+    
+    def _process_single_image(self, image_path: Path) -> List[float]:
+        """Process a single image: load, resize, normalize, flatten."""
+        # Fix path if it points to wrong directory
+        image_path_str = str(image_path)
+        if "224x224" in image_path_str and "256x256" in str(self.input_dir):
+            # Replace 224x224 with 256x256 in the path
+            image_path_str = image_path_str.replace("224x224", "256x256")
+            image_path = Path(image_path_str)
+        
+        # Load image
+        with Image.open(image_path) as img:
+            # Convert to RGB if needed
+            if img.mode != 'RGB':
+                img = img.convert('RGB')
+            
+            # Resize to target size (e.g., from 256x256 to 224x224)
+            if img.size != (self.target_size, self.target_size):
+                img = img.resize((self.target_size, self.target_size), Image.LANCZOS)
+            
+            # Convert to numpy array and normalize to [0,1]
+            img_array = np.array(img, dtype=np.float32) / 255.0
+            
+            # Flatten to feature vector
+            features = img_array.flatten().tolist()
+            
+            return features
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Process raw ImageNet JPG data for SystemDS')
+    parser.add_argument('--input_dir', type=str, required=True,
+                        help='Directory containing raw ImageNet data')
+    parser.add_argument('--output_dir', type=str, default='imagenet_data',
+                        help='Base output directory for processed data (resolution subdirs will be created)')
+    parser.add_argument('--target_size', type=int, default=224,
+                        help='Target image size (default: 224 for 224x224)')
+    parser.add_argument('--max_samples', type=int, default=None,
+                        help='Maximum number of samples per split (for testing)')
+    parser.add_argument('--dry_run', action='store_true',
+                        help='Just inspect data without processing')
+    parser.add_argument('--skip_check', action='store_true',
+                        help='Skip image availability checking')
+    parser.add_argument('--split_from_train', action='store_true',
+                        help='Create validation set from training data instead of using test set')
+    
+    args = parser.parse_args()
+    
+    # Initialize processor
+    processor = RawImageNetProcessor(args.input_dir, args.output_dir, args.target_size)
+    
+    # Inspect data first (unless skipping check)
+    if not args.skip_check:
+        try:
+            metadata = processor.inspect_raw_data()
+        except Exception as e:
+            print(f"Error during inspection: {e}")
+            return 1
+    else:
+        print("Skipping data inspection...")
+    
+    # Process if not dry run
+    if not args.dry_run:
+        try:
+            results = processor.process_dataset(
+                max_samples=args.max_samples, 
+                dry_run=False,
+                skip_check=args.skip_check,
+                split_from_train=args.split_from_train
+            )
+            print(f"\n=== Processing Complete ===")
+            print(f"Results: {results}")
+        except Exception as e:
+            print(f"Error during processing: {e}")
+            return 1
+    else:
+        processor.process_dataset(dry_run=True)
+    
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main()) 
\ No newline at end of file
diff --git a/scripts/data_prep/run_raw_imagenet_preprocessing.py b/scripts/data_prep/run_raw_imagenet_preprocessing.py
new file mode 100644
index 00000000000..085a89db866
--- /dev/null
+++ b/scripts/data_prep/run_raw_imagenet_preprocessing.py
@@ -0,0 +1,148 @@
+#!/usr/bin/env python3
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+"""
+Simple runner for raw ImageNet preprocessing
+"""
+
+import sys
+import subprocess
+from pathlib import Path
+
+def main():
+    # Default paths
+    input_dir = r"C:\Users\romer\Desktop\Big_Data\imagenet\256x256"  # Source images are 256x256
+    output_dir = "imagenet_data"
+    
+    print("Raw ImageNet Preprocessing Runner")
+    print("=" * 50)
+    print(f"Input directory: {input_dir} (256x256 source images)")
+    print(f"Output directory: {output_dir}")
+    print(f"Default target size: 224x224 (for AlexNet)")
+    print()
+    
+    # Ask user what they want to do
+    print("Choose an option:")
+    print("1. Inspect data only (dry run)")
+    print("2. Process small sample (2000 train + 400 val from training set)")
+    print("3. Process full dataset (256x256 -> 224x224)")
+    print("4. Process full dataset (256x256 -> custom size)")
+    print("5. Custom processing")
+    print()
+    
+    choice = input("Enter choice (1-5): ").strip()
+    
+    if choice == "1":
+        # Dry run
+        cmd = [
+            sys.executable, "scripts/data_prep/prepare_raw_imagenet.py",
+            "--input_dir", input_dir,
+            "--output_dir", output_dir,
+            "--dry_run"
+        ]
+    elif choice == "2":
+        # Small sample with train/val split from training data
+        print("Processing 2000 training + 400 validation samples from training set...")
+        cmd = [
+            sys.executable, "scripts/data_prep/prepare_raw_imagenet.py",
+            "--input_dir", input_dir,
+            "--output_dir", output_dir,
+            "--max_samples", "2000",
+            "--skip_check",
+            "--split_from_train"
+        ]
+    elif choice == "3":
+        # Full dataset 256x256 -> 224x224
+        print("Processing 256x256 images -> 224x224 for AlexNet...")
+        cmd = [
+            sys.executable, "scripts/data_prep/prepare_raw_imagenet.py",
+            "--input_dir", input_dir,
+            "--output_dir", output_dir,
+            "--target_size", "224",
+            "--skip_check"
+        ]
+    elif choice == "4":
+        # Full dataset custom resolution
+        target_size = input("Enter target size (e.g., 256, 299): ").strip()
+        if not target_size.isdigit():
+            print("Invalid target size!")
+            return 1
+        
+        print(f"Processing 256x256 images -> {target_size}x{target_size}...")
+        
+        cmd = [
+            sys.executable, "scripts/data_prep/prepare_raw_imagenet.py",
+            "--input_dir", input_dir,
+            "--output_dir", output_dir,
+            "--target_size", target_size,
+            "--skip_check"
+        ]
+    elif choice == "5":
+        # Custom
+        custom_input = input(f"Input directory [{input_dir}]: ").strip()
+        if custom_input:
+            input_dir = custom_input
+        
+        custom_output = input(f"Output directory [{output_dir}]: ").strip()
+        if custom_output:
+            output_dir = custom_output
+        
+        target_size = input("Target size [224]: ").strip() or "224"
+        max_samples = input("Max samples per split (leave empty for all): ").strip()
+        
+        cmd = [
+            sys.executable, "scripts/data_prep/prepare_raw_imagenet.py",
+            "--input_dir", input_dir,
+            "--output_dir", output_dir,
+            "--target_size", target_size
+        ]
+        
+        if max_samples:
+            cmd.extend(["--max_samples", max_samples])
+        
+        skip_check = input("Skip image availability check? [Y/n]: ").strip().lower()
+        if skip_check != 'n':
+            cmd.append("--skip_check")
+            
+        split_from_train = input("Create validation from training data? [y/N]: ").strip().lower()
+        if split_from_train == 'y':
+            cmd.append("--split_from_train")
+    else:
+        print("Invalid choice!")
+        return 1
+    
+    print(f"\nRunning command: {' '.join(cmd)}")
+    print()
+    
+    # Run the command
+    try:
+        result = subprocess.run(cmd, check=True)
+        print("\nProcessing completed successfully!")
+        return 0
+    except subprocess.CalledProcessError as e:
+        print(f"\nError during processing: {e}")
+        return 1
+    except KeyboardInterrupt:
+        print("\nProcessing interrupted by user")
+        return 1
+
+if __name__ == "__main__":
+    sys.exit(main()) 
\ No newline at end of file
diff --git a/scripts/nn/examples/Example-ImageNet_AlexNet_Optimizers.dml b/scripts/nn/examples/Example-ImageNet_AlexNet_Optimizers.dml
new file mode 100644
index 00000000000..22555e3d040
--- /dev/null
+++ b/scripts/nn/examples/Example-ImageNet_AlexNet_Optimizers.dml
@@ -0,0 +1,192 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Example script to test different optimizers with AlexNet on ImageNet
+ * 
+ * This script demonstrates how different optimizers perform on ImageNet,
+ * particularly focusing on large batch training scenarios.
+ */
+
+source("imagenet_alexnet.dml") as imagenet_alexnet
+
+# ImageNet parameters
+C = 3      # RGB channels
+Hin = 224  # Height
+Win = 224  # Width
+K = 1000   # Number of classes
+
+print("\n=======================================================")
+print("Optimizer Comparison on ImageNet AlexNet")
+print("=======================================================\n")
+
+# For demonstration, we'll use a smaller subset of ImageNet
+# In practice, you would load the full ImageNet dataset
+print("Loading ImageNet subset for demonstration...")
+
+# Simulate loading training data (5K samples for faster demo)
+n_train = 5000
+X = rand(rows=n_train, cols=C*Hin*Win, min=0, max=1, seed=42)
+y = rand(rows=n_train, cols=K, min=0, max=0, seed=42)
+# Create one-hot encoded labels
+for(i in 1:n_train) {
+  class = as.scalar(round(rand(rows=1, cols=1, min=1, max=K, seed=42+i)))
+  y[i, class] = 1
+}
+
+# Simulate validation data (500 samples)
+n_val = 500
+X_val = rand(rows=n_val, cols=C*Hin*Win, min=0, max=1, seed=43)
+y_val = rand(rows=n_val, cols=K, min=0, max=0, seed=43)
+for(i in 1:n_val) {
+  class = as.scalar(round(rand(rows=1, cols=1, min=1, max=K, seed=43+i)))
+  y_val[i, class] = 1
+}
+
+# Training parameters
+epochs = 1  # Reduced for demonstration
+batch_size = 512  # Medium batch size for fair comparison
+
+# Test different optimizers
+optimizers = list("sgd", "sgd_momentum", "adam", "lars")
+learning_rates = list(0.01, 0.01, 0.001, 0.1)  # Tuned for each optimizer
+
+# Store results
+results = matrix(0, rows=length(optimizers), cols=5)
+# Columns: optimizer_id, top1_acc, top5_acc, final_loss, train_time
+
+print("Configuration:")
+print("- Dataset: ImageNet subset (demonstration)")
+print("- Model: AlexNet with Batch Normalization")
+print("- Training samples: " + n_train)
+print("- Validation samples: " + n_val)
+print("- Epochs: " + epochs)
+print("- Batch size: " + batch_size)
+print("\n")
+
+# Test each optimizer
+for (i in 1:length(optimizers)) {
+  optimizer = as.scalar(optimizers[i])
+  lr = as.scalar(learning_rates[i])
+  
+  print("\n=========================================")
+  print("Testing optimizer: " + optimizer)
+  print("Learning rate: " + lr)
+  print("-----------------------------------------")
+  
+  # Train model
+  start_time = time()
+  model = imagenet_alexnet::train(X, y, X_val, y_val, C, Hin, Win, 
+                                 epochs, optimizer, lr, batch_size)
+  train_time = (time() - start_time) / 1000.0  # Convert to seconds
+  
+  # Extract all model parameters
+  W1 = as.matrix(model["W1"]); b1 = as.matrix(model["b1"])
+  W2 = as.matrix(model["W2"]); b2 = as.matrix(model["b2"])
+  W3 = as.matrix(model["W3"]); b3 = as.matrix(model["b3"])
+  W4 = as.matrix(model["W4"]); b4 = as.matrix(model["b4"])
+  W5 = as.matrix(model["W5"]); b5 = as.matrix(model["b5"])
+  W6 = as.matrix(model["W6"]); b6 = as.matrix(model["b6"])
+  W7 = as.matrix(model["W7"]); b7 = as.matrix(model["b7"])
+  W8 = as.matrix(model["W8"]); b8 = as.matrix(model["b8"])
+  
+  # Extract BN parameters
+  gamma1 = as.matrix(model["gamma1"]); beta1 = as.matrix(model["beta1"])
+  ema_mean1 = as.matrix(model["ema_mean1"]); ema_var1 = as.matrix(model["ema_var1"])
+  gamma2 = as.matrix(model["gamma2"]); beta2 = as.matrix(model["beta2"])
+  ema_mean2 = as.matrix(model["ema_mean2"]); ema_var2 = as.matrix(model["ema_var2"])
+  gamma3 = as.matrix(model["gamma3"]); beta3 = as.matrix(model["beta3"])
+  ema_mean3 = as.matrix(model["ema_mean3"]); ema_var3 = as.matrix(model["ema_var3"])
+  gamma4 = as.matrix(model["gamma4"]); beta4 = as.matrix(model["beta4"])
+  ema_mean4 = as.matrix(model["ema_mean4"]); ema_var4 = as.matrix(model["ema_var4"])
+  gamma5 = as.matrix(model["gamma5"]); beta5 = as.matrix(model["beta5"])
+  ema_mean5 = as.matrix(model["ema_mean5"]); ema_var5 = as.matrix(model["ema_var5"])
+  
+  # Evaluate on validation set
+  probs_val = imagenet_alexnet::predict(X_val, C, Hin, Win, 
+                                       W1, b1, W2, b2, W3, b3, W4, b4, 
+                                       W5, b5, W6, b6, W7, b7, W8, b8,
+                                       gamma1, beta1, ema_mean1, ema_var1,
+                                       gamma2, beta2, ema_mean2, ema_var2,
+                                       gamma3, beta3, ema_mean3, ema_var3,
+                                       gamma4, beta4, ema_mean4, ema_var4,
+                                       gamma5, beta5, ema_mean5, ema_var5)
+  [loss_val, top1_acc, top5_acc] = imagenet_alexnet::eval(probs_val, y_val)
+  
+  print("\nFinal Results:")
+  print("Validation Loss: " + loss_val)
+  print("Top-1 Accuracy: " + top1_acc + " (" + (top1_acc * 100) + "%)")
+  print("Top-5 Accuracy: " + top5_acc + " (" + (top5_acc * 100) + "%)")
+  print("Training Time: " + train_time + " seconds")
+  
+  # Store results
+  results[i, 1] = i  # optimizer id
+  results[i, 2] = top1_acc
+  results[i, 3] = top5_acc
+  results[i, 4] = loss_val
+  results[i, 5] = train_time
+}
+
+# Print summary comparison
+print("\n\n=======================================================")
+print("OPTIMIZER COMPARISON SUMMARY")
+print("=======================================================")
+print("\nOptimizer      | Top-1 Acc | Top-5 Acc | Val Loss | Time (s)")
+print("---------------|-----------|-----------|----------|----------")
+
+optimizer_names = list("SGD", "SGD+Momentum", "Adam", "LARS")
+for(i in 1:nrow(results)) {
+  opt_name = as.scalar(optimizer_names[i])
+  print(sprintf("%-14s | %9.4f | %9.4f | %8.4f | %8.2f",
+                opt_name, 
+                as.scalar(results[i,2]), 
+                as.scalar(results[i,3]), 
+                as.scalar(results[i,4]), 
+                as.scalar(results[i,5])))
+}
+
+# Find best performers
+best_top1_idx = as.scalar(rowIndexMax(results[,2]))
+best_top5_idx = as.scalar(rowIndexMax(results[,3]))
+fastest_idx = as.scalar(rowIndexMin(results[,5]))
+
+print("\nBest Performers:")
+print("- Highest Top-1 Accuracy: " + as.scalar(optimizer_names[best_top1_idx]) + 
+      " (" + as.scalar(results[best_top1_idx,2]) + ")")
+print("- Highest Top-5 Accuracy: " + as.scalar(optimizer_names[best_top5_idx]) + 
+      " (" + as.scalar(results[best_top5_idx,3]) + ")")
+print("- Fastest Training: " + as.scalar(optimizer_names[fastest_idx]) + 
+      " (" + as.scalar(results[fastest_idx,5]) + "s)")
+
+print("\nKey Observations:")
+print("1. SGD with momentum typically provides good baseline performance")
+print("2. Adam converges quickly but may not achieve best final accuracy")
+print("3. LARS excels with large batch sizes (not fully demonstrated here)")
+print("4. Proper learning rate tuning is crucial for each optimizer")
+print("5. Batch normalization helps stabilize training across optimizers")
+
+print("\nNote: This is a demonstration with limited data and epochs.")
+print("Full ImageNet training would require:")
+print("- 1.2M+ training images")
+print("- 90+ epochs")
+print("- Proper data augmentation")
+print("- Learning rate scheduling")
+print("=======================================================\n") 
\ No newline at end of file
diff --git a/scripts/nn/examples/Example-MNIST_Softmax.dml b/scripts/nn/examples/Example-MNIST_Softmax.dml
index 6a666698ff8..011278bf775 100644
--- a/scripts/nn/examples/Example-MNIST_Softmax.dml
+++ b/scripts/nn/examples/Example-MNIST_Softmax.dml
@@ -23,7 +23,7 @@
 source("nn/examples/mnist_softmax.dml") as mnist_softmax
 
 # Read training data
-data = read("mnist_data/mnist_train.csv", format="csv")
+data = read("mnist_data/mnist_train.csv", format="csv", header=TRUE)
 n = nrow(data)
 
 # Extract images and labels
@@ -45,7 +45,7 @@ epochs = 1
 [W, b] = mnist_softmax::train(X, y, X_val, y_val, epochs)
 
 # Read test data
-data = read("mnist_data/mnist_test.csv", format="csv")
+data = read("mnist_data/mnist_test.csv", format="csv", header=TRUE)
 n = nrow(data)
 
 # Extract images and labels
diff --git a/scripts/nn/examples/Example-ResNet.dml b/scripts/nn/examples/Example-ResNet.dml
index 97b7781573c..81d965df760 100644
--- a/scripts/nn/examples/Example-ResNet.dml
+++ b/scripts/nn/examples/Example-ResNet.dml
@@ -48,7 +48,7 @@ classes = 1000
 # *** adagrad
 # optimizer_params = resnet18::init_adagrad_optim_params(classes)
 # *** adam
-optimizer_params = resnet18::init_adam_optim_params(classes)
+# optimizer_params = resnet18::init_adam_optim_params(classes)
 # *** rmsprop
 # optimizer_params = resnet18::init_rmsprop_optim_params(classes)
 # *** sgd
@@ -57,6 +57,8 @@ optimizer_params = resnet18::init_adam_optim_params(classes)
 # optimizer_params = resnet18::init_sgd_momentumg_optim_params(classes)
 # *** sgd nesterov
 # optimizer_params = resnet18::init_sgd_nesterov_optim_params(classes)
+# *** lars
+optimizer_params = resnet18::init_lars_optim_params(classes)
 
 # create random data
 N = 100
@@ -90,6 +92,11 @@ train = function(matrix[double] X, matrix[double] Y, list[unknown] model, list[u
     decay_rate = 0.99
     # sgd momentum & nesterov
     momentum = 0.8
+    # lars
+    trust_coeff = 0.001
+    momentum = 0.9
+    weight_decay = 0.0001
+    decay_power = 2
 
     learned_model = list()
     learned_emas = list()
@@ -127,9 +134,9 @@ train = function(matrix[double] X, matrix[double] Y, list[unknown] model, list[u
             # *** adagrad
             # [model, optim_params] = resnet18::update_params_with_adagrad(model, gradients, lr, epsilon, optim_params)
             # *** adam
-            [model, optim_params] = resnet18::update_params_with_adam(model, gradients, lr, beta1, beta2, epsilon,
-                t, optim_params)
-            t = t + 1
+            # [model, optim_params] = resnet18::update_params_with_adam(model, gradients, lr, beta1, beta2, epsilon,
+            #    t, optim_params)
+            # t = t + 1
             # *** rmsprop
             # [model, optim_params] = resnet18::update_params_with_rmsprop(model, gradients, lr, decay_rate, epsilon,
             #     optim_params)
@@ -141,6 +148,9 @@ train = function(matrix[double] X, matrix[double] Y, list[unknown] model, list[u
             # *** sgd nesterov
             # [model, optim_params] = resnet18::update_params_with_sgd_nesterov(model, gradients, lr, momentum,
             #     optim_params)
+            # *** lars
+            [model, optim_params] = resnet18::update_params_with_lars(model, gradients, lr, momentum, weight_decay, trust_coeff,
+                  optim_params)
         }
 
         # reshuffle mini batches
diff --git a/scripts/nn/examples/imagenet_alexnet.dml b/scripts/nn/examples/imagenet_alexnet.dml
new file mode 100644
index 00000000000..d26d7a0d6c1
--- /dev/null
+++ b/scripts/nn/examples/imagenet_alexnet.dml
@@ -0,0 +1,334 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# ImageNet AlexNet - Train
+#
+# This script trains a convolutional net using the "AlexNet" architecture
+# on 224x224 ImageNet images using LARS optimizer.
+#
+# Inputs:
+#  - train_data: File containing ImageNet training images (features)
+#  - train_labels: File containing ImageNet training labels (one-hot)
+#  - val_data: File containing ImageNet validation images (features)
+#  - val_labels: File containing ImageNet validation labels (one-hot)
+#  - epochs: [DEFAULT: 30] Total number of full training loops
+#  - batch_size: [DEFAULT: 256] Mini-batch size for training
+#  - out_dir: [DEFAULT: "scripts/nn/examples/model/imagenet_alexnet"] Directory to store results
+#
+# Outputs:
+#  - accuracy: File containing validation accuracy over epochs
+#  - loss: File containing training loss over epochs
+#
+# Sample Invocation:
+#   ```
+#   java -Xmx8g -Xms8g -cp "target/systemds-3.4.0-SNAPSHOT.jar:target/lib/*" \
+#   org.apache.sysds.api.DMLScript -f scripts/nn/examples/imagenet_alexnet.dml \
+#   -exec singlenode -gpu
+#   java -Xmx8g -Xms8g -cp "target/systemds-3.4.0-SNAPSHOT.jar:target/lib/*" org.apache.sysds.api.DMLScript -f scripts/nn/examples/imagenet_alexnet.dml -exec singlenode -gpu
+#   ```
+
+
+
+source("nn/networks/alexnet.dml") as alexnet
+source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
+
+# Read the ImageNet data
+fmt = "csv"
+target_size = 224  # For display purposes
+
+print("Loading ImageNet data (224x224)...")
+print("Data directory: imagenet_data/224x224")
+print("")
+
+# Read the data files with constant string paths
+print("Reading training data...")
+train_data = read("imagenet_data/224x224/imagenet_224x224_train.csv", format=fmt)
+train_labels = read("imagenet_data/224x224/imagenet_224x224_train_labels.csv", format=fmt)
+print("Reading validation data...")
+val_data = read("imagenet_data/224x224/imagenet_224x224_test.csv", format=fmt)
+val_labels = read("imagenet_data/224x224/imagenet_224x224_test_labels.csv", format=fmt)
+out_dir = "scripts/nn/examples/model/imagenet_alexnet"
+
+print("Data loaded successfully.")
+
+# Get dataset dimensions
+N = nrow(train_data)
+N_val = nrow(val_data)
+classes = 1000
+
+print("Dataset info:")
+print("- Training samples: " + N)
+print("- Validation samples: " + N_val)
+print("- Features: " + ncol(train_data))
+print("- Classes: " + classes)
+
+# Scale images to [-1,1] (data is already in [0,1] range from preprocessing)
+X = (train_data - 0.5) * 2
+X_val = (val_data - 0.5) * 2
+
+# Labels are already one-hot encoded from preprocessing
+Y = train_labels
+Y_val = val_labels
+
+print("Data preprocessing completed.")
+print("- Image range: [" + min(X) + ", " + max(X) + "]")
+print("- Label sum check: " + mean(rowSums(Y)))
+
+# Get initial model parameters
+print("Initializing AlexNet model...")
+use_bn = FALSE  # Use batch normalization
+if (use_bn) {
+    print("Using AlexNet with Batch Normalization")
+    [model, emas] = alexnet::init_with_bn(3, 224, 224, classes, 42)
+} else {
+    print("Using standard AlexNet")
+    model = alexnet::init(3, 224, 224, classes, 42)
+    emas = list()  # Empty for non-BN version
+}
+
+# Get initial optimizer parameters
+print("Initializing LARS optimizer...")
+optimizer_params = alexnet::init_lars_optim_params(model)
+
+# Define image properties
+Hin = target_size
+Win = target_size
+C = 3
+
+# Define training parameters
+epochs = 30
+batch_size = 256
+
+print("Training configuration:")
+print("- Image size: " + Hin + "x" + Win + "x" + C + " (features: " + (Hin*Win*C) + ")")
+print("- Epochs: " + epochs)
+print("- Batch size: " + batch_size)
+print("- Use Batch Normalization: " + use_bn)
+print("")
+
+print("Starting training...")
+[accuracy, loss_metric, learned_model, learned_emas] = train(X, Y, X_val, Y_val, model, emas, N, C, Hin, Win, epochs, batch_size, optimizer_params, use_bn)
+
+print("Saving results...")
+write(accuracy, out_dir + "/imagenet_alexnet_accuracy.csv", format="csv")
+write(loss_metric, out_dir + "/imagenet_alexnet_loss.csv", format="csv")
+
+# Save final metrics
+final_accuracy = as.scalar(accuracy[epochs, 1])
+print("Final validation accuracy: " + final_accuracy)
+
+print("Training completed!")
+
+train = function(matrix[double] X, matrix[double] Y, matrix[double] X_val, matrix[double] Y_val, list[unknown] model, list[unknown] emas, 
+    int samples, int C, int Hin, int Win, int epochs, int batch_size, list[unknown] optim_params, boolean use_bn)
+    return (matrix[double] accuracy, matrix[double] loss_metric, 
+            list[unknown] learned_model, list[unknown] learned_emas) {
+
+    # --- HYPERPARAMETERS ---
+    base_batch_size = 256  # Reference batch size for LR scaling
+    initial_lr = 0.01 * (batch_size / base_batch_size)  # Linear scaling rule
+    end_lr = 0.00001 
+    warmup_epochs = 5
+    power = 2.0
+    momentum = 0.9
+    trust_coeff = 0.001
+    weight_decay = 0.0005
+    
+    iterations_per_epoch = ceil(samples / batch_size)
+    total_iterations = epochs * iterations_per_epoch
+    warmup_iterations = warmup_epochs * iterations_per_epoch
+    decay_iterations = total_iterations - warmup_iterations
+
+    print("LARS Configuration:")
+    print("- Base LR: " + (0.01) + " (scaled to " + initial_lr + " for batch size " + batch_size + ")")
+    print("- End LR: " + end_lr)
+    print("- Warmup epochs: " + warmup_epochs)
+    print("- Momentum: " + momentum)
+    print("- Weight decay: " + weight_decay)
+    print("- Trust coefficient: " + trust_coeff)
+    print("- Use BN: " + use_bn)
+    print("")
+
+    accuracy = matrix(0, rows=epochs, cols=1)
+    loss_metric = matrix(0, rows=epochs, cols=1)
+    mode = "train"
+
+    for (epoch in 1:epochs) {
+        loss_avg = 0.0
+        print("Start epoch: " + epoch + "/" + epochs)
+
+        for (i in 1:iterations_per_epoch) {
+            if (i %% 50 == 1) { print(" - Iteration: " + i + "/" + iterations_per_epoch) }
+
+            # --- DYNAMIC LEARNING RATE ---
+            current_iteration = (epoch - 1) * iterations_per_epoch + i
+            if (current_iteration < warmup_iterations) {
+                current_lr = initial_lr * (as.double(current_iteration) / warmup_iterations)
+            } else {
+                decay_step = current_iteration - warmup_iterations
+                decay_progress = as.double(decay_step) / decay_iterations
+                current_lr = end_lr + (initial_lr - end_lr) * (1 - decay_progress)^power
+            }
+            if (i == 1) { print("Using Learning Rate: " + current_lr) }
+
+            # --- BATCH PREPARATION ---
+            start = (i - 1) * batch_size + 1
+            end = min(samples, i * batch_size)
+            X_batch = X[start:end,]
+            Y_batch = Y[start:end,]
+
+            # --- FORWARD AND BACKWARD PASS ---
+            if (use_bn) { 
+                [out, cached_out, emas] = alexnet::forward_with_bn(X_batch, C, Hin, Win, model, "train", 0.5) 
+            } else { 
+                [out, cached_out] = alexnet::forward(X_batch, C, Hin, Win, model, "train", 0.5) 
+            }
+            
+            # Compute loss with L2 regularization
+            loss = alexnet::compute_loss(out, Y_batch, model, weight_decay)
+            loss_avg = (loss_avg * (i - 1) + loss) / i
+            
+            # Backward pass
+            dOut = cross_entropy_loss::backward(out, Y_batch)
+            if (use_bn) { 
+                [dX, gradients] = alexnet::backward_with_bn(dOut, cached_out, model, C, Hin, Win, 0.5) 
+            } else { 
+                [dX, gradients] = alexnet::backward(dOut, cached_out, model, C, Hin, Win, 0.5) 
+            }
+            
+            # Update with LARS (weight decay is handled internally by LARS)
+            [model, optim_params] = alexnet::update_params_with_lars(
+                model, gradients, current_lr, momentum, weight_decay, 
+                trust_coeff, optim_params)
+        }
+
+        # --- EVALUATION ---
+        print("Computing metrics for current epoch...")
+        if (use_bn) { 
+            accuracy_scalar = predict_and_eval_batched_with_bn(X_val, Y_val, C, Hin, Win, model, emas, batch_size) 
+        } else { 
+            accuracy_scalar = predict_and_eval_batched(X_val, Y_val, C, Hin, Win, model, batch_size) 
+        }
+
+        loss_metric[epoch, 1] = loss_avg
+        accuracy[epoch, 1] = accuracy_scalar
+        print("Epoch " + epoch + " completed:")
+        print("- Avg. Loss: " + loss_avg)
+        print("- Validation Accuracy: " + accuracy_scalar)
+        print("")
+    }
+    learned_model = model
+    learned_emas = emas
+}
+
+predict = function(matrix[double] X, int C, int Hin, int Win, 
+    list[unknown] model) 
+    return(matrix[double] out) {
+    /*
+    * Computes the class probability predictions using standard AlexNet.
+    */
+    
+    # Predict on validation dataset
+    mode = "test"
+    [out, cached_out] = alexnet::forward(X, C, Hin, Win, model, mode, 0.0)
+}
+
+predict_with_bn = function(matrix[double] X, int C, int Hin, int Win, 
+    list[unknown] model, list[unknown] emas) 
+    return(matrix[double] out) {
+    /*
+    * Computes the class probability predictions using AlexNet with Batch Normalization.
+    */
+    
+    # Predict on validation dataset
+    mode = "test"
+    [out, cached_out, emas_temp] = alexnet::forward_with_bn(X, C, Hin, Win, model, mode, 0.0)
+}
+
+predict_and_eval_batched = function(matrix[double] X_val, matrix[double] Y_val, int C, int Hin, int Win, 
+    list[unknown] model, int batch_size)
+    return(double accuracy) {
+    /*
+    * Batched prediction and evaluation for standard AlexNet to avoid memory issues
+    */
+    
+    N_val = nrow(X_val)
+    val_iterations = ceil(N_val / batch_size)
+    correct_total = 0
+    mode = "test"
+    
+    print("  Evaluating validation set in " + val_iterations + " batches...")
+    
+    for (i in 1:val_iterations) {
+        if (i %% 10 == 1) {
+            print("    Validation batch: " + i + "/" + val_iterations)
+        }
+        
+        start = (i - 1) * batch_size + 1
+        end = min(N_val, i * batch_size)
+        X_batch = X_val[start:end,]
+        Y_batch = Y_val[start:end,]
+        
+        # Forward pass
+        [out_batch, cached_out] = alexnet::forward(X_batch, C, Hin, Win, model, mode, 0.0)
+        
+        # Count correct predictions
+        correct_pred = rowIndexMax(out_batch) == rowIndexMax(Y_batch)
+        correct_total = correct_total + sum(correct_pred)
+    }
+    
+    accuracy = correct_total / N_val
+}
+
+predict_and_eval_batched_with_bn = function(matrix[double] X_val, matrix[double] Y_val, int C, int Hin, int Win, 
+    list[unknown] model, list[unknown] emas, int batch_size)
+    return(double accuracy) {
+    /*
+    * Batched prediction and evaluation for AlexNet with BN to avoid memory issues
+    */
+    
+    N_val = nrow(X_val)
+    val_iterations = ceil(N_val / batch_size)
+    correct_total = 0
+    mode = "test"
+    
+    print("  Evaluating validation set in " + val_iterations + " batches...")
+    
+    for (i in 1:val_iterations) {
+        if (i %% 10 == 1) {
+            print("    Validation batch: " + i + "/" + val_iterations)
+        }
+        
+        start = (i - 1) * batch_size + 1
+        end = min(N_val, i * batch_size)
+        X_batch = X_val[start:end,]
+        Y_batch = Y_val[start:end,]
+        
+        # Forward pass
+        [out_batch, cached_out, emas_temp] = alexnet::forward_with_bn(X_batch, C, Hin, Win, model, mode, 0.0)
+        
+        # Count correct predictions
+        correct_pred = rowIndexMax(out_batch) == rowIndexMax(Y_batch)
+        correct_total = correct_total + sum(correct_pred)
+    }
+    
+    accuracy = correct_total / N_val
+}
\ No newline at end of file
diff --git a/scripts/nn/examples/imagenet_resnet.dml b/scripts/nn/examples/imagenet_resnet.dml
new file mode 100644
index 00000000000..2aba2fd16bb
--- /dev/null
+++ b/scripts/nn/examples/imagenet_resnet.dml
@@ -0,0 +1,307 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# ImageNet Resnet - Train
+#
+# This script trains a convolutional net using the "ResNet" architecture
+# on 64x64 ImageNet images using LARS optimizer.
+#
+# Inputs:
+#  - train_data: File containing ImageNet training images (features)
+#  - train_labels: File containing ImageNet training labels (one-hot)
+#  - val_data: File containing ImageNet validation images (features) 
+#  - val_labels: File containing ImageNet validation labels (one-hot)
+#  - epochs: [DEFAULT: 30] Total number of full training loops
+#  - batch_size: [DEFAULT: 256] Mini-batch size for training
+#  - out_dir: [DEFAULT: "scripts/nn/examples/model/imagenet_resnet"] Directory to store results
+#
+# Outputs:
+#  - accuracy: File containing validation accuracy over epochs
+#  - loss: File containing training loss over epochs
+#
+# Sample Invocation:
+#   ```
+#   java -Xmx8g -Xms8g -cp "target/systemds-3.4.0-SNAPSHOT.jar:target/lib/*" \
+#   org.apache.sysds.api.DMLScript -f scripts/nn/examples/imagenet_resnet.dml \
+#   -exec singlenode -gpu
+#   ```
+
+source("nn/networks/resnet50.dml") as resnet
+source("scripts/nn/layers/softmax_cross_entropy_loss.dml") as loss_nn
+
+# Read the ImageNet data
+fmt = "csv"
+print("Loading ImageNet data...")
+train_data = read("imagenet_data/systemds_ready/imagenet_train_6GB.csv", format=fmt)
+train_labels = read("imagenet_data/systemds_ready/imagenet_train_labels_6GB.csv", format=fmt)
+val_data = read("imagenet_data/systemds_ready/imagenet_val_6GB.csv", format=fmt)
+val_labels = read("imagenet_data/systemds_ready/imagenet_val_labels_6GB.csv", format=fmt)
+out_dir = "scripts/nn/examples/model/imagenet_resnet"
+
+print("Data loaded successfully.")
+
+# Get dataset dimensions
+N = nrow(train_data)
+N_val = nrow(val_data)
+classes = 1000
+
+print("Dataset info:")
+print("- Training samples: " + N)
+print("- Validation samples: " + N_val)  
+print("- Features: " + ncol(train_data))
+print("- Classes: " + classes)
+
+# Scale images to [-1,1] (data is already in [0,1] range from preprocessing)
+X = (train_data - 0.5) * 2
+X_val = (val_data - 0.5) * 2
+
+# Labels are already one-hot encoded from preprocessing
+Y = train_labels
+Y_val = val_labels
+
+print("Data preprocessing completed.")
+print("- Image range: [" + min(X) + ", " + max(X) + "]")
+print("- Label sum check: " + mean(rowSums(Y)))
+
+# Get initial model parameters
+print("Initializing ResNet-18 model...")
+[model, ema_means_vars] = resnet::init(classes, -1)
+
+# Get initial optimizer parameters  
+print("Initializing LARS optimizer...")
+optimizer_params = resnet::init_lars_optim_params(classes)
+
+# Define image properties
+Hin = 64
+Win = 64
+
+# Define training parameters
+epochs = 90
+batch_size = 256
+
+print("Training configuration:")
+print("- Image size: " + Hin + "x" + Win + "x3")
+print("- Epochs: " + epochs)
+print("- Batch size: " + batch_size)
+print("")
+
+print("Starting training...")
+[accuracy, loss_metric, learned_model, learned_emas] = train(X, Y, X_val, Y_val, model, ema_means_vars, N, Hin, Win, epochs, batch_size, optimizer_params)
+
+print("Saving results...")
+write(accuracy, out_dir + "/imagenet_resnet_accuracy.csv", format="csv")
+write(loss_metric, out_dir + "/imagenet_resnet_loss.csv", format="csv")
+
+# Save final metrics
+final_accuracy = as.scalar(accuracy[epochs, 1])
+print("Final validation accuracy: " + final_accuracy)
+
+print("Training completed!")
+
+# Train function
+train = function(matrix[double] X, matrix[double] Y, matrix[double] X_val, matrix[double] Y_val, list[unknown] model, list[unknown] emas, int samples, int Hin,
+    int Win, int epochs, int batch_size, list[unknown] optim_params)
+    return (matrix[double] accuracy, matrix[double] loss_metric, 
+            list[unknown] learned_model, list[unknown] learned_emas) {
+
+    # --- LEARNING RATE SCHEDULE HYPERPARAMETERS ---
+    # The learning rate we want to reach AFTER warmup
+    initial_lr = 0.01 
+    # A very small final learning rate to decay towards
+    end_lr = 0.0001 
+    # Number of warmup epochs, as per the paper
+    warmup_epochs = 5
+    # The exponent for the polynomial decay, as per the paper
+    power = 2.0
+
+    # Optimizer hyperparameters
+    momentum = 0.9
+    trust_coeff = 0.001
+    weight_decay = 0.0001
+    
+    # Calculate total iterations for the schedule
+    iterations_per_epoch = ceil(samples / batch_size)
+    total_iterations = epochs * iterations_per_epoch
+    warmup_iterations = warmup_epochs * iterations_per_epoch
+    decay_iterations = total_iterations - warmup_iterations
+
+    # Initialize metrics
+    learned_model = list()
+    learned_emas = list()
+    accuracy = matrix(0, rows=epochs, cols=1)
+    loss_metric = matrix(0, rows=epochs, cols=1)
+
+    iterations = ceil(samples/batch_size)
+    mode = "train"
+
+    for (epoch in 1:epochs) {
+        loss_avg = 0.0
+
+        print("Start epoch: " + epoch + "/" + epochs)
+
+        for (i in 1:iterations) {
+            print(" - Iteration: " + i + "/" + iterations)
+
+            # --- START DYNAMIC LEARNING RATE LOGIC ---
+            current_iteration = (epoch - 1) * iterations_per_epoch + i
+            current_lr = 0.0
+
+            if (current_iteration < warmup_iterations) {
+                # 1. Linear Warmup Phase
+                # Linearly increase LR from 0 to initial_lr over warmup_iterations
+                current_lr = initial_lr * (as.double(current_iteration) / warmup_iterations)
+            } else {
+                # 2. Polynomial Decay Phase
+                decay_step = current_iteration - warmup_iterations
+                decay_progress = as.double(decay_step) / decay_iterations
+                current_lr = end_lr + (initial_lr - end_lr) * (1 - decay_progress)^power
+            }
+            
+            if (i == 1) { # Print LR once per epoch to reduce log spam
+                print("Using Learning Rate: " + current_lr)
+            }
+            # --- END DYNAMIC LEARNING RATE LOGIC ---
+
+            # Get batch
+            start = (i - 1) * batch_size + 1
+            end = min(samples, i * batch_size)
+            X_batch = X[start:end,]
+            Y_batch = Y[start:end,]
+
+            # Forward pass
+            [out, emas, cached_out, cached_means_vars] = resnet::forward(X_batch, Hin, Win, model, mode, emas)
+
+            # Loss
+            loss = loss_nn::forward(out, Y_batch)
+            if (i %% 10 == 0) { # Print loss same frequency as MNIST
+                print(" - Iteration: " + i + "/" + iterations + ", Loss: " + loss)
+            }
+            loss_avg = (loss_avg * (i - 1) + loss) / i
+
+            # Backward
+            dOut = loss_nn::backward(out, Y_batch)
+            [dX, gradients] = resnet::backward(dOut, cached_out, model, cached_means_vars)
+
+            # Update parameters
+            [model, optim_params] = resnet::update_params_with_lars(model, gradients, current_lr, momentum, weight_decay, trust_coeff,
+                  optim_params)
+        }
+
+        # Reshuffle mini batches
+        r = rand(rows=nrow(Y), cols=1, min=0, max=1, pdf="uniform")
+        X_tmp = order(target=cbind(r, X), by=1)
+        Y_tmp = order(target=cbind(r, Y), by=1)
+        X = X_tmp[,2:ncol(X_tmp)]
+        Y = Y_tmp[,2:ncol(Y_tmp)]
+
+        print("Computing metrics for current epoch...")
+
+        # Predict on the validation dataset with batching to avoid OOM
+        accuracy_scalar = predict_and_eval_batched(X_val, Y_val, Hin, Win, model, emas, batch_size)
+
+        # Append to the epoch-wise metrics
+        loss_metric[epoch, 1] = loss_avg
+        accuracy[epoch, 1] = accuracy_scalar
+
+        print("Epoch Avg. Loss: " + loss_avg)
+        print("Epoch Accuracy: " + accuracy_scalar)
+    }
+
+    learned_model = model
+    learned_emas = emas
+}
+
+predict = function(matrix[double] X, int Hin, int Win, 
+    list[unknown] model, list[unknown] emas) 
+    return(matrix[double] out) {
+    /*
+    * Computes the class probability predictions of a convolutional
+    * net using the "ResNet" architecture.
+    *
+    * The input matrix, X, has N examples, each represented as a 3D
+    * volume unrolled into a single vector.
+    *
+    * Inputs:
+    *  - X: Input data matrix, of shape (N, C*Hin*Win).
+    *
+    * Outputs:
+    *  - probs: Class probabilities, of shape (N, K).
+    */
+    
+    # Predict on validation dataset
+    mode = "train"
+    [out, temp_emas, temp_cached_out, temp_cached_means_vars] = resnet::forward(X, Hin, Win, model, mode, emas)
+}
+
+predict_and_eval_batched = function(matrix[double] X_val, matrix[double] Y_val, int Hin, int Win, 
+    list[unknown] model, list[unknown] emas, int batch_size)
+    return(double accuracy) {
+    /*
+    * Batched prediction and evaluation to avoid memory issues with large validation sets
+    */
+    
+    N_val = nrow(X_val)
+    val_iterations = ceil(N_val / batch_size)
+    correct_total = 0
+    mode = "train"
+    
+    print("  Evaluating validation set in " + val_iterations + " batches...")
+    
+    for (i in 1:val_iterations) {
+        if (i %% 10 == 1) {
+            print("    Validation batch: " + i + "/" + val_iterations)
+        }
+        
+        start = (i - 1) * batch_size + 1
+        end = min(N_val, i * batch_size)
+        X_batch = X_val[start:end,]
+        Y_batch = Y_val[start:end,]
+        
+        # Forward pass
+        [out_batch, temp_emas, temp_cached_out, temp_cached_means_vars] = resnet::forward(X_batch, Hin, Win, model, mode, emas)
+        
+        # Count correct predictions
+        correct_pred = rowIndexMax(out_batch) == rowIndexMax(Y_batch)
+        correct_total = correct_total + sum(correct_pred)
+    }
+    
+    accuracy = correct_total / N_val
+}
+
+eval = function(matrix[double] probs, matrix[double] Y)
+    return(double accuracy) {
+    /*
+    * Evaluates a convolutional net using the "ResNet" architecture.
+    *
+    * The probs matrix contains the class probability predictions
+    * of K classes over N examples.  The targets, Y, have K classes,
+    * and are one-hot encoded.
+    *
+    * Inputs:
+    *  - probs: Class probabilities, of shape (N, K).
+    *  - Y: Target matrix, of shape (N, K).
+    *
+    * Outputs:
+    *  - accuracy: Scalar accuracy, of shape (1).
+    */
+    correct_pred = rowIndexMax(probs) == rowIndexMax(Y)
+    accuracy = mean(correct_pred)
+}
\ No newline at end of file
diff --git a/scripts/nn/examples/load_imagenet_csv.dml b/scripts/nn/examples/load_imagenet_csv.dml
new file mode 100644
index 00000000000..52e724b6de4
--- /dev/null
+++ b/scripts/nn/examples/load_imagenet_csv.dml
@@ -0,0 +1,122 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#-------------------------------------------------------------
+#
+# Script to load ImageNet CSV data and convert to binary format
+#
+#-------------------------------------------------------------
+
+# Function to load and preprocess ImageNet CSV data
+load_and_save_imagenet_data = function() {
+  print("Loading ImageNet CSV data...")
+  
+  # Parameters
+  num_classes = 10  # Adjust based on your data
+  
+  # Use relative paths
+  train_csv = "imagenet_data/imagenet_train.csv"
+  val_csv = "imagenet_data/imagenet_val.csv"
+  
+  # Output binary files
+  train_data_file = "imagenet_data/train_data.bin"
+  train_labels_file = "imagenet_data/train_labels.bin"
+  val_data_file = "imagenet_data/val_data.bin"
+  val_labels_file = "imagenet_data/val_labels.bin"
+  
+  print("Loading training data from CSV...")
+  # Read CSV file
+  train_data = read(train_csv, format="csv", header=FALSE)
+  
+  # Force dense
+  train_data = train_data + 0
+  
+  # Extract labels and features
+  train_labels = train_data[,1]
+  train_features = train_data[,2:ncol(train_data)]
+  
+  # Get sizes
+  N_train = nrow(train_features)
+  D = ncol(train_features)
+  
+  print("Training samples: " + N_train)
+  print("Feature dimension: " + D)
+  
+  # Normalize features to [0, 1]
+  train_features = train_features / 255.0
+  
+  # Convert labels to one-hot encoding
+  # Adjust labels to be 1-based if they are 0-based
+  min_label = min(train_labels)
+  if (min_label == 0) {
+    train_labels = train_labels + 1
+  }
+  
+  train_labels_onehot = table(seq(1, N_train), train_labels, N_train, num_classes)
+  
+  # Save training data in binary format
+  print("Saving training data to binary format...")
+  write(train_features, train_data_file, format="binary")
+  write(train_labels_onehot, train_labels_file, format="binary")
+  
+  print("Loading validation data from CSV...")
+  # Read validation CSV
+  val_data = read(val_csv, format="csv", header=FALSE)
+  
+  # Force dense
+  val_data = val_data + 0
+  
+  # Extract labels and features
+  val_labels = val_data[,1]
+  val_features = val_data[,2:ncol(val_data)]
+  
+  N_val = nrow(val_features)
+  print("Validation samples: " + N_val)
+  
+  # Normalize features
+  val_features = val_features / 255.0
+  
+  # Convert labels to one-hot encoding
+  if (min_label == 0) {
+    val_labels = val_labels + 1
+  }
+  
+  val_labels_onehot = table(seq(1, N_val), val_labels, N_val, num_classes)
+  
+  # Save validation data in binary format
+  print("Saving validation data to binary format...")
+  write(val_features, val_data_file, format="binary")
+  write(val_labels_onehot, val_labels_file, format="binary")
+  
+  print("")
+  print("Data conversion completed!")
+  print("Binary files created:")
+  print("- " + train_data_file + " (shape: " + N_train + " x " + D + ")")
+  print("- " + train_labels_file + " (shape: " + N_train + " x " + num_classes + ")")
+  print("- " + val_data_file + " (shape: " + N_val + " x " + D + ")")
+  print("- " + val_labels_file + " (shape: " + N_val + " x " + num_classes + ")")
+}
+
+# Run the conversion
+load_and_save_imagenet_data()
+
+print("")
+print("You can now use these binary files in your training script for better performance!") 
\ No newline at end of file
diff --git a/scripts/nn/examples/mnist_resnet.dml b/scripts/nn/examples/mnist_resnet.dml
new file mode 100644
index 00000000000..16124dd6c92
--- /dev/null
+++ b/scripts/nn/examples/mnist_resnet.dml
@@ -0,0 +1,286 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# MNIST Resnet - Train
+#
+# This script trains a convolutional net using the "ResNet" architecture
+# on images of handwritten digits.
+#
+# Inputs:
+#  - train: File containing labeled MNIST training images.
+#     The format is "label, pixel_1, pixel_2, ..., pixel_n".
+#  - test: File containing labeled MNIST test images.
+#     The format is "label, pixel_1, pixel_2, ..., pixel_n".
+#  - C: Number of color chanels in the images.
+#  - Hin: Input image height.
+#  - Win: Input image width.
+#  - epochs: [DEFAULT: 10] Total number of full training loops over
+#     the full data set.
+#  - out_dir: [DEFAULT: "."] Directory to store weights and bias
+#     matrices of trained model, as well as final test accuracy.
+#  - fmt: [DEFAULT: "csv"] File format of `train` and `test` data.
+#     Options include: "csv", "mm", "text", and "binary".
+#
+# Outputs:
+#  - W1, W2, W3, W4: Files containing the trained weights of the model.
+#  - b1, b2, b3, b4: Files containing the trained biases of the model.
+#  - accuracy: File containing the accuracy and loss on the test data over all epochs.
+#
+# Data:
+# The MNIST dataset contains labeled images of handwritten digits,
+# where each example is a 28x28 pixel image of grayscale values in
+# the range [0,255] stretched out as 784 pixels, and each label is
+# one of 10 possible digits in [0,9].
+#
+# Sample Invocation (running from outside the `nn` folder):
+# 1. Download data (60,000 training examples, and 10,000 test examples)
+#   ```
+#   nn/examples/get_mnist_data.sh
+#   ```
+#
+# 2. Execute using Spark
+#   ```
+#   spark-submit --master local[*] --driver-memory 10G
+#   --conf spark.driver.maxResultSize=0 --conf spark.rpc.message.maxSize=128
+#   $SYSTEMDS_ROOT/target/SystemDS.jar -f nn/examples/mnist_resnet.dml
+#   -nvargs train=nn/examples/data/mnist/mnist_train.csv test=nn/examples/data/mnist/mnist_test.csv
+#   C=1 Hin=28 Win=28 epochs=10 out_dir=nn/examples/model/mnist_resnet
+#   ```
+#
+
+source("nn/networks/resnet18.dml") as resnet
+source("scripts/nn/layers/softmax_cross_entropy_loss.dml") as loss_nn
+
+# Read the data
+fmt = "csv"
+train = read("scripts/nn/examples/data/mnist_train.csv", format=fmt)
+test = read("scripts/nn/examples/data/mnist_test.csv", format=fmt)
+out_dir = "scripts/nn/example/model/mnist_resnet"
+
+# Extract images and labels
+images = train[,2:ncol(train)]
+labels = train[,1]
+images_test = test[,2:ncol(test)]
+labels_test = test[,1]
+classes = 10
+
+# Scale images to [-1,1], and one-hot encode the labels
+N = nrow(images)
+N_test = nrow(images_test)
+X = (images / 255.0) * 2 - 1
+X = cbind(X, X, X) # Resnet assumes C=3 so we duplicate the data along the channels
+Y = table(seq(1, N), labels+1, N, 10)
+X_test = (images_test / 255.0) * 2 - 1
+X_test = cbind(X_test, X_test, X_test)
+Y_test = table(seq(1, N_test), labels_test+1, N_test, 10)
+
+# Split into training (55,000 examples) and validation (5,000 examples)
+#X = images[5001:nrow(images),]
+#X_val = images[1:5000,]
+#Y = labels[5001:nrow(images),]
+#Y_val = labels[1:5000,]
+
+# Get initial model parameters
+[model, ema_means_vars] = resnet::init(classes, -1)
+
+# Get initial optimizer parameters
+optimizer_params = resnet::init_lars_optim_params(classes)
+# optimizer_params = resnet::init_sgd_momentum_optim_params(classes)
+# optimizer_params = resnet::init_adam_optim_params(classes)
+
+# Define image properties
+Hin = 28
+Win = 28
+#N_val = 0
+
+# Define training parameters
+epochs = 90
+batch_size = 512
+
+[accuracy, loss_metric, learned_model, learned_emas] = train(X, Y, X_test, Y_test, model, ema_means_vars, N, Hin, Win, epochs, batch_size, optimizer_params)
+
+write(accuracy, "scripts/nn/examples/out/resnet_mnist_accuracy.csv", format="csv")
+write(loss_metric, "scripts/nn/examples/out/resnet_mnist_loss.csv", format="csv")
+
+#Train
+train = function(matrix[double] X, matrix[double] Y, matrix[double] X_test, matrix[double] Y_test, list[unknown] model, list[unknown] emas, int samples, int Hin,
+    int Win, int epochs, int batch_size, list[unknown] optim_params)
+    return (matrix[double] accuracy, matrix[double] loss_metric, 
+            list[unknown] learned_model, list[unknown] learned_emas) {
+
+    # --- LEARNING RATE SCHEDULE HYPERPARAMETERS ---
+    # The learning rate we want to reach AFTER warmup
+    initial_lr = 0.01 
+    # A very small final learning rate to decay towards
+    end_lr = 0.0001 
+    # Number of warmup epochs, as per the paper
+    warmup_epochs = 5
+    # The exponent for the polynomial decay, as per the paper
+    power = 2.0
+
+    # Optimizer hyperparameters
+    momentum = 0.9
+    trust_coeff = 0.001
+    weight_decay = 0.0001
+    
+    # Adam optimizer hyperparameters
+    beta1 = 0.9
+    beta2 = 0.999
+    epsilon = 1e-8
+    
+    # Calculate total iterations for the schedule
+    iterations_per_epoch = ceil(samples / batch_size)
+    total_iterations = epochs * iterations_per_epoch
+    warmup_iterations = warmup_epochs * iterations_per_epoch
+    decay_iterations = total_iterations - warmup_iterations
+
+    # Initialize metrics
+    learned_model = list()
+    learned_emas = list()
+    accuracy = matrix(0, rows=epochs, cols=1)
+    loss_metric = matrix(0, rows=epochs, cols=1)
+
+    iterations = ceil(samples/batch_size)
+    mode = "train"
+
+    for (epoch in 1:epochs) {
+        loss_avg = 0.0
+
+        print("Start epoch: " + epoch + "/" + epochs)
+
+        for (i in 1:iterations) {
+            print(" - Iteration: " + i + "/" + iterations)
+
+            # --- START DYNAMIC LEARNING RATE LOGIC ---
+            current_iteration = (epoch - 1) * iterations_per_epoch + i
+            current_lr = 0.0
+
+            if (current_iteration < warmup_iterations) {
+                # 1. Linear Warmup Phase
+                # Linearly increase LR from 0 to initial_lr over warmup_iterations
+                current_lr = initial_lr * (as.double(current_iteration) / warmup_iterations)
+            } else {
+                # 2. Polynomial Decay Phase
+                decay_step = current_iteration - warmup_iterations
+                decay_progress = as.double(decay_step) / decay_iterations
+                current_lr = end_lr + (initial_lr - end_lr) * (1 - decay_progress)^power
+            }
+            
+            if (i == 1) { # Print LR once per epoch to reduce log spam
+                print("Using Learning Rate: " + current_lr)
+            }
+            # --- END DYNAMIC LEARNING RATE LOGIC ---
+
+            # Get batch
+            start = (i - 1) * batch_size + 1
+            end = min(samples, i * batch_size)
+            X_batch = X[start:end,]
+            Y_batch = Y[start:end,]
+
+            # Forward pass
+            [out, emas, cached_out, cached_means_vars] = resnet::forward(X_batch, Hin, Win, model, mode, emas)
+
+            # Loss
+            loss = loss_nn::forward(out, Y_batch)
+            if (i %% 10 == 0) { # Print loss less frequently on large datasets
+                print(" - Iteration: " + i + "/" + iterations + ", Loss: " + loss)
+            }
+            loss_avg = (loss_avg * (i - 1) + loss) / i
+
+            # Backward
+            dOut = loss_nn::backward(out, Y_batch)
+            [dX, gradients] = resnet::backward(dOut, cached_out, model, cached_means_vars)
+
+            # Update parameters
+            [model, optim_params] = resnet::update_params_with_lars(model, gradients, current_lr, momentum, weight_decay, trust_coeff,
+                  optim_params)
+            # [model, optim_params] = resnet::update_params_with_sgd_momentum(model, gradients, current_lr, momentum, optim_params)
+            
+            # [model, optim_params] = resnet::update_params_with_adam(model, gradients, current_lr, beta1, beta2, epsilon, current_iteration, optim_params)
+        }
+
+        # Reshuffle mini batches
+        r = rand(rows=nrow(Y), cols=1, min=0, max=1, pdf="uniform")
+        X_tmp = order(target=cbind(r, X), by=1)
+        Y_tmp = order(target=cbind(r, Y), by=1)
+        X = X_tmp[,2:ncol(X_tmp)]
+        Y = Y_tmp[,2:ncol(Y_tmp)]
+
+        print("Computing metrics for current epoch...")
+
+        # Predict on the test dataset
+        out = predict(X_test, Hin, Win, model, emas)
+        accuracy_scalar = eval(out, Y_test)
+
+        # Append to the epoch-wise metrics
+        loss_metric[epoch, 1] = loss_avg
+        accuracy[epoch, 1] = accuracy_scalar
+
+        print("Epoch Avg. Loss: " + loss_avg)
+        print("Epoch Accuracy: " + accuracy_scalar)
+    }
+
+    learned_model = model
+    learned_emas = emas
+}
+
+predict = function(matrix[double] X, int Hin, int Win, 
+    list[unknown] model, list[unknown] emas) 
+    return(matrix[double] out) {
+    /*
+    * Computes the class probability predictions of a convolutional
+    * net using the "ResNet" architecture.
+    *
+    * The input matrix, X, has N examples, each represented as a 3D
+    * volume unrolled into a single vector.
+    *
+    * Inputs:
+    *  - X: Input data matrix, of shape (N, C*Hin*Win).
+    *
+    * Outputs:
+    *  - probs: Class probabilities, of shape (N, K).
+    */
+    
+    # Predict on test dataset
+    mode = "train"
+    [out, temp_emas, temp_cached_out, temp_cached_means_vars] = resnet::forward(X, Hin, Win, model, mode, emas)
+}
+
+
+eval = function(matrix[double] probs, matrix[double] Y)
+    return(double accuracy) {
+    /*
+    * Evaluates a convolutional net using the "ResNet" architecture.
+    *
+    * The probs matrix contains the class probability predictions
+    * of K classes over N examples.  The targets, Y, have K classes,
+    * and are one-hot encoded.
+    *
+    * Inputs:
+    *  - probs: Class probabilities, of shape (N, K).
+    *  - Y: Target matrix, of shape (N, K).
+    *
+    * Outputs:
+    *  - accuracy: Scalar accuracy, of shape (1).
+    */
+    correct_pred = rowIndexMax(probs) == rowIndexMax(Y)
+    accuracy = mean(correct_pred)
+}
\ No newline at end of file
diff --git a/scripts/nn/layers/lrn.dml b/scripts/nn/layers/lrn.dml
new file mode 100644
index 00000000000..bd1dae3dc45
--- /dev/null
+++ b/scripts/nn/layers/lrn.dml
@@ -0,0 +1,153 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Local Response Normalization (LRN) layer.
+ */
+
+forward = function(matrix[double] X, int C, int Hin, int Win,
+                   int N, double alpha, double beta, double K)
+    return (matrix[double] Y) {
+  /*
+   * Computes the forward pass for a Local Response Normalization
+   * (LRN) layer.  The LRN layer performs a lateral normalization
+   * over channels at each spatial location.
+   *
+   * This is the cross-channel LRN used in AlexNet:
+   * `y_{x,y}^i = x_{x,y}^i / (K + alpha * sum_{j=max(0,i-n/2)}^{min(C-1,i+n/2)} (x_{x,y}^j)^2)^beta`
+   *
+   * Inputs:
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - C: Number of input channels.
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - N: Number of channels to sum over (i.e. size of local region).
+   *  - alpha: Scaling parameter.
+   *  - beta: Exponent parameter.
+   *  - K: Additive constant to avoid divide-by-zero.
+   *
+   * Outputs:
+   *  - Y: Outputs, of shape (N, C*Hin*Win).
+   */
+  N_batch = nrow(X)
+  
+  # Initialize output
+  Y = matrix(0, rows=N_batch, cols=C*Hin*Win)
+  
+  # Reshape for easier manipulation
+  X_reshaped = matrix(X, rows=N_batch, cols=C*Hin*Win)
+  
+  # Compute normalization
+  half_N = as.integer(N / 2)
+  
+  for (i in 1:N_batch) {
+    # Get current sample
+    x = matrix(X_reshaped[i,], rows=C, cols=Hin*Win, byrow=TRUE)
+    y = matrix(0, rows=C, cols=Hin*Win)
+    
+    # For each channel
+    for (c in 1:C) {
+      # Define the local region
+      j_start = max(1, c - half_N)
+      j_end = min(C, c + half_N)
+      
+      # Compute sum of squares in the local region
+      scale = matrix(K, rows=1, cols=Hin*Win)
+      for (j in j_start:j_end) {
+        scale = scale + alpha * (x[j,])^2
+      }
+      
+      # Apply normalization
+      y[c,] = x[c,] / (scale^beta)
+    }
+    
+    # Reshape back and store
+    Y[i,] = matrix(y, rows=1, cols=C*Hin*Win, byrow=TRUE)
+  }
+}
+
+backward = function(matrix[double] dY, matrix[double] X, int C, int Hin, int Win,
+                    int N, double alpha, double beta, double K)
+    return (matrix[double] dX) {
+  /*
+   * Computes the backward pass for a Local Response Normalization layer.
+   *
+   * Inputs:
+   *  - dY: Gradient wrt Y, of shape (N, C*Hin*Win).
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - C: Number of input channels.
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - N: Number of channels to sum over.
+   *  - alpha: Scaling parameter.
+   *  - beta: Exponent parameter.
+   *  - K: Additive constant.
+   *
+   * Outputs:
+   *  - dX: Gradient wrt X, of shape (N, C*Hin*Win).
+   */
+  N_batch = nrow(X)
+  
+  # Initialize gradient
+  dX = matrix(0, rows=N_batch, cols=C*Hin*Win)
+  
+  # Reshape for easier manipulation
+  X_reshaped = matrix(X, rows=N_batch, cols=C*Hin*Win)
+  dY_reshaped = matrix(dY, rows=N_batch, cols=C*Hin*Win)
+  
+  half_N = as.integer(N / 2)
+  
+  for (i in 1:N_batch) {
+    # Get current sample
+    x = matrix(X_reshaped[i,], rows=C, cols=Hin*Win, byrow=TRUE)
+    dy = matrix(dY_reshaped[i,], rows=C, cols=Hin*Win, byrow=TRUE)
+    dx = matrix(0, rows=C, cols=Hin*Win)
+    
+    # First, compute the scale values for all channels
+    scale = matrix(K, rows=C, cols=Hin*Win)
+    for (c in 1:C) {
+      j_start = max(1, c - half_N)
+      j_end = min(C, c + half_N)
+      for (j in j_start:j_end) {
+        scale[c,] = scale[c,] + alpha * (x[j,])^2
+      }
+    }
+    
+    # Compute gradients
+    for (c in 1:C) {
+      # Channels that this channel influences
+      k_start = max(1, c - half_N)
+      k_end = min(C, c + half_N)
+      
+      for (k in k_start:k_end) {
+        if (k == c) {
+          # Gradient from own normalization
+          dx[c,] = dx[c,] + dy[k,] * scale[k,]^(-beta)
+        }
+        # Gradient from normalizing other channels
+        dx[c,] = dx[c,] - 2 * alpha * beta * dy[k,] * x[k,] * x[c,] * scale[k,]^(-beta-1)
+      }
+    }
+    
+    # Reshape back and store
+    dX[i,] = matrix(dx, rows=1, cols=C*Hin*Win, byrow=TRUE)
+  }
+} 
\ No newline at end of file
diff --git a/scripts/nn/layers/softmax_cross_entropy_loss.dml b/scripts/nn/layers/softmax_cross_entropy_loss.dml
new file mode 100644
index 00000000000..8952d92d2cc
--- /dev/null
+++ b/scripts/nn/layers/softmax_cross_entropy_loss.dml
@@ -0,0 +1,73 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Softmax Cross-Entropy loss function.
+ * This combines the Softmax activation with the Cross-Entropy loss.
+ */
+
+forward = function(matrix[double] logits, matrix[double] y)
+    return (double loss) {
+  /*
+   * Computes the forward pass for a Softmax Cross-Entropy loss function.
+   *
+   * Inputs:
+   * - logits: Raw scores from the network, of shape (N, K).
+   * - y: Target one-hot encoded labels, of shape (N, K).
+   *
+   * Outputs:
+   * - loss: Average loss.
+   */
+  N = nrow(y)
+  
+  # Numerically stable Softmax
+  # Subtracting the max logit from each row prevents overflow when taking exp()
+  shifted_logits = logits - rowMaxs(logits)
+  probs = exp(shifted_logits) / rowSums(exp(shifted_logits))
+
+  # Cross-entropy loss calculation
+  # Adding a small epsilon for numerical stability to avoid log(0)
+  eps = 1e-9
+  loss = -sum(y * log(probs + eps)) / N
+}
+
+backward = function(matrix[double] logits, matrix[double] y)
+    return (matrix[double] d_logits) {
+  /*
+   * Computes the backward pass for a Softmax Cross-Entropy loss function.
+   * The gradient of the combined Softmax and Cross-Entropy is remarkably simple.
+   *
+   * Inputs:
+   * - logits: Raw scores from the network, of shape (N, K).
+   * - y: Target one-hot encoded labels, of shape (N, K).
+   *
+   * Outputs:
+   * - d_logits: Gradient with respect to the input logits, of shape (N, K).
+   */
+  N = nrow(y)
+  
+  # Recompute the probabilities (softmax)
+  shifted_logits = logits - rowMaxs(logits)
+  probs = exp(shifted_logits) / rowSums(exp(shifted_logits))
+
+  # The gradient is simply (probabilities - true_labels)
+  d_logits = (probs - y) / N
+}
\ No newline at end of file
diff --git a/scripts/nn/networks/alexnet.dml b/scripts/nn/networks/alexnet.dml
new file mode 100644
index 00000000000..829ca633b40
--- /dev/null
+++ b/scripts/nn/networks/alexnet.dml
@@ -0,0 +1,1162 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * AlexNet: Deep Convolutional Neural Network
+ * 
+ * Reference: "ImageNet Classification with Deep Convolutional Neural Networks"
+ * by Alex Krizhevsky, Ilya Sutskever, and Geoffrey E. Hinton (2012)
+ * 
+ * This implementation provides a flexible, modular AlexNet architecture
+ * suitable for various computer vision tasks.
+ */
+
+# Import layer implementations
+source("nn/layers/affine.dml") as affine
+source("nn/layers/conv2d_builtin.dml") as conv2d
+source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
+source("nn/layers/dropout.dml") as dropout
+source("nn/layers/l2_reg.dml") as l2_reg
+source("nn/layers/max_pool2d_builtin.dml") as max_pool2d
+source("nn/layers/relu.dml") as relu
+source("nn/layers/softmax.dml") as softmax
+
+# Import optimizers
+source("nn/optim/sgd.dml") as sgd
+source("nn/optim/sgd_momentum.dml") as sgd_momentum
+source("nn/optim/sgd_nesterov.dml") as sgd_nesterov
+source("nn/optim/adam.dml") as adam
+source("nn/optim/adagrad.dml") as adagrad
+source("nn/optim/rmsprop.dml") as rmsprop
+source("nn/optim/lars.dml") as lars
+
+# Import batch normalization
+source("nn/layers/batch_norm2d.dml") as batch_norm2d
+
+/*
+ * Forward and backward pass.
+ */
+
+forward = function(matrix[double] X, int C, int Hin, int Win,
+                   list[unknown] model, string mode, double dropout_prob)
+    return (matrix[double] out, list[unknown] cached_out) {
+  /*
+   * Forward pass of the AlexNet model.
+   *
+   * Architecture:
+   * - Conv1: 96 filters, 11x11, stride 4, pad 0 -> ReLU -> MaxPool 3x3, stride 2
+   * - Conv2: 256 filters, 5x5, stride 1, pad 2 -> ReLU -> MaxPool 3x3, stride 2  
+   * - Conv3: 384 filters, 3x3, stride 1, pad 1 -> ReLU
+   * - Conv4: 384 filters, 3x3, stride 1, pad 1 -> ReLU
+   * - Conv5: 256 filters, 3x3, stride 1, pad 1 -> ReLU -> MaxPool 3x3, stride 2
+   * - FC1: 4096 neurons -> ReLU -> Dropout
+   * - FC2: 4096 neurons -> ReLU -> Dropout
+   * - FC3: num_classes neurons -> Softmax
+   *
+   * Inputs:
+   * - X: Input data, of shape (N, C*Hin*Win).
+   * - C: Number of input channels (3 for RGB).
+   * - Hin: Input height (256 for ImageNet).
+   * - Win: Input width (256 for ImageNet).
+   * - model: List of model parameters with the following structure:
+   *   -> 1: Conv1 weights, of shape (96, C*11*11)
+   *   -> 2: Conv1 bias, of shape (96, 1)
+   *   -> 3: Conv2 weights, of shape (256, 96*5*5)
+   *   -> 4: Conv2 bias, of shape (256, 1)
+   *   -> 5: Conv3 weights, of shape (384, 256*3*3)
+   *   -> 6: Conv3 bias, of shape (384, 1)
+   *   -> 7: Conv4 weights, of shape (384, 384*3*3)
+   *   -> 8: Conv4 bias, of shape (384, 1)
+   *   -> 9: Conv5 weights, of shape (256, 384*3*3)
+   *   -> 10: Conv5 bias, of shape (256, 1)
+   *   -> 11: FC1 weights, of shape (fc_input_size, 4096)
+   *   -> 12: FC1 bias, of shape (1, 4096)
+   *   -> 13: FC2 weights, of shape (4096, 4096)
+   *   -> 14: FC2 bias, of shape (1, 4096)
+   *   -> 15: FC3 weights, of shape (4096, num_classes)
+   *   -> 16: FC3 bias, of shape (1, num_classes)
+   * - mode: 'train' or 'test' for dropout behavior
+   * - dropout_prob: Dropout probability (typically 0.5)
+   *
+   * Outputs:
+   * - out: Output predictions, of shape (N, num_classes)
+   * - cached_out: Cached intermediate outputs for backward pass
+   */
+  
+  # Extract model parameters
+  W1 = as.matrix(model[1]); b1 = as.matrix(model[2])
+  W2 = as.matrix(model[3]); b2 = as.matrix(model[4])
+  W3 = as.matrix(model[5]); b3 = as.matrix(model[6])
+  W4 = as.matrix(model[7]); b4 = as.matrix(model[8])
+  W5 = as.matrix(model[9]); b5 = as.matrix(model[10])
+  W6 = as.matrix(model[11]); b6 = as.matrix(model[12])
+  W7 = as.matrix(model[13]); b7 = as.matrix(model[14])
+  W8 = as.matrix(model[15]); b8 = as.matrix(model[16])
+
+  # Forward pass
+  # Conv1 -> ReLU -> MaxPool1
+  [outc1, Houtc1, Woutc1] = conv2d::forward(X, W1, b1, C, Hin, Win, 11, 11, 4, 4, 2, 2)
+  outr1 = relu::forward(outc1)
+  [outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, 96, Houtc1, Woutc1, 3, 3, 2, 2, 0, 0)
+  
+  # Conv2 -> ReLU -> MaxPool2
+  [outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, 96, Houtp1, Woutp1, 5, 5, 1, 1, 2, 2)
+  outr2 = relu::forward(outc2)
+  [outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, 256, Houtc2, Woutc2, 3, 3, 2, 2, 0, 0)
+  
+  # Conv3 -> ReLU
+  [outc3, Houtc3, Woutc3] = conv2d::forward(outp2, W3, b3, 256, Houtp2, Woutp2, 3, 3, 1, 1, 1, 1)
+  outr3 = relu::forward(outc3)
+  
+  # Conv4 -> ReLU
+  [outc4, Houtc4, Woutc4] = conv2d::forward(outr3, W4, b4, 384, Houtc3, Woutc3, 3, 3, 1, 1, 1, 1)
+  outr4 = relu::forward(outc4)
+  
+  # Conv5 -> ReLU -> MaxPool3
+  [outc5, Houtc5, Woutc5] = conv2d::forward(outr4, W5, b5, 384, Houtc4, Woutc4, 3, 3, 1, 1, 1, 1)
+  outr5 = relu::forward(outc5)
+  [outp5, Houtp5, Woutp5] = max_pool2d::forward(outr5, 256, Houtc5, Woutc5, 3, 3, 2, 2, 0, 0)
+  
+  # FC1 -> ReLU -> Dropout
+  outa6 = affine::forward(outp5, W6, b6)
+  outr6 = relu::forward(outa6)
+  if (mode == "train") {
+    [outd6, maskd6] = dropout::forward(outr6, dropout_prob, -1)
+  } else {
+    outd6 = outr6
+    maskd6 = matrix(1, rows=nrow(outr6), cols=ncol(outr6))
+  }
+  
+  # FC2 -> ReLU -> Dropout
+  outa7 = affine::forward(outd6, W7, b7)
+  outr7 = relu::forward(outa7)
+  if (mode == "train") {
+    [outd7, maskd7] = dropout::forward(outr7, dropout_prob, -1)
+  } else {
+    outd7 = outr7
+    maskd7 = matrix(1, rows=nrow(outr7), cols=ncol(outr7))
+  }
+  
+  # FC3 -> Softmax
+  outa8 = affine::forward(outd7, W8, b8)
+  out = softmax::forward(outa8)
+
+  # Cache intermediate outputs for backward pass
+  cached_out = list(X, outc1, Houtc1, Woutc1, outr1, outp1, Houtp1, Woutp1,
+                    outc2, Houtc2, Woutc2, outr2, outp2, Houtp2, Woutp2,
+                    outc3, Houtc3, Woutc3, outr3, outc4, Houtc4, Woutc4, outr4,
+                    outc5, Houtc5, Woutc5, outr5, outp5, Houtp5, Woutp5,
+                    outa6, outr6, outd6, maskd6, outa7, outr7, outd7, maskd7, outa8)
+}
+
+backward = function(matrix[double] dOut, list[unknown] cached_out,
+                    list[unknown] model, int C, int Hin, int Win, double dropout_prob)
+    return (matrix[double] dX, list[unknown] gradients) {
+  /*
+   * Backward pass of the AlexNet model.
+   *
+   * Inputs:
+   * - dOut: Gradient w.r.t. output, of shape (N, num_classes)
+   * - cached_out: Cached outputs from forward pass
+   * - model: Model parameters (same structure as forward pass)
+   * - C, Hin, Win: Input dimensions
+   * - dropout_prob: Dropout probability used in forward pass
+   *
+   * Outputs:
+   * - dX: Gradient w.r.t. input, of shape (N, C*Hin*Win)
+   * - gradients: List of gradients for all parameters (same structure as model)
+   */
+  
+  # Extract model parameters
+  W1 = as.matrix(model[1]); b1 = as.matrix(model[2])
+  W2 = as.matrix(model[3]); b2 = as.matrix(model[4])
+  W3 = as.matrix(model[5]); b3 = as.matrix(model[6])
+  W4 = as.matrix(model[7]); b4 = as.matrix(model[8])
+  W5 = as.matrix(model[9]); b5 = as.matrix(model[10])
+  W6 = as.matrix(model[11]); b6 = as.matrix(model[12])
+  W7 = as.matrix(model[13]); b7 = as.matrix(model[14])
+  W8 = as.matrix(model[15]); b8 = as.matrix(model[16])
+
+  # Extract cached outputs
+  X = as.matrix(cached_out[1])
+  outc1 = as.matrix(cached_out[2]); Houtc1 = as.scalar(cached_out[3]); Woutc1 = as.scalar(cached_out[4])
+  outr1 = as.matrix(cached_out[5])
+  outp1 = as.matrix(cached_out[6]); Houtp1 = as.scalar(cached_out[7]); Woutp1 = as.scalar(cached_out[8])
+  outc2 = as.matrix(cached_out[9]); Houtc2 = as.scalar(cached_out[10]); Woutc2 = as.scalar(cached_out[11])
+  outr2 = as.matrix(cached_out[12])
+  outp2 = as.matrix(cached_out[13]); Houtp2 = as.scalar(cached_out[14]); Woutp2 = as.scalar(cached_out[15])
+  outc3 = as.matrix(cached_out[16]); Houtc3 = as.scalar(cached_out[17]); Woutc3 = as.scalar(cached_out[18])
+  outr3 = as.matrix(cached_out[19])
+  outc4 = as.matrix(cached_out[20]); Houtc4 = as.scalar(cached_out[21]); Woutc4 = as.scalar(cached_out[22])
+  outr4 = as.matrix(cached_out[23])
+  outc5 = as.matrix(cached_out[24]); Houtc5 = as.scalar(cached_out[25]); Woutc5 = as.scalar(cached_out[26])
+  outr5 = as.matrix(cached_out[27])
+  outp5 = as.matrix(cached_out[28]); Houtp5 = as.scalar(cached_out[29]); Woutp5 = as.scalar(cached_out[30])
+  outa6 = as.matrix(cached_out[31]); outr6 = as.matrix(cached_out[32])
+  outd6 = as.matrix(cached_out[33]); maskd6 = as.matrix(cached_out[34])
+  outa7 = as.matrix(cached_out[35]); outr7 = as.matrix(cached_out[36])
+  outd7 = as.matrix(cached_out[37]); maskd7 = as.matrix(cached_out[38])
+  outa8 = as.matrix(cached_out[39])
+
+  # Backward pass
+  # FC3
+  douta8 = softmax::backward(dOut, outa8)
+  [doutd7, dW8, db8] = affine::backward(douta8, outd7, W8, b8)
+  
+  # FC2
+  doutr7 = dropout::backward(doutd7, outr7, dropout_prob, maskd7)
+  douta7 = relu::backward(doutr7, outa7)
+  [doutd6, dW7, db7] = affine::backward(douta7, outd6, W7, b7)
+  
+  # FC1
+  doutr6 = dropout::backward(doutd6, outr6, dropout_prob, maskd6)
+  douta6 = relu::backward(doutr6, outa6)
+  [doutp5, dW6, db6] = affine::backward(douta6, outp5, W6, b6)
+  
+  # Conv5
+  doutr5 = max_pool2d::backward(doutp5, Houtp5, Woutp5, outr5, 256, Houtc5, Woutc5, 3, 3, 2, 2, 0, 0)
+  doutc5 = relu::backward(doutr5, outc5)
+  [doutr4, dW5, db5] = conv2d::backward(doutc5, Houtc5, Woutc5, outr4, W5, b5, 384, Houtc4, Woutc4, 3, 3, 1, 1, 1, 1)
+  
+  # Conv4
+  doutc4 = relu::backward(doutr4, outc4)
+  [doutr3, dW4, db4] = conv2d::backward(doutc4, Houtc4, Woutc4, outr3, W4, b4, 384, Houtc3, Woutc3, 3, 3, 1, 1, 1, 1)
+  
+  # Conv3
+  doutc3 = relu::backward(doutr3, outc3)
+  [doutp2, dW3, db3] = conv2d::backward(doutc3, Houtc3, Woutc3, outp2, W3, b3, 256, Houtp2, Woutp2, 3, 3, 1, 1, 1, 1)
+  
+  # Conv2
+  doutr2 = max_pool2d::backward(doutp2, Houtp2, Woutp2, outr2, 256, Houtc2, Woutc2, 3, 3, 2, 2, 0, 0)
+  doutc2 = relu::backward(doutr2, outc2)
+  [doutp1, dW2, db2] = conv2d::backward(doutc2, Houtc2, Woutc2, outp1, W2, b2, 96, Houtp1, Woutp1, 5, 5, 1, 1, 2, 2)
+  
+  # Conv1
+  doutr1 = max_pool2d::backward(doutp1, Houtp1, Woutp1, outr1, 96, Houtc1, Woutc1, 3, 3, 2, 2, 0, 0)
+  doutc1 = relu::backward(doutr1, outc1)
+  [dX, dW1, db1] = conv2d::backward(doutc1, Houtc1, Woutc1, X, W1, b1, C, Hin, Win, 11, 11, 4, 4, 2, 2)
+
+  # Package gradients
+  gradients = list(dW1, db1, dW2, db2, dW3, db3, dW4, db4, dW5, db5, dW6, db6, dW7, db7, dW8, db8)
+}
+
+/*
+ * Helper function to calculate output dimensions after convolutions and pooling
+ */
+
+calculate_conv_output_size = function(int Hin, int Win)
+    return (int fc_input_size) {
+  /*
+   * Calculate the input size for the first fully connected layer
+   * based on the actual input dimensions after all conv and pooling layers.
+   *
+   * Current AlexNet architecture:
+   * 1. Conv1: 96 filters, 11x11, stride 4, pad 2
+   * 2. MaxPool1: 3x3, stride 2, pad 0  
+   * 3. Conv2: 256 filters, 5x5, stride 1, pad 2
+   * 4. MaxPool2: 3x3, stride 2, pad 0
+   * 5. Conv3: 384 filters, 3x3, stride 1, pad 1
+   * 6. Conv4: 384 filters, 3x3, stride 1, pad 1
+   * 7. Conv5: 256 filters, 3x3, stride 1, pad 1
+   * 8. MaxPool3: 3x3, stride 2, pad 0
+   */
+  
+  # Start with input dimensions
+  H = as.double(Hin)
+  W = as.double(Win)
+  
+  print("Input dimensions: " + Hin + "x" + Win)
+  
+  # Conv1: 11x11, stride 4, pad 2
+  H = floor((H - 11 + 4) / 4) + 1  # pad 2 on each side = 4 total
+  W = floor((W - 11 + 4) / 4) + 1
+  print("After Conv1: " + as.integer(H) + "x" + as.integer(W))
+  
+  # MaxPool1: 3x3, stride 2, pad 0
+  H = floor((H - 3 + 0) / 2) + 1
+  W = floor((W - 3 + 0) / 2) + 1
+  print("After MaxPool1: " + as.integer(H) + "x" + as.integer(W))
+  
+  # Conv2: 5x5, stride 1, pad 2
+  H = floor((H - 5 + 4) / 1) + 1
+  W = floor((W - 5 + 4) / 1) + 1
+  print("After Conv2: " + as.integer(H) + "x" + as.integer(W))
+  
+  # MaxPool2: 3x3, stride 2, pad 0
+  H = floor((H - 3 + 0) / 2) + 1
+  W = floor((W - 3 + 0) / 2) + 1
+  print("After MaxPool2: " + as.integer(H) + "x" + as.integer(W))
+  
+  # Conv3: 3x3, stride 1, pad 1
+  H = floor((H - 3 + 2) / 1) + 1
+  W = floor((W - 3 + 2) / 1) + 1
+  print("After Conv3: " + as.integer(H) + "x" + as.integer(W))
+  
+  # Conv4: 3x3, stride 1, pad 1  
+  H = floor((H - 3 + 2) / 1) + 1
+  W = floor((W - 3 + 2) / 1) + 1
+  print("After Conv4: " + as.integer(H) + "x" + as.integer(W))
+  
+  # Conv5: 3x3, stride 1, pad 1
+  H = floor((H - 3 + 2) / 1) + 1
+  W = floor((W - 3 + 2) / 1) + 1
+  print("After Conv5: " + as.integer(H) + "x" + as.integer(W))
+  
+  # MaxPool3: 3x3, stride 2, pad 0
+  H = floor((H - 3 + 0) / 2) + 1
+  W = floor((W - 3 + 0) / 2) + 1
+  print("After MaxPool3: " + as.integer(H) + "x" + as.integer(W))
+  
+  # Handle edge case where dimensions become 0 or negative
+  if (H <= 0 | W <= 0) {
+    print("ERROR: Spatial dimensions became 0 or negative!")
+    print("Input size " + Hin + "x" + Win + " is too small for AlexNet architecture.")
+    print("Consider using larger input images or adjusting the architecture.")
+    stop("Invalid spatial dimensions")
+  }
+  
+  # Final dimensions: 256 channels with H x W spatial size
+  fc_input_size = as.integer(256 * H * W)
+  
+  print("Final FC input size: " + fc_input_size + " (spatial: " + as.integer(H) + "x" + as.integer(W) + " x 256 channels)")
+}
+
+/*
+ * Model initialization.
+ */
+
+init = function(int C, int Hin, int Win, int num_classes, int seed)
+    return (list[unknown] model) {
+  /*
+   * Initialize AlexNet model parameters.
+   *
+   * Inputs:
+   * - C: Number of input channels (3 for RGB)
+   * - Hin: Input height (supports various sizes, e.g., 224, 256)
+   * - Win: Input width (supports various sizes, e.g., 224, 256)
+   * - num_classes: Number of output classes
+   * - seed: Random seed for initialization
+   *
+   * Outputs:
+   * - model: List of initialized model parameters
+   */
+   
+  # Calculate fully connected input size based on actual input dimensions
+  fc_input_size = calculate_conv_output_size(Hin, Win)
+  
+  # --- Explicit AlexNet weight init for Conv layers ---
+  # All weights ∼ N(0,0.01), all biases = 0 (following original AlexNet paper)
+  
+  # Conv1: 96 11x11 filters
+  W1 = rand(rows=96, cols=C * 11 * 11, pdf="normal", seed=seed) * 0.01      # 96 × (C·11·11)
+  b1 = matrix(0.0, rows=96, cols=1)
+  
+  # Conv2: 256 5x5 filters
+  W2 = rand(rows=256, cols=96 * 5 * 5, pdf="normal", seed=seed) * 0.01      # 256 × (96·5·5)
+  b2 = matrix(0.0, rows=256, cols=1)
+  
+  # Conv3: 384 3x3 filters
+  W3 = rand(rows=384, cols=256 * 3 * 3, pdf="normal", seed=seed) * 0.01     # 384 × (256·3·3)
+  b3 = matrix(0.0, rows=384, cols=1)
+  
+  # Conv4: 384 3x3 filters
+  W4 = rand(rows=384, cols=384 * 3 * 3, pdf="normal", seed=seed) * 0.01     # 384 × (384·3·3)
+  b4 = matrix(0.0, rows=384, cols=1)
+  
+  # Conv5: 256 3x3 filters
+  W5 = rand(rows=256, cols=384 * 3 * 3, pdf="normal", seed=seed) * 0.01     # 256 × (384·3·3)
+  b5 = matrix(0.0, rows=256, cols=1)
+
+  # --- Explicit AlexNet weight init for FC layers ---
+  # FC1: fc_input_size → 4096
+  W6 = rand(rows=fc_input_size, cols=4096, pdf="normal", seed=seed) * 0.01
+  b6 = matrix(0.0, rows=1, cols=4096)
+  
+  # FC2: 4096 → 4096
+  W7 = rand(rows=4096, cols=4096, pdf="normal", seed=seed) * 0.01
+  b7 = matrix(0.0, rows=1, cols=4096)
+  
+  # FC3: 4096 → num_classes (output layer)
+  W8 = rand(rows=4096, cols=num_classes, pdf="normal", seed=seed) * 0.01
+  b8 = matrix(0.0, rows=1, cols=num_classes)
+  
+  # Scale final layer for better convergence (as mentioned in your image)
+  W8 = W8 / sqrt(2)
+
+  # Package model
+  model = list(W1, b1, W2, b2, W3, b3, W4, b4, W5, b5, W6, b6, W7, b7, W8, b8)
+}
+
+/*
+ * Utility functions for optimizers.
+ */
+
+update_params_with_sgd = function(list[unknown] model, list[unknown] gradients, double lr)
+    return (list[unknown] model_upd) {
+  /*
+   * Update model parameters with SGD optimizer.
+   */
+  model_upd = list()
+  for (i in 1:length(model)) {
+    param = as.matrix(model[i])
+    grad = as.matrix(gradients[i])
+    param_upd = sgd::update(param, grad, lr)
+    model_upd = append(model_upd, param_upd)
+  }
+}
+
+init_sgd_momentum_optim_params = function(list[unknown] model)
+    return (list[unknown] optim_state) {
+  /*
+   * Initialize SGD momentum optimizer state.
+   */
+  optim_state = list()
+  for (i in 1:length(model)) {
+    param = as.matrix(model[i])
+    momentum_state = sgd_momentum::init(param)
+    optim_state = append(optim_state, momentum_state)
+  }
+}
+
+update_params_with_sgd_momentum = function(list[unknown] model, list[unknown] gradients,
+                                           double lr, double mu, list[unknown] optim_state)
+    return (list[unknown] model_upd, list[unknown] optim_state_upd) {
+  /*
+   * Update model parameters with SGD momentum optimizer.
+   */
+  model_upd = list()
+  optim_state_upd = list()
+  for (i in 1:length(model)) {
+    param = as.matrix(model[i])
+    grad = as.matrix(gradients[i])
+    momentum_state = as.matrix(optim_state[i])
+    [param_upd, momentum_state_upd] = sgd_momentum::update(param, grad, lr, mu, momentum_state)
+    model_upd = append(model_upd, param_upd)
+    optim_state_upd = append(optim_state_upd, momentum_state_upd)
+  }
+}
+
+init_adam_optim_params = function(list[unknown] model)
+    return (list[unknown] optim_state) {
+  /*
+   * Initialize Adam optimizer state.
+   */
+  optim_state = list()
+  for (i in 1:length(model)) {
+    param = as.matrix(model[i])
+    [m_state, v_state] = adam::init(param)
+    adam_state = list(m_state, v_state)
+    optim_state = append(optim_state, adam_state)
+  }
+}
+
+update_params_with_adam = function(list[unknown] model, list[unknown] gradients,
+                                   double lr, double beta1, double beta2, double epsilon, int t,
+                                   list[unknown] optim_state)
+    return (list[unknown] model_upd, list[unknown] optim_state_upd) {
+  /*
+   * Update model parameters with Adam optimizer.
+   */
+  model_upd = list()
+  optim_state_upd = list()
+  for (i in 1:length(model)) {
+    param = as.matrix(model[i])
+    grad = as.matrix(gradients[i])
+    adam_state = as.list(optim_state[i])
+    m_state = as.matrix(adam_state[1])
+    v_state = as.matrix(adam_state[2])
+    [param_upd, m_state_upd, v_state_upd] = adam::update(param, grad, lr, beta1, beta2, epsilon, t, m_state, v_state)
+    adam_state_upd = list(m_state_upd, v_state_upd)
+    model_upd = append(model_upd, param_upd)
+    optim_state_upd = append(optim_state_upd, adam_state_upd)
+  }
+}
+
+init_lars_optim_params = function(list[unknown] model)
+    return (list[unknown] optim_state) {
+  /*
+   * Initialize LARS optimizer state.
+   */
+  optim_state = list()
+  for (i in 1:length(model)) {
+    param = as.matrix(model[i])
+    momentum_state = lars::init(param)
+    optim_state = append(optim_state, momentum_state)
+  }
+}
+
+update_params_with_lars = function(list[unknown] model, list[unknown] gradients,
+                                   double lr, double mu, double weight_decay, double trust_coeff,
+                                   list[unknown] optim_state)
+    return (list[unknown] model_upd, list[unknown] optim_state_upd) {
+  /*
+   * Update model parameters with LARS optimizer.
+   *
+   * LARS (Layer-wise Adaptive Rate Scaling) applies different learning
+   * rates to different layers based on the ratio of parameter norm
+   * to gradient norm, enabling stable large-batch training.
+   */
+  model_upd = list()
+  optim_state_upd = list()
+  for (i in 1:length(model)) {
+    param = as.matrix(model[i])
+    grad = as.matrix(gradients[i])
+    momentum_state = as.matrix(optim_state[i])
+    [param_upd, momentum_state_upd] = lars::update(param, grad, lr, mu, momentum_state, weight_decay, trust_coeff)
+    model_upd = append(model_upd, param_upd)
+    optim_state_upd = append(optim_state_upd, momentum_state_upd)
+  }
+}
+
+/*
+ * Training and evaluation utilities.
+ */
+
+compute_loss = function(matrix[double] predictions, matrix[double] targets, list[unknown] model, double weight_decay)
+    return (double loss) {
+  /*
+   * Compute cross-entropy loss with L2 regularization.
+   */
+  data_loss = cross_entropy_loss::forward(predictions, targets)
+  reg_loss = 0
+  for (i in seq(1, length(model), 2)) {  # Only weights, skip biases
+    W = as.matrix(model[i])
+    reg_loss = reg_loss + l2_reg::forward(W, 1)
+  }
+  loss = data_loss + weight_decay * reg_loss
+}
+
+compute_accuracy = function(matrix[double] predictions, matrix[double] targets)
+    return (double accuracy) {
+  /*
+   * Compute classification accuracy.
+   */
+  pred_labels = rowIndexMax(predictions)
+  true_labels = rowIndexMax(targets)
+  accuracy = mean(pred_labels == true_labels)
+}
+
+evaluate = function(matrix[double] X, matrix[double] Y, int C, int Hin, int Win,
+                    list[unknown] model, int batch_size)
+    return (double loss, double accuracy) {
+  /*
+   * Evaluate model on a dataset.
+   */
+  N = nrow(X)
+  total_loss = 0
+  total_acc = 0
+  num_batches = ceil(N / batch_size)
+  
+  for (i in 1:num_batches) {
+    beg = ((i-1) * batch_size) %% N + 1
+    end = min(N, beg + batch_size - 1)
+    X_batch = X[beg:end,]
+    Y_batch = Y[beg:end,]
+    
+    [predictions, cached_out] = forward(X_batch, C, Hin, Win, model, "test", 0.0)
+    batch_loss = compute_loss(predictions=predictions, targets=Y_batch, model=model, weight_decay=0.0)
+    batch_acc = compute_accuracy(predictions=predictions, targets=Y_batch)
+    
+    total_loss = total_loss + batch_loss
+    total_acc = total_acc + batch_acc
+  }
+  
+  loss = total_loss / num_batches
+  accuracy = total_acc / num_batches
+}
+
+/*
+ * AlexNet-BN variant initialization (with Batch Normalization).
+ */
+
+init_with_bn = function(int C, int Hin, int Win, int num_classes, int seed)
+    return (list[unknown] model, list[unknown] emas) {
+  /*
+   * Initialize AlexNet-BN model parameters (with Batch Normalization).
+   * 
+   * This variant adds batch normalization after each convolutional layer,
+   * as described in the LARS paper for improved large-batch training.
+   *
+   * Inputs:
+   * - C: Number of input channels (3 for RGB)
+   * - Hin: Input height (supports various sizes, e.g., 64, 224)
+   * - Win: Input width (supports various sizes, e.g., 64, 224)
+   * - num_classes: Number of output classes
+   * - seed: Random seed for initialization
+   *
+   * Outputs:
+   * - model: List of model parameters including BN parameters
+   * - emas: List of exponential moving averages for BN layers
+   */
+  
+  # Calculate fully connected input size based on actual input dimensions
+  fc_input_size = calculate_conv_output_size(Hin, Win)
+  
+  # --- Explicit AlexNet weight init for Conv layers ---
+  # All weights ∼ N(0,0.01), all biases = 0 (following original AlexNet paper)
+  
+  # Conv1: 96 11x11 filters
+  W1 = rand(rows=96, cols=C * 11 * 11, pdf="normal", seed=seed) * 0.01      # 96 × (C·11·11)
+  b1 = matrix(0.0, rows=96, cols=1)
+  
+  # Conv2: 256 5x5 filters
+  W2 = rand(rows=256, cols=96 * 5 * 5, pdf="normal", seed=seed) * 0.01      # 256 × (96·5·5)
+  b2 = matrix(0.0, rows=256, cols=1)
+  
+  # Conv3: 384 3x3 filters
+  W3 = rand(rows=384, cols=256 * 3 * 3, pdf="normal", seed=seed) * 0.01     # 384 × (256·3·3)
+  b3 = matrix(0.0, rows=384, cols=1)
+  
+  # Conv4: 384 3x3 filters
+  W4 = rand(rows=384, cols=384 * 3 * 3, pdf="normal", seed=seed) * 0.01     # 384 × (384·3·3)
+  b4 = matrix(0.0, rows=384, cols=1)
+  
+  # Conv5: 256 3x3 filters
+  W5 = rand(rows=256, cols=384 * 3 * 3, pdf="normal", seed=seed) * 0.01     # 256 × (384·3·3)
+  b5 = matrix(0.0, rows=256, cols=1)
+
+  # --- Initialize batch normalization parameters for each conv layer ---
+  [gamma1, beta1, ema_mean1, ema_var1] = batch_norm2d::init(96)
+  [gamma2, beta2, ema_mean2, ema_var2] = batch_norm2d::init(256)
+  [gamma3, beta3, ema_mean3, ema_var3] = batch_norm2d::init(384)
+  [gamma4, beta4, ema_mean4, ema_var4] = batch_norm2d::init(384)
+  [gamma5, beta5, ema_mean5, ema_var5] = batch_norm2d::init(256)
+  
+  # --- Explicit AlexNet weight init for FC layers ---
+  # FC1: fc_input_size → 4096
+  W6 = rand(rows=fc_input_size, cols=4096, pdf="normal", seed=seed) * 0.01
+  b6 = matrix(0.0, rows=1, cols=4096)
+  
+  # FC2: 4096 → 4096
+  W7 = rand(rows=4096, cols=4096, pdf="normal", seed=seed) * 0.01
+  b7 = matrix(0.0, rows=1, cols=4096)
+  
+  # FC3: 4096 → num_classes (output layer)
+  W8 = rand(rows=4096, cols=num_classes, pdf="normal", seed=seed) * 0.01
+  b8 = matrix(0.0, rows=1, cols=num_classes)
+  
+  # Scale final layer for better convergence (as mentioned in your image)
+  W8 = W8 / sqrt(2)
+  
+  # Package model with BN parameters
+  # Order: W, b, gamma, beta, ema_mean, ema_var for each conv layer, then FC layers
+  model = list(W1, b1, gamma1, beta1, ema_mean1, ema_var1,
+               W2, b2, gamma2, beta2, ema_mean2, ema_var2,
+               W3, b3, gamma3, beta3, ema_mean3, ema_var3,
+               W4, b4, gamma4, beta4, ema_mean4, ema_var4,
+               W5, b5, gamma5, beta5, ema_mean5, ema_var5,
+               W6, b6, W7, b7, W8, b8)
+  
+  # Package EMA parameters for easy access
+  emas = list(ema_mean1, ema_var1, ema_mean2, ema_var2, ema_mean3, ema_var3,
+              ema_mean4, ema_var4, ema_mean5, ema_var5)
+}
+
+forward_with_bn = function(matrix[double] X, int C, int Hin, int Win,
+                          list[unknown] model, string mode, double dropout_prob)
+    return (matrix[double] out, list[unknown] cached_out, list[unknown] emas_upd) {
+  /*
+   * Forward pass of the AlexNet-BN model (with Batch Normalization).
+   *
+   * Architecture:
+   * - Conv1 -> BN -> ReLU -> MaxPool
+   * - Conv2 -> BN -> ReLU -> MaxPool
+   * - Conv3 -> BN -> ReLU
+   * - Conv4 -> BN -> ReLU
+   * - Conv5 -> BN -> ReLU -> MaxPool
+   * - FC1 -> ReLU -> Dropout
+   * - FC2 -> ReLU -> Dropout
+   * - FC3 -> Softmax
+   */
+  
+  # Extract model parameters (with BN)
+  W1 = as.matrix(model[1]); b1 = as.matrix(model[2])
+  gamma1 = as.matrix(model[3]); beta1 = as.matrix(model[4])
+  ema_mean1 = as.matrix(model[5]); ema_var1 = as.matrix(model[6])
+  
+  W2 = as.matrix(model[7]); b2 = as.matrix(model[8])
+  gamma2 = as.matrix(model[9]); beta2 = as.matrix(model[10])
+  ema_mean2 = as.matrix(model[11]); ema_var2 = as.matrix(model[12])
+  
+  W3 = as.matrix(model[13]); b3 = as.matrix(model[14])
+  gamma3 = as.matrix(model[15]); beta3 = as.matrix(model[16])
+  ema_mean3 = as.matrix(model[17]); ema_var3 = as.matrix(model[18])
+  
+  W4 = as.matrix(model[19]); b4 = as.matrix(model[20])
+  gamma4 = as.matrix(model[21]); beta4 = as.matrix(model[22])
+  ema_mean4 = as.matrix(model[23]); ema_var4 = as.matrix(model[24])
+  
+  W5 = as.matrix(model[25]); b5 = as.matrix(model[26])
+  gamma5 = as.matrix(model[27]); beta5 = as.matrix(model[28])
+  ema_mean5 = as.matrix(model[29]); ema_var5 = as.matrix(model[30])
+  
+  W6 = as.matrix(model[31]); b6 = as.matrix(model[32])
+  W7 = as.matrix(model[33]); b7 = as.matrix(model[34])
+  W8 = as.matrix(model[35]); b8 = as.matrix(model[36])
+
+  # Forward pass with batch normalization
+  # Conv1 -> BN -> ReLU -> MaxPool
+  [outc1, Houtc1, Woutc1] = conv2d::forward(X, W1, b1, C, Hin, Win, 11, 11, 4, 4, 2, 2)
+  [outbn1, ema_mean1_upd, ema_var1_upd, cache_mean1, cache_inv_var1] = batch_norm2d::forward(outc1, gamma1, beta1, 96, Houtc1, Woutc1, mode, ema_mean1, ema_var1, 0.99, 1e-5)
+  outr1 = relu::forward(outbn1)
+  [outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, 96, Houtc1, Woutc1, 3, 3, 2, 2, 0, 0)
+  
+  # Conv2 -> BN -> ReLU -> MaxPool
+  [outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, 96, Houtp1, Woutp1, 5, 5, 1, 1, 2, 2)
+  [outbn2, ema_mean2_upd, ema_var2_upd, cache_mean2, cache_inv_var2] = batch_norm2d::forward(outc2, gamma2, beta2, 256, Houtc2, Woutc2, mode, ema_mean2, ema_var2, 0.99, 1e-5)
+  outr2 = relu::forward(outbn2)
+  [outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, 256, Houtc2, Woutc2, 3, 3, 2, 2, 0, 0)
+  
+  # Conv3 -> BN -> ReLU
+  [outc3, Houtc3, Woutc3] = conv2d::forward(outp2, W3, b3, 256, Houtp2, Woutp2, 3, 3, 1, 1, 1, 1)
+  [outbn3, ema_mean3_upd, ema_var3_upd, cache_mean3, cache_inv_var3] = batch_norm2d::forward(outc3, gamma3, beta3, 384, Houtc3, Woutc3, mode, ema_mean3, ema_var3, 0.99, 1e-5)
+  outr3 = relu::forward(outbn3)
+  
+  # Conv4 -> BN -> ReLU
+  [outc4, Houtc4, Woutc4] = conv2d::forward(outr3, W4, b4, 384, Houtc3, Woutc3, 3, 3, 1, 1, 1, 1)
+  [outbn4, ema_mean4_upd, ema_var4_upd, cache_mean4, cache_inv_var4] = batch_norm2d::forward(outc4, gamma4, beta4, 384, Houtc4, Woutc4, mode, ema_mean4, ema_var4, 0.99, 1e-5)
+  outr4 = relu::forward(outbn4)
+  
+  # Conv5 -> BN -> ReLU -> MaxPool
+  [outc5, Houtc5, Woutc5] = conv2d::forward(outr4, W5, b5, 384, Houtc4, Woutc4, 3, 3, 1, 1, 1, 1)
+  [outbn5, ema_mean5_upd, ema_var5_upd, cache_mean5, cache_inv_var5] = batch_norm2d::forward(outc5, gamma5, beta5, 256, Houtc5, Woutc5, mode, ema_mean5, ema_var5, 0.99, 1e-5)
+  outr5 = relu::forward(outbn5)
+  [outp5, Houtp5, Woutp5] = max_pool2d::forward(outr5, 256, Houtc5, Woutc5, 3, 3, 2, 2, 0, 0)
+  
+  # FC1 -> ReLU -> Dropout
+  outa6 = affine::forward(outp5, W6, b6)
+  outr6 = relu::forward(outa6)
+  if (mode == "train") {
+    [outd6, maskd6] = dropout::forward(outr6, dropout_prob, -1)
+  } else {
+    outd6 = outr6
+    maskd6 = matrix(1, rows=nrow(outr6), cols=ncol(outr6))
+  }
+  
+  # FC2 -> ReLU -> Dropout
+  outa7 = affine::forward(outd6, W7, b7)
+  outr7 = relu::forward(outa7)
+  if (mode == "train") {
+    [outd7, maskd7] = dropout::forward(outr7, dropout_prob, -1)
+  } else {
+    outd7 = outr7
+    maskd7 = matrix(1, rows=nrow(outr7), cols=ncol(outr7))
+  }
+  
+  # FC3 -> Softmax
+  outa8 = affine::forward(outd7, W8, b8)
+  out = softmax::forward(outa8)
+
+  # Cache intermediate outputs for backward pass
+  cached_out = list(X, outc1, Houtc1, Woutc1, outbn1, cache_mean1, cache_inv_var1, outr1, outp1, Houtp1, Woutp1,
+                    outc2, Houtc2, Woutc2, outbn2, cache_mean2, cache_inv_var2, outr2, outp2, Houtp2, Woutp2,
+                    outc3, Houtc3, Woutc3, outbn3, cache_mean3, cache_inv_var3, outr3,
+                    outc4, Houtc4, Woutc4, outbn4, cache_mean4, cache_inv_var4, outr4,
+                    outc5, Houtc5, Woutc5, outbn5, cache_mean5, cache_inv_var5, outr5, outp5, Houtp5, Woutp5,
+                    outa6, outr6, outd6, maskd6, outa7, outr7, outd7, maskd7, outa8)
+  
+  # Updated EMA parameters
+  emas_upd = list(ema_mean1_upd, ema_var1_upd, ema_mean2_upd, ema_var2_upd, ema_mean3_upd, ema_var3_upd,
+                  ema_mean4_upd, ema_var4_upd, ema_mean5_upd, ema_var5_upd)
+}
+
+/*
+ * LARS Training Utilities
+ */
+
+get_lr_with_warmup = function(double base_lr, int epoch, int iter, int total_epochs,
+                              int iters_per_epoch, int batch_size, int base_batch_size,
+                              int warmup_epochs, double decay_power)
+    return (double lr) {
+  /*
+   * Learning rate scheduler with warmup, batch scaling, and polynomial decay.
+   * Implements the LARS paper's learning rate schedule.
+   *
+   * Inputs:
+   * - base_lr: Base learning rate (before scaling)
+   * - epoch, iter: Current epoch and iteration
+   * - total_epochs: Total number of training epochs
+   * - iters_per_epoch: Iterations per epoch
+   * - batch_size: Current batch size
+   * - base_batch_size: Reference batch size for scaling (typically 256)
+   * - warmup_epochs: Number of warmup epochs
+   * - decay_power: Power for polynomial decay (typically 2)
+   *
+   * Outputs:
+   * - lr: Scaled learning rate for current iteration
+   */
+  
+  # Scale base LR by batch size (linear scaling rule)
+  scaled_base_lr = base_lr * (batch_size / base_batch_size)
+  
+  # Calculate total progress
+  total_iters = total_epochs * iters_per_epoch
+  warmup_iters = warmup_epochs * iters_per_epoch
+  current_iter = (epoch - 1) * iters_per_epoch + iter
+  
+  if (current_iter <= warmup_iters) {
+    # Linear warmup from 0 to scaled_base_lr
+    lr = scaled_base_lr * (current_iter / warmup_iters)
+  } else {
+    # Polynomial decay after warmup
+    progress = (current_iter - warmup_iters) / (total_iters - warmup_iters)
+    lr = scaled_base_lr * (1 - progress)^decay_power
+  }
+}
+
+get_lars_hyperparams = function(int batch_size, boolean use_bn)
+    return (double base_lr, int warmup_epochs, int total_epochs) {
+  /*
+   * Get recommended LARS hyperparameters based on batch size.
+   * Based on Table 3 from the LARS paper.
+   *
+   * Inputs:
+   * - batch_size: Training batch size
+   * - use_bn: Whether using batch normalization
+   *
+   * Outputs:
+   * - base_lr: Base learning rate (before batch scaling)
+   * - warmup_epochs: Number of warmup epochs
+   * - total_epochs: Recommended total training epochs
+   */
+  
+  if (use_bn) {
+    # AlexNet-BN (better scaling properties)
+    if (batch_size <= 512) {
+      base_lr = 0.02
+      warmup_epochs = 5
+      total_epochs = 100
+    } else if (batch_size <= 4096) {
+      base_lr = 0.02  # Will be scaled to ~0.32 for 4K batch
+      warmup_epochs = 5
+      total_epochs = 100
+    } else if (batch_size <= 8192) {
+      base_lr = 0.02  # Will be scaled to ~0.64 for 8K batch
+      warmup_epochs = 5
+      total_epochs = 100
+    } else if (batch_size <= 16384) {
+      base_lr = 0.02  # Will be scaled to ~1.28 for 16K batch
+      warmup_epochs = 5
+      total_epochs = 100
+    } else {  # 32K and above
+      base_lr = 0.02  # Will be scaled to ~2.56 for 32K batch
+      warmup_epochs = 5
+      total_epochs = 200  # Need more epochs for very large batch
+    }
+  } else {
+    # Regular AlexNet (limited scaling)
+    if (batch_size <= 512) {
+      base_lr = 0.01
+      warmup_epochs = 2
+      total_epochs = 100
+    } else if (batch_size <= 4096) {
+      base_lr = 0.01  # Will be scaled proportionally
+      warmup_epochs = 2
+      total_epochs = 100
+    } else {
+      # Regular AlexNet doesn't scale well beyond 4K
+      print("Warning: Regular AlexNet (without BN) doesn't scale well beyond batch size 4K")
+      base_lr = 0.01
+      warmup_epochs = 2
+      total_epochs = 100
+    }
+  }
+}
+
+train_with_lars = function(matrix[double] X_train, matrix[double] Y_train,
+                          matrix[double] X_val, matrix[double] Y_val,
+                          int C, int Hin, int Win, int num_classes,
+                          int epochs, int batch_size, double base_lr,
+                          boolean use_bn, int seed)
+    return (list[unknown] model, matrix[double] train_losses, matrix[double] val_accs) {
+  /*
+   * Train AlexNet with LARS optimizer following paper's best practices.
+   *
+   * Inputs:
+   * - X_train, Y_train: Training data and labels
+   * - X_val, Y_val: Validation data and labels
+   * - C, Hin, Win: Input dimensions
+   * - num_classes: Number of output classes
+   * - epochs: Number of training epochs
+   * - batch_size: Training batch size
+   * - base_lr: Base learning rate (before batch scaling)
+   * - use_bn: Whether to use batch normalization (recommended for LARS)
+   * - seed: Random seed for reproducibility
+   *
+   * Outputs:
+   * - model: Trained model parameters
+   * - train_losses: Training losses per epoch
+   * - val_accs: Validation accuracies per epoch
+   */
+  
+  N = nrow(X_train)
+  
+  # Initialize model
+  if (use_bn) {
+    [model, emas] = init_with_bn(C, Hin, Win, num_classes, seed)
+  } else {
+    model = init(C, Hin, Win, num_classes, seed)
+  }
+  
+  # LARS hyperparameters from paper
+  base_batch_size = 256
+  warmup_epochs = ifelse(use_bn, 5, 2)  # 5 for BN, 2 for regular
+  decay_power = 2
+  weight_decay = 0.0005
+  momentum = 0.9
+  trust_coeff = 0.001
+  
+  # Initialize optimizer state
+  optim_state = init_lars_optim_params(model)
+  
+  # Training metrics
+  train_losses = matrix(0, rows=epochs, cols=1)
+  val_accs = matrix(0, rows=epochs, cols=1)
+  
+  # Print training info
+  print("Training AlexNet with LARS optimizer")
+  print("Batch size: " + batch_size + ", Base LR: " + base_lr)
+  print("Scaled LR: " + (base_lr * batch_size / base_batch_size))
+  print("Warmup epochs: " + warmup_epochs + ", Using BN: " + use_bn)
+  print("")
+  
+  iters_per_epoch = ceil(N / batch_size)
+  
+  for (epoch in 1:epochs) {
+    epoch_loss = 0
+    
+    for (iter in 1:iters_per_epoch) {
+      # Get learning rate with warmup and decay
+      lr = get_lr_with_warmup(base_lr, epoch, iter, epochs, iters_per_epoch,
+                              batch_size, base_batch_size, warmup_epochs, decay_power)
+      
+      # Get batch
+      beg = ((iter-1) * batch_size) %% N + 1
+      end = min(N, beg + batch_size - 1)
+      X_batch = X_train[beg:end,]
+      Y_batch = Y_train[beg:end,]
+      
+      # Forward pass
+      if (use_bn) {
+        [predictions, cached_out, emas] = forward_with_bn(X_batch, C, Hin, Win, model, "train", 0.5)
+      } else {
+        [predictions, cached_out] = forward(X_batch, C, Hin, Win, model, "train", 0.5)
+      }
+      
+      # Compute loss
+      loss = compute_loss(predictions, Y_batch, model, weight_decay)
+      epoch_loss = epoch_loss + loss
+      
+      # Backward pass
+      dprobs = cross_entropy_loss::backward(predictions, Y_batch)
+      if (use_bn) {
+        # Note: BN backward pass would need to be implemented separately
+        [dX, gradients] = backward(dprobs, cached_out, model, C, Hin, Win, 0.5)
+      } else {
+        [dX, gradients] = backward(dprobs, cached_out, model, C, Hin, Win, 0.5)
+      }
+      
+      # Add L2 regularization gradients
+      for (i in seq(1, length(gradients), 2)) {  # Only weights
+        if (i <= length(model)) {
+          W = as.matrix(model[i])
+          dW = as.matrix(gradients[i])
+          gradients[i] = dW + weight_decay * l2_reg::backward(W, 1)
+        }
+      }
+      
+      # Update with LARS
+      [model, optim_state] = update_params_with_lars(model, gradients, lr, 
+                                                      momentum, weight_decay, 
+                                                      trust_coeff, optim_state)
+      
+      # Print progress
+      if (iter %% 50 == 0) {
+        print("Epoch " + epoch + "/" + epochs + ", Iter " + iter + "/" + 
+              iters_per_epoch + ", LR: " + lr + ", Loss: " + loss)
+      }
+    }
+    
+    # Epoch metrics
+    train_losses[epoch,1] = epoch_loss / iters_per_epoch
+    
+    # Validation
+    if (use_bn) {
+      [val_loss, val_acc] = evaluate_with_bn(X_val, Y_val, C, Hin, Win, model, batch_size)
+    } else {
+      [val_loss, val_acc] = evaluate(X_val, Y_val, C, Hin, Win, model, batch_size)
+    }
+    val_accs[epoch,1] = val_acc
+    
+    print("Epoch " + epoch + " - Train Loss: " + train_losses[epoch,1] + 
+          ", Val Acc: " + val_acc)
+  }
+}
+
+backward_with_bn = function(matrix[double] dOut, list[unknown] cached_out,
+                            list[unknown] model, int C, int Hin, int Win, double dropout_prob)
+    return (matrix[double] dX, list[unknown] gradients) {
+  /*
+   * Backward pass of the AlexNet-BN model (with Batch Normalization).
+   *
+   * Inputs:
+   * - dOut: Gradient w.r.t. output, of shape (N, num_classes)
+   * - cached_out: Cached outputs from forward pass
+   * - model: Model parameters (same structure as forward pass)
+   * - C, Hin, Win: Input dimensions
+   * - dropout_prob: Dropout probability used in forward pass
+   *
+   * Outputs:
+   * - dX: Gradient w.r.t. input, of shape (N, C*Hin*Win)
+   * - gradients: List of gradients for all parameters (same structure as model)
+   */
+  
+  # Extract model parameters (with BN)
+  W1 = as.matrix(model[1]); b1 = as.matrix(model[2])
+  gamma1 = as.matrix(model[3]); beta1 = as.matrix(model[4])
+  
+  W2 = as.matrix(model[7]); b2 = as.matrix(model[8])
+  gamma2 = as.matrix(model[9]); beta2 = as.matrix(model[10])
+  
+  W3 = as.matrix(model[13]); b3 = as.matrix(model[14])
+  gamma3 = as.matrix(model[15]); beta3 = as.matrix(model[16])
+  
+  W4 = as.matrix(model[19]); b4 = as.matrix(model[20])
+  gamma4 = as.matrix(model[21]); beta4 = as.matrix(model[22])
+  
+  W5 = as.matrix(model[25]); b5 = as.matrix(model[26])
+  gamma5 = as.matrix(model[27]); beta5 = as.matrix(model[28])
+  
+  W6 = as.matrix(model[31]); b6 = as.matrix(model[32])
+  W7 = as.matrix(model[33]); b7 = as.matrix(model[34])
+  W8 = as.matrix(model[35]); b8 = as.matrix(model[36])
+
+  # Extract cached outputs
+  X = as.matrix(cached_out[1])
+  outc1 = as.matrix(cached_out[2]); Houtc1 = as.scalar(cached_out[3]); Woutc1 = as.scalar(cached_out[4])
+  outbn1 = as.matrix(cached_out[5]); cache_mean1 = as.matrix(cached_out[6]); cache_inv_var1 = as.matrix(cached_out[7])
+  outr1 = as.matrix(cached_out[8])
+  outp1 = as.matrix(cached_out[9]); Houtp1 = as.scalar(cached_out[10]); Woutp1 = as.scalar(cached_out[11])
+  
+  outc2 = as.matrix(cached_out[12]); Houtc2 = as.scalar(cached_out[13]); Woutc2 = as.scalar(cached_out[14])
+  outbn2 = as.matrix(cached_out[15]); cache_mean2 = as.matrix(cached_out[16]); cache_inv_var2 = as.matrix(cached_out[17])
+  outr2 = as.matrix(cached_out[18])
+  outp2 = as.matrix(cached_out[19]); Houtp2 = as.scalar(cached_out[20]); Woutp2 = as.scalar(cached_out[21])
+  
+  outc3 = as.matrix(cached_out[22]); Houtc3 = as.scalar(cached_out[23]); Woutc3 = as.scalar(cached_out[24])
+  outbn3 = as.matrix(cached_out[25]); cache_mean3 = as.matrix(cached_out[26]); cache_inv_var3 = as.matrix(cached_out[27])
+  outr3 = as.matrix(cached_out[28])
+  
+  outc4 = as.matrix(cached_out[29]); Houtc4 = as.scalar(cached_out[30]); Woutc4 = as.scalar(cached_out[31])
+  outbn4 = as.matrix(cached_out[32]); cache_mean4 = as.matrix(cached_out[33]); cache_inv_var4 = as.matrix(cached_out[34])
+  outr4 = as.matrix(cached_out[35])
+  
+  outc5 = as.matrix(cached_out[36]); Houtc5 = as.scalar(cached_out[37]); Woutc5 = as.scalar(cached_out[38])
+  outbn5 = as.matrix(cached_out[39]); cache_mean5 = as.matrix(cached_out[40]); cache_inv_var5 = as.matrix(cached_out[41])
+  outr5 = as.matrix(cached_out[42])
+  outp5 = as.matrix(cached_out[43]); Houtp5 = as.scalar(cached_out[44]); Woutp5 = as.scalar(cached_out[45])
+  
+  outa6 = as.matrix(cached_out[46]); outr6 = as.matrix(cached_out[47])
+  outd6 = as.matrix(cached_out[48]); maskd6 = as.matrix(cached_out[49])
+  outa7 = as.matrix(cached_out[50]); outr7 = as.matrix(cached_out[51])
+  outd7 = as.matrix(cached_out[52]); maskd7 = as.matrix(cached_out[53])
+  outa8 = as.matrix(cached_out[54])
+
+  # Backward pass
+  # FC3
+  douta8 = softmax::backward(dOut, outa8)
+  [doutd7, dW8, db8] = affine::backward(douta8, outd7, W8, b8)
+  
+  # FC2
+  doutr7 = dropout::backward(doutd7, outr7, dropout_prob, maskd7)
+  douta7 = relu::backward(doutr7, outa7)
+  [doutd6, dW7, db7] = affine::backward(douta7, outd6, W7, b7)
+  
+  # FC1
+  doutr6 = dropout::backward(doutd6, outr6, dropout_prob, maskd6)
+  douta6 = relu::backward(doutr6, outa6)
+  [doutp5, dW6, db6] = affine::backward(douta6, outp5, W6, b6)
+  
+  # Conv5
+  doutr5 = max_pool2d::backward(doutp5, Houtp5, Woutp5, outr5, 256, Houtc5, Woutc5, 3, 3, 2, 2, 0, 0)
+  doutbn5 = relu::backward(doutr5, outbn5)
+  [doutc5, dgamma5, dbeta5] = batch_norm2d::backward(doutbn5, cache_mean5, cache_inv_var5, outc5, gamma5, 256, Houtc5, Woutc5, 1e-5)
+  [doutr4, dW5, db5] = conv2d::backward(doutc5, Houtc5, Woutc5, outr4, W5, b5, 384, Houtc4, Woutc4, 3, 3, 1, 1, 1, 1)
+  
+  # Conv4
+  doutbn4 = relu::backward(doutr4, outbn4)
+  [doutc4, dgamma4, dbeta4] = batch_norm2d::backward(doutbn4, cache_mean4, cache_inv_var4, outc4, gamma4, 384, Houtc4, Woutc4, 1e-5)
+  [doutr3, dW4, db4] = conv2d::backward(doutc4, Houtc4, Woutc4, outr3, W4, b4, 384, Houtc3, Woutc3, 3, 3, 1, 1, 1, 1)
+  
+  # Conv3
+  doutbn3 = relu::backward(doutr3, outbn3)
+  [doutc3, dgamma3, dbeta3] = batch_norm2d::backward(doutbn3, cache_mean3, cache_inv_var3, outc3, gamma3, 384, Houtc3, Woutc3, 1e-5)
+  [doutp2, dW3, db3] = conv2d::backward(doutc3, Houtc3, Woutc3, outp2, W3, b3, 256, Houtp2, Woutp2, 3, 3, 1, 1, 1, 1)
+  
+  # Conv2
+  doutr2 = max_pool2d::backward(doutp2, Houtp2, Woutp2, outr2, 256, Houtc2, Woutc2, 3, 3, 2, 2, 0, 0)
+  doutbn2 = relu::backward(doutr2, outbn2)
+  [doutc2, dgamma2, dbeta2] = batch_norm2d::backward(doutbn2, cache_mean2, cache_inv_var2, outc2, gamma2, 256, Houtc2, Woutc2, 1e-5)
+  [doutp1, dW2, db2] = conv2d::backward(doutc2, Houtc2, Woutc2, outp1, W2, b2, 96, Houtp1, Woutp1, 5, 5, 1, 1, 2, 2)
+  
+  # Conv1
+  doutr1 = max_pool2d::backward(doutp1, Houtp1, Woutp1, outr1, 96, Houtc1, Woutc1, 3, 3, 2, 2, 0, 0)
+  doutbn1 = relu::backward(doutr1, outbn1)
+  [doutc1, dgamma1, dbeta1] = batch_norm2d::backward(doutbn1, cache_mean1, cache_inv_var1, outc1, gamma1, 96, Houtc1, Woutc1, 1e-5)
+  [dX, dW1, db1] = conv2d::backward(doutc1, Houtc1, Woutc1, X, W1, b1, C, Hin, Win, 11, 11, 4, 4, 2, 2)
+
+  # Package gradients (with BN parameters)
+  gradients = list(dW1, db1, dgamma1, dbeta1, matrix(0, rows=nrow(dgamma1), cols=ncol(dgamma1)), matrix(0, rows=nrow(dgamma1), cols=ncol(dgamma1)),
+                   dW2, db2, dgamma2, dbeta2, matrix(0, rows=nrow(dgamma2), cols=ncol(dgamma2)), matrix(0, rows=nrow(dgamma2), cols=ncol(dgamma2)),
+                   dW3, db3, dgamma3, dbeta3, matrix(0, rows=nrow(dgamma3), cols=ncol(dgamma3)), matrix(0, rows=nrow(dgamma3), cols=ncol(dgamma3)),
+                   dW4, db4, dgamma4, dbeta4, matrix(0, rows=nrow(dgamma4), cols=ncol(dgamma4)), matrix(0, rows=nrow(dgamma4), cols=ncol(dgamma4)),
+                   dW5, db5, dgamma5, dbeta5, matrix(0, rows=nrow(dgamma5), cols=ncol(dgamma5)), matrix(0, rows=nrow(dgamma5), cols=ncol(dgamma5)),
+                   dW6, db6, dW7, db7, dW8, db8)
+}
+
+evaluate_with_bn = function(matrix[double] X, matrix[double] Y, int C, int Hin, int Win,
+                           list[unknown] model, int batch_size)
+    return (double loss, double accuracy) {
+  /*
+   * Evaluate AlexNet-BN model on a dataset.
+   */
+  N = nrow(X)
+  total_loss = 0
+  total_acc = 0
+  num_batches = ceil(N / batch_size)
+  
+  for (i in 1:num_batches) {
+    beg = ((i-1) * batch_size) %% N + 1
+    end = min(N, beg + batch_size - 1)
+    X_batch = X[beg:end,]
+    Y_batch = Y[beg:end,]
+    
+    [predictions, cached_out, emas] = forward_with_bn(X_batch, C, Hin, Win, model, "test", 0.0)
+    batch_loss = compute_loss(predictions, Y_batch, model, 0.0)
+    batch_acc = compute_accuracy(predictions, Y_batch)
+    
+    total_loss = total_loss + batch_loss
+    total_acc = total_acc + batch_acc
+  }
+  
+  loss = total_loss / num_batches
+  accuracy = total_acc / num_batches
+} 
\ No newline at end of file
diff --git a/scripts/nn/networks/resnet.dml b/scripts/nn/networks/resnet.dml
index 70df93f2448..9f121380f7e 100644
--- a/scripts/nn/networks/resnet.dml
+++ b/scripts/nn/networks/resnet.dml
@@ -19,12 +19,13 @@
 #
 #-------------------------------------------------------------
 
-source("scripts/nn/layers/batch_norm2d_old.dml") as bn2d
-source("scripts/nn/layers/conv2d_builtin.dml") as conv2d
-source("scripts/nn/layers/relu.dml") as relu
-source("scripts/nn/layers/max_pool2d_builtin.dml") as mp2d
-source("scripts/nn/layers/global_avg_pool2d.dml") as ap2d
-source("scripts/nn/layers/affine.dml") as fc
+source("nn/layers/batch_norm2d_old.dml") as bn2d
+source("nn/layers/conv2d_builtin.dml") as conv2d
+source("nn/layers/relu.dml") as relu
+source("nn/layers/max_pool2d_builtin.dml") as mp2d
+source("nn/layers/global_avg_pool2d.dml") as ap2d
+source("nn/layers/affine.dml") as fc
+source("nn/layers/softmax.dml") as softmax
 
 conv3x3_forward = function(matrix[double] X, matrix[double] W,
                            int C_in, int C_out, int Hin, int Win,
@@ -863,7 +864,7 @@ resnet_forward = function(matrix[double] X, int Hin, int Win,
     ema_means_vars_upd = list(ema_mean_bn1_upd, ema_var_bn1_upd, emas1_upd, emas2_upd, emas3_upd, emas4_upd)
     cached_out = list(X, Hin, Win, out_conv1, Hout_conv1, Wout_conv1, out_bn1, out_re1, out_mp, Hout_mp, Wout_mp,
         cached_out_l1, cached_out_l2, cached_out_l3, cached_out_l4, out_res, Hout_res, Wout_res, out_ap, Hout_ap,
-        Wout_ap)
+        Wout_ap, out_fc)
     cached_means_vars = list(cached_m, cached_v, cached_mv_l1, cached_mv_l2, cached_mv_l3, cached_mv_l4)
 
     out = out_fc
diff --git a/scripts/nn/networks/resnet101.dml b/scripts/nn/networks/resnet101.dml
index ebcb1d6b976..22a59c99285 100644
--- a/scripts/nn/networks/resnet101.dml
+++ b/scripts/nn/networks/resnet101.dml
@@ -432,3 +432,50 @@ update_params_with_sgd_nesterov = function(list[unknown] model,
         "bottleneck", layer_sizes)
 }
 
+init_lars_optim_params = function(int classes)
+    return(list[unknown] params) {
+    /*
+     * Initializes the state of the LARS optimizer for every
+     * learnable parameter of ResNet 50.
+     *
+     * Inputs:
+     * - classes: Number of network output classes.
+     *
+     * Outputs:
+     * - params: List of state parameters with the same structure
+     *     as weights of the forward and backward pass. It can be
+     *     directly passed to the update parameter function.
+     */
+    layer_sizes = list(3, 4, 23, 3)
+    params = util::init_optim("lars", classes, "bottleneck", layer_sizes)
+}
+
+update_params_with_lars = function(list[unknown] model, list[unknown] gradients,
+                                   double lr, double mu, double weight_decay,
+                                   double trust_coeff, list[unknown] optim_state)
+    return (list[unknown] model_upd, list[unknown] optim_state_upd) {
+    /*
+     * Updates all learnable parameters with the LARS optimizer.
+     *
+     * LARS (Layer-wise Adaptive Rate Scaling) applies different learning
+     * rates to different layers based on the ratio of parameter norm
+     * to gradient norm, enabling stable large-batch training.
+     *
+     * Inputs:
+     * - model: Model parameters, same as for forward and backward pass.
+     * - gradients: Gradients, returned from the backward pass.
+     * - lr: Global learning rate.
+     * - mu: Momentum value. Recommended: 0.9
+     * - weight_decay: L2 regularization strength. Recommended: 5e-4
+     * - trust_coeff: Trust coefficient for LARS. Recommended: 0.001
+     * - optim_state: Optimizer states for all model parameters.
+     *
+     * Outputs:
+     * - model_upd: Updated model parameters.
+     * - optim_state_upd: Updated model states for all parameters.
+     */
+    layer_sizes = list(3, 4, 23, 3)
+    hyper_params = list(lr, mu, weight_decay, trust_coeff)
+    [optim_state_upd, model_upd] = util::update_params("lars", optim_state, hyper_params, gradients, model, "bottleneck", 
+        layer_sizes)
+}
\ No newline at end of file
diff --git a/scripts/nn/networks/resnet152.dml b/scripts/nn/networks/resnet152.dml
index e0e4154fc94..92da614345a 100644
--- a/scripts/nn/networks/resnet152.dml
+++ b/scripts/nn/networks/resnet152.dml
@@ -432,3 +432,50 @@ update_params_with_sgd_nesterov = function(list[unknown] model,
         "bottleneck", layer_sizes)
 }
 
+init_lars_optim_params = function(int classes)
+    return(list[unknown] params) {
+    /*
+     * Initializes the state of the LARS optimizer for every
+     * learnable parameter of ResNet 50.
+     *
+     * Inputs:
+     * - classes: Number of network output classes.
+     *
+     * Outputs:
+     * - params: List of state parameters with the same structure
+     *     as weights of the forward and backward pass. It can be
+     *     directly passed to the update parameter function.
+     */
+    layer_sizes = list(3, 8, 36, 3)
+    params = util::init_optim("lars", classes, "bottleneck", layer_sizes)
+}
+
+update_params_with_lars = function(list[unknown] model, list[unknown] gradients,
+                                   double lr, double mu, double weight_decay,
+                                   double trust_coeff, list[unknown] optim_state)
+    return (list[unknown] model_upd, list[unknown] optim_state_upd) {
+    /*
+     * Updates all learnable parameters with the LARS optimizer.
+     *
+     * LARS (Layer-wise Adaptive Rate Scaling) applies different learning
+     * rates to different layers based on the ratio of parameter norm
+     * to gradient norm, enabling stable large-batch training.
+     *
+     * Inputs:
+     * - model: Model parameters, same as for forward and backward pass.
+     * - gradients: Gradients, returned from the backward pass.
+     * - lr: Global learning rate.
+     * - mu: Momentum value. Recommended: 0.9
+     * - weight_decay: L2 regularization strength. Recommended: 5e-4
+     * - trust_coeff: Trust coefficient for LARS. Recommended: 0.001
+     * - optim_state: Optimizer states for all model parameters.
+     *
+     * Outputs:
+     * - model_upd: Updated model parameters.
+     * - optim_state_upd: Updated model states for all parameters.
+     */
+    layer_sizes = list(3, 8, 36, 3)
+    hyper_params = list(lr, mu, weight_decay, trust_coeff)
+    [optim_state_upd, model_upd] = util::update_params("lars", optim_state, hyper_params, gradients, model, "bottleneck", 
+        layer_sizes)
+}
\ No newline at end of file
diff --git a/scripts/nn/networks/resnet18.dml b/scripts/nn/networks/resnet18.dml
index 2a67c9ddb61..52a80eb92d1 100644
--- a/scripts/nn/networks/resnet18.dml
+++ b/scripts/nn/networks/resnet18.dml
@@ -434,3 +434,50 @@ update_params_with_sgd_nesterov = function(list[unknown] model,
         layer_sizes)
 }
 
+init_lars_optim_params = function(int classes)
+    return(list[unknown] params) {
+    /*
+     * Initializes the state of the LARS optimizer for every
+     * learnable parameter of ResNet 50.
+     *
+     * Inputs:
+     * - classes: Number of network output classes.
+     *
+     * Outputs:
+     * - params: List of state parameters with the same structure
+     *     as weights of the forward and backward pass. It can be
+     *     directly passed to the update parameter function.
+     */
+    layer_sizes = list(2, 2, 2, 2)
+    params = util::init_optim("lars", classes, "basic", layer_sizes)
+}
+
+update_params_with_lars = function(list[unknown] model, list[unknown] gradients,
+                                   double lr, double mu, double weight_decay,
+                                   double trust_coeff, list[unknown] optim_state)
+    return (list[unknown] model_upd, list[unknown] optim_state_upd) {
+    /*
+     * Updates all learnable parameters with the LARS optimizer.
+     *
+     * LARS (Layer-wise Adaptive Rate Scaling) applies different learning
+     * rates to different layers based on the ratio of parameter norm
+     * to gradient norm, enabling stable large-batch training.
+     *
+     * Inputs:
+     * - model: Model parameters, same as for forward and backward pass.
+     * - gradients: Gradients, returned from the backward pass.
+     * - lr: Global learning rate.
+     * - mu: Momentum value. Recommended: 0.9
+     * - weight_decay: L2 regularization strength. Recommended: 5e-4
+     * - trust_coeff: Trust coefficient for LARS. Recommended: 0.001
+     * - optim_state: Optimizer states for all model parameters.
+     *
+     * Outputs:
+     * - model_upd: Updated model parameters.
+     * - optim_state_upd: Updated model states for all parameters.
+     */
+    layer_sizes = list(2, 2, 2, 2)
+    hyper_params = list(lr, mu, weight_decay, trust_coeff)
+    [optim_state_upd, model_upd] = util::update_params("lars", optim_state, hyper_params, gradients, model, "basic", 
+        layer_sizes)
+}
\ No newline at end of file
diff --git a/scripts/nn/networks/resnet34.dml b/scripts/nn/networks/resnet34.dml
index 9dcabcf1ecc..86e9e547ce5 100644
--- a/scripts/nn/networks/resnet34.dml
+++ b/scripts/nn/networks/resnet34.dml
@@ -428,3 +428,50 @@ update_params_with_sgd_nesterov = function(list[unknown] model,
         layer_sizes)
 }
 
+init_lars_optim_params = function(int classes)
+    return(list[unknown] params) {
+    /*
+     * Initializes the state of the LARS optimizer for every
+     * learnable parameter of ResNet 50.
+     *
+     * Inputs:
+     * - classes: Number of network output classes.
+     *
+     * Outputs:
+     * - params: List of state parameters with the same structure
+     *     as weights of the forward and backward pass. It can be
+     *     directly passed to the update parameter function.
+     */
+    layer_sizes = list(3, 4, 6, 3)
+    params = util::init_optim("lars", classes, "basic", layer_sizes)
+}
+
+update_params_with_lars = function(list[unknown] model, list[unknown] gradients,
+                                   double lr, double mu, double weight_decay,
+                                   double trust_coeff, list[unknown] optim_state)
+    return (list[unknown] model_upd, list[unknown] optim_state_upd) {
+    /*
+     * Updates all learnable parameters with the LARS optimizer.
+     *
+     * LARS (Layer-wise Adaptive Rate Scaling) applies different learning
+     * rates to different layers based on the ratio of parameter norm
+     * to gradient norm, enabling stable large-batch training.
+     *
+     * Inputs:
+     * - model: Model parameters, same as for forward and backward pass.
+     * - gradients: Gradients, returned from the backward pass.
+     * - lr: Global learning rate.
+     * - mu: Momentum value. Recommended: 0.9
+     * - weight_decay: L2 regularization strength. Recommended: 5e-4
+     * - trust_coeff: Trust coefficient for LARS. Recommended: 0.001
+     * - optim_state: Optimizer states for all model parameters.
+     *
+     * Outputs:
+     * - model_upd: Updated model parameters.
+     * - optim_state_upd: Updated model states for all parameters.
+     */
+    layer_sizes = list(3, 4, 6, 3)
+    hyper_params = list(lr, mu, weight_decay, trust_coeff)
+    [optim_state_upd, model_upd] = util::update_params("lars", optim_state, hyper_params, gradients, model, "basic", 
+        layer_sizes)
+}
\ No newline at end of file
diff --git a/scripts/nn/networks/resnet50.dml b/scripts/nn/networks/resnet50.dml
index bac0e938af3..ac4e1952301 100644
--- a/scripts/nn/networks/resnet50.dml
+++ b/scripts/nn/networks/resnet50.dml
@@ -432,3 +432,50 @@ update_params_with_sgd_nesterov = function(list[unknown] model,
         "bottleneck", layer_sizes)
 }
 
+init_lars_optim_params = function(int classes)
+    return(list[unknown] params) {
+    /*
+     * Initializes the state of the LARS optimizer for every
+     * learnable parameter of ResNet 50.
+     *
+     * Inputs:
+     * - classes: Number of network output classes.
+     *
+     * Outputs:
+     * - params: List of state parameters with the same structure
+     *     as weights of the forward and backward pass. It can be
+     *     directly passed to the update parameter function.
+     */
+    layer_sizes = list(3, 4, 6, 3)
+    params = util::init_optim("lars", classes, "bottleneck", layer_sizes)
+}
+
+update_params_with_lars = function(list[unknown] model, list[unknown] gradients,
+                                   double lr, double mu, double weight_decay,
+                                   double trust_coeff, list[unknown] optim_state)
+    return (list[unknown] model_upd, list[unknown] optim_state_upd) {
+    /*
+     * Updates all learnable parameters with the LARS optimizer.
+     *
+     * LARS (Layer-wise Adaptive Rate Scaling) applies different learning
+     * rates to different layers based on the ratio of parameter norm
+     * to gradient norm, enabling stable large-batch training.
+     *
+     * Inputs:
+     * - model: Model parameters, same as for forward and backward pass.
+     * - gradients: Gradients, returned from the backward pass.
+     * - lr: Global learning rate.
+     * - mu: Momentum value. Recommended: 0.9
+     * - weight_decay: L2 regularization strength. Recommended: 5e-4
+     * - trust_coeff: Trust coefficient for LARS. Recommended: 0.001
+     * - optim_state: Optimizer states for all model parameters.
+     *
+     * Outputs:
+     * - model_upd: Updated model parameters.
+     * - optim_state_upd: Updated model states for all parameters.
+     */
+    layer_sizes = list(3, 4, 6, 3)
+    hyper_params = list(lr, mu, weight_decay, trust_coeff)
+    [optim_state_upd, model_upd] = util::update_params("lars", optim_state, hyper_params, gradients, model, "bottleneck", 
+        layer_sizes)
+}
\ No newline at end of file
diff --git a/scripts/nn/networks/resnet_util.dml b/scripts/nn/networks/resnet_util.dml
index 995736585ba..117e1c98631 100644
--- a/scripts/nn/networks/resnet_util.dml
+++ b/scripts/nn/networks/resnet_util.dml
@@ -25,6 +25,7 @@ source("nn/optim/rmsprop.dml") as rmsprop
 source("nn/optim/sgd.dml") as sgd
 source("nn/optim/sgd_momentum.dml") as sgd_momentum
 source("nn/optim/sgd_nesterov.dml") as sgd_nesterov
+source("nn/optim/lars.dml") as lars
 
 
 init_optim_adam_basic_block = function(int C_in, int C_base, boolean downsample)
@@ -55,6 +56,33 @@ init_optim_adam_basic_block = function(int C_in, int C_base, boolean downsample)
     }
 }
 
+init_optim_lars_basic_block = function(int C_in, int C_base, boolean downsample)
+    return (list[unknown] block_params) {
+    # Conv 1
+    v_W_conv1 = matrix(0, rows=C_base, cols=C_in*3*3)
+    # BN 1
+    v_W_bn1 = matrix(0, rows=C_base, cols=1)
+    v_b_bn1 = matrix(0, rows=C_base, cols=1)
+    # Conv 2
+    v_W_conv2 = matrix(0, rows=C_base, cols=C_base*3*3)
+    # BN 2
+    v_W_bn2 = matrix(0, rows=C_base, cols=1)
+    v_b_bn2 = matrix(0, rows=C_base, cols=1)
+
+    block_params = list(v_W_conv1, v_W_bn1, v_b_bn1, v_W_conv2, v_W_bn2, v_b_bn2)
+
+    if (downsample) {
+        # Conv 3
+        v_W_conv3 = matrix(0, rows=C_base, cols=C_in)
+        # BN 3
+        v_W_bn3 = matrix(0, rows=C_base, cols=1)
+        v_b_bn3 = matrix(0, rows=C_base, cols=1)
+        block_params = append(block_params, v_W_conv3)
+        block_params = append(block_params, v_W_bn3)
+        block_params = append(block_params, v_b_bn3)
+    }
+}
+
 init_optim_other_basic_block = function(int C_in, int C_base, boolean downsample)
     return (list[unknown] block_params) {
     # Conv 1
@@ -114,6 +142,38 @@ init_optim_other_bottleneck_block = function(int C_in, int C_base, boolean downs
     }
 }
 
+init_optim_lars_bottleneck_block = function(int C_in, int C_base, boolean downsample)
+    return (list[unknown] block_params) {
+    # Conv 1
+    v_W_conv1 = matrix(0, rows=C_base, cols=C_in)
+    # BN 1
+    v_W_bn1 = matrix(0, rows=C_base, cols=1)
+    v_b_bn1 = matrix(0, rows=C_base, cols=1)
+    # Conv 2
+    v_W_conv2 = matrix(0, rows=C_base, cols=C_base*3*3)
+    # BN 2
+    v_W_bn2 = matrix(0, rows=C_base, cols=1)
+    v_b_bn2 = matrix(0, rows=C_base, cols=1)
+    # Conv 3
+    v_W_conv3 = matrix(0, rows=4*C_base, cols=C_base)
+    # BN 3
+    v_W_bn3 = matrix(0, rows=4*C_base, cols=1)
+    v_b_bn3 = matrix(0, rows=4*C_base, cols=1)
+
+    block_params = list(v_W_conv1, v_W_bn1, v_b_bn1, v_W_conv2, v_W_bn2, v_b_bn2, v_W_conv3, v_W_bn3, v_b_bn3)
+
+    if (downsample) {
+        # Conv 4
+        v_W_conv4 = matrix(0, rows=4*C_base, cols=C_in)
+        # BN 4
+        v_W_bn4 = matrix(0, rows=4*C_base, cols=1)
+        v_b_bn4 = matrix(0, rows=4*C_base, cols=1)
+        block_params = append(block_params, v_W_conv4)
+        block_params = append(block_params, v_W_bn4)
+        block_params = append(block_params, v_b_bn4)
+    }
+}
+
 init_optim_adam_bottleneck_block = function(int C_in, int C_base, boolean downsample)
     return (list[unknown] block_params) {
     # Conv 1
@@ -158,6 +218,9 @@ init_optim = function(string optimizer, int classes, string block_type, list[unk
         m_W_conv1 = matrix(0, rows=64, cols=C_in*7*7)
         v_W_conv1 = matrix(0, rows=64, cols=C_in*7*7)
         params = append(params, list(m_W_conv1, v_W_conv1))
+    } else if (optimizer == "lars") {
+        v_W_conv1 = matrix(0, rows=64, cols=C_in*7*7)
+        params = append(params, v_W_conv1)
     } else {
         s_W_conv1 = matrix(0, rows=64, cols=C_in*7*7)
         params = append(params, s_W_conv1)
@@ -169,6 +232,11 @@ init_optim = function(string optimizer, int classes, string block_type, list[unk
         m_b_bn1 = matrix(0, rows=C_in, cols=1); v_b_bn1 = matrix(0, rows=C_in, cols=1)
         params = append(params, list(m_W_bn1, v_W_bn1))
         params = append(params, list(m_b_bn1, v_b_bn1))
+    } else if (optimizer == "lars") {
+        v_W_bn1 = matrix(0, rows=C_in, cols=1)
+        v_b_bn1 = matrix(0, rows=C_in, cols=1)
+        params = append(params, v_W_bn1)
+        params = append(params, v_b_bn1)
     } else {
         s_W_bn1 = matrix(0, rows=C_in, cols=1)
         s_b_bn1 = matrix(0, rows=C_in, cols=1)
@@ -191,6 +259,8 @@ init_optim = function(string optimizer, int classes, string block_type, list[unk
                 downsample = block == 1 & stride > 1
                 if (optimizer == "adam")
                     optim_block = init_optim_adam_basic_block(C_in, C_base, downsample)
+                else if (optimizer == "lars")
+                    optim_block = init_optim_lars_basic_block(C_in, C_base, downsample)
                 else
                     optim_block = init_optim_other_basic_block(C_in, C_base, downsample)
                 optim_layer = append(optim_layer, optim_block)
@@ -203,6 +273,8 @@ init_optim = function(string optimizer, int classes, string block_type, list[unk
                 downsample = block == 1
                 if (optimizer == "adam")
                     optim_block = init_optim_adam_bottleneck_block(C_in, C_base, downsample)
+                else if (optimizer == "lars")
+                    optim_block = init_optim_lars_bottleneck_block(C_in, C_base, downsample)
                 else
                     optim_block = init_optim_other_bottleneck_block(C_in, C_base, downsample)
                 optim_layer = append(optim_layer, optim_block)
@@ -220,6 +292,11 @@ init_optim = function(string optimizer, int classes, string block_type, list[unk
         m_b_fc = matrix(0, rows=1, cols=classes); v_b_fc = matrix(0, rows=1, cols=classes)
         params = append(params, list(m_W_fc, v_W_fc))
         params = append(params, list(m_b_fc, v_b_fc))
+    } else if (optimizer == "lars") {
+        v_W_fc = matrix(0, rows=C_in, cols=classes)
+        v_b_fc = matrix(0, rows=1, cols=classes)
+        params = append(params, v_W_fc)
+        params = append(params, v_b_fc)
     } else {
         s_W_fc = matrix(0, rows=C_in, cols=classes)
         s_b_fc = matrix(0, rows=1, cols=classes)
@@ -284,6 +361,15 @@ update_param = function(int index, string optimizer, list[unknown] optim_params,
 
         [param_upd, v_upd] = sgd_nesterov::update(param, grad, lr, mu, v)
         optim_params_upd = append(optim_params_upd, v_upd)
+    } else if (optimizer == "lars") {
+        lr = as.scalar(optim_hyper_params[1])
+        mu = as.scalar(optim_hyper_params[2])
+        lambda = as.scalar(optim_hyper_params[3])
+        trust_coeff = as.scalar(optim_hyper_params[4])
+
+        v = as.matrix(optim_params[index])
+        [param_upd, v_upd] = lars::update(param, grad, lr, mu, v, lambda, trust_coeff)
+        optim_params_upd = append(optim_params_upd, v_upd)
     }
     params_upd = append(params_upd, param_upd)
 }
diff --git a/scripts/nn/optim/lars.dml b/scripts/nn/optim/lars.dml
new file mode 100644
index 00000000000..f6957753d12
--- /dev/null
+++ b/scripts/nn/optim/lars.dml
@@ -0,0 +1,97 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Layer-wise Adaptive Rate Scaling (LARS) optimizer.
+ */
+
+update = function(matrix[double] X, matrix[double] dX, double lr, double mu, 
+                  matrix[double] v, double lambda, double trust_coeff)
+    return (matrix[double] X, matrix[double] v) {
+  /*
+   * Performs a LARS update with layer-wise adaptive learning rate,
+   * faithfully implementing Algorithm 1 from the original paper.
+   *
+   * Reference:
+   * - "Large Batch Training of Convolutional Networks" by You, Gitman, and Ginsburg.
+   *   https://arxiv.org/abs/1708.03888
+   *
+   * This implementation correctly uses the sum of norms for the denominator
+   * and a coupled weight decay approach, as specified in the paper's
+   * pseudocode.
+   *
+   * Inputs:
+   *  - X: Parameters to update, of shape (any, any).
+   *  - dX: Gradient of the loss function w.r.t. X, of same shape as X.
+   *  - lr: Global learning rate (γ in the paper).
+   *  - mu: Momentum coefficient (m in the paper).
+   *  - v: Velocity (momentum state), of same shape as X.
+   *  - lambda: L2 regularization strength (β in the paper).
+   *  - trust_coeff: Trust coefficient for LARS (η in the paper).
+   *
+   * Outputs:
+   *  - X: Updated parameters X, of same shape as input X.
+   *  - v: Updated velocity, of same shape as input v.
+   */
+
+  # Step 1: Add weight decay to the gradient to form g'.
+  # This corresponds to `g_t' + βw_t'` in Algorithm 1.
+  dX_wd = dX + lambda * X;
+  
+  # Step 2: Compute the L2 norms of the pure gradient and the weights separately.
+  X_norm = sqrt(sum(X^2));
+  dX_norm = sqrt(sum(dX^2));
+  
+  # A small epsilon for numerical stability, preventing division by zero.
+  epsilon = 1e-8;
+
+  # Step 3: Compute the local learning rate `λ'`.
+  local_lr = trust_coeff * X_norm / (dX_norm + lambda * X_norm + epsilon);
+  
+  # Step 4: Compute the final effective learning rate for this layer's update.
+  effective_lr = lr * local_lr;
+  
+  # Step 5: For very small layers (like biases), which can be unstable with LARS,
+  # we fall back to using the global learning rate. 
+  if (X_norm < 1e-3 | ncol(X) == 1 | nrow(X) == 1) {
+    effective_lr = lr;
+  }
+  
+  # Step 6: Update the momentum (velocity).
+  v = mu * v - effective_lr * dX_wd;
+
+  # Step 7: Update the weights.
+  X = X + v;
+}
+
+init = function(matrix[double] X)
+    return (matrix[double] v) {
+  /*
+   * Initialize the state for LARS (momentum).
+   *
+   * Inputs:
+   *  - X: Parameters to update, of shape (any, any).
+   *
+   * Outputs:
+   *  - v: Initial velocity (zeros), of same shape as X.
+   */
+  v = matrix(0, rows=nrow(X), cols=ncol(X))
+}
\ No newline at end of file
diff --git a/scripts/nn/optim/lars_util.dml b/scripts/nn/optim/lars_util.dml
new file mode 100644
index 00000000000..99e5b02c2f9
--- /dev/null
+++ b/scripts/nn/optim/lars_util.dml
@@ -0,0 +1,54 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+get_lr_with_warmup = function(double base_lr, int epoch, int iter, int total_epochs,
+                              int iters_per_epoch, int batch_size, int base_batch_size,
+                              int warmup_epochs, int decay_power)
+    return (double lr) {
+    /*
+     * Compute learning rate with linear warmup and polynomial decay.
+     * 
+     * Implements the learning rate schedule from LARS paper:
+     * - Linear warmup for first warmup_epochs
+     * - Polynomial decay afterwards
+     * - Linear scaling with batch size
+     */
+    
+    # Scale learning rate linearly with batch size
+    scaled_lr = base_lr * batch_size / base_batch_size
+    
+    # Total number of iterations
+    total_iters = total_epochs * iters_per_epoch
+    warmup_iters = warmup_epochs * iters_per_epoch
+    current_iter = (epoch - 1) * iters_per_epoch + iter
+    
+    if (current_iter <= warmup_iters) {
+        # Linear warmup
+        lr = scaled_lr * current_iter / warmup_iters
+    } else {
+        # Polynomial decay
+        decay_iters = total_iters - warmup_iters
+        decay_current = current_iter - warmup_iters
+        decay_factor = (1 - decay_current / decay_iters) ^ decay_power
+        lr = scaled_lr * decay_factor
+    }
+}
+