diff --git a/.gitignore b/.gitignore index f3c28571bdf..de63e6c2538 100644 --- a/.gitignore +++ b/.gitignore @@ -150,3 +150,5 @@ venv/* # resource optimization scripts/resource/output *.pem +scripts/nn/examples/mnist_data/mnist_test.csv +scripts/nn/examples/mnist_data/mnist_train.csv \ No newline at end of file diff --git a/scripts/data_prep/create_binary_chunks.py b/scripts/data_prep/create_binary_chunks.py new file mode 100644 index 00000000000..6a2d273410a --- /dev/null +++ b/scripts/data_prep/create_binary_chunks.py @@ -0,0 +1,215 @@ +#!/usr/bin/env python3 +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +""" +Create pre-split binary chunks from ImageNet data for SystemDS LARS training. + +This script reads existing CSV or binary data and splits it into manageable chunks +for memory-efficient training with large datasets. +""" + +import os +import sys +import numpy as np +import pandas as pd +from pathlib import Path + +def create_binary_chunks(data_dir="imagenet_data", chunk_size=10000): + """ + Create binary chunk files from existing ImageNet data. + + Args: + data_dir: Directory containing the ImageNet data + chunk_size: Number of samples per chunk + """ + data_path = Path(data_dir) + + print(f"Creating binary chunks from data in: {data_path}") + print(f"Chunk size: {chunk_size}") + + # Check what data we have available + csv_train = data_path / "imagenet_train.csv" + csv_val = data_path / "imagenet_val.csv" + + if csv_train.exists() and csv_val.exists(): + print("Found CSV files, converting to binary chunks...") + create_chunks_from_csv(data_path, chunk_size) + else: + print("CSV files not found, creating dummy chunks for testing...") + create_dummy_chunks(data_path, chunk_size) + +def create_chunks_from_csv(data_path, chunk_size): + """Create chunks from CSV files.""" + + # Read training data + print("Reading training CSV...") + train_df = pd.read_csv(data_path / "imagenet_train.csv", header=None) + print(f"Training data shape: {train_df.shape}") + + # Read validation data + print("Reading validation CSV...") + val_df = pd.read_csv(data_path / "imagenet_val.csv", header=None) + print(f"Validation data shape: {val_df.shape}") + + # Split training data into chunks + train_labels = train_df.iloc[:, 0].values + train_data = train_df.iloc[:, 1:].values + + # Convert to float and normalize + train_data = train_data.astype(np.float64) / 255.0 + + num_train_chunks = (len(train_data) + chunk_size - 1) // chunk_size + print(f"Creating {num_train_chunks} training chunks...") + + for i in range(num_train_chunks): + start_idx = i * chunk_size + end_idx = min((i + 1) * chunk_size, len(train_data)) + + chunk_data = train_data[start_idx:end_idx] + chunk_labels = train_labels[start_idx:end_idx] + + # Convert labels to one-hot (assuming 10 classes for now) + num_classes = 10 + chunk_labels_onehot = np.eye(num_classes)[chunk_labels] + + # Save as binary files that SystemDS can read + chunk_num = f"{i+1:03d}" + + # Save data chunk as CSV + data_file = data_path / f"train_chunk_{chunk_num}.csv" + pd.DataFrame(chunk_data).to_csv(data_file, header=False, index=False) + + # Save labels chunk as CSV + labels_file = data_path / f"train_labels_{chunk_num}.csv" + pd.DataFrame(chunk_labels_onehot).to_csv(labels_file, header=False, index=False) + + print(f" Chunk {chunk_num}: {chunk_data.shape[0]} samples") + + # Process validation data (typically smaller, so fewer chunks) + val_labels = val_df.iloc[:, 0].values + val_data = val_df.iloc[:, 1:].values + val_data = val_data.astype(np.float64) / 255.0 + + val_chunk_size = min(chunk_size, len(val_data)) + num_val_chunks = (len(val_data) + val_chunk_size - 1) // val_chunk_size + print(f"Creating {num_val_chunks} validation chunks...") + + for i in range(num_val_chunks): + start_idx = i * val_chunk_size + end_idx = min((i + 1) * val_chunk_size, len(val_data)) + + chunk_data = val_data[start_idx:end_idx] + chunk_labels = val_labels[start_idx:end_idx] + + # Convert labels to one-hot + chunk_labels_onehot = np.eye(num_classes)[chunk_labels] + + chunk_num = f"{i+1:03d}" + + # Save data chunk as CSV + data_file = data_path / f"val_chunk_{chunk_num}.csv" + pd.DataFrame(chunk_data).to_csv(data_file, header=False, index=False) + + # Save labels chunk as CSV + labels_file = data_path / f"val_labels_{chunk_num}.csv" + pd.DataFrame(chunk_labels_onehot).to_csv(labels_file, header=False, index=False) + + print(f" Val chunk {chunk_num}: {chunk_data.shape[0]} samples") + +def create_dummy_chunks(data_path, chunk_size): + """Create dummy chunks for testing when real data isn't available.""" + print("Creating dummy data chunks for testing...") + + # ImageNet-like dimensions + img_height, img_width, channels = 224, 224, 3 + num_features = img_height * img_width * channels + num_classes = 10 + + # Create training chunks + num_train_samples = chunk_size * 2 # Create 2 chunks for demo + + print(f"Generating {num_train_samples} dummy training samples...") + train_data = np.random.rand(num_train_samples, num_features).astype(np.float64) + train_labels = np.random.randint(0, num_classes, num_train_samples) + train_labels_onehot = np.eye(num_classes)[train_labels] + + # Split into chunks + for i in range(2): # 2 training chunks + start_idx = i * chunk_size + end_idx = (i + 1) * chunk_size + + chunk_data = train_data[start_idx:end_idx] + chunk_labels_onehot_chunk = train_labels_onehot[start_idx:end_idx] + + chunk_num = f"{i+1:03d}" + + # Save chunks as CSV + data_file = data_path / f"train_chunk_{chunk_num}.csv" + pd.DataFrame(chunk_data).to_csv(data_file, header=False, index=False) + + labels_file = data_path / f"train_labels_{chunk_num}.csv" + pd.DataFrame(chunk_labels_onehot_chunk).to_csv(labels_file, header=False, index=False) + + print(f" Created train chunk {chunk_num}: {chunk_data.shape}") + + # Create validation chunk + num_val_samples = min(chunk_size, 5000) # Smaller validation set + print(f"Generating {num_val_samples} dummy validation samples...") + + val_data = np.random.rand(num_val_samples, num_features).astype(np.float64) + val_labels = np.random.randint(0, num_classes, num_val_samples) + val_labels_onehot = np.eye(num_classes)[val_labels] + + # Save validation chunk as CSV + data_file = data_path / "val_chunk_001.csv" + pd.DataFrame(val_data).to_csv(data_file, header=False, index=False) + + labels_file = data_path / "val_labels_001.csv" + pd.DataFrame(val_labels_onehot).to_csv(labels_file, header=False, index=False) + + print(f" Created val chunk 001: {val_data.shape}") + +def main(): + """Main execution.""" + data_dir = "imagenet_data" + chunk_size = 10000 + + if len(sys.argv) > 1: + data_dir = sys.argv[1] + if len(sys.argv) > 2: + chunk_size = int(sys.argv[2]) + + # Create data directory if it doesn't exist + os.makedirs(data_dir, exist_ok=True) + + create_binary_chunks(data_dir, chunk_size) + + print("\n✅ Binary chunk creation completed!") + print(f"Chunks saved in: {data_dir}/") + print("Files created:") + + data_path = Path(data_dir) + for file in sorted(data_path.glob("*_chunk_*.bin")): + size_mb = file.stat().st_size / (1024 * 1024) + print(f" {file.name} ({size_mb:.1f} MB)") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/data_prep/prepare_raw_imagenet.py b/scripts/data_prep/prepare_raw_imagenet.py new file mode 100644 index 00000000000..63b51374876 --- /dev/null +++ b/scripts/data_prep/prepare_raw_imagenet.py @@ -0,0 +1,433 @@ +#!/usr/bin/env python3 +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +""" +Raw ImageNet Data Preprocessing Pipeline +========================================= + +This script processes raw ImageNet JPG images with metadata CSV files and prepares them +for SystemDS AlexNet training. It handles: + +1. Reading metadata CSV files with file_path,label format +2. Loading JPG images (typically 256x256) and resizing to specified target size (default: 224x224) +3. Converting to normalized feature vectors +4. Creating one-hot encoded labels +5. Saving in SystemDS-compatible CSV format with resolution-based naming + +Usage: + python prepare_raw_imagenet.py --input_dir "C:/Users/romer/Desktop/Big_Data/imagenet/256x256" --output_dir "imagenet_data" + python prepare_raw_imagenet.py --input_dir "C:/Users/romer/Desktop/Big_Data/imagenet/256x256" --target_size 299 + python prepare_raw_imagenet.py --input_dir "path/to/imagenet" --dry_run + +Output files will be saved as: + imagenet_data/x/imagenet_x_train.csv + imagenet_data/x/imagenet_x_train_labels.csv + imagenet_data/x/imagenet_x_test.csv + imagenet_data/x/imagenet_x_test_labels.csv +""" + +import os +import sys +import argparse +import numpy as np +import pandas as pd +from pathlib import Path +from typing import Dict, List, Optional, Tuple +import time +import gc +from PIL import Image +import csv + +class RawImageNetProcessor: + """Raw ImageNet JPG image processor for SystemDS.""" + + def __init__(self, input_dir: str, output_dir: str = "imagenet_data/224x224", target_size: int = 224): + self.input_dir = Path(input_dir) + self.target_size = target_size + + # Create output directory based on resolution + base_output = Path(output_dir).parent if "x" in Path(output_dir).name else Path(output_dir) + self.output_dir = base_output / f"{target_size}x{target_size}" + self.output_dir.mkdir(parents=True, exist_ok=True) + + # Target specifications for SystemDS AlexNet + self.channels = 3 + self.features = self.target_size * self.target_size * self.channels + self.num_classes = 1000 + + print(f"Raw ImageNet Processor initialized") + print(f"Input directory: {self.input_dir}") + print(f"Output directory: {self.output_dir}") + print(f"Target format: {self.target_size}x{self.target_size}x{self.channels} images ({self.features} features), {self.num_classes} classes") + print(f"Note: Source images will be resized from their original size to {self.target_size}x{self.target_size}") + + def inspect_raw_data(self) -> Dict: + """Inspect the raw data structure and return metadata.""" + print("\n=== Raw Data Inspection ===") + + # Look for metadata files + train_metadata_file = self.input_dir / "imagenet_train_metadata.csv" + test_metadata_file = self.input_dir / "imagenet_test_metadata.csv" + + if not train_metadata_file.exists(): + raise FileNotFoundError(f"Training metadata file not found: {train_metadata_file}") + if not test_metadata_file.exists(): + raise FileNotFoundError(f"Test metadata file not found: {test_metadata_file}") + + # Read metadata + print(f"Reading training metadata from: {train_metadata_file}") + train_df = pd.read_csv(train_metadata_file) + print(f"Reading test metadata from: {test_metadata_file}") + test_df = pd.read_csv(test_metadata_file) + + # Inspect structure + print(f"\nTraining metadata shape: {train_df.shape}") + print(f"Training columns: {list(train_df.columns)}") + print(f"Training label range: {train_df['label'].min()} to {train_df['label'].max()}") + print(f"Training unique labels: {train_df['label'].nunique()}") + + print(f"\nTest metadata shape: {test_df.shape}") + print(f"Test columns: {list(test_df.columns)}") + print(f"Test label range: {test_df['label'].min()} to {test_df['label'].max()}") + print(f"Test unique labels: {test_df['label'].nunique()}") + + # Check if images actually exist + print(f"\nChecking image availability...") + train_available = self._count_available_images(train_df) + test_available = self._count_available_images(test_df) + + # Sample a few images to check dimensions + sample_dims = self._check_sample_image_dimensions(train_df.head(5)) + + metadata = { + 'train_total': len(train_df), + 'train_available': train_available, + 'test_total': len(test_df), + 'test_available': test_available, + 'train_labels': sorted(train_df['label'].unique()), + 'test_labels': sorted(test_df['label'].unique()), + 'sample_dimensions': sample_dims + } + + print(f"\n=== Summary ===") + print(f"Training: {train_available}/{len(train_df)} images available") + print(f"Test: {test_available}/{len(test_df)} images available") + print(f"Sample image dimensions: {sample_dims}") + + return metadata + + def _count_available_images(self, df: pd.DataFrame) -> int: + """Count how many images actually exist on disk.""" + available = 0 + total = len(df) + + print(f" Checking {total} image files...") + for i, row in df.iterrows(): + image_path = self.input_dir / row['file_path'] + if image_path.exists(): + available += 1 + + # Progress update every 1000 images + if (i + 1) % 1000 == 0: + print(f" Checked {i + 1}/{total} images, {available} available") + + print(f" Final: {available}/{total} images available") + return available + + def _check_sample_image_dimensions(self, sample_df: pd.DataFrame) -> List[Tuple]: + """Check dimensions of a few sample images.""" + dimensions = [] + + for _, row in sample_df.iterrows(): + image_path = self.input_dir / row['file_path'] + if image_path.exists(): + try: + with Image.open(image_path) as img: + dimensions.append((img.width, img.height, len(img.getbands()))) + except Exception as e: + print(f" Error reading {image_path}: {e}") + + if len(dimensions) >= 3: # Just check a few + break + + return dimensions + + def process_dataset(self, max_samples: Optional[int] = None, dry_run: bool = False, skip_check: bool = False, split_from_train: bool = False) -> Dict: + """Process the complete dataset.""" + print(f"\n=== Processing Dataset (dry_run={dry_run}) ===") + + # Read metadata + train_df = pd.read_csv(self.input_dir / "imagenet_train_metadata.csv") + + if split_from_train: + print("Creating validation set from training data...") + # Skip test metadata entirely + test_df = None + else: + test_df = pd.read_csv(self.input_dir / "imagenet_test_metadata.csv") + + # Filter to only available images (unless skipping) + if not skip_check: + print("Filtering to available images...") + train_df = self._filter_available_images(train_df) + if test_df is not None: + test_df = self._filter_available_images(test_df) + else: + print("Skipping image availability check...") + + # Handle data splitting + if split_from_train: + # Use training data for both train and validation + if max_samples: + # Take first max_samples for training + train_samples = max_samples + # Use 20% of training samples for validation (or 400, whichever is smaller) + val_samples = min(400, int(train_samples * 0.2), len(train_df) - train_samples) + + print(f"Splitting from training data:") + print(f" - Training: first {train_samples} samples") + print(f" - Validation: next {val_samples} samples") + + val_df = train_df.iloc[train_samples:train_samples + val_samples].copy() + train_df = train_df.head(train_samples) + else: + # Default split: 90% train, 10% validation + split_idx = int(len(train_df) * 0.9) + val_df = train_df.iloc[split_idx:].copy() + train_df = train_df.iloc[:split_idx].copy() + print(f"Splitting training data: {len(train_df)} train, {len(val_df)} validation") + + test_df = val_df # Use validation split as "test" for consistency + else: + # Limit samples if requested + if max_samples: + print(f"Limiting to {max_samples} samples per split...") + train_df = train_df.head(max_samples) + if test_df is not None: + test_df = test_df.head(max_samples) + + print(f"Processing {len(train_df)} training samples...") + print(f"Processing {len(test_df)} test samples...") + + if dry_run: + print("DRY RUN: Would process the above samples") + return {'dry_run': True, 'train_samples': len(train_df), 'test_samples': len(test_df)} + + # Process training data + train_results = self._process_split(train_df, "train") + + # Process test data (as validation) + test_results = self._process_split(test_df, "val") + + return { + 'train': train_results, + 'validation': test_results + } + + def _filter_available_images(self, df: pd.DataFrame) -> pd.DataFrame: + """Filter dataframe to only include images that exist on disk.""" + available_mask = [] + + for _, row in df.iterrows(): + image_path = self.input_dir / row['file_path'] + available_mask.append(image_path.exists()) + + filtered_df = df[available_mask].copy() + print(f" Filtered {len(df)} -> {len(filtered_df)} available images") + return filtered_df + + def _process_split(self, df: pd.DataFrame, split_name: str) -> Dict: + """Process a data split (train or val).""" + print(f"\nProcessing {split_name} split...") + + # Prepare output files with resolution in name + # For val split, use 'test' in filename for consistency + file_split_name = 'test' if split_name == 'val' else split_name + features_file = self.output_dir / f"imagenet_{self.target_size}x{self.target_size}_{file_split_name}.csv" + labels_file = self.output_dir / f"imagenet_{self.target_size}x{self.target_size}_{file_split_name}_labels.csv" + + # Process images in batches to manage memory + batch_size = 1000 + total_samples = len(df) + num_batches = (total_samples + batch_size - 1) // batch_size + + print(f"Processing {total_samples} samples in {num_batches} batches of {batch_size}") + + # Initialize CSV files + features_written = 0 + labels_written = 0 + + with open(features_file, 'w', newline='') as f_feat, \ + open(labels_file, 'w', newline='') as f_label: + + feat_writer = csv.writer(f_feat) + label_writer = csv.writer(f_label) + + for batch_idx in range(num_batches): + start_idx = batch_idx * batch_size + end_idx = min(start_idx + batch_size, total_samples) + batch_df = df.iloc[start_idx:end_idx] + + print(f" Batch {batch_idx + 1}/{num_batches}: Processing samples {start_idx}-{end_idx-1}") + + # Process batch + batch_features, batch_labels = self._process_image_batch(batch_df) + + # Write to CSV + for features_row in batch_features: + feat_writer.writerow(features_row) + features_written += 1 + + for labels_row in batch_labels: + label_writer.writerow(labels_row) + labels_written += 1 + + # Memory cleanup + del batch_features, batch_labels + gc.collect() + + print(f" Wrote {len(batch_df)} samples to CSV") + + result = { + 'samples_processed': features_written, + 'features_file': str(features_file), + 'labels_file': str(labels_file), + 'features_shape': (features_written, self.features), + 'labels_shape': (labels_written, self.num_classes) + } + + print(f" {split_name} processing complete: {features_written} samples") + return result + + def _process_image_batch(self, batch_df: pd.DataFrame) -> Tuple[List, List]: + """Process a batch of images.""" + batch_features = [] + batch_labels = [] + + for _, row in batch_df.iterrows(): + try: + # Load and process image + image_path = self.input_dir / row['file_path'] + features = self._process_single_image(image_path) + + # Process label + label = int(row['label']) + # Convert to 0-indexed if needed (ImageNet labels are usually 1-indexed) + if label > 0: + label = label - 1 + + # Create one-hot encoding + one_hot = [0.0] * self.num_classes + if 0 <= label < self.num_classes: + one_hot[label] = 1.0 + + batch_features.append(features) + batch_labels.append(one_hot) + + except Exception as e: + print(f" Error processing {row['file_path']}: {e}") + # Skip this sample + continue + + return batch_features, batch_labels + + def _process_single_image(self, image_path: Path) -> List[float]: + """Process a single image: load, resize, normalize, flatten.""" + # Fix path if it points to wrong directory + image_path_str = str(image_path) + if "224x224" in image_path_str and "256x256" in str(self.input_dir): + # Replace 224x224 with 256x256 in the path + image_path_str = image_path_str.replace("224x224", "256x256") + image_path = Path(image_path_str) + + # Load image + with Image.open(image_path) as img: + # Convert to RGB if needed + if img.mode != 'RGB': + img = img.convert('RGB') + + # Resize to target size (e.g., from 256x256 to 224x224) + if img.size != (self.target_size, self.target_size): + img = img.resize((self.target_size, self.target_size), Image.LANCZOS) + + # Convert to numpy array and normalize to [0,1] + img_array = np.array(img, dtype=np.float32) / 255.0 + + # Flatten to feature vector + features = img_array.flatten().tolist() + + return features + + +def main(): + parser = argparse.ArgumentParser(description='Process raw ImageNet JPG data for SystemDS') + parser.add_argument('--input_dir', type=str, required=True, + help='Directory containing raw ImageNet data') + parser.add_argument('--output_dir', type=str, default='imagenet_data', + help='Base output directory for processed data (resolution subdirs will be created)') + parser.add_argument('--target_size', type=int, default=224, + help='Target image size (default: 224 for 224x224)') + parser.add_argument('--max_samples', type=int, default=None, + help='Maximum number of samples per split (for testing)') + parser.add_argument('--dry_run', action='store_true', + help='Just inspect data without processing') + parser.add_argument('--skip_check', action='store_true', + help='Skip image availability checking') + parser.add_argument('--split_from_train', action='store_true', + help='Create validation set from training data instead of using test set') + + args = parser.parse_args() + + # Initialize processor + processor = RawImageNetProcessor(args.input_dir, args.output_dir, args.target_size) + + # Inspect data first (unless skipping check) + if not args.skip_check: + try: + metadata = processor.inspect_raw_data() + except Exception as e: + print(f"Error during inspection: {e}") + return 1 + else: + print("Skipping data inspection...") + + # Process if not dry run + if not args.dry_run: + try: + results = processor.process_dataset( + max_samples=args.max_samples, + dry_run=False, + skip_check=args.skip_check, + split_from_train=args.split_from_train + ) + print(f"\n=== Processing Complete ===") + print(f"Results: {results}") + except Exception as e: + print(f"Error during processing: {e}") + return 1 + else: + processor.process_dataset(dry_run=True) + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) \ No newline at end of file diff --git a/scripts/data_prep/run_raw_imagenet_preprocessing.py b/scripts/data_prep/run_raw_imagenet_preprocessing.py new file mode 100644 index 00000000000..085a89db866 --- /dev/null +++ b/scripts/data_prep/run_raw_imagenet_preprocessing.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python3 +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +""" +Simple runner for raw ImageNet preprocessing +""" + +import sys +import subprocess +from pathlib import Path + +def main(): + # Default paths + input_dir = r"C:\Users\romer\Desktop\Big_Data\imagenet\256x256" # Source images are 256x256 + output_dir = "imagenet_data" + + print("Raw ImageNet Preprocessing Runner") + print("=" * 50) + print(f"Input directory: {input_dir} (256x256 source images)") + print(f"Output directory: {output_dir}") + print(f"Default target size: 224x224 (for AlexNet)") + print() + + # Ask user what they want to do + print("Choose an option:") + print("1. Inspect data only (dry run)") + print("2. Process small sample (2000 train + 400 val from training set)") + print("3. Process full dataset (256x256 -> 224x224)") + print("4. Process full dataset (256x256 -> custom size)") + print("5. Custom processing") + print() + + choice = input("Enter choice (1-5): ").strip() + + if choice == "1": + # Dry run + cmd = [ + sys.executable, "scripts/data_prep/prepare_raw_imagenet.py", + "--input_dir", input_dir, + "--output_dir", output_dir, + "--dry_run" + ] + elif choice == "2": + # Small sample with train/val split from training data + print("Processing 2000 training + 400 validation samples from training set...") + cmd = [ + sys.executable, "scripts/data_prep/prepare_raw_imagenet.py", + "--input_dir", input_dir, + "--output_dir", output_dir, + "--max_samples", "2000", + "--skip_check", + "--split_from_train" + ] + elif choice == "3": + # Full dataset 256x256 -> 224x224 + print("Processing 256x256 images -> 224x224 for AlexNet...") + cmd = [ + sys.executable, "scripts/data_prep/prepare_raw_imagenet.py", + "--input_dir", input_dir, + "--output_dir", output_dir, + "--target_size", "224", + "--skip_check" + ] + elif choice == "4": + # Full dataset custom resolution + target_size = input("Enter target size (e.g., 256, 299): ").strip() + if not target_size.isdigit(): + print("Invalid target size!") + return 1 + + print(f"Processing 256x256 images -> {target_size}x{target_size}...") + + cmd = [ + sys.executable, "scripts/data_prep/prepare_raw_imagenet.py", + "--input_dir", input_dir, + "--output_dir", output_dir, + "--target_size", target_size, + "--skip_check" + ] + elif choice == "5": + # Custom + custom_input = input(f"Input directory [{input_dir}]: ").strip() + if custom_input: + input_dir = custom_input + + custom_output = input(f"Output directory [{output_dir}]: ").strip() + if custom_output: + output_dir = custom_output + + target_size = input("Target size [224]: ").strip() or "224" + max_samples = input("Max samples per split (leave empty for all): ").strip() + + cmd = [ + sys.executable, "scripts/data_prep/prepare_raw_imagenet.py", + "--input_dir", input_dir, + "--output_dir", output_dir, + "--target_size", target_size + ] + + if max_samples: + cmd.extend(["--max_samples", max_samples]) + + skip_check = input("Skip image availability check? [Y/n]: ").strip().lower() + if skip_check != 'n': + cmd.append("--skip_check") + + split_from_train = input("Create validation from training data? [y/N]: ").strip().lower() + if split_from_train == 'y': + cmd.append("--split_from_train") + else: + print("Invalid choice!") + return 1 + + print(f"\nRunning command: {' '.join(cmd)}") + print() + + # Run the command + try: + result = subprocess.run(cmd, check=True) + print("\nProcessing completed successfully!") + return 0 + except subprocess.CalledProcessError as e: + print(f"\nError during processing: {e}") + return 1 + except KeyboardInterrupt: + print("\nProcessing interrupted by user") + return 1 + +if __name__ == "__main__": + sys.exit(main()) \ No newline at end of file diff --git a/scripts/nn/examples/Example-ImageNet_AlexNet_Optimizers.dml b/scripts/nn/examples/Example-ImageNet_AlexNet_Optimizers.dml new file mode 100644 index 00000000000..22555e3d040 --- /dev/null +++ b/scripts/nn/examples/Example-ImageNet_AlexNet_Optimizers.dml @@ -0,0 +1,192 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +/* + * Example script to test different optimizers with AlexNet on ImageNet + * + * This script demonstrates how different optimizers perform on ImageNet, + * particularly focusing on large batch training scenarios. + */ + +source("imagenet_alexnet.dml") as imagenet_alexnet + +# ImageNet parameters +C = 3 # RGB channels +Hin = 224 # Height +Win = 224 # Width +K = 1000 # Number of classes + +print("\n=======================================================") +print("Optimizer Comparison on ImageNet AlexNet") +print("=======================================================\n") + +# For demonstration, we'll use a smaller subset of ImageNet +# In practice, you would load the full ImageNet dataset +print("Loading ImageNet subset for demonstration...") + +# Simulate loading training data (5K samples for faster demo) +n_train = 5000 +X = rand(rows=n_train, cols=C*Hin*Win, min=0, max=1, seed=42) +y = rand(rows=n_train, cols=K, min=0, max=0, seed=42) +# Create one-hot encoded labels +for(i in 1:n_train) { + class = as.scalar(round(rand(rows=1, cols=1, min=1, max=K, seed=42+i))) + y[i, class] = 1 +} + +# Simulate validation data (500 samples) +n_val = 500 +X_val = rand(rows=n_val, cols=C*Hin*Win, min=0, max=1, seed=43) +y_val = rand(rows=n_val, cols=K, min=0, max=0, seed=43) +for(i in 1:n_val) { + class = as.scalar(round(rand(rows=1, cols=1, min=1, max=K, seed=43+i))) + y_val[i, class] = 1 +} + +# Training parameters +epochs = 1 # Reduced for demonstration +batch_size = 512 # Medium batch size for fair comparison + +# Test different optimizers +optimizers = list("sgd", "sgd_momentum", "adam", "lars") +learning_rates = list(0.01, 0.01, 0.001, 0.1) # Tuned for each optimizer + +# Store results +results = matrix(0, rows=length(optimizers), cols=5) +# Columns: optimizer_id, top1_acc, top5_acc, final_loss, train_time + +print("Configuration:") +print("- Dataset: ImageNet subset (demonstration)") +print("- Model: AlexNet with Batch Normalization") +print("- Training samples: " + n_train) +print("- Validation samples: " + n_val) +print("- Epochs: " + epochs) +print("- Batch size: " + batch_size) +print("\n") + +# Test each optimizer +for (i in 1:length(optimizers)) { + optimizer = as.scalar(optimizers[i]) + lr = as.scalar(learning_rates[i]) + + print("\n=========================================") + print("Testing optimizer: " + optimizer) + print("Learning rate: " + lr) + print("-----------------------------------------") + + # Train model + start_time = time() + model = imagenet_alexnet::train(X, y, X_val, y_val, C, Hin, Win, + epochs, optimizer, lr, batch_size) + train_time = (time() - start_time) / 1000.0 # Convert to seconds + + # Extract all model parameters + W1 = as.matrix(model["W1"]); b1 = as.matrix(model["b1"]) + W2 = as.matrix(model["W2"]); b2 = as.matrix(model["b2"]) + W3 = as.matrix(model["W3"]); b3 = as.matrix(model["b3"]) + W4 = as.matrix(model["W4"]); b4 = as.matrix(model["b4"]) + W5 = as.matrix(model["W5"]); b5 = as.matrix(model["b5"]) + W6 = as.matrix(model["W6"]); b6 = as.matrix(model["b6"]) + W7 = as.matrix(model["W7"]); b7 = as.matrix(model["b7"]) + W8 = as.matrix(model["W8"]); b8 = as.matrix(model["b8"]) + + # Extract BN parameters + gamma1 = as.matrix(model["gamma1"]); beta1 = as.matrix(model["beta1"]) + ema_mean1 = as.matrix(model["ema_mean1"]); ema_var1 = as.matrix(model["ema_var1"]) + gamma2 = as.matrix(model["gamma2"]); beta2 = as.matrix(model["beta2"]) + ema_mean2 = as.matrix(model["ema_mean2"]); ema_var2 = as.matrix(model["ema_var2"]) + gamma3 = as.matrix(model["gamma3"]); beta3 = as.matrix(model["beta3"]) + ema_mean3 = as.matrix(model["ema_mean3"]); ema_var3 = as.matrix(model["ema_var3"]) + gamma4 = as.matrix(model["gamma4"]); beta4 = as.matrix(model["beta4"]) + ema_mean4 = as.matrix(model["ema_mean4"]); ema_var4 = as.matrix(model["ema_var4"]) + gamma5 = as.matrix(model["gamma5"]); beta5 = as.matrix(model["beta5"]) + ema_mean5 = as.matrix(model["ema_mean5"]); ema_var5 = as.matrix(model["ema_var5"]) + + # Evaluate on validation set + probs_val = imagenet_alexnet::predict(X_val, C, Hin, Win, + W1, b1, W2, b2, W3, b3, W4, b4, + W5, b5, W6, b6, W7, b7, W8, b8, + gamma1, beta1, ema_mean1, ema_var1, + gamma2, beta2, ema_mean2, ema_var2, + gamma3, beta3, ema_mean3, ema_var3, + gamma4, beta4, ema_mean4, ema_var4, + gamma5, beta5, ema_mean5, ema_var5) + [loss_val, top1_acc, top5_acc] = imagenet_alexnet::eval(probs_val, y_val) + + print("\nFinal Results:") + print("Validation Loss: " + loss_val) + print("Top-1 Accuracy: " + top1_acc + " (" + (top1_acc * 100) + "%)") + print("Top-5 Accuracy: " + top5_acc + " (" + (top5_acc * 100) + "%)") + print("Training Time: " + train_time + " seconds") + + # Store results + results[i, 1] = i # optimizer id + results[i, 2] = top1_acc + results[i, 3] = top5_acc + results[i, 4] = loss_val + results[i, 5] = train_time +} + +# Print summary comparison +print("\n\n=======================================================") +print("OPTIMIZER COMPARISON SUMMARY") +print("=======================================================") +print("\nOptimizer | Top-1 Acc | Top-5 Acc | Val Loss | Time (s)") +print("---------------|-----------|-----------|----------|----------") + +optimizer_names = list("SGD", "SGD+Momentum", "Adam", "LARS") +for(i in 1:nrow(results)) { + opt_name = as.scalar(optimizer_names[i]) + print(sprintf("%-14s | %9.4f | %9.4f | %8.4f | %8.2f", + opt_name, + as.scalar(results[i,2]), + as.scalar(results[i,3]), + as.scalar(results[i,4]), + as.scalar(results[i,5]))) +} + +# Find best performers +best_top1_idx = as.scalar(rowIndexMax(results[,2])) +best_top5_idx = as.scalar(rowIndexMax(results[,3])) +fastest_idx = as.scalar(rowIndexMin(results[,5])) + +print("\nBest Performers:") +print("- Highest Top-1 Accuracy: " + as.scalar(optimizer_names[best_top1_idx]) + + " (" + as.scalar(results[best_top1_idx,2]) + ")") +print("- Highest Top-5 Accuracy: " + as.scalar(optimizer_names[best_top5_idx]) + + " (" + as.scalar(results[best_top5_idx,3]) + ")") +print("- Fastest Training: " + as.scalar(optimizer_names[fastest_idx]) + + " (" + as.scalar(results[fastest_idx,5]) + "s)") + +print("\nKey Observations:") +print("1. SGD with momentum typically provides good baseline performance") +print("2. Adam converges quickly but may not achieve best final accuracy") +print("3. LARS excels with large batch sizes (not fully demonstrated here)") +print("4. Proper learning rate tuning is crucial for each optimizer") +print("5. Batch normalization helps stabilize training across optimizers") + +print("\nNote: This is a demonstration with limited data and epochs.") +print("Full ImageNet training would require:") +print("- 1.2M+ training images") +print("- 90+ epochs") +print("- Proper data augmentation") +print("- Learning rate scheduling") +print("=======================================================\n") \ No newline at end of file diff --git a/scripts/nn/examples/Example-MNIST_Softmax.dml b/scripts/nn/examples/Example-MNIST_Softmax.dml index 6a666698ff8..011278bf775 100644 --- a/scripts/nn/examples/Example-MNIST_Softmax.dml +++ b/scripts/nn/examples/Example-MNIST_Softmax.dml @@ -23,7 +23,7 @@ source("nn/examples/mnist_softmax.dml") as mnist_softmax # Read training data -data = read("mnist_data/mnist_train.csv", format="csv") +data = read("mnist_data/mnist_train.csv", format="csv", header=TRUE) n = nrow(data) # Extract images and labels @@ -45,7 +45,7 @@ epochs = 1 [W, b] = mnist_softmax::train(X, y, X_val, y_val, epochs) # Read test data -data = read("mnist_data/mnist_test.csv", format="csv") +data = read("mnist_data/mnist_test.csv", format="csv", header=TRUE) n = nrow(data) # Extract images and labels diff --git a/scripts/nn/examples/Example-ResNet.dml b/scripts/nn/examples/Example-ResNet.dml index 97b7781573c..81d965df760 100644 --- a/scripts/nn/examples/Example-ResNet.dml +++ b/scripts/nn/examples/Example-ResNet.dml @@ -48,7 +48,7 @@ classes = 1000 # *** adagrad # optimizer_params = resnet18::init_adagrad_optim_params(classes) # *** adam -optimizer_params = resnet18::init_adam_optim_params(classes) +# optimizer_params = resnet18::init_adam_optim_params(classes) # *** rmsprop # optimizer_params = resnet18::init_rmsprop_optim_params(classes) # *** sgd @@ -57,6 +57,8 @@ optimizer_params = resnet18::init_adam_optim_params(classes) # optimizer_params = resnet18::init_sgd_momentumg_optim_params(classes) # *** sgd nesterov # optimizer_params = resnet18::init_sgd_nesterov_optim_params(classes) +# *** lars +optimizer_params = resnet18::init_lars_optim_params(classes) # create random data N = 100 @@ -90,6 +92,11 @@ train = function(matrix[double] X, matrix[double] Y, list[unknown] model, list[u decay_rate = 0.99 # sgd momentum & nesterov momentum = 0.8 + # lars + trust_coeff = 0.001 + momentum = 0.9 + weight_decay = 0.0001 + decay_power = 2 learned_model = list() learned_emas = list() @@ -127,9 +134,9 @@ train = function(matrix[double] X, matrix[double] Y, list[unknown] model, list[u # *** adagrad # [model, optim_params] = resnet18::update_params_with_adagrad(model, gradients, lr, epsilon, optim_params) # *** adam - [model, optim_params] = resnet18::update_params_with_adam(model, gradients, lr, beta1, beta2, epsilon, - t, optim_params) - t = t + 1 + # [model, optim_params] = resnet18::update_params_with_adam(model, gradients, lr, beta1, beta2, epsilon, + # t, optim_params) + # t = t + 1 # *** rmsprop # [model, optim_params] = resnet18::update_params_with_rmsprop(model, gradients, lr, decay_rate, epsilon, # optim_params) @@ -141,6 +148,9 @@ train = function(matrix[double] X, matrix[double] Y, list[unknown] model, list[u # *** sgd nesterov # [model, optim_params] = resnet18::update_params_with_sgd_nesterov(model, gradients, lr, momentum, # optim_params) + # *** lars + [model, optim_params] = resnet18::update_params_with_lars(model, gradients, lr, momentum, weight_decay, trust_coeff, + optim_params) } # reshuffle mini batches diff --git a/scripts/nn/examples/imagenet_alexnet.dml b/scripts/nn/examples/imagenet_alexnet.dml new file mode 100644 index 00000000000..d26d7a0d6c1 --- /dev/null +++ b/scripts/nn/examples/imagenet_alexnet.dml @@ -0,0 +1,334 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# ImageNet AlexNet - Train +# +# This script trains a convolutional net using the "AlexNet" architecture +# on 224x224 ImageNet images using LARS optimizer. +# +# Inputs: +# - train_data: File containing ImageNet training images (features) +# - train_labels: File containing ImageNet training labels (one-hot) +# - val_data: File containing ImageNet validation images (features) +# - val_labels: File containing ImageNet validation labels (one-hot) +# - epochs: [DEFAULT: 30] Total number of full training loops +# - batch_size: [DEFAULT: 256] Mini-batch size for training +# - out_dir: [DEFAULT: "scripts/nn/examples/model/imagenet_alexnet"] Directory to store results +# +# Outputs: +# - accuracy: File containing validation accuracy over epochs +# - loss: File containing training loss over epochs +# +# Sample Invocation: +# ``` +# java -Xmx8g -Xms8g -cp "target/systemds-3.4.0-SNAPSHOT.jar:target/lib/*" \ +# org.apache.sysds.api.DMLScript -f scripts/nn/examples/imagenet_alexnet.dml \ +# -exec singlenode -gpu +# java -Xmx8g -Xms8g -cp "target/systemds-3.4.0-SNAPSHOT.jar:target/lib/*" org.apache.sysds.api.DMLScript -f scripts/nn/examples/imagenet_alexnet.dml -exec singlenode -gpu +# ``` + + + +source("nn/networks/alexnet.dml") as alexnet +source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss + +# Read the ImageNet data +fmt = "csv" +target_size = 224 # For display purposes + +print("Loading ImageNet data (224x224)...") +print("Data directory: imagenet_data/224x224") +print("") + +# Read the data files with constant string paths +print("Reading training data...") +train_data = read("imagenet_data/224x224/imagenet_224x224_train.csv", format=fmt) +train_labels = read("imagenet_data/224x224/imagenet_224x224_train_labels.csv", format=fmt) +print("Reading validation data...") +val_data = read("imagenet_data/224x224/imagenet_224x224_test.csv", format=fmt) +val_labels = read("imagenet_data/224x224/imagenet_224x224_test_labels.csv", format=fmt) +out_dir = "scripts/nn/examples/model/imagenet_alexnet" + +print("Data loaded successfully.") + +# Get dataset dimensions +N = nrow(train_data) +N_val = nrow(val_data) +classes = 1000 + +print("Dataset info:") +print("- Training samples: " + N) +print("- Validation samples: " + N_val) +print("- Features: " + ncol(train_data)) +print("- Classes: " + classes) + +# Scale images to [-1,1] (data is already in [0,1] range from preprocessing) +X = (train_data - 0.5) * 2 +X_val = (val_data - 0.5) * 2 + +# Labels are already one-hot encoded from preprocessing +Y = train_labels +Y_val = val_labels + +print("Data preprocessing completed.") +print("- Image range: [" + min(X) + ", " + max(X) + "]") +print("- Label sum check: " + mean(rowSums(Y))) + +# Get initial model parameters +print("Initializing AlexNet model...") +use_bn = FALSE # Use batch normalization +if (use_bn) { + print("Using AlexNet with Batch Normalization") + [model, emas] = alexnet::init_with_bn(3, 224, 224, classes, 42) +} else { + print("Using standard AlexNet") + model = alexnet::init(3, 224, 224, classes, 42) + emas = list() # Empty for non-BN version +} + +# Get initial optimizer parameters +print("Initializing LARS optimizer...") +optimizer_params = alexnet::init_lars_optim_params(model) + +# Define image properties +Hin = target_size +Win = target_size +C = 3 + +# Define training parameters +epochs = 30 +batch_size = 256 + +print("Training configuration:") +print("- Image size: " + Hin + "x" + Win + "x" + C + " (features: " + (Hin*Win*C) + ")") +print("- Epochs: " + epochs) +print("- Batch size: " + batch_size) +print("- Use Batch Normalization: " + use_bn) +print("") + +print("Starting training...") +[accuracy, loss_metric, learned_model, learned_emas] = train(X, Y, X_val, Y_val, model, emas, N, C, Hin, Win, epochs, batch_size, optimizer_params, use_bn) + +print("Saving results...") +write(accuracy, out_dir + "/imagenet_alexnet_accuracy.csv", format="csv") +write(loss_metric, out_dir + "/imagenet_alexnet_loss.csv", format="csv") + +# Save final metrics +final_accuracy = as.scalar(accuracy[epochs, 1]) +print("Final validation accuracy: " + final_accuracy) + +print("Training completed!") + +train = function(matrix[double] X, matrix[double] Y, matrix[double] X_val, matrix[double] Y_val, list[unknown] model, list[unknown] emas, + int samples, int C, int Hin, int Win, int epochs, int batch_size, list[unknown] optim_params, boolean use_bn) + return (matrix[double] accuracy, matrix[double] loss_metric, + list[unknown] learned_model, list[unknown] learned_emas) { + + # --- HYPERPARAMETERS --- + base_batch_size = 256 # Reference batch size for LR scaling + initial_lr = 0.01 * (batch_size / base_batch_size) # Linear scaling rule + end_lr = 0.00001 + warmup_epochs = 5 + power = 2.0 + momentum = 0.9 + trust_coeff = 0.001 + weight_decay = 0.0005 + + iterations_per_epoch = ceil(samples / batch_size) + total_iterations = epochs * iterations_per_epoch + warmup_iterations = warmup_epochs * iterations_per_epoch + decay_iterations = total_iterations - warmup_iterations + + print("LARS Configuration:") + print("- Base LR: " + (0.01) + " (scaled to " + initial_lr + " for batch size " + batch_size + ")") + print("- End LR: " + end_lr) + print("- Warmup epochs: " + warmup_epochs) + print("- Momentum: " + momentum) + print("- Weight decay: " + weight_decay) + print("- Trust coefficient: " + trust_coeff) + print("- Use BN: " + use_bn) + print("") + + accuracy = matrix(0, rows=epochs, cols=1) + loss_metric = matrix(0, rows=epochs, cols=1) + mode = "train" + + for (epoch in 1:epochs) { + loss_avg = 0.0 + print("Start epoch: " + epoch + "/" + epochs) + + for (i in 1:iterations_per_epoch) { + if (i %% 50 == 1) { print(" - Iteration: " + i + "/" + iterations_per_epoch) } + + # --- DYNAMIC LEARNING RATE --- + current_iteration = (epoch - 1) * iterations_per_epoch + i + if (current_iteration < warmup_iterations) { + current_lr = initial_lr * (as.double(current_iteration) / warmup_iterations) + } else { + decay_step = current_iteration - warmup_iterations + decay_progress = as.double(decay_step) / decay_iterations + current_lr = end_lr + (initial_lr - end_lr) * (1 - decay_progress)^power + } + if (i == 1) { print("Using Learning Rate: " + current_lr) } + + # --- BATCH PREPARATION --- + start = (i - 1) * batch_size + 1 + end = min(samples, i * batch_size) + X_batch = X[start:end,] + Y_batch = Y[start:end,] + + # --- FORWARD AND BACKWARD PASS --- + if (use_bn) { + [out, cached_out, emas] = alexnet::forward_with_bn(X_batch, C, Hin, Win, model, "train", 0.5) + } else { + [out, cached_out] = alexnet::forward(X_batch, C, Hin, Win, model, "train", 0.5) + } + + # Compute loss with L2 regularization + loss = alexnet::compute_loss(out, Y_batch, model, weight_decay) + loss_avg = (loss_avg * (i - 1) + loss) / i + + # Backward pass + dOut = cross_entropy_loss::backward(out, Y_batch) + if (use_bn) { + [dX, gradients] = alexnet::backward_with_bn(dOut, cached_out, model, C, Hin, Win, 0.5) + } else { + [dX, gradients] = alexnet::backward(dOut, cached_out, model, C, Hin, Win, 0.5) + } + + # Update with LARS (weight decay is handled internally by LARS) + [model, optim_params] = alexnet::update_params_with_lars( + model, gradients, current_lr, momentum, weight_decay, + trust_coeff, optim_params) + } + + # --- EVALUATION --- + print("Computing metrics for current epoch...") + if (use_bn) { + accuracy_scalar = predict_and_eval_batched_with_bn(X_val, Y_val, C, Hin, Win, model, emas, batch_size) + } else { + accuracy_scalar = predict_and_eval_batched(X_val, Y_val, C, Hin, Win, model, batch_size) + } + + loss_metric[epoch, 1] = loss_avg + accuracy[epoch, 1] = accuracy_scalar + print("Epoch " + epoch + " completed:") + print("- Avg. Loss: " + loss_avg) + print("- Validation Accuracy: " + accuracy_scalar) + print("") + } + learned_model = model + learned_emas = emas +} + +predict = function(matrix[double] X, int C, int Hin, int Win, + list[unknown] model) + return(matrix[double] out) { + /* + * Computes the class probability predictions using standard AlexNet. + */ + + # Predict on validation dataset + mode = "test" + [out, cached_out] = alexnet::forward(X, C, Hin, Win, model, mode, 0.0) +} + +predict_with_bn = function(matrix[double] X, int C, int Hin, int Win, + list[unknown] model, list[unknown] emas) + return(matrix[double] out) { + /* + * Computes the class probability predictions using AlexNet with Batch Normalization. + */ + + # Predict on validation dataset + mode = "test" + [out, cached_out, emas_temp] = alexnet::forward_with_bn(X, C, Hin, Win, model, mode, 0.0) +} + +predict_and_eval_batched = function(matrix[double] X_val, matrix[double] Y_val, int C, int Hin, int Win, + list[unknown] model, int batch_size) + return(double accuracy) { + /* + * Batched prediction and evaluation for standard AlexNet to avoid memory issues + */ + + N_val = nrow(X_val) + val_iterations = ceil(N_val / batch_size) + correct_total = 0 + mode = "test" + + print(" Evaluating validation set in " + val_iterations + " batches...") + + for (i in 1:val_iterations) { + if (i %% 10 == 1) { + print(" Validation batch: " + i + "/" + val_iterations) + } + + start = (i - 1) * batch_size + 1 + end = min(N_val, i * batch_size) + X_batch = X_val[start:end,] + Y_batch = Y_val[start:end,] + + # Forward pass + [out_batch, cached_out] = alexnet::forward(X_batch, C, Hin, Win, model, mode, 0.0) + + # Count correct predictions + correct_pred = rowIndexMax(out_batch) == rowIndexMax(Y_batch) + correct_total = correct_total + sum(correct_pred) + } + + accuracy = correct_total / N_val +} + +predict_and_eval_batched_with_bn = function(matrix[double] X_val, matrix[double] Y_val, int C, int Hin, int Win, + list[unknown] model, list[unknown] emas, int batch_size) + return(double accuracy) { + /* + * Batched prediction and evaluation for AlexNet with BN to avoid memory issues + */ + + N_val = nrow(X_val) + val_iterations = ceil(N_val / batch_size) + correct_total = 0 + mode = "test" + + print(" Evaluating validation set in " + val_iterations + " batches...") + + for (i in 1:val_iterations) { + if (i %% 10 == 1) { + print(" Validation batch: " + i + "/" + val_iterations) + } + + start = (i - 1) * batch_size + 1 + end = min(N_val, i * batch_size) + X_batch = X_val[start:end,] + Y_batch = Y_val[start:end,] + + # Forward pass + [out_batch, cached_out, emas_temp] = alexnet::forward_with_bn(X_batch, C, Hin, Win, model, mode, 0.0) + + # Count correct predictions + correct_pred = rowIndexMax(out_batch) == rowIndexMax(Y_batch) + correct_total = correct_total + sum(correct_pred) + } + + accuracy = correct_total / N_val +} \ No newline at end of file diff --git a/scripts/nn/examples/imagenet_resnet.dml b/scripts/nn/examples/imagenet_resnet.dml new file mode 100644 index 00000000000..2aba2fd16bb --- /dev/null +++ b/scripts/nn/examples/imagenet_resnet.dml @@ -0,0 +1,307 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# ImageNet Resnet - Train +# +# This script trains a convolutional net using the "ResNet" architecture +# on 64x64 ImageNet images using LARS optimizer. +# +# Inputs: +# - train_data: File containing ImageNet training images (features) +# - train_labels: File containing ImageNet training labels (one-hot) +# - val_data: File containing ImageNet validation images (features) +# - val_labels: File containing ImageNet validation labels (one-hot) +# - epochs: [DEFAULT: 30] Total number of full training loops +# - batch_size: [DEFAULT: 256] Mini-batch size for training +# - out_dir: [DEFAULT: "scripts/nn/examples/model/imagenet_resnet"] Directory to store results +# +# Outputs: +# - accuracy: File containing validation accuracy over epochs +# - loss: File containing training loss over epochs +# +# Sample Invocation: +# ``` +# java -Xmx8g -Xms8g -cp "target/systemds-3.4.0-SNAPSHOT.jar:target/lib/*" \ +# org.apache.sysds.api.DMLScript -f scripts/nn/examples/imagenet_resnet.dml \ +# -exec singlenode -gpu +# ``` + +source("nn/networks/resnet50.dml") as resnet +source("scripts/nn/layers/softmax_cross_entropy_loss.dml") as loss_nn + +# Read the ImageNet data +fmt = "csv" +print("Loading ImageNet data...") +train_data = read("imagenet_data/systemds_ready/imagenet_train_6GB.csv", format=fmt) +train_labels = read("imagenet_data/systemds_ready/imagenet_train_labels_6GB.csv", format=fmt) +val_data = read("imagenet_data/systemds_ready/imagenet_val_6GB.csv", format=fmt) +val_labels = read("imagenet_data/systemds_ready/imagenet_val_labels_6GB.csv", format=fmt) +out_dir = "scripts/nn/examples/model/imagenet_resnet" + +print("Data loaded successfully.") + +# Get dataset dimensions +N = nrow(train_data) +N_val = nrow(val_data) +classes = 1000 + +print("Dataset info:") +print("- Training samples: " + N) +print("- Validation samples: " + N_val) +print("- Features: " + ncol(train_data)) +print("- Classes: " + classes) + +# Scale images to [-1,1] (data is already in [0,1] range from preprocessing) +X = (train_data - 0.5) * 2 +X_val = (val_data - 0.5) * 2 + +# Labels are already one-hot encoded from preprocessing +Y = train_labels +Y_val = val_labels + +print("Data preprocessing completed.") +print("- Image range: [" + min(X) + ", " + max(X) + "]") +print("- Label sum check: " + mean(rowSums(Y))) + +# Get initial model parameters +print("Initializing ResNet-18 model...") +[model, ema_means_vars] = resnet::init(classes, -1) + +# Get initial optimizer parameters +print("Initializing LARS optimizer...") +optimizer_params = resnet::init_lars_optim_params(classes) + +# Define image properties +Hin = 64 +Win = 64 + +# Define training parameters +epochs = 90 +batch_size = 256 + +print("Training configuration:") +print("- Image size: " + Hin + "x" + Win + "x3") +print("- Epochs: " + epochs) +print("- Batch size: " + batch_size) +print("") + +print("Starting training...") +[accuracy, loss_metric, learned_model, learned_emas] = train(X, Y, X_val, Y_val, model, ema_means_vars, N, Hin, Win, epochs, batch_size, optimizer_params) + +print("Saving results...") +write(accuracy, out_dir + "/imagenet_resnet_accuracy.csv", format="csv") +write(loss_metric, out_dir + "/imagenet_resnet_loss.csv", format="csv") + +# Save final metrics +final_accuracy = as.scalar(accuracy[epochs, 1]) +print("Final validation accuracy: " + final_accuracy) + +print("Training completed!") + +# Train function +train = function(matrix[double] X, matrix[double] Y, matrix[double] X_val, matrix[double] Y_val, list[unknown] model, list[unknown] emas, int samples, int Hin, + int Win, int epochs, int batch_size, list[unknown] optim_params) + return (matrix[double] accuracy, matrix[double] loss_metric, + list[unknown] learned_model, list[unknown] learned_emas) { + + # --- LEARNING RATE SCHEDULE HYPERPARAMETERS --- + # The learning rate we want to reach AFTER warmup + initial_lr = 0.01 + # A very small final learning rate to decay towards + end_lr = 0.0001 + # Number of warmup epochs, as per the paper + warmup_epochs = 5 + # The exponent for the polynomial decay, as per the paper + power = 2.0 + + # Optimizer hyperparameters + momentum = 0.9 + trust_coeff = 0.001 + weight_decay = 0.0001 + + # Calculate total iterations for the schedule + iterations_per_epoch = ceil(samples / batch_size) + total_iterations = epochs * iterations_per_epoch + warmup_iterations = warmup_epochs * iterations_per_epoch + decay_iterations = total_iterations - warmup_iterations + + # Initialize metrics + learned_model = list() + learned_emas = list() + accuracy = matrix(0, rows=epochs, cols=1) + loss_metric = matrix(0, rows=epochs, cols=1) + + iterations = ceil(samples/batch_size) + mode = "train" + + for (epoch in 1:epochs) { + loss_avg = 0.0 + + print("Start epoch: " + epoch + "/" + epochs) + + for (i in 1:iterations) { + print(" - Iteration: " + i + "/" + iterations) + + # --- START DYNAMIC LEARNING RATE LOGIC --- + current_iteration = (epoch - 1) * iterations_per_epoch + i + current_lr = 0.0 + + if (current_iteration < warmup_iterations) { + # 1. Linear Warmup Phase + # Linearly increase LR from 0 to initial_lr over warmup_iterations + current_lr = initial_lr * (as.double(current_iteration) / warmup_iterations) + } else { + # 2. Polynomial Decay Phase + decay_step = current_iteration - warmup_iterations + decay_progress = as.double(decay_step) / decay_iterations + current_lr = end_lr + (initial_lr - end_lr) * (1 - decay_progress)^power + } + + if (i == 1) { # Print LR once per epoch to reduce log spam + print("Using Learning Rate: " + current_lr) + } + # --- END DYNAMIC LEARNING RATE LOGIC --- + + # Get batch + start = (i - 1) * batch_size + 1 + end = min(samples, i * batch_size) + X_batch = X[start:end,] + Y_batch = Y[start:end,] + + # Forward pass + [out, emas, cached_out, cached_means_vars] = resnet::forward(X_batch, Hin, Win, model, mode, emas) + + # Loss + loss = loss_nn::forward(out, Y_batch) + if (i %% 10 == 0) { # Print loss same frequency as MNIST + print(" - Iteration: " + i + "/" + iterations + ", Loss: " + loss) + } + loss_avg = (loss_avg * (i - 1) + loss) / i + + # Backward + dOut = loss_nn::backward(out, Y_batch) + [dX, gradients] = resnet::backward(dOut, cached_out, model, cached_means_vars) + + # Update parameters + [model, optim_params] = resnet::update_params_with_lars(model, gradients, current_lr, momentum, weight_decay, trust_coeff, + optim_params) + } + + # Reshuffle mini batches + r = rand(rows=nrow(Y), cols=1, min=0, max=1, pdf="uniform") + X_tmp = order(target=cbind(r, X), by=1) + Y_tmp = order(target=cbind(r, Y), by=1) + X = X_tmp[,2:ncol(X_tmp)] + Y = Y_tmp[,2:ncol(Y_tmp)] + + print("Computing metrics for current epoch...") + + # Predict on the validation dataset with batching to avoid OOM + accuracy_scalar = predict_and_eval_batched(X_val, Y_val, Hin, Win, model, emas, batch_size) + + # Append to the epoch-wise metrics + loss_metric[epoch, 1] = loss_avg + accuracy[epoch, 1] = accuracy_scalar + + print("Epoch Avg. Loss: " + loss_avg) + print("Epoch Accuracy: " + accuracy_scalar) + } + + learned_model = model + learned_emas = emas +} + +predict = function(matrix[double] X, int Hin, int Win, + list[unknown] model, list[unknown] emas) + return(matrix[double] out) { + /* + * Computes the class probability predictions of a convolutional + * net using the "ResNet" architecture. + * + * The input matrix, X, has N examples, each represented as a 3D + * volume unrolled into a single vector. + * + * Inputs: + * - X: Input data matrix, of shape (N, C*Hin*Win). + * + * Outputs: + * - probs: Class probabilities, of shape (N, K). + */ + + # Predict on validation dataset + mode = "train" + [out, temp_emas, temp_cached_out, temp_cached_means_vars] = resnet::forward(X, Hin, Win, model, mode, emas) +} + +predict_and_eval_batched = function(matrix[double] X_val, matrix[double] Y_val, int Hin, int Win, + list[unknown] model, list[unknown] emas, int batch_size) + return(double accuracy) { + /* + * Batched prediction and evaluation to avoid memory issues with large validation sets + */ + + N_val = nrow(X_val) + val_iterations = ceil(N_val / batch_size) + correct_total = 0 + mode = "train" + + print(" Evaluating validation set in " + val_iterations + " batches...") + + for (i in 1:val_iterations) { + if (i %% 10 == 1) { + print(" Validation batch: " + i + "/" + val_iterations) + } + + start = (i - 1) * batch_size + 1 + end = min(N_val, i * batch_size) + X_batch = X_val[start:end,] + Y_batch = Y_val[start:end,] + + # Forward pass + [out_batch, temp_emas, temp_cached_out, temp_cached_means_vars] = resnet::forward(X_batch, Hin, Win, model, mode, emas) + + # Count correct predictions + correct_pred = rowIndexMax(out_batch) == rowIndexMax(Y_batch) + correct_total = correct_total + sum(correct_pred) + } + + accuracy = correct_total / N_val +} + +eval = function(matrix[double] probs, matrix[double] Y) + return(double accuracy) { + /* + * Evaluates a convolutional net using the "ResNet" architecture. + * + * The probs matrix contains the class probability predictions + * of K classes over N examples. The targets, Y, have K classes, + * and are one-hot encoded. + * + * Inputs: + * - probs: Class probabilities, of shape (N, K). + * - Y: Target matrix, of shape (N, K). + * + * Outputs: + * - accuracy: Scalar accuracy, of shape (1). + */ + correct_pred = rowIndexMax(probs) == rowIndexMax(Y) + accuracy = mean(correct_pred) +} \ No newline at end of file diff --git a/scripts/nn/examples/load_imagenet_csv.dml b/scripts/nn/examples/load_imagenet_csv.dml new file mode 100644 index 00000000000..52e724b6de4 --- /dev/null +++ b/scripts/nn/examples/load_imagenet_csv.dml @@ -0,0 +1,122 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +#------------------------------------------------------------- +# +# Script to load ImageNet CSV data and convert to binary format +# +#------------------------------------------------------------- + +# Function to load and preprocess ImageNet CSV data +load_and_save_imagenet_data = function() { + print("Loading ImageNet CSV data...") + + # Parameters + num_classes = 10 # Adjust based on your data + + # Use relative paths + train_csv = "imagenet_data/imagenet_train.csv" + val_csv = "imagenet_data/imagenet_val.csv" + + # Output binary files + train_data_file = "imagenet_data/train_data.bin" + train_labels_file = "imagenet_data/train_labels.bin" + val_data_file = "imagenet_data/val_data.bin" + val_labels_file = "imagenet_data/val_labels.bin" + + print("Loading training data from CSV...") + # Read CSV file + train_data = read(train_csv, format="csv", header=FALSE) + + # Force dense + train_data = train_data + 0 + + # Extract labels and features + train_labels = train_data[,1] + train_features = train_data[,2:ncol(train_data)] + + # Get sizes + N_train = nrow(train_features) + D = ncol(train_features) + + print("Training samples: " + N_train) + print("Feature dimension: " + D) + + # Normalize features to [0, 1] + train_features = train_features / 255.0 + + # Convert labels to one-hot encoding + # Adjust labels to be 1-based if they are 0-based + min_label = min(train_labels) + if (min_label == 0) { + train_labels = train_labels + 1 + } + + train_labels_onehot = table(seq(1, N_train), train_labels, N_train, num_classes) + + # Save training data in binary format + print("Saving training data to binary format...") + write(train_features, train_data_file, format="binary") + write(train_labels_onehot, train_labels_file, format="binary") + + print("Loading validation data from CSV...") + # Read validation CSV + val_data = read(val_csv, format="csv", header=FALSE) + + # Force dense + val_data = val_data + 0 + + # Extract labels and features + val_labels = val_data[,1] + val_features = val_data[,2:ncol(val_data)] + + N_val = nrow(val_features) + print("Validation samples: " + N_val) + + # Normalize features + val_features = val_features / 255.0 + + # Convert labels to one-hot encoding + if (min_label == 0) { + val_labels = val_labels + 1 + } + + val_labels_onehot = table(seq(1, N_val), val_labels, N_val, num_classes) + + # Save validation data in binary format + print("Saving validation data to binary format...") + write(val_features, val_data_file, format="binary") + write(val_labels_onehot, val_labels_file, format="binary") + + print("") + print("Data conversion completed!") + print("Binary files created:") + print("- " + train_data_file + " (shape: " + N_train + " x " + D + ")") + print("- " + train_labels_file + " (shape: " + N_train + " x " + num_classes + ")") + print("- " + val_data_file + " (shape: " + N_val + " x " + D + ")") + print("- " + val_labels_file + " (shape: " + N_val + " x " + num_classes + ")") +} + +# Run the conversion +load_and_save_imagenet_data() + +print("") +print("You can now use these binary files in your training script for better performance!") \ No newline at end of file diff --git a/scripts/nn/examples/mnist_resnet.dml b/scripts/nn/examples/mnist_resnet.dml new file mode 100644 index 00000000000..16124dd6c92 --- /dev/null +++ b/scripts/nn/examples/mnist_resnet.dml @@ -0,0 +1,286 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# MNIST Resnet - Train +# +# This script trains a convolutional net using the "ResNet" architecture +# on images of handwritten digits. +# +# Inputs: +# - train: File containing labeled MNIST training images. +# The format is "label, pixel_1, pixel_2, ..., pixel_n". +# - test: File containing labeled MNIST test images. +# The format is "label, pixel_1, pixel_2, ..., pixel_n". +# - C: Number of color chanels in the images. +# - Hin: Input image height. +# - Win: Input image width. +# - epochs: [DEFAULT: 10] Total number of full training loops over +# the full data set. +# - out_dir: [DEFAULT: "."] Directory to store weights and bias +# matrices of trained model, as well as final test accuracy. +# - fmt: [DEFAULT: "csv"] File format of `train` and `test` data. +# Options include: "csv", "mm", "text", and "binary". +# +# Outputs: +# - W1, W2, W3, W4: Files containing the trained weights of the model. +# - b1, b2, b3, b4: Files containing the trained biases of the model. +# - accuracy: File containing the accuracy and loss on the test data over all epochs. +# +# Data: +# The MNIST dataset contains labeled images of handwritten digits, +# where each example is a 28x28 pixel image of grayscale values in +# the range [0,255] stretched out as 784 pixels, and each label is +# one of 10 possible digits in [0,9]. +# +# Sample Invocation (running from outside the `nn` folder): +# 1. Download data (60,000 training examples, and 10,000 test examples) +# ``` +# nn/examples/get_mnist_data.sh +# ``` +# +# 2. Execute using Spark +# ``` +# spark-submit --master local[*] --driver-memory 10G +# --conf spark.driver.maxResultSize=0 --conf spark.rpc.message.maxSize=128 +# $SYSTEMDS_ROOT/target/SystemDS.jar -f nn/examples/mnist_resnet.dml +# -nvargs train=nn/examples/data/mnist/mnist_train.csv test=nn/examples/data/mnist/mnist_test.csv +# C=1 Hin=28 Win=28 epochs=10 out_dir=nn/examples/model/mnist_resnet +# ``` +# + +source("nn/networks/resnet18.dml") as resnet +source("scripts/nn/layers/softmax_cross_entropy_loss.dml") as loss_nn + +# Read the data +fmt = "csv" +train = read("scripts/nn/examples/data/mnist_train.csv", format=fmt) +test = read("scripts/nn/examples/data/mnist_test.csv", format=fmt) +out_dir = "scripts/nn/example/model/mnist_resnet" + +# Extract images and labels +images = train[,2:ncol(train)] +labels = train[,1] +images_test = test[,2:ncol(test)] +labels_test = test[,1] +classes = 10 + +# Scale images to [-1,1], and one-hot encode the labels +N = nrow(images) +N_test = nrow(images_test) +X = (images / 255.0) * 2 - 1 +X = cbind(X, X, X) # Resnet assumes C=3 so we duplicate the data along the channels +Y = table(seq(1, N), labels+1, N, 10) +X_test = (images_test / 255.0) * 2 - 1 +X_test = cbind(X_test, X_test, X_test) +Y_test = table(seq(1, N_test), labels_test+1, N_test, 10) + +# Split into training (55,000 examples) and validation (5,000 examples) +#X = images[5001:nrow(images),] +#X_val = images[1:5000,] +#Y = labels[5001:nrow(images),] +#Y_val = labels[1:5000,] + +# Get initial model parameters +[model, ema_means_vars] = resnet::init(classes, -1) + +# Get initial optimizer parameters +optimizer_params = resnet::init_lars_optim_params(classes) +# optimizer_params = resnet::init_sgd_momentum_optim_params(classes) +# optimizer_params = resnet::init_adam_optim_params(classes) + +# Define image properties +Hin = 28 +Win = 28 +#N_val = 0 + +# Define training parameters +epochs = 90 +batch_size = 512 + +[accuracy, loss_metric, learned_model, learned_emas] = train(X, Y, X_test, Y_test, model, ema_means_vars, N, Hin, Win, epochs, batch_size, optimizer_params) + +write(accuracy, "scripts/nn/examples/out/resnet_mnist_accuracy.csv", format="csv") +write(loss_metric, "scripts/nn/examples/out/resnet_mnist_loss.csv", format="csv") + +#Train +train = function(matrix[double] X, matrix[double] Y, matrix[double] X_test, matrix[double] Y_test, list[unknown] model, list[unknown] emas, int samples, int Hin, + int Win, int epochs, int batch_size, list[unknown] optim_params) + return (matrix[double] accuracy, matrix[double] loss_metric, + list[unknown] learned_model, list[unknown] learned_emas) { + + # --- LEARNING RATE SCHEDULE HYPERPARAMETERS --- + # The learning rate we want to reach AFTER warmup + initial_lr = 0.01 + # A very small final learning rate to decay towards + end_lr = 0.0001 + # Number of warmup epochs, as per the paper + warmup_epochs = 5 + # The exponent for the polynomial decay, as per the paper + power = 2.0 + + # Optimizer hyperparameters + momentum = 0.9 + trust_coeff = 0.001 + weight_decay = 0.0001 + + # Adam optimizer hyperparameters + beta1 = 0.9 + beta2 = 0.999 + epsilon = 1e-8 + + # Calculate total iterations for the schedule + iterations_per_epoch = ceil(samples / batch_size) + total_iterations = epochs * iterations_per_epoch + warmup_iterations = warmup_epochs * iterations_per_epoch + decay_iterations = total_iterations - warmup_iterations + + # Initialize metrics + learned_model = list() + learned_emas = list() + accuracy = matrix(0, rows=epochs, cols=1) + loss_metric = matrix(0, rows=epochs, cols=1) + + iterations = ceil(samples/batch_size) + mode = "train" + + for (epoch in 1:epochs) { + loss_avg = 0.0 + + print("Start epoch: " + epoch + "/" + epochs) + + for (i in 1:iterations) { + print(" - Iteration: " + i + "/" + iterations) + + # --- START DYNAMIC LEARNING RATE LOGIC --- + current_iteration = (epoch - 1) * iterations_per_epoch + i + current_lr = 0.0 + + if (current_iteration < warmup_iterations) { + # 1. Linear Warmup Phase + # Linearly increase LR from 0 to initial_lr over warmup_iterations + current_lr = initial_lr * (as.double(current_iteration) / warmup_iterations) + } else { + # 2. Polynomial Decay Phase + decay_step = current_iteration - warmup_iterations + decay_progress = as.double(decay_step) / decay_iterations + current_lr = end_lr + (initial_lr - end_lr) * (1 - decay_progress)^power + } + + if (i == 1) { # Print LR once per epoch to reduce log spam + print("Using Learning Rate: " + current_lr) + } + # --- END DYNAMIC LEARNING RATE LOGIC --- + + # Get batch + start = (i - 1) * batch_size + 1 + end = min(samples, i * batch_size) + X_batch = X[start:end,] + Y_batch = Y[start:end,] + + # Forward pass + [out, emas, cached_out, cached_means_vars] = resnet::forward(X_batch, Hin, Win, model, mode, emas) + + # Loss + loss = loss_nn::forward(out, Y_batch) + if (i %% 10 == 0) { # Print loss less frequently on large datasets + print(" - Iteration: " + i + "/" + iterations + ", Loss: " + loss) + } + loss_avg = (loss_avg * (i - 1) + loss) / i + + # Backward + dOut = loss_nn::backward(out, Y_batch) + [dX, gradients] = resnet::backward(dOut, cached_out, model, cached_means_vars) + + # Update parameters + [model, optim_params] = resnet::update_params_with_lars(model, gradients, current_lr, momentum, weight_decay, trust_coeff, + optim_params) + # [model, optim_params] = resnet::update_params_with_sgd_momentum(model, gradients, current_lr, momentum, optim_params) + + # [model, optim_params] = resnet::update_params_with_adam(model, gradients, current_lr, beta1, beta2, epsilon, current_iteration, optim_params) + } + + # Reshuffle mini batches + r = rand(rows=nrow(Y), cols=1, min=0, max=1, pdf="uniform") + X_tmp = order(target=cbind(r, X), by=1) + Y_tmp = order(target=cbind(r, Y), by=1) + X = X_tmp[,2:ncol(X_tmp)] + Y = Y_tmp[,2:ncol(Y_tmp)] + + print("Computing metrics for current epoch...") + + # Predict on the test dataset + out = predict(X_test, Hin, Win, model, emas) + accuracy_scalar = eval(out, Y_test) + + # Append to the epoch-wise metrics + loss_metric[epoch, 1] = loss_avg + accuracy[epoch, 1] = accuracy_scalar + + print("Epoch Avg. Loss: " + loss_avg) + print("Epoch Accuracy: " + accuracy_scalar) + } + + learned_model = model + learned_emas = emas +} + +predict = function(matrix[double] X, int Hin, int Win, + list[unknown] model, list[unknown] emas) + return(matrix[double] out) { + /* + * Computes the class probability predictions of a convolutional + * net using the "ResNet" architecture. + * + * The input matrix, X, has N examples, each represented as a 3D + * volume unrolled into a single vector. + * + * Inputs: + * - X: Input data matrix, of shape (N, C*Hin*Win). + * + * Outputs: + * - probs: Class probabilities, of shape (N, K). + */ + + # Predict on test dataset + mode = "train" + [out, temp_emas, temp_cached_out, temp_cached_means_vars] = resnet::forward(X, Hin, Win, model, mode, emas) +} + + +eval = function(matrix[double] probs, matrix[double] Y) + return(double accuracy) { + /* + * Evaluates a convolutional net using the "ResNet" architecture. + * + * The probs matrix contains the class probability predictions + * of K classes over N examples. The targets, Y, have K classes, + * and are one-hot encoded. + * + * Inputs: + * - probs: Class probabilities, of shape (N, K). + * - Y: Target matrix, of shape (N, K). + * + * Outputs: + * - accuracy: Scalar accuracy, of shape (1). + */ + correct_pred = rowIndexMax(probs) == rowIndexMax(Y) + accuracy = mean(correct_pred) +} \ No newline at end of file diff --git a/scripts/nn/layers/lrn.dml b/scripts/nn/layers/lrn.dml new file mode 100644 index 00000000000..bd1dae3dc45 --- /dev/null +++ b/scripts/nn/layers/lrn.dml @@ -0,0 +1,153 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +/* + * Local Response Normalization (LRN) layer. + */ + +forward = function(matrix[double] X, int C, int Hin, int Win, + int N, double alpha, double beta, double K) + return (matrix[double] Y) { + /* + * Computes the forward pass for a Local Response Normalization + * (LRN) layer. The LRN layer performs a lateral normalization + * over channels at each spatial location. + * + * This is the cross-channel LRN used in AlexNet: + * `y_{x,y}^i = x_{x,y}^i / (K + alpha * sum_{j=max(0,i-n/2)}^{min(C-1,i+n/2)} (x_{x,y}^j)^2)^beta` + * + * Inputs: + * - X: Inputs, of shape (N, C*Hin*Win). + * - C: Number of input channels. + * - Hin: Input height. + * - Win: Input width. + * - N: Number of channels to sum over (i.e. size of local region). + * - alpha: Scaling parameter. + * - beta: Exponent parameter. + * - K: Additive constant to avoid divide-by-zero. + * + * Outputs: + * - Y: Outputs, of shape (N, C*Hin*Win). + */ + N_batch = nrow(X) + + # Initialize output + Y = matrix(0, rows=N_batch, cols=C*Hin*Win) + + # Reshape for easier manipulation + X_reshaped = matrix(X, rows=N_batch, cols=C*Hin*Win) + + # Compute normalization + half_N = as.integer(N / 2) + + for (i in 1:N_batch) { + # Get current sample + x = matrix(X_reshaped[i,], rows=C, cols=Hin*Win, byrow=TRUE) + y = matrix(0, rows=C, cols=Hin*Win) + + # For each channel + for (c in 1:C) { + # Define the local region + j_start = max(1, c - half_N) + j_end = min(C, c + half_N) + + # Compute sum of squares in the local region + scale = matrix(K, rows=1, cols=Hin*Win) + for (j in j_start:j_end) { + scale = scale + alpha * (x[j,])^2 + } + + # Apply normalization + y[c,] = x[c,] / (scale^beta) + } + + # Reshape back and store + Y[i,] = matrix(y, rows=1, cols=C*Hin*Win, byrow=TRUE) + } +} + +backward = function(matrix[double] dY, matrix[double] X, int C, int Hin, int Win, + int N, double alpha, double beta, double K) + return (matrix[double] dX) { + /* + * Computes the backward pass for a Local Response Normalization layer. + * + * Inputs: + * - dY: Gradient wrt Y, of shape (N, C*Hin*Win). + * - X: Inputs, of shape (N, C*Hin*Win). + * - C: Number of input channels. + * - Hin: Input height. + * - Win: Input width. + * - N: Number of channels to sum over. + * - alpha: Scaling parameter. + * - beta: Exponent parameter. + * - K: Additive constant. + * + * Outputs: + * - dX: Gradient wrt X, of shape (N, C*Hin*Win). + */ + N_batch = nrow(X) + + # Initialize gradient + dX = matrix(0, rows=N_batch, cols=C*Hin*Win) + + # Reshape for easier manipulation + X_reshaped = matrix(X, rows=N_batch, cols=C*Hin*Win) + dY_reshaped = matrix(dY, rows=N_batch, cols=C*Hin*Win) + + half_N = as.integer(N / 2) + + for (i in 1:N_batch) { + # Get current sample + x = matrix(X_reshaped[i,], rows=C, cols=Hin*Win, byrow=TRUE) + dy = matrix(dY_reshaped[i,], rows=C, cols=Hin*Win, byrow=TRUE) + dx = matrix(0, rows=C, cols=Hin*Win) + + # First, compute the scale values for all channels + scale = matrix(K, rows=C, cols=Hin*Win) + for (c in 1:C) { + j_start = max(1, c - half_N) + j_end = min(C, c + half_N) + for (j in j_start:j_end) { + scale[c,] = scale[c,] + alpha * (x[j,])^2 + } + } + + # Compute gradients + for (c in 1:C) { + # Channels that this channel influences + k_start = max(1, c - half_N) + k_end = min(C, c + half_N) + + for (k in k_start:k_end) { + if (k == c) { + # Gradient from own normalization + dx[c,] = dx[c,] + dy[k,] * scale[k,]^(-beta) + } + # Gradient from normalizing other channels + dx[c,] = dx[c,] - 2 * alpha * beta * dy[k,] * x[k,] * x[c,] * scale[k,]^(-beta-1) + } + } + + # Reshape back and store + dX[i,] = matrix(dx, rows=1, cols=C*Hin*Win, byrow=TRUE) + } +} \ No newline at end of file diff --git a/scripts/nn/layers/softmax_cross_entropy_loss.dml b/scripts/nn/layers/softmax_cross_entropy_loss.dml new file mode 100644 index 00000000000..8952d92d2cc --- /dev/null +++ b/scripts/nn/layers/softmax_cross_entropy_loss.dml @@ -0,0 +1,73 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +/* + * Softmax Cross-Entropy loss function. + * This combines the Softmax activation with the Cross-Entropy loss. + */ + +forward = function(matrix[double] logits, matrix[double] y) + return (double loss) { + /* + * Computes the forward pass for a Softmax Cross-Entropy loss function. + * + * Inputs: + * - logits: Raw scores from the network, of shape (N, K). + * - y: Target one-hot encoded labels, of shape (N, K). + * + * Outputs: + * - loss: Average loss. + */ + N = nrow(y) + + # Numerically stable Softmax + # Subtracting the max logit from each row prevents overflow when taking exp() + shifted_logits = logits - rowMaxs(logits) + probs = exp(shifted_logits) / rowSums(exp(shifted_logits)) + + # Cross-entropy loss calculation + # Adding a small epsilon for numerical stability to avoid log(0) + eps = 1e-9 + loss = -sum(y * log(probs + eps)) / N +} + +backward = function(matrix[double] logits, matrix[double] y) + return (matrix[double] d_logits) { + /* + * Computes the backward pass for a Softmax Cross-Entropy loss function. + * The gradient of the combined Softmax and Cross-Entropy is remarkably simple. + * + * Inputs: + * - logits: Raw scores from the network, of shape (N, K). + * - y: Target one-hot encoded labels, of shape (N, K). + * + * Outputs: + * - d_logits: Gradient with respect to the input logits, of shape (N, K). + */ + N = nrow(y) + + # Recompute the probabilities (softmax) + shifted_logits = logits - rowMaxs(logits) + probs = exp(shifted_logits) / rowSums(exp(shifted_logits)) + + # The gradient is simply (probabilities - true_labels) + d_logits = (probs - y) / N +} \ No newline at end of file diff --git a/scripts/nn/networks/alexnet.dml b/scripts/nn/networks/alexnet.dml new file mode 100644 index 00000000000..829ca633b40 --- /dev/null +++ b/scripts/nn/networks/alexnet.dml @@ -0,0 +1,1162 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +/* + * AlexNet: Deep Convolutional Neural Network + * + * Reference: "ImageNet Classification with Deep Convolutional Neural Networks" + * by Alex Krizhevsky, Ilya Sutskever, and Geoffrey E. Hinton (2012) + * + * This implementation provides a flexible, modular AlexNet architecture + * suitable for various computer vision tasks. + */ + +# Import layer implementations +source("nn/layers/affine.dml") as affine +source("nn/layers/conv2d_builtin.dml") as conv2d +source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss +source("nn/layers/dropout.dml") as dropout +source("nn/layers/l2_reg.dml") as l2_reg +source("nn/layers/max_pool2d_builtin.dml") as max_pool2d +source("nn/layers/relu.dml") as relu +source("nn/layers/softmax.dml") as softmax + +# Import optimizers +source("nn/optim/sgd.dml") as sgd +source("nn/optim/sgd_momentum.dml") as sgd_momentum +source("nn/optim/sgd_nesterov.dml") as sgd_nesterov +source("nn/optim/adam.dml") as adam +source("nn/optim/adagrad.dml") as adagrad +source("nn/optim/rmsprop.dml") as rmsprop +source("nn/optim/lars.dml") as lars + +# Import batch normalization +source("nn/layers/batch_norm2d.dml") as batch_norm2d + +/* + * Forward and backward pass. + */ + +forward = function(matrix[double] X, int C, int Hin, int Win, + list[unknown] model, string mode, double dropout_prob) + return (matrix[double] out, list[unknown] cached_out) { + /* + * Forward pass of the AlexNet model. + * + * Architecture: + * - Conv1: 96 filters, 11x11, stride 4, pad 0 -> ReLU -> MaxPool 3x3, stride 2 + * - Conv2: 256 filters, 5x5, stride 1, pad 2 -> ReLU -> MaxPool 3x3, stride 2 + * - Conv3: 384 filters, 3x3, stride 1, pad 1 -> ReLU + * - Conv4: 384 filters, 3x3, stride 1, pad 1 -> ReLU + * - Conv5: 256 filters, 3x3, stride 1, pad 1 -> ReLU -> MaxPool 3x3, stride 2 + * - FC1: 4096 neurons -> ReLU -> Dropout + * - FC2: 4096 neurons -> ReLU -> Dropout + * - FC3: num_classes neurons -> Softmax + * + * Inputs: + * - X: Input data, of shape (N, C*Hin*Win). + * - C: Number of input channels (3 for RGB). + * - Hin: Input height (256 for ImageNet). + * - Win: Input width (256 for ImageNet). + * - model: List of model parameters with the following structure: + * -> 1: Conv1 weights, of shape (96, C*11*11) + * -> 2: Conv1 bias, of shape (96, 1) + * -> 3: Conv2 weights, of shape (256, 96*5*5) + * -> 4: Conv2 bias, of shape (256, 1) + * -> 5: Conv3 weights, of shape (384, 256*3*3) + * -> 6: Conv3 bias, of shape (384, 1) + * -> 7: Conv4 weights, of shape (384, 384*3*3) + * -> 8: Conv4 bias, of shape (384, 1) + * -> 9: Conv5 weights, of shape (256, 384*3*3) + * -> 10: Conv5 bias, of shape (256, 1) + * -> 11: FC1 weights, of shape (fc_input_size, 4096) + * -> 12: FC1 bias, of shape (1, 4096) + * -> 13: FC2 weights, of shape (4096, 4096) + * -> 14: FC2 bias, of shape (1, 4096) + * -> 15: FC3 weights, of shape (4096, num_classes) + * -> 16: FC3 bias, of shape (1, num_classes) + * - mode: 'train' or 'test' for dropout behavior + * - dropout_prob: Dropout probability (typically 0.5) + * + * Outputs: + * - out: Output predictions, of shape (N, num_classes) + * - cached_out: Cached intermediate outputs for backward pass + */ + + # Extract model parameters + W1 = as.matrix(model[1]); b1 = as.matrix(model[2]) + W2 = as.matrix(model[3]); b2 = as.matrix(model[4]) + W3 = as.matrix(model[5]); b3 = as.matrix(model[6]) + W4 = as.matrix(model[7]); b4 = as.matrix(model[8]) + W5 = as.matrix(model[9]); b5 = as.matrix(model[10]) + W6 = as.matrix(model[11]); b6 = as.matrix(model[12]) + W7 = as.matrix(model[13]); b7 = as.matrix(model[14]) + W8 = as.matrix(model[15]); b8 = as.matrix(model[16]) + + # Forward pass + # Conv1 -> ReLU -> MaxPool1 + [outc1, Houtc1, Woutc1] = conv2d::forward(X, W1, b1, C, Hin, Win, 11, 11, 4, 4, 2, 2) + outr1 = relu::forward(outc1) + [outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, 96, Houtc1, Woutc1, 3, 3, 2, 2, 0, 0) + + # Conv2 -> ReLU -> MaxPool2 + [outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, 96, Houtp1, Woutp1, 5, 5, 1, 1, 2, 2) + outr2 = relu::forward(outc2) + [outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, 256, Houtc2, Woutc2, 3, 3, 2, 2, 0, 0) + + # Conv3 -> ReLU + [outc3, Houtc3, Woutc3] = conv2d::forward(outp2, W3, b3, 256, Houtp2, Woutp2, 3, 3, 1, 1, 1, 1) + outr3 = relu::forward(outc3) + + # Conv4 -> ReLU + [outc4, Houtc4, Woutc4] = conv2d::forward(outr3, W4, b4, 384, Houtc3, Woutc3, 3, 3, 1, 1, 1, 1) + outr4 = relu::forward(outc4) + + # Conv5 -> ReLU -> MaxPool3 + [outc5, Houtc5, Woutc5] = conv2d::forward(outr4, W5, b5, 384, Houtc4, Woutc4, 3, 3, 1, 1, 1, 1) + outr5 = relu::forward(outc5) + [outp5, Houtp5, Woutp5] = max_pool2d::forward(outr5, 256, Houtc5, Woutc5, 3, 3, 2, 2, 0, 0) + + # FC1 -> ReLU -> Dropout + outa6 = affine::forward(outp5, W6, b6) + outr6 = relu::forward(outa6) + if (mode == "train") { + [outd6, maskd6] = dropout::forward(outr6, dropout_prob, -1) + } else { + outd6 = outr6 + maskd6 = matrix(1, rows=nrow(outr6), cols=ncol(outr6)) + } + + # FC2 -> ReLU -> Dropout + outa7 = affine::forward(outd6, W7, b7) + outr7 = relu::forward(outa7) + if (mode == "train") { + [outd7, maskd7] = dropout::forward(outr7, dropout_prob, -1) + } else { + outd7 = outr7 + maskd7 = matrix(1, rows=nrow(outr7), cols=ncol(outr7)) + } + + # FC3 -> Softmax + outa8 = affine::forward(outd7, W8, b8) + out = softmax::forward(outa8) + + # Cache intermediate outputs for backward pass + cached_out = list(X, outc1, Houtc1, Woutc1, outr1, outp1, Houtp1, Woutp1, + outc2, Houtc2, Woutc2, outr2, outp2, Houtp2, Woutp2, + outc3, Houtc3, Woutc3, outr3, outc4, Houtc4, Woutc4, outr4, + outc5, Houtc5, Woutc5, outr5, outp5, Houtp5, Woutp5, + outa6, outr6, outd6, maskd6, outa7, outr7, outd7, maskd7, outa8) +} + +backward = function(matrix[double] dOut, list[unknown] cached_out, + list[unknown] model, int C, int Hin, int Win, double dropout_prob) + return (matrix[double] dX, list[unknown] gradients) { + /* + * Backward pass of the AlexNet model. + * + * Inputs: + * - dOut: Gradient w.r.t. output, of shape (N, num_classes) + * - cached_out: Cached outputs from forward pass + * - model: Model parameters (same structure as forward pass) + * - C, Hin, Win: Input dimensions + * - dropout_prob: Dropout probability used in forward pass + * + * Outputs: + * - dX: Gradient w.r.t. input, of shape (N, C*Hin*Win) + * - gradients: List of gradients for all parameters (same structure as model) + */ + + # Extract model parameters + W1 = as.matrix(model[1]); b1 = as.matrix(model[2]) + W2 = as.matrix(model[3]); b2 = as.matrix(model[4]) + W3 = as.matrix(model[5]); b3 = as.matrix(model[6]) + W4 = as.matrix(model[7]); b4 = as.matrix(model[8]) + W5 = as.matrix(model[9]); b5 = as.matrix(model[10]) + W6 = as.matrix(model[11]); b6 = as.matrix(model[12]) + W7 = as.matrix(model[13]); b7 = as.matrix(model[14]) + W8 = as.matrix(model[15]); b8 = as.matrix(model[16]) + + # Extract cached outputs + X = as.matrix(cached_out[1]) + outc1 = as.matrix(cached_out[2]); Houtc1 = as.scalar(cached_out[3]); Woutc1 = as.scalar(cached_out[4]) + outr1 = as.matrix(cached_out[5]) + outp1 = as.matrix(cached_out[6]); Houtp1 = as.scalar(cached_out[7]); Woutp1 = as.scalar(cached_out[8]) + outc2 = as.matrix(cached_out[9]); Houtc2 = as.scalar(cached_out[10]); Woutc2 = as.scalar(cached_out[11]) + outr2 = as.matrix(cached_out[12]) + outp2 = as.matrix(cached_out[13]); Houtp2 = as.scalar(cached_out[14]); Woutp2 = as.scalar(cached_out[15]) + outc3 = as.matrix(cached_out[16]); Houtc3 = as.scalar(cached_out[17]); Woutc3 = as.scalar(cached_out[18]) + outr3 = as.matrix(cached_out[19]) + outc4 = as.matrix(cached_out[20]); Houtc4 = as.scalar(cached_out[21]); Woutc4 = as.scalar(cached_out[22]) + outr4 = as.matrix(cached_out[23]) + outc5 = as.matrix(cached_out[24]); Houtc5 = as.scalar(cached_out[25]); Woutc5 = as.scalar(cached_out[26]) + outr5 = as.matrix(cached_out[27]) + outp5 = as.matrix(cached_out[28]); Houtp5 = as.scalar(cached_out[29]); Woutp5 = as.scalar(cached_out[30]) + outa6 = as.matrix(cached_out[31]); outr6 = as.matrix(cached_out[32]) + outd6 = as.matrix(cached_out[33]); maskd6 = as.matrix(cached_out[34]) + outa7 = as.matrix(cached_out[35]); outr7 = as.matrix(cached_out[36]) + outd7 = as.matrix(cached_out[37]); maskd7 = as.matrix(cached_out[38]) + outa8 = as.matrix(cached_out[39]) + + # Backward pass + # FC3 + douta8 = softmax::backward(dOut, outa8) + [doutd7, dW8, db8] = affine::backward(douta8, outd7, W8, b8) + + # FC2 + doutr7 = dropout::backward(doutd7, outr7, dropout_prob, maskd7) + douta7 = relu::backward(doutr7, outa7) + [doutd6, dW7, db7] = affine::backward(douta7, outd6, W7, b7) + + # FC1 + doutr6 = dropout::backward(doutd6, outr6, dropout_prob, maskd6) + douta6 = relu::backward(doutr6, outa6) + [doutp5, dW6, db6] = affine::backward(douta6, outp5, W6, b6) + + # Conv5 + doutr5 = max_pool2d::backward(doutp5, Houtp5, Woutp5, outr5, 256, Houtc5, Woutc5, 3, 3, 2, 2, 0, 0) + doutc5 = relu::backward(doutr5, outc5) + [doutr4, dW5, db5] = conv2d::backward(doutc5, Houtc5, Woutc5, outr4, W5, b5, 384, Houtc4, Woutc4, 3, 3, 1, 1, 1, 1) + + # Conv4 + doutc4 = relu::backward(doutr4, outc4) + [doutr3, dW4, db4] = conv2d::backward(doutc4, Houtc4, Woutc4, outr3, W4, b4, 384, Houtc3, Woutc3, 3, 3, 1, 1, 1, 1) + + # Conv3 + doutc3 = relu::backward(doutr3, outc3) + [doutp2, dW3, db3] = conv2d::backward(doutc3, Houtc3, Woutc3, outp2, W3, b3, 256, Houtp2, Woutp2, 3, 3, 1, 1, 1, 1) + + # Conv2 + doutr2 = max_pool2d::backward(doutp2, Houtp2, Woutp2, outr2, 256, Houtc2, Woutc2, 3, 3, 2, 2, 0, 0) + doutc2 = relu::backward(doutr2, outc2) + [doutp1, dW2, db2] = conv2d::backward(doutc2, Houtc2, Woutc2, outp1, W2, b2, 96, Houtp1, Woutp1, 5, 5, 1, 1, 2, 2) + + # Conv1 + doutr1 = max_pool2d::backward(doutp1, Houtp1, Woutp1, outr1, 96, Houtc1, Woutc1, 3, 3, 2, 2, 0, 0) + doutc1 = relu::backward(doutr1, outc1) + [dX, dW1, db1] = conv2d::backward(doutc1, Houtc1, Woutc1, X, W1, b1, C, Hin, Win, 11, 11, 4, 4, 2, 2) + + # Package gradients + gradients = list(dW1, db1, dW2, db2, dW3, db3, dW4, db4, dW5, db5, dW6, db6, dW7, db7, dW8, db8) +} + +/* + * Helper function to calculate output dimensions after convolutions and pooling + */ + +calculate_conv_output_size = function(int Hin, int Win) + return (int fc_input_size) { + /* + * Calculate the input size for the first fully connected layer + * based on the actual input dimensions after all conv and pooling layers. + * + * Current AlexNet architecture: + * 1. Conv1: 96 filters, 11x11, stride 4, pad 2 + * 2. MaxPool1: 3x3, stride 2, pad 0 + * 3. Conv2: 256 filters, 5x5, stride 1, pad 2 + * 4. MaxPool2: 3x3, stride 2, pad 0 + * 5. Conv3: 384 filters, 3x3, stride 1, pad 1 + * 6. Conv4: 384 filters, 3x3, stride 1, pad 1 + * 7. Conv5: 256 filters, 3x3, stride 1, pad 1 + * 8. MaxPool3: 3x3, stride 2, pad 0 + */ + + # Start with input dimensions + H = as.double(Hin) + W = as.double(Win) + + print("Input dimensions: " + Hin + "x" + Win) + + # Conv1: 11x11, stride 4, pad 2 + H = floor((H - 11 + 4) / 4) + 1 # pad 2 on each side = 4 total + W = floor((W - 11 + 4) / 4) + 1 + print("After Conv1: " + as.integer(H) + "x" + as.integer(W)) + + # MaxPool1: 3x3, stride 2, pad 0 + H = floor((H - 3 + 0) / 2) + 1 + W = floor((W - 3 + 0) / 2) + 1 + print("After MaxPool1: " + as.integer(H) + "x" + as.integer(W)) + + # Conv2: 5x5, stride 1, pad 2 + H = floor((H - 5 + 4) / 1) + 1 + W = floor((W - 5 + 4) / 1) + 1 + print("After Conv2: " + as.integer(H) + "x" + as.integer(W)) + + # MaxPool2: 3x3, stride 2, pad 0 + H = floor((H - 3 + 0) / 2) + 1 + W = floor((W - 3 + 0) / 2) + 1 + print("After MaxPool2: " + as.integer(H) + "x" + as.integer(W)) + + # Conv3: 3x3, stride 1, pad 1 + H = floor((H - 3 + 2) / 1) + 1 + W = floor((W - 3 + 2) / 1) + 1 + print("After Conv3: " + as.integer(H) + "x" + as.integer(W)) + + # Conv4: 3x3, stride 1, pad 1 + H = floor((H - 3 + 2) / 1) + 1 + W = floor((W - 3 + 2) / 1) + 1 + print("After Conv4: " + as.integer(H) + "x" + as.integer(W)) + + # Conv5: 3x3, stride 1, pad 1 + H = floor((H - 3 + 2) / 1) + 1 + W = floor((W - 3 + 2) / 1) + 1 + print("After Conv5: " + as.integer(H) + "x" + as.integer(W)) + + # MaxPool3: 3x3, stride 2, pad 0 + H = floor((H - 3 + 0) / 2) + 1 + W = floor((W - 3 + 0) / 2) + 1 + print("After MaxPool3: " + as.integer(H) + "x" + as.integer(W)) + + # Handle edge case where dimensions become 0 or negative + if (H <= 0 | W <= 0) { + print("ERROR: Spatial dimensions became 0 or negative!") + print("Input size " + Hin + "x" + Win + " is too small for AlexNet architecture.") + print("Consider using larger input images or adjusting the architecture.") + stop("Invalid spatial dimensions") + } + + # Final dimensions: 256 channels with H x W spatial size + fc_input_size = as.integer(256 * H * W) + + print("Final FC input size: " + fc_input_size + " (spatial: " + as.integer(H) + "x" + as.integer(W) + " x 256 channels)") +} + +/* + * Model initialization. + */ + +init = function(int C, int Hin, int Win, int num_classes, int seed) + return (list[unknown] model) { + /* + * Initialize AlexNet model parameters. + * + * Inputs: + * - C: Number of input channels (3 for RGB) + * - Hin: Input height (supports various sizes, e.g., 224, 256) + * - Win: Input width (supports various sizes, e.g., 224, 256) + * - num_classes: Number of output classes + * - seed: Random seed for initialization + * + * Outputs: + * - model: List of initialized model parameters + */ + + # Calculate fully connected input size based on actual input dimensions + fc_input_size = calculate_conv_output_size(Hin, Win) + + # --- Explicit AlexNet weight init for Conv layers --- + # All weights ∼ N(0,0.01), all biases = 0 (following original AlexNet paper) + + # Conv1: 96 11x11 filters + W1 = rand(rows=96, cols=C * 11 * 11, pdf="normal", seed=seed) * 0.01 # 96 × (C·11·11) + b1 = matrix(0.0, rows=96, cols=1) + + # Conv2: 256 5x5 filters + W2 = rand(rows=256, cols=96 * 5 * 5, pdf="normal", seed=seed) * 0.01 # 256 × (96·5·5) + b2 = matrix(0.0, rows=256, cols=1) + + # Conv3: 384 3x3 filters + W3 = rand(rows=384, cols=256 * 3 * 3, pdf="normal", seed=seed) * 0.01 # 384 × (256·3·3) + b3 = matrix(0.0, rows=384, cols=1) + + # Conv4: 384 3x3 filters + W4 = rand(rows=384, cols=384 * 3 * 3, pdf="normal", seed=seed) * 0.01 # 384 × (384·3·3) + b4 = matrix(0.0, rows=384, cols=1) + + # Conv5: 256 3x3 filters + W5 = rand(rows=256, cols=384 * 3 * 3, pdf="normal", seed=seed) * 0.01 # 256 × (384·3·3) + b5 = matrix(0.0, rows=256, cols=1) + + # --- Explicit AlexNet weight init for FC layers --- + # FC1: fc_input_size → 4096 + W6 = rand(rows=fc_input_size, cols=4096, pdf="normal", seed=seed) * 0.01 + b6 = matrix(0.0, rows=1, cols=4096) + + # FC2: 4096 → 4096 + W7 = rand(rows=4096, cols=4096, pdf="normal", seed=seed) * 0.01 + b7 = matrix(0.0, rows=1, cols=4096) + + # FC3: 4096 → num_classes (output layer) + W8 = rand(rows=4096, cols=num_classes, pdf="normal", seed=seed) * 0.01 + b8 = matrix(0.0, rows=1, cols=num_classes) + + # Scale final layer for better convergence (as mentioned in your image) + W8 = W8 / sqrt(2) + + # Package model + model = list(W1, b1, W2, b2, W3, b3, W4, b4, W5, b5, W6, b6, W7, b7, W8, b8) +} + +/* + * Utility functions for optimizers. + */ + +update_params_with_sgd = function(list[unknown] model, list[unknown] gradients, double lr) + return (list[unknown] model_upd) { + /* + * Update model parameters with SGD optimizer. + */ + model_upd = list() + for (i in 1:length(model)) { + param = as.matrix(model[i]) + grad = as.matrix(gradients[i]) + param_upd = sgd::update(param, grad, lr) + model_upd = append(model_upd, param_upd) + } +} + +init_sgd_momentum_optim_params = function(list[unknown] model) + return (list[unknown] optim_state) { + /* + * Initialize SGD momentum optimizer state. + */ + optim_state = list() + for (i in 1:length(model)) { + param = as.matrix(model[i]) + momentum_state = sgd_momentum::init(param) + optim_state = append(optim_state, momentum_state) + } +} + +update_params_with_sgd_momentum = function(list[unknown] model, list[unknown] gradients, + double lr, double mu, list[unknown] optim_state) + return (list[unknown] model_upd, list[unknown] optim_state_upd) { + /* + * Update model parameters with SGD momentum optimizer. + */ + model_upd = list() + optim_state_upd = list() + for (i in 1:length(model)) { + param = as.matrix(model[i]) + grad = as.matrix(gradients[i]) + momentum_state = as.matrix(optim_state[i]) + [param_upd, momentum_state_upd] = sgd_momentum::update(param, grad, lr, mu, momentum_state) + model_upd = append(model_upd, param_upd) + optim_state_upd = append(optim_state_upd, momentum_state_upd) + } +} + +init_adam_optim_params = function(list[unknown] model) + return (list[unknown] optim_state) { + /* + * Initialize Adam optimizer state. + */ + optim_state = list() + for (i in 1:length(model)) { + param = as.matrix(model[i]) + [m_state, v_state] = adam::init(param) + adam_state = list(m_state, v_state) + optim_state = append(optim_state, adam_state) + } +} + +update_params_with_adam = function(list[unknown] model, list[unknown] gradients, + double lr, double beta1, double beta2, double epsilon, int t, + list[unknown] optim_state) + return (list[unknown] model_upd, list[unknown] optim_state_upd) { + /* + * Update model parameters with Adam optimizer. + */ + model_upd = list() + optim_state_upd = list() + for (i in 1:length(model)) { + param = as.matrix(model[i]) + grad = as.matrix(gradients[i]) + adam_state = as.list(optim_state[i]) + m_state = as.matrix(adam_state[1]) + v_state = as.matrix(adam_state[2]) + [param_upd, m_state_upd, v_state_upd] = adam::update(param, grad, lr, beta1, beta2, epsilon, t, m_state, v_state) + adam_state_upd = list(m_state_upd, v_state_upd) + model_upd = append(model_upd, param_upd) + optim_state_upd = append(optim_state_upd, adam_state_upd) + } +} + +init_lars_optim_params = function(list[unknown] model) + return (list[unknown] optim_state) { + /* + * Initialize LARS optimizer state. + */ + optim_state = list() + for (i in 1:length(model)) { + param = as.matrix(model[i]) + momentum_state = lars::init(param) + optim_state = append(optim_state, momentum_state) + } +} + +update_params_with_lars = function(list[unknown] model, list[unknown] gradients, + double lr, double mu, double weight_decay, double trust_coeff, + list[unknown] optim_state) + return (list[unknown] model_upd, list[unknown] optim_state_upd) { + /* + * Update model parameters with LARS optimizer. + * + * LARS (Layer-wise Adaptive Rate Scaling) applies different learning + * rates to different layers based on the ratio of parameter norm + * to gradient norm, enabling stable large-batch training. + */ + model_upd = list() + optim_state_upd = list() + for (i in 1:length(model)) { + param = as.matrix(model[i]) + grad = as.matrix(gradients[i]) + momentum_state = as.matrix(optim_state[i]) + [param_upd, momentum_state_upd] = lars::update(param, grad, lr, mu, momentum_state, weight_decay, trust_coeff) + model_upd = append(model_upd, param_upd) + optim_state_upd = append(optim_state_upd, momentum_state_upd) + } +} + +/* + * Training and evaluation utilities. + */ + +compute_loss = function(matrix[double] predictions, matrix[double] targets, list[unknown] model, double weight_decay) + return (double loss) { + /* + * Compute cross-entropy loss with L2 regularization. + */ + data_loss = cross_entropy_loss::forward(predictions, targets) + reg_loss = 0 + for (i in seq(1, length(model), 2)) { # Only weights, skip biases + W = as.matrix(model[i]) + reg_loss = reg_loss + l2_reg::forward(W, 1) + } + loss = data_loss + weight_decay * reg_loss +} + +compute_accuracy = function(matrix[double] predictions, matrix[double] targets) + return (double accuracy) { + /* + * Compute classification accuracy. + */ + pred_labels = rowIndexMax(predictions) + true_labels = rowIndexMax(targets) + accuracy = mean(pred_labels == true_labels) +} + +evaluate = function(matrix[double] X, matrix[double] Y, int C, int Hin, int Win, + list[unknown] model, int batch_size) + return (double loss, double accuracy) { + /* + * Evaluate model on a dataset. + */ + N = nrow(X) + total_loss = 0 + total_acc = 0 + num_batches = ceil(N / batch_size) + + for (i in 1:num_batches) { + beg = ((i-1) * batch_size) %% N + 1 + end = min(N, beg + batch_size - 1) + X_batch = X[beg:end,] + Y_batch = Y[beg:end,] + + [predictions, cached_out] = forward(X_batch, C, Hin, Win, model, "test", 0.0) + batch_loss = compute_loss(predictions=predictions, targets=Y_batch, model=model, weight_decay=0.0) + batch_acc = compute_accuracy(predictions=predictions, targets=Y_batch) + + total_loss = total_loss + batch_loss + total_acc = total_acc + batch_acc + } + + loss = total_loss / num_batches + accuracy = total_acc / num_batches +} + +/* + * AlexNet-BN variant initialization (with Batch Normalization). + */ + +init_with_bn = function(int C, int Hin, int Win, int num_classes, int seed) + return (list[unknown] model, list[unknown] emas) { + /* + * Initialize AlexNet-BN model parameters (with Batch Normalization). + * + * This variant adds batch normalization after each convolutional layer, + * as described in the LARS paper for improved large-batch training. + * + * Inputs: + * - C: Number of input channels (3 for RGB) + * - Hin: Input height (supports various sizes, e.g., 64, 224) + * - Win: Input width (supports various sizes, e.g., 64, 224) + * - num_classes: Number of output classes + * - seed: Random seed for initialization + * + * Outputs: + * - model: List of model parameters including BN parameters + * - emas: List of exponential moving averages for BN layers + */ + + # Calculate fully connected input size based on actual input dimensions + fc_input_size = calculate_conv_output_size(Hin, Win) + + # --- Explicit AlexNet weight init for Conv layers --- + # All weights ∼ N(0,0.01), all biases = 0 (following original AlexNet paper) + + # Conv1: 96 11x11 filters + W1 = rand(rows=96, cols=C * 11 * 11, pdf="normal", seed=seed) * 0.01 # 96 × (C·11·11) + b1 = matrix(0.0, rows=96, cols=1) + + # Conv2: 256 5x5 filters + W2 = rand(rows=256, cols=96 * 5 * 5, pdf="normal", seed=seed) * 0.01 # 256 × (96·5·5) + b2 = matrix(0.0, rows=256, cols=1) + + # Conv3: 384 3x3 filters + W3 = rand(rows=384, cols=256 * 3 * 3, pdf="normal", seed=seed) * 0.01 # 384 × (256·3·3) + b3 = matrix(0.0, rows=384, cols=1) + + # Conv4: 384 3x3 filters + W4 = rand(rows=384, cols=384 * 3 * 3, pdf="normal", seed=seed) * 0.01 # 384 × (384·3·3) + b4 = matrix(0.0, rows=384, cols=1) + + # Conv5: 256 3x3 filters + W5 = rand(rows=256, cols=384 * 3 * 3, pdf="normal", seed=seed) * 0.01 # 256 × (384·3·3) + b5 = matrix(0.0, rows=256, cols=1) + + # --- Initialize batch normalization parameters for each conv layer --- + [gamma1, beta1, ema_mean1, ema_var1] = batch_norm2d::init(96) + [gamma2, beta2, ema_mean2, ema_var2] = batch_norm2d::init(256) + [gamma3, beta3, ema_mean3, ema_var3] = batch_norm2d::init(384) + [gamma4, beta4, ema_mean4, ema_var4] = batch_norm2d::init(384) + [gamma5, beta5, ema_mean5, ema_var5] = batch_norm2d::init(256) + + # --- Explicit AlexNet weight init for FC layers --- + # FC1: fc_input_size → 4096 + W6 = rand(rows=fc_input_size, cols=4096, pdf="normal", seed=seed) * 0.01 + b6 = matrix(0.0, rows=1, cols=4096) + + # FC2: 4096 → 4096 + W7 = rand(rows=4096, cols=4096, pdf="normal", seed=seed) * 0.01 + b7 = matrix(0.0, rows=1, cols=4096) + + # FC3: 4096 → num_classes (output layer) + W8 = rand(rows=4096, cols=num_classes, pdf="normal", seed=seed) * 0.01 + b8 = matrix(0.0, rows=1, cols=num_classes) + + # Scale final layer for better convergence (as mentioned in your image) + W8 = W8 / sqrt(2) + + # Package model with BN parameters + # Order: W, b, gamma, beta, ema_mean, ema_var for each conv layer, then FC layers + model = list(W1, b1, gamma1, beta1, ema_mean1, ema_var1, + W2, b2, gamma2, beta2, ema_mean2, ema_var2, + W3, b3, gamma3, beta3, ema_mean3, ema_var3, + W4, b4, gamma4, beta4, ema_mean4, ema_var4, + W5, b5, gamma5, beta5, ema_mean5, ema_var5, + W6, b6, W7, b7, W8, b8) + + # Package EMA parameters for easy access + emas = list(ema_mean1, ema_var1, ema_mean2, ema_var2, ema_mean3, ema_var3, + ema_mean4, ema_var4, ema_mean5, ema_var5) +} + +forward_with_bn = function(matrix[double] X, int C, int Hin, int Win, + list[unknown] model, string mode, double dropout_prob) + return (matrix[double] out, list[unknown] cached_out, list[unknown] emas_upd) { + /* + * Forward pass of the AlexNet-BN model (with Batch Normalization). + * + * Architecture: + * - Conv1 -> BN -> ReLU -> MaxPool + * - Conv2 -> BN -> ReLU -> MaxPool + * - Conv3 -> BN -> ReLU + * - Conv4 -> BN -> ReLU + * - Conv5 -> BN -> ReLU -> MaxPool + * - FC1 -> ReLU -> Dropout + * - FC2 -> ReLU -> Dropout + * - FC3 -> Softmax + */ + + # Extract model parameters (with BN) + W1 = as.matrix(model[1]); b1 = as.matrix(model[2]) + gamma1 = as.matrix(model[3]); beta1 = as.matrix(model[4]) + ema_mean1 = as.matrix(model[5]); ema_var1 = as.matrix(model[6]) + + W2 = as.matrix(model[7]); b2 = as.matrix(model[8]) + gamma2 = as.matrix(model[9]); beta2 = as.matrix(model[10]) + ema_mean2 = as.matrix(model[11]); ema_var2 = as.matrix(model[12]) + + W3 = as.matrix(model[13]); b3 = as.matrix(model[14]) + gamma3 = as.matrix(model[15]); beta3 = as.matrix(model[16]) + ema_mean3 = as.matrix(model[17]); ema_var3 = as.matrix(model[18]) + + W4 = as.matrix(model[19]); b4 = as.matrix(model[20]) + gamma4 = as.matrix(model[21]); beta4 = as.matrix(model[22]) + ema_mean4 = as.matrix(model[23]); ema_var4 = as.matrix(model[24]) + + W5 = as.matrix(model[25]); b5 = as.matrix(model[26]) + gamma5 = as.matrix(model[27]); beta5 = as.matrix(model[28]) + ema_mean5 = as.matrix(model[29]); ema_var5 = as.matrix(model[30]) + + W6 = as.matrix(model[31]); b6 = as.matrix(model[32]) + W7 = as.matrix(model[33]); b7 = as.matrix(model[34]) + W8 = as.matrix(model[35]); b8 = as.matrix(model[36]) + + # Forward pass with batch normalization + # Conv1 -> BN -> ReLU -> MaxPool + [outc1, Houtc1, Woutc1] = conv2d::forward(X, W1, b1, C, Hin, Win, 11, 11, 4, 4, 2, 2) + [outbn1, ema_mean1_upd, ema_var1_upd, cache_mean1, cache_inv_var1] = batch_norm2d::forward(outc1, gamma1, beta1, 96, Houtc1, Woutc1, mode, ema_mean1, ema_var1, 0.99, 1e-5) + outr1 = relu::forward(outbn1) + [outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, 96, Houtc1, Woutc1, 3, 3, 2, 2, 0, 0) + + # Conv2 -> BN -> ReLU -> MaxPool + [outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, 96, Houtp1, Woutp1, 5, 5, 1, 1, 2, 2) + [outbn2, ema_mean2_upd, ema_var2_upd, cache_mean2, cache_inv_var2] = batch_norm2d::forward(outc2, gamma2, beta2, 256, Houtc2, Woutc2, mode, ema_mean2, ema_var2, 0.99, 1e-5) + outr2 = relu::forward(outbn2) + [outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, 256, Houtc2, Woutc2, 3, 3, 2, 2, 0, 0) + + # Conv3 -> BN -> ReLU + [outc3, Houtc3, Woutc3] = conv2d::forward(outp2, W3, b3, 256, Houtp2, Woutp2, 3, 3, 1, 1, 1, 1) + [outbn3, ema_mean3_upd, ema_var3_upd, cache_mean3, cache_inv_var3] = batch_norm2d::forward(outc3, gamma3, beta3, 384, Houtc3, Woutc3, mode, ema_mean3, ema_var3, 0.99, 1e-5) + outr3 = relu::forward(outbn3) + + # Conv4 -> BN -> ReLU + [outc4, Houtc4, Woutc4] = conv2d::forward(outr3, W4, b4, 384, Houtc3, Woutc3, 3, 3, 1, 1, 1, 1) + [outbn4, ema_mean4_upd, ema_var4_upd, cache_mean4, cache_inv_var4] = batch_norm2d::forward(outc4, gamma4, beta4, 384, Houtc4, Woutc4, mode, ema_mean4, ema_var4, 0.99, 1e-5) + outr4 = relu::forward(outbn4) + + # Conv5 -> BN -> ReLU -> MaxPool + [outc5, Houtc5, Woutc5] = conv2d::forward(outr4, W5, b5, 384, Houtc4, Woutc4, 3, 3, 1, 1, 1, 1) + [outbn5, ema_mean5_upd, ema_var5_upd, cache_mean5, cache_inv_var5] = batch_norm2d::forward(outc5, gamma5, beta5, 256, Houtc5, Woutc5, mode, ema_mean5, ema_var5, 0.99, 1e-5) + outr5 = relu::forward(outbn5) + [outp5, Houtp5, Woutp5] = max_pool2d::forward(outr5, 256, Houtc5, Woutc5, 3, 3, 2, 2, 0, 0) + + # FC1 -> ReLU -> Dropout + outa6 = affine::forward(outp5, W6, b6) + outr6 = relu::forward(outa6) + if (mode == "train") { + [outd6, maskd6] = dropout::forward(outr6, dropout_prob, -1) + } else { + outd6 = outr6 + maskd6 = matrix(1, rows=nrow(outr6), cols=ncol(outr6)) + } + + # FC2 -> ReLU -> Dropout + outa7 = affine::forward(outd6, W7, b7) + outr7 = relu::forward(outa7) + if (mode == "train") { + [outd7, maskd7] = dropout::forward(outr7, dropout_prob, -1) + } else { + outd7 = outr7 + maskd7 = matrix(1, rows=nrow(outr7), cols=ncol(outr7)) + } + + # FC3 -> Softmax + outa8 = affine::forward(outd7, W8, b8) + out = softmax::forward(outa8) + + # Cache intermediate outputs for backward pass + cached_out = list(X, outc1, Houtc1, Woutc1, outbn1, cache_mean1, cache_inv_var1, outr1, outp1, Houtp1, Woutp1, + outc2, Houtc2, Woutc2, outbn2, cache_mean2, cache_inv_var2, outr2, outp2, Houtp2, Woutp2, + outc3, Houtc3, Woutc3, outbn3, cache_mean3, cache_inv_var3, outr3, + outc4, Houtc4, Woutc4, outbn4, cache_mean4, cache_inv_var4, outr4, + outc5, Houtc5, Woutc5, outbn5, cache_mean5, cache_inv_var5, outr5, outp5, Houtp5, Woutp5, + outa6, outr6, outd6, maskd6, outa7, outr7, outd7, maskd7, outa8) + + # Updated EMA parameters + emas_upd = list(ema_mean1_upd, ema_var1_upd, ema_mean2_upd, ema_var2_upd, ema_mean3_upd, ema_var3_upd, + ema_mean4_upd, ema_var4_upd, ema_mean5_upd, ema_var5_upd) +} + +/* + * LARS Training Utilities + */ + +get_lr_with_warmup = function(double base_lr, int epoch, int iter, int total_epochs, + int iters_per_epoch, int batch_size, int base_batch_size, + int warmup_epochs, double decay_power) + return (double lr) { + /* + * Learning rate scheduler with warmup, batch scaling, and polynomial decay. + * Implements the LARS paper's learning rate schedule. + * + * Inputs: + * - base_lr: Base learning rate (before scaling) + * - epoch, iter: Current epoch and iteration + * - total_epochs: Total number of training epochs + * - iters_per_epoch: Iterations per epoch + * - batch_size: Current batch size + * - base_batch_size: Reference batch size for scaling (typically 256) + * - warmup_epochs: Number of warmup epochs + * - decay_power: Power for polynomial decay (typically 2) + * + * Outputs: + * - lr: Scaled learning rate for current iteration + */ + + # Scale base LR by batch size (linear scaling rule) + scaled_base_lr = base_lr * (batch_size / base_batch_size) + + # Calculate total progress + total_iters = total_epochs * iters_per_epoch + warmup_iters = warmup_epochs * iters_per_epoch + current_iter = (epoch - 1) * iters_per_epoch + iter + + if (current_iter <= warmup_iters) { + # Linear warmup from 0 to scaled_base_lr + lr = scaled_base_lr * (current_iter / warmup_iters) + } else { + # Polynomial decay after warmup + progress = (current_iter - warmup_iters) / (total_iters - warmup_iters) + lr = scaled_base_lr * (1 - progress)^decay_power + } +} + +get_lars_hyperparams = function(int batch_size, boolean use_bn) + return (double base_lr, int warmup_epochs, int total_epochs) { + /* + * Get recommended LARS hyperparameters based on batch size. + * Based on Table 3 from the LARS paper. + * + * Inputs: + * - batch_size: Training batch size + * - use_bn: Whether using batch normalization + * + * Outputs: + * - base_lr: Base learning rate (before batch scaling) + * - warmup_epochs: Number of warmup epochs + * - total_epochs: Recommended total training epochs + */ + + if (use_bn) { + # AlexNet-BN (better scaling properties) + if (batch_size <= 512) { + base_lr = 0.02 + warmup_epochs = 5 + total_epochs = 100 + } else if (batch_size <= 4096) { + base_lr = 0.02 # Will be scaled to ~0.32 for 4K batch + warmup_epochs = 5 + total_epochs = 100 + } else if (batch_size <= 8192) { + base_lr = 0.02 # Will be scaled to ~0.64 for 8K batch + warmup_epochs = 5 + total_epochs = 100 + } else if (batch_size <= 16384) { + base_lr = 0.02 # Will be scaled to ~1.28 for 16K batch + warmup_epochs = 5 + total_epochs = 100 + } else { # 32K and above + base_lr = 0.02 # Will be scaled to ~2.56 for 32K batch + warmup_epochs = 5 + total_epochs = 200 # Need more epochs for very large batch + } + } else { + # Regular AlexNet (limited scaling) + if (batch_size <= 512) { + base_lr = 0.01 + warmup_epochs = 2 + total_epochs = 100 + } else if (batch_size <= 4096) { + base_lr = 0.01 # Will be scaled proportionally + warmup_epochs = 2 + total_epochs = 100 + } else { + # Regular AlexNet doesn't scale well beyond 4K + print("Warning: Regular AlexNet (without BN) doesn't scale well beyond batch size 4K") + base_lr = 0.01 + warmup_epochs = 2 + total_epochs = 100 + } + } +} + +train_with_lars = function(matrix[double] X_train, matrix[double] Y_train, + matrix[double] X_val, matrix[double] Y_val, + int C, int Hin, int Win, int num_classes, + int epochs, int batch_size, double base_lr, + boolean use_bn, int seed) + return (list[unknown] model, matrix[double] train_losses, matrix[double] val_accs) { + /* + * Train AlexNet with LARS optimizer following paper's best practices. + * + * Inputs: + * - X_train, Y_train: Training data and labels + * - X_val, Y_val: Validation data and labels + * - C, Hin, Win: Input dimensions + * - num_classes: Number of output classes + * - epochs: Number of training epochs + * - batch_size: Training batch size + * - base_lr: Base learning rate (before batch scaling) + * - use_bn: Whether to use batch normalization (recommended for LARS) + * - seed: Random seed for reproducibility + * + * Outputs: + * - model: Trained model parameters + * - train_losses: Training losses per epoch + * - val_accs: Validation accuracies per epoch + */ + + N = nrow(X_train) + + # Initialize model + if (use_bn) { + [model, emas] = init_with_bn(C, Hin, Win, num_classes, seed) + } else { + model = init(C, Hin, Win, num_classes, seed) + } + + # LARS hyperparameters from paper + base_batch_size = 256 + warmup_epochs = ifelse(use_bn, 5, 2) # 5 for BN, 2 for regular + decay_power = 2 + weight_decay = 0.0005 + momentum = 0.9 + trust_coeff = 0.001 + + # Initialize optimizer state + optim_state = init_lars_optim_params(model) + + # Training metrics + train_losses = matrix(0, rows=epochs, cols=1) + val_accs = matrix(0, rows=epochs, cols=1) + + # Print training info + print("Training AlexNet with LARS optimizer") + print("Batch size: " + batch_size + ", Base LR: " + base_lr) + print("Scaled LR: " + (base_lr * batch_size / base_batch_size)) + print("Warmup epochs: " + warmup_epochs + ", Using BN: " + use_bn) + print("") + + iters_per_epoch = ceil(N / batch_size) + + for (epoch in 1:epochs) { + epoch_loss = 0 + + for (iter in 1:iters_per_epoch) { + # Get learning rate with warmup and decay + lr = get_lr_with_warmup(base_lr, epoch, iter, epochs, iters_per_epoch, + batch_size, base_batch_size, warmup_epochs, decay_power) + + # Get batch + beg = ((iter-1) * batch_size) %% N + 1 + end = min(N, beg + batch_size - 1) + X_batch = X_train[beg:end,] + Y_batch = Y_train[beg:end,] + + # Forward pass + if (use_bn) { + [predictions, cached_out, emas] = forward_with_bn(X_batch, C, Hin, Win, model, "train", 0.5) + } else { + [predictions, cached_out] = forward(X_batch, C, Hin, Win, model, "train", 0.5) + } + + # Compute loss + loss = compute_loss(predictions, Y_batch, model, weight_decay) + epoch_loss = epoch_loss + loss + + # Backward pass + dprobs = cross_entropy_loss::backward(predictions, Y_batch) + if (use_bn) { + # Note: BN backward pass would need to be implemented separately + [dX, gradients] = backward(dprobs, cached_out, model, C, Hin, Win, 0.5) + } else { + [dX, gradients] = backward(dprobs, cached_out, model, C, Hin, Win, 0.5) + } + + # Add L2 regularization gradients + for (i in seq(1, length(gradients), 2)) { # Only weights + if (i <= length(model)) { + W = as.matrix(model[i]) + dW = as.matrix(gradients[i]) + gradients[i] = dW + weight_decay * l2_reg::backward(W, 1) + } + } + + # Update with LARS + [model, optim_state] = update_params_with_lars(model, gradients, lr, + momentum, weight_decay, + trust_coeff, optim_state) + + # Print progress + if (iter %% 50 == 0) { + print("Epoch " + epoch + "/" + epochs + ", Iter " + iter + "/" + + iters_per_epoch + ", LR: " + lr + ", Loss: " + loss) + } + } + + # Epoch metrics + train_losses[epoch,1] = epoch_loss / iters_per_epoch + + # Validation + if (use_bn) { + [val_loss, val_acc] = evaluate_with_bn(X_val, Y_val, C, Hin, Win, model, batch_size) + } else { + [val_loss, val_acc] = evaluate(X_val, Y_val, C, Hin, Win, model, batch_size) + } + val_accs[epoch,1] = val_acc + + print("Epoch " + epoch + " - Train Loss: " + train_losses[epoch,1] + + ", Val Acc: " + val_acc) + } +} + +backward_with_bn = function(matrix[double] dOut, list[unknown] cached_out, + list[unknown] model, int C, int Hin, int Win, double dropout_prob) + return (matrix[double] dX, list[unknown] gradients) { + /* + * Backward pass of the AlexNet-BN model (with Batch Normalization). + * + * Inputs: + * - dOut: Gradient w.r.t. output, of shape (N, num_classes) + * - cached_out: Cached outputs from forward pass + * - model: Model parameters (same structure as forward pass) + * - C, Hin, Win: Input dimensions + * - dropout_prob: Dropout probability used in forward pass + * + * Outputs: + * - dX: Gradient w.r.t. input, of shape (N, C*Hin*Win) + * - gradients: List of gradients for all parameters (same structure as model) + */ + + # Extract model parameters (with BN) + W1 = as.matrix(model[1]); b1 = as.matrix(model[2]) + gamma1 = as.matrix(model[3]); beta1 = as.matrix(model[4]) + + W2 = as.matrix(model[7]); b2 = as.matrix(model[8]) + gamma2 = as.matrix(model[9]); beta2 = as.matrix(model[10]) + + W3 = as.matrix(model[13]); b3 = as.matrix(model[14]) + gamma3 = as.matrix(model[15]); beta3 = as.matrix(model[16]) + + W4 = as.matrix(model[19]); b4 = as.matrix(model[20]) + gamma4 = as.matrix(model[21]); beta4 = as.matrix(model[22]) + + W5 = as.matrix(model[25]); b5 = as.matrix(model[26]) + gamma5 = as.matrix(model[27]); beta5 = as.matrix(model[28]) + + W6 = as.matrix(model[31]); b6 = as.matrix(model[32]) + W7 = as.matrix(model[33]); b7 = as.matrix(model[34]) + W8 = as.matrix(model[35]); b8 = as.matrix(model[36]) + + # Extract cached outputs + X = as.matrix(cached_out[1]) + outc1 = as.matrix(cached_out[2]); Houtc1 = as.scalar(cached_out[3]); Woutc1 = as.scalar(cached_out[4]) + outbn1 = as.matrix(cached_out[5]); cache_mean1 = as.matrix(cached_out[6]); cache_inv_var1 = as.matrix(cached_out[7]) + outr1 = as.matrix(cached_out[8]) + outp1 = as.matrix(cached_out[9]); Houtp1 = as.scalar(cached_out[10]); Woutp1 = as.scalar(cached_out[11]) + + outc2 = as.matrix(cached_out[12]); Houtc2 = as.scalar(cached_out[13]); Woutc2 = as.scalar(cached_out[14]) + outbn2 = as.matrix(cached_out[15]); cache_mean2 = as.matrix(cached_out[16]); cache_inv_var2 = as.matrix(cached_out[17]) + outr2 = as.matrix(cached_out[18]) + outp2 = as.matrix(cached_out[19]); Houtp2 = as.scalar(cached_out[20]); Woutp2 = as.scalar(cached_out[21]) + + outc3 = as.matrix(cached_out[22]); Houtc3 = as.scalar(cached_out[23]); Woutc3 = as.scalar(cached_out[24]) + outbn3 = as.matrix(cached_out[25]); cache_mean3 = as.matrix(cached_out[26]); cache_inv_var3 = as.matrix(cached_out[27]) + outr3 = as.matrix(cached_out[28]) + + outc4 = as.matrix(cached_out[29]); Houtc4 = as.scalar(cached_out[30]); Woutc4 = as.scalar(cached_out[31]) + outbn4 = as.matrix(cached_out[32]); cache_mean4 = as.matrix(cached_out[33]); cache_inv_var4 = as.matrix(cached_out[34]) + outr4 = as.matrix(cached_out[35]) + + outc5 = as.matrix(cached_out[36]); Houtc5 = as.scalar(cached_out[37]); Woutc5 = as.scalar(cached_out[38]) + outbn5 = as.matrix(cached_out[39]); cache_mean5 = as.matrix(cached_out[40]); cache_inv_var5 = as.matrix(cached_out[41]) + outr5 = as.matrix(cached_out[42]) + outp5 = as.matrix(cached_out[43]); Houtp5 = as.scalar(cached_out[44]); Woutp5 = as.scalar(cached_out[45]) + + outa6 = as.matrix(cached_out[46]); outr6 = as.matrix(cached_out[47]) + outd6 = as.matrix(cached_out[48]); maskd6 = as.matrix(cached_out[49]) + outa7 = as.matrix(cached_out[50]); outr7 = as.matrix(cached_out[51]) + outd7 = as.matrix(cached_out[52]); maskd7 = as.matrix(cached_out[53]) + outa8 = as.matrix(cached_out[54]) + + # Backward pass + # FC3 + douta8 = softmax::backward(dOut, outa8) + [doutd7, dW8, db8] = affine::backward(douta8, outd7, W8, b8) + + # FC2 + doutr7 = dropout::backward(doutd7, outr7, dropout_prob, maskd7) + douta7 = relu::backward(doutr7, outa7) + [doutd6, dW7, db7] = affine::backward(douta7, outd6, W7, b7) + + # FC1 + doutr6 = dropout::backward(doutd6, outr6, dropout_prob, maskd6) + douta6 = relu::backward(doutr6, outa6) + [doutp5, dW6, db6] = affine::backward(douta6, outp5, W6, b6) + + # Conv5 + doutr5 = max_pool2d::backward(doutp5, Houtp5, Woutp5, outr5, 256, Houtc5, Woutc5, 3, 3, 2, 2, 0, 0) + doutbn5 = relu::backward(doutr5, outbn5) + [doutc5, dgamma5, dbeta5] = batch_norm2d::backward(doutbn5, cache_mean5, cache_inv_var5, outc5, gamma5, 256, Houtc5, Woutc5, 1e-5) + [doutr4, dW5, db5] = conv2d::backward(doutc5, Houtc5, Woutc5, outr4, W5, b5, 384, Houtc4, Woutc4, 3, 3, 1, 1, 1, 1) + + # Conv4 + doutbn4 = relu::backward(doutr4, outbn4) + [doutc4, dgamma4, dbeta4] = batch_norm2d::backward(doutbn4, cache_mean4, cache_inv_var4, outc4, gamma4, 384, Houtc4, Woutc4, 1e-5) + [doutr3, dW4, db4] = conv2d::backward(doutc4, Houtc4, Woutc4, outr3, W4, b4, 384, Houtc3, Woutc3, 3, 3, 1, 1, 1, 1) + + # Conv3 + doutbn3 = relu::backward(doutr3, outbn3) + [doutc3, dgamma3, dbeta3] = batch_norm2d::backward(doutbn3, cache_mean3, cache_inv_var3, outc3, gamma3, 384, Houtc3, Woutc3, 1e-5) + [doutp2, dW3, db3] = conv2d::backward(doutc3, Houtc3, Woutc3, outp2, W3, b3, 256, Houtp2, Woutp2, 3, 3, 1, 1, 1, 1) + + # Conv2 + doutr2 = max_pool2d::backward(doutp2, Houtp2, Woutp2, outr2, 256, Houtc2, Woutc2, 3, 3, 2, 2, 0, 0) + doutbn2 = relu::backward(doutr2, outbn2) + [doutc2, dgamma2, dbeta2] = batch_norm2d::backward(doutbn2, cache_mean2, cache_inv_var2, outc2, gamma2, 256, Houtc2, Woutc2, 1e-5) + [doutp1, dW2, db2] = conv2d::backward(doutc2, Houtc2, Woutc2, outp1, W2, b2, 96, Houtp1, Woutp1, 5, 5, 1, 1, 2, 2) + + # Conv1 + doutr1 = max_pool2d::backward(doutp1, Houtp1, Woutp1, outr1, 96, Houtc1, Woutc1, 3, 3, 2, 2, 0, 0) + doutbn1 = relu::backward(doutr1, outbn1) + [doutc1, dgamma1, dbeta1] = batch_norm2d::backward(doutbn1, cache_mean1, cache_inv_var1, outc1, gamma1, 96, Houtc1, Woutc1, 1e-5) + [dX, dW1, db1] = conv2d::backward(doutc1, Houtc1, Woutc1, X, W1, b1, C, Hin, Win, 11, 11, 4, 4, 2, 2) + + # Package gradients (with BN parameters) + gradients = list(dW1, db1, dgamma1, dbeta1, matrix(0, rows=nrow(dgamma1), cols=ncol(dgamma1)), matrix(0, rows=nrow(dgamma1), cols=ncol(dgamma1)), + dW2, db2, dgamma2, dbeta2, matrix(0, rows=nrow(dgamma2), cols=ncol(dgamma2)), matrix(0, rows=nrow(dgamma2), cols=ncol(dgamma2)), + dW3, db3, dgamma3, dbeta3, matrix(0, rows=nrow(dgamma3), cols=ncol(dgamma3)), matrix(0, rows=nrow(dgamma3), cols=ncol(dgamma3)), + dW4, db4, dgamma4, dbeta4, matrix(0, rows=nrow(dgamma4), cols=ncol(dgamma4)), matrix(0, rows=nrow(dgamma4), cols=ncol(dgamma4)), + dW5, db5, dgamma5, dbeta5, matrix(0, rows=nrow(dgamma5), cols=ncol(dgamma5)), matrix(0, rows=nrow(dgamma5), cols=ncol(dgamma5)), + dW6, db6, dW7, db7, dW8, db8) +} + +evaluate_with_bn = function(matrix[double] X, matrix[double] Y, int C, int Hin, int Win, + list[unknown] model, int batch_size) + return (double loss, double accuracy) { + /* + * Evaluate AlexNet-BN model on a dataset. + */ + N = nrow(X) + total_loss = 0 + total_acc = 0 + num_batches = ceil(N / batch_size) + + for (i in 1:num_batches) { + beg = ((i-1) * batch_size) %% N + 1 + end = min(N, beg + batch_size - 1) + X_batch = X[beg:end,] + Y_batch = Y[beg:end,] + + [predictions, cached_out, emas] = forward_with_bn(X_batch, C, Hin, Win, model, "test", 0.0) + batch_loss = compute_loss(predictions, Y_batch, model, 0.0) + batch_acc = compute_accuracy(predictions, Y_batch) + + total_loss = total_loss + batch_loss + total_acc = total_acc + batch_acc + } + + loss = total_loss / num_batches + accuracy = total_acc / num_batches +} \ No newline at end of file diff --git a/scripts/nn/networks/resnet.dml b/scripts/nn/networks/resnet.dml index 70df93f2448..9f121380f7e 100644 --- a/scripts/nn/networks/resnet.dml +++ b/scripts/nn/networks/resnet.dml @@ -19,12 +19,13 @@ # #------------------------------------------------------------- -source("scripts/nn/layers/batch_norm2d_old.dml") as bn2d -source("scripts/nn/layers/conv2d_builtin.dml") as conv2d -source("scripts/nn/layers/relu.dml") as relu -source("scripts/nn/layers/max_pool2d_builtin.dml") as mp2d -source("scripts/nn/layers/global_avg_pool2d.dml") as ap2d -source("scripts/nn/layers/affine.dml") as fc +source("nn/layers/batch_norm2d_old.dml") as bn2d +source("nn/layers/conv2d_builtin.dml") as conv2d +source("nn/layers/relu.dml") as relu +source("nn/layers/max_pool2d_builtin.dml") as mp2d +source("nn/layers/global_avg_pool2d.dml") as ap2d +source("nn/layers/affine.dml") as fc +source("nn/layers/softmax.dml") as softmax conv3x3_forward = function(matrix[double] X, matrix[double] W, int C_in, int C_out, int Hin, int Win, @@ -863,7 +864,7 @@ resnet_forward = function(matrix[double] X, int Hin, int Win, ema_means_vars_upd = list(ema_mean_bn1_upd, ema_var_bn1_upd, emas1_upd, emas2_upd, emas3_upd, emas4_upd) cached_out = list(X, Hin, Win, out_conv1, Hout_conv1, Wout_conv1, out_bn1, out_re1, out_mp, Hout_mp, Wout_mp, cached_out_l1, cached_out_l2, cached_out_l3, cached_out_l4, out_res, Hout_res, Wout_res, out_ap, Hout_ap, - Wout_ap) + Wout_ap, out_fc) cached_means_vars = list(cached_m, cached_v, cached_mv_l1, cached_mv_l2, cached_mv_l3, cached_mv_l4) out = out_fc diff --git a/scripts/nn/networks/resnet101.dml b/scripts/nn/networks/resnet101.dml index ebcb1d6b976..22a59c99285 100644 --- a/scripts/nn/networks/resnet101.dml +++ b/scripts/nn/networks/resnet101.dml @@ -432,3 +432,50 @@ update_params_with_sgd_nesterov = function(list[unknown] model, "bottleneck", layer_sizes) } +init_lars_optim_params = function(int classes) + return(list[unknown] params) { + /* + * Initializes the state of the LARS optimizer for every + * learnable parameter of ResNet 50. + * + * Inputs: + * - classes: Number of network output classes. + * + * Outputs: + * - params: List of state parameters with the same structure + * as weights of the forward and backward pass. It can be + * directly passed to the update parameter function. + */ + layer_sizes = list(3, 4, 23, 3) + params = util::init_optim("lars", classes, "bottleneck", layer_sizes) +} + +update_params_with_lars = function(list[unknown] model, list[unknown] gradients, + double lr, double mu, double weight_decay, + double trust_coeff, list[unknown] optim_state) + return (list[unknown] model_upd, list[unknown] optim_state_upd) { + /* + * Updates all learnable parameters with the LARS optimizer. + * + * LARS (Layer-wise Adaptive Rate Scaling) applies different learning + * rates to different layers based on the ratio of parameter norm + * to gradient norm, enabling stable large-batch training. + * + * Inputs: + * - model: Model parameters, same as for forward and backward pass. + * - gradients: Gradients, returned from the backward pass. + * - lr: Global learning rate. + * - mu: Momentum value. Recommended: 0.9 + * - weight_decay: L2 regularization strength. Recommended: 5e-4 + * - trust_coeff: Trust coefficient for LARS. Recommended: 0.001 + * - optim_state: Optimizer states for all model parameters. + * + * Outputs: + * - model_upd: Updated model parameters. + * - optim_state_upd: Updated model states for all parameters. + */ + layer_sizes = list(3, 4, 23, 3) + hyper_params = list(lr, mu, weight_decay, trust_coeff) + [optim_state_upd, model_upd] = util::update_params("lars", optim_state, hyper_params, gradients, model, "bottleneck", + layer_sizes) +} \ No newline at end of file diff --git a/scripts/nn/networks/resnet152.dml b/scripts/nn/networks/resnet152.dml index e0e4154fc94..92da614345a 100644 --- a/scripts/nn/networks/resnet152.dml +++ b/scripts/nn/networks/resnet152.dml @@ -432,3 +432,50 @@ update_params_with_sgd_nesterov = function(list[unknown] model, "bottleneck", layer_sizes) } +init_lars_optim_params = function(int classes) + return(list[unknown] params) { + /* + * Initializes the state of the LARS optimizer for every + * learnable parameter of ResNet 50. + * + * Inputs: + * - classes: Number of network output classes. + * + * Outputs: + * - params: List of state parameters with the same structure + * as weights of the forward and backward pass. It can be + * directly passed to the update parameter function. + */ + layer_sizes = list(3, 8, 36, 3) + params = util::init_optim("lars", classes, "bottleneck", layer_sizes) +} + +update_params_with_lars = function(list[unknown] model, list[unknown] gradients, + double lr, double mu, double weight_decay, + double trust_coeff, list[unknown] optim_state) + return (list[unknown] model_upd, list[unknown] optim_state_upd) { + /* + * Updates all learnable parameters with the LARS optimizer. + * + * LARS (Layer-wise Adaptive Rate Scaling) applies different learning + * rates to different layers based on the ratio of parameter norm + * to gradient norm, enabling stable large-batch training. + * + * Inputs: + * - model: Model parameters, same as for forward and backward pass. + * - gradients: Gradients, returned from the backward pass. + * - lr: Global learning rate. + * - mu: Momentum value. Recommended: 0.9 + * - weight_decay: L2 regularization strength. Recommended: 5e-4 + * - trust_coeff: Trust coefficient for LARS. Recommended: 0.001 + * - optim_state: Optimizer states for all model parameters. + * + * Outputs: + * - model_upd: Updated model parameters. + * - optim_state_upd: Updated model states for all parameters. + */ + layer_sizes = list(3, 8, 36, 3) + hyper_params = list(lr, mu, weight_decay, trust_coeff) + [optim_state_upd, model_upd] = util::update_params("lars", optim_state, hyper_params, gradients, model, "bottleneck", + layer_sizes) +} \ No newline at end of file diff --git a/scripts/nn/networks/resnet18.dml b/scripts/nn/networks/resnet18.dml index 2a67c9ddb61..52a80eb92d1 100644 --- a/scripts/nn/networks/resnet18.dml +++ b/scripts/nn/networks/resnet18.dml @@ -434,3 +434,50 @@ update_params_with_sgd_nesterov = function(list[unknown] model, layer_sizes) } +init_lars_optim_params = function(int classes) + return(list[unknown] params) { + /* + * Initializes the state of the LARS optimizer for every + * learnable parameter of ResNet 50. + * + * Inputs: + * - classes: Number of network output classes. + * + * Outputs: + * - params: List of state parameters with the same structure + * as weights of the forward and backward pass. It can be + * directly passed to the update parameter function. + */ + layer_sizes = list(2, 2, 2, 2) + params = util::init_optim("lars", classes, "basic", layer_sizes) +} + +update_params_with_lars = function(list[unknown] model, list[unknown] gradients, + double lr, double mu, double weight_decay, + double trust_coeff, list[unknown] optim_state) + return (list[unknown] model_upd, list[unknown] optim_state_upd) { + /* + * Updates all learnable parameters with the LARS optimizer. + * + * LARS (Layer-wise Adaptive Rate Scaling) applies different learning + * rates to different layers based on the ratio of parameter norm + * to gradient norm, enabling stable large-batch training. + * + * Inputs: + * - model: Model parameters, same as for forward and backward pass. + * - gradients: Gradients, returned from the backward pass. + * - lr: Global learning rate. + * - mu: Momentum value. Recommended: 0.9 + * - weight_decay: L2 regularization strength. Recommended: 5e-4 + * - trust_coeff: Trust coefficient for LARS. Recommended: 0.001 + * - optim_state: Optimizer states for all model parameters. + * + * Outputs: + * - model_upd: Updated model parameters. + * - optim_state_upd: Updated model states for all parameters. + */ + layer_sizes = list(2, 2, 2, 2) + hyper_params = list(lr, mu, weight_decay, trust_coeff) + [optim_state_upd, model_upd] = util::update_params("lars", optim_state, hyper_params, gradients, model, "basic", + layer_sizes) +} \ No newline at end of file diff --git a/scripts/nn/networks/resnet34.dml b/scripts/nn/networks/resnet34.dml index 9dcabcf1ecc..86e9e547ce5 100644 --- a/scripts/nn/networks/resnet34.dml +++ b/scripts/nn/networks/resnet34.dml @@ -428,3 +428,50 @@ update_params_with_sgd_nesterov = function(list[unknown] model, layer_sizes) } +init_lars_optim_params = function(int classes) + return(list[unknown] params) { + /* + * Initializes the state of the LARS optimizer for every + * learnable parameter of ResNet 50. + * + * Inputs: + * - classes: Number of network output classes. + * + * Outputs: + * - params: List of state parameters with the same structure + * as weights of the forward and backward pass. It can be + * directly passed to the update parameter function. + */ + layer_sizes = list(3, 4, 6, 3) + params = util::init_optim("lars", classes, "basic", layer_sizes) +} + +update_params_with_lars = function(list[unknown] model, list[unknown] gradients, + double lr, double mu, double weight_decay, + double trust_coeff, list[unknown] optim_state) + return (list[unknown] model_upd, list[unknown] optim_state_upd) { + /* + * Updates all learnable parameters with the LARS optimizer. + * + * LARS (Layer-wise Adaptive Rate Scaling) applies different learning + * rates to different layers based on the ratio of parameter norm + * to gradient norm, enabling stable large-batch training. + * + * Inputs: + * - model: Model parameters, same as for forward and backward pass. + * - gradients: Gradients, returned from the backward pass. + * - lr: Global learning rate. + * - mu: Momentum value. Recommended: 0.9 + * - weight_decay: L2 regularization strength. Recommended: 5e-4 + * - trust_coeff: Trust coefficient for LARS. Recommended: 0.001 + * - optim_state: Optimizer states for all model parameters. + * + * Outputs: + * - model_upd: Updated model parameters. + * - optim_state_upd: Updated model states for all parameters. + */ + layer_sizes = list(3, 4, 6, 3) + hyper_params = list(lr, mu, weight_decay, trust_coeff) + [optim_state_upd, model_upd] = util::update_params("lars", optim_state, hyper_params, gradients, model, "basic", + layer_sizes) +} \ No newline at end of file diff --git a/scripts/nn/networks/resnet50.dml b/scripts/nn/networks/resnet50.dml index bac0e938af3..ac4e1952301 100644 --- a/scripts/nn/networks/resnet50.dml +++ b/scripts/nn/networks/resnet50.dml @@ -432,3 +432,50 @@ update_params_with_sgd_nesterov = function(list[unknown] model, "bottleneck", layer_sizes) } +init_lars_optim_params = function(int classes) + return(list[unknown] params) { + /* + * Initializes the state of the LARS optimizer for every + * learnable parameter of ResNet 50. + * + * Inputs: + * - classes: Number of network output classes. + * + * Outputs: + * - params: List of state parameters with the same structure + * as weights of the forward and backward pass. It can be + * directly passed to the update parameter function. + */ + layer_sizes = list(3, 4, 6, 3) + params = util::init_optim("lars", classes, "bottleneck", layer_sizes) +} + +update_params_with_lars = function(list[unknown] model, list[unknown] gradients, + double lr, double mu, double weight_decay, + double trust_coeff, list[unknown] optim_state) + return (list[unknown] model_upd, list[unknown] optim_state_upd) { + /* + * Updates all learnable parameters with the LARS optimizer. + * + * LARS (Layer-wise Adaptive Rate Scaling) applies different learning + * rates to different layers based on the ratio of parameter norm + * to gradient norm, enabling stable large-batch training. + * + * Inputs: + * - model: Model parameters, same as for forward and backward pass. + * - gradients: Gradients, returned from the backward pass. + * - lr: Global learning rate. + * - mu: Momentum value. Recommended: 0.9 + * - weight_decay: L2 regularization strength. Recommended: 5e-4 + * - trust_coeff: Trust coefficient for LARS. Recommended: 0.001 + * - optim_state: Optimizer states for all model parameters. + * + * Outputs: + * - model_upd: Updated model parameters. + * - optim_state_upd: Updated model states for all parameters. + */ + layer_sizes = list(3, 4, 6, 3) + hyper_params = list(lr, mu, weight_decay, trust_coeff) + [optim_state_upd, model_upd] = util::update_params("lars", optim_state, hyper_params, gradients, model, "bottleneck", + layer_sizes) +} \ No newline at end of file diff --git a/scripts/nn/networks/resnet_util.dml b/scripts/nn/networks/resnet_util.dml index 995736585ba..117e1c98631 100644 --- a/scripts/nn/networks/resnet_util.dml +++ b/scripts/nn/networks/resnet_util.dml @@ -25,6 +25,7 @@ source("nn/optim/rmsprop.dml") as rmsprop source("nn/optim/sgd.dml") as sgd source("nn/optim/sgd_momentum.dml") as sgd_momentum source("nn/optim/sgd_nesterov.dml") as sgd_nesterov +source("nn/optim/lars.dml") as lars init_optim_adam_basic_block = function(int C_in, int C_base, boolean downsample) @@ -55,6 +56,33 @@ init_optim_adam_basic_block = function(int C_in, int C_base, boolean downsample) } } +init_optim_lars_basic_block = function(int C_in, int C_base, boolean downsample) + return (list[unknown] block_params) { + # Conv 1 + v_W_conv1 = matrix(0, rows=C_base, cols=C_in*3*3) + # BN 1 + v_W_bn1 = matrix(0, rows=C_base, cols=1) + v_b_bn1 = matrix(0, rows=C_base, cols=1) + # Conv 2 + v_W_conv2 = matrix(0, rows=C_base, cols=C_base*3*3) + # BN 2 + v_W_bn2 = matrix(0, rows=C_base, cols=1) + v_b_bn2 = matrix(0, rows=C_base, cols=1) + + block_params = list(v_W_conv1, v_W_bn1, v_b_bn1, v_W_conv2, v_W_bn2, v_b_bn2) + + if (downsample) { + # Conv 3 + v_W_conv3 = matrix(0, rows=C_base, cols=C_in) + # BN 3 + v_W_bn3 = matrix(0, rows=C_base, cols=1) + v_b_bn3 = matrix(0, rows=C_base, cols=1) + block_params = append(block_params, v_W_conv3) + block_params = append(block_params, v_W_bn3) + block_params = append(block_params, v_b_bn3) + } +} + init_optim_other_basic_block = function(int C_in, int C_base, boolean downsample) return (list[unknown] block_params) { # Conv 1 @@ -114,6 +142,38 @@ init_optim_other_bottleneck_block = function(int C_in, int C_base, boolean downs } } +init_optim_lars_bottleneck_block = function(int C_in, int C_base, boolean downsample) + return (list[unknown] block_params) { + # Conv 1 + v_W_conv1 = matrix(0, rows=C_base, cols=C_in) + # BN 1 + v_W_bn1 = matrix(0, rows=C_base, cols=1) + v_b_bn1 = matrix(0, rows=C_base, cols=1) + # Conv 2 + v_W_conv2 = matrix(0, rows=C_base, cols=C_base*3*3) + # BN 2 + v_W_bn2 = matrix(0, rows=C_base, cols=1) + v_b_bn2 = matrix(0, rows=C_base, cols=1) + # Conv 3 + v_W_conv3 = matrix(0, rows=4*C_base, cols=C_base) + # BN 3 + v_W_bn3 = matrix(0, rows=4*C_base, cols=1) + v_b_bn3 = matrix(0, rows=4*C_base, cols=1) + + block_params = list(v_W_conv1, v_W_bn1, v_b_bn1, v_W_conv2, v_W_bn2, v_b_bn2, v_W_conv3, v_W_bn3, v_b_bn3) + + if (downsample) { + # Conv 4 + v_W_conv4 = matrix(0, rows=4*C_base, cols=C_in) + # BN 4 + v_W_bn4 = matrix(0, rows=4*C_base, cols=1) + v_b_bn4 = matrix(0, rows=4*C_base, cols=1) + block_params = append(block_params, v_W_conv4) + block_params = append(block_params, v_W_bn4) + block_params = append(block_params, v_b_bn4) + } +} + init_optim_adam_bottleneck_block = function(int C_in, int C_base, boolean downsample) return (list[unknown] block_params) { # Conv 1 @@ -158,6 +218,9 @@ init_optim = function(string optimizer, int classes, string block_type, list[unk m_W_conv1 = matrix(0, rows=64, cols=C_in*7*7) v_W_conv1 = matrix(0, rows=64, cols=C_in*7*7) params = append(params, list(m_W_conv1, v_W_conv1)) + } else if (optimizer == "lars") { + v_W_conv1 = matrix(0, rows=64, cols=C_in*7*7) + params = append(params, v_W_conv1) } else { s_W_conv1 = matrix(0, rows=64, cols=C_in*7*7) params = append(params, s_W_conv1) @@ -169,6 +232,11 @@ init_optim = function(string optimizer, int classes, string block_type, list[unk m_b_bn1 = matrix(0, rows=C_in, cols=1); v_b_bn1 = matrix(0, rows=C_in, cols=1) params = append(params, list(m_W_bn1, v_W_bn1)) params = append(params, list(m_b_bn1, v_b_bn1)) + } else if (optimizer == "lars") { + v_W_bn1 = matrix(0, rows=C_in, cols=1) + v_b_bn1 = matrix(0, rows=C_in, cols=1) + params = append(params, v_W_bn1) + params = append(params, v_b_bn1) } else { s_W_bn1 = matrix(0, rows=C_in, cols=1) s_b_bn1 = matrix(0, rows=C_in, cols=1) @@ -191,6 +259,8 @@ init_optim = function(string optimizer, int classes, string block_type, list[unk downsample = block == 1 & stride > 1 if (optimizer == "adam") optim_block = init_optim_adam_basic_block(C_in, C_base, downsample) + else if (optimizer == "lars") + optim_block = init_optim_lars_basic_block(C_in, C_base, downsample) else optim_block = init_optim_other_basic_block(C_in, C_base, downsample) optim_layer = append(optim_layer, optim_block) @@ -203,6 +273,8 @@ init_optim = function(string optimizer, int classes, string block_type, list[unk downsample = block == 1 if (optimizer == "adam") optim_block = init_optim_adam_bottleneck_block(C_in, C_base, downsample) + else if (optimizer == "lars") + optim_block = init_optim_lars_bottleneck_block(C_in, C_base, downsample) else optim_block = init_optim_other_bottleneck_block(C_in, C_base, downsample) optim_layer = append(optim_layer, optim_block) @@ -220,6 +292,11 @@ init_optim = function(string optimizer, int classes, string block_type, list[unk m_b_fc = matrix(0, rows=1, cols=classes); v_b_fc = matrix(0, rows=1, cols=classes) params = append(params, list(m_W_fc, v_W_fc)) params = append(params, list(m_b_fc, v_b_fc)) + } else if (optimizer == "lars") { + v_W_fc = matrix(0, rows=C_in, cols=classes) + v_b_fc = matrix(0, rows=1, cols=classes) + params = append(params, v_W_fc) + params = append(params, v_b_fc) } else { s_W_fc = matrix(0, rows=C_in, cols=classes) s_b_fc = matrix(0, rows=1, cols=classes) @@ -284,6 +361,15 @@ update_param = function(int index, string optimizer, list[unknown] optim_params, [param_upd, v_upd] = sgd_nesterov::update(param, grad, lr, mu, v) optim_params_upd = append(optim_params_upd, v_upd) + } else if (optimizer == "lars") { + lr = as.scalar(optim_hyper_params[1]) + mu = as.scalar(optim_hyper_params[2]) + lambda = as.scalar(optim_hyper_params[3]) + trust_coeff = as.scalar(optim_hyper_params[4]) + + v = as.matrix(optim_params[index]) + [param_upd, v_upd] = lars::update(param, grad, lr, mu, v, lambda, trust_coeff) + optim_params_upd = append(optim_params_upd, v_upd) } params_upd = append(params_upd, param_upd) } diff --git a/scripts/nn/optim/lars.dml b/scripts/nn/optim/lars.dml new file mode 100644 index 00000000000..f6957753d12 --- /dev/null +++ b/scripts/nn/optim/lars.dml @@ -0,0 +1,97 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +/* + * Layer-wise Adaptive Rate Scaling (LARS) optimizer. + */ + +update = function(matrix[double] X, matrix[double] dX, double lr, double mu, + matrix[double] v, double lambda, double trust_coeff) + return (matrix[double] X, matrix[double] v) { + /* + * Performs a LARS update with layer-wise adaptive learning rate, + * faithfully implementing Algorithm 1 from the original paper. + * + * Reference: + * - "Large Batch Training of Convolutional Networks" by You, Gitman, and Ginsburg. + * https://arxiv.org/abs/1708.03888 + * + * This implementation correctly uses the sum of norms for the denominator + * and a coupled weight decay approach, as specified in the paper's + * pseudocode. + * + * Inputs: + * - X: Parameters to update, of shape (any, any). + * - dX: Gradient of the loss function w.r.t. X, of same shape as X. + * - lr: Global learning rate (γ in the paper). + * - mu: Momentum coefficient (m in the paper). + * - v: Velocity (momentum state), of same shape as X. + * - lambda: L2 regularization strength (β in the paper). + * - trust_coeff: Trust coefficient for LARS (η in the paper). + * + * Outputs: + * - X: Updated parameters X, of same shape as input X. + * - v: Updated velocity, of same shape as input v. + */ + + # Step 1: Add weight decay to the gradient to form g'. + # This corresponds to `g_t' + βw_t'` in Algorithm 1. + dX_wd = dX + lambda * X; + + # Step 2: Compute the L2 norms of the pure gradient and the weights separately. + X_norm = sqrt(sum(X^2)); + dX_norm = sqrt(sum(dX^2)); + + # A small epsilon for numerical stability, preventing division by zero. + epsilon = 1e-8; + + # Step 3: Compute the local learning rate `λ'`. + local_lr = trust_coeff * X_norm / (dX_norm + lambda * X_norm + epsilon); + + # Step 4: Compute the final effective learning rate for this layer's update. + effective_lr = lr * local_lr; + + # Step 5: For very small layers (like biases), which can be unstable with LARS, + # we fall back to using the global learning rate. + if (X_norm < 1e-3 | ncol(X) == 1 | nrow(X) == 1) { + effective_lr = lr; + } + + # Step 6: Update the momentum (velocity). + v = mu * v - effective_lr * dX_wd; + + # Step 7: Update the weights. + X = X + v; +} + +init = function(matrix[double] X) + return (matrix[double] v) { + /* + * Initialize the state for LARS (momentum). + * + * Inputs: + * - X: Parameters to update, of shape (any, any). + * + * Outputs: + * - v: Initial velocity (zeros), of same shape as X. + */ + v = matrix(0, rows=nrow(X), cols=ncol(X)) +} \ No newline at end of file diff --git a/scripts/nn/optim/lars_util.dml b/scripts/nn/optim/lars_util.dml new file mode 100644 index 00000000000..99e5b02c2f9 --- /dev/null +++ b/scripts/nn/optim/lars_util.dml @@ -0,0 +1,54 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +get_lr_with_warmup = function(double base_lr, int epoch, int iter, int total_epochs, + int iters_per_epoch, int batch_size, int base_batch_size, + int warmup_epochs, int decay_power) + return (double lr) { + /* + * Compute learning rate with linear warmup and polynomial decay. + * + * Implements the learning rate schedule from LARS paper: + * - Linear warmup for first warmup_epochs + * - Polynomial decay afterwards + * - Linear scaling with batch size + */ + + # Scale learning rate linearly with batch size + scaled_lr = base_lr * batch_size / base_batch_size + + # Total number of iterations + total_iters = total_epochs * iters_per_epoch + warmup_iters = warmup_epochs * iters_per_epoch + current_iter = (epoch - 1) * iters_per_epoch + iter + + if (current_iter <= warmup_iters) { + # Linear warmup + lr = scaled_lr * current_iter / warmup_iters + } else { + # Polynomial decay + decay_iters = total_iters - warmup_iters + decay_current = current_iter - warmup_iters + decay_factor = (1 - decay_current / decay_iters) ^ decay_power + lr = scaled_lr * decay_factor + } +} +