From f9542b7859c87b12a5190f9cd807f9e1ed7c1142 Mon Sep 17 00:00:00 2001
From: Timothy Kautz <tim@timothykautz.com>
Date: Sun, 18 Dec 2022 22:41:27 +0900
Subject: [PATCH 01/12] Add Encoder model to torch2coreml for image2image

and later for in-paining
---
 .../torch2coreml.py                           | 185 +++++++++++++++++-
 1 file changed, 183 insertions(+), 2 deletions(-)

diff --git a/python_coreml_stable_diffusion/torch2coreml.py b/python_coreml_stable_diffusion/torch2coreml.py
index 6d6c2fad..3963f537 100644
--- a/python_coreml_stable_diffusion/torch2coreml.py
+++ b/python_coreml_stable_diffusion/torch2coreml.py
@@ -43,6 +43,23 @@ def _get_coreml_inputs(sample_inputs, args):
         ) for k, v in sample_inputs.items()
     ]
 
+# Simpler version of `DiagonalGaussianDistribution` with only needed calculations
+# as implemented in vae.py as part of the AutoencoderKL class
+# This is because coreml tools does not support the `randn` operation, so we pass in a random tensor.
+class CoreMLDiagonalGaussianDistribution(object):
+    def __init__(self, parameters, noise):
+        self.parameters = parameters
+        self.noise = noise
+        self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
+        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+        self.std = torch.exp(0.5 * self.logvar)
+
+    def sample(self) -> torch.FloatTensor:
+        device = self.parameters.device
+        # make sure sample is on the same device as the parameters and has same dtype
+        sample = self.noise.to(device=device, dtype=self.parameters.dtype)
+        x = self.mean + self.std * sample
+        return x
 
 def compute_psnr(a, b):
     """ Compute Peak-Signal-to-Noise-Ratio across two numpy.ndarray objects
@@ -140,7 +157,7 @@ def _convert_to_coreml(submodule_name, torchscript_module, sample_inputs,
 
 def quantize_weights_to_8bits(args):
     for model_name in [
-            "text_encoder", "vae_decoder", "unet", "unet_chunk1",
+            "text_encoder", "vae_decoder", "vae_encoder", "unet", "unet_chunk1",
             "unet_chunk2", "safety_checker"
     ]:
         out_path = _get_out_path(args, model_name)
@@ -190,6 +207,7 @@ def bundle_resources_for_swift_cli(args):
     # Compile model using coremlcompiler (Significantly reduces the load time for unet)
     for source_name, target_name in [("text_encoder", "TextEncoder"),
                                      ("vae_decoder", "VAEDecoder"),
+                                     ("vae_encoder", "VAEEncoder"),
                                      ("unet", "Unet"),
                                      ("unet_chunk1", "UnetChunk1"),
                                      ("unet_chunk2", "UnetChunk2"),
@@ -453,6 +471,163 @@ def forward(self, z):
     gc.collect()
 
 
+def convert_vae_encoder(pipe, args):
+    """ Converts the VAE Encoder component of Stable Diffusion
+    """
+    out_path = _get_out_path(args, "vae_encoder")
+    if os.path.exists(out_path):
+        logger.info(
+            f"`vae_encoder` already exists at {out_path}, skipping conversion."
+        )
+        return
+
+    if not hasattr(pipe, "unet"):
+        raise RuntimeError(
+            "convert_unet() deletes pipe.unet to save RAM. "
+            "Please use convert_vae_encoder() before convert_unet()")
+
+    sample_shape = (
+        1,  # B
+        3,  # C (RGB range from -1 to 1)
+        args.latent_h or pipe.unet.config.sample_size * 8,  # H
+        args.latent_w or pipe.unet.config.sample_size * 8,  # w
+    )
+    
+    noise_shape = (
+        1,  # B
+        4,  # C
+        pipe.unet.config.sample_size,  # H
+        pipe.unet.config.sample_size,  # w
+    )
+
+    float_value_shape = (
+        1,
+        1,
+    )
+
+    sqrtAlphasCumprodTorchShape = torch.tensor([[0.2,]])
+    sqrtOneMinusAlphasCumprodTorchShape = torch.tensor([[0.8,]])
+
+    sample_vae_encoder_inputs = {
+        "sample": torch.rand(*sample_shape, dtype=torch.float16),
+        "diagonalNoise": torch.rand(*noise_shape, dtype=torch.float16),
+        "noise": torch.rand(*noise_shape, dtype=torch.float16),
+        "sqrtAlphasCumprod": torch.rand(*float_value_shape, dtype=torch.float16),
+        "sqrtOneMinusAlphasCumprod": torch.rand(*float_value_shape, dtype=torch.float16),
+    }
+
+    class VAEEncoder(nn.Module):
+        """ Wrapper nn.Module wrapper for pipe.encode() method
+        """
+
+        def __init__(self):
+            super().__init__()
+            self.quant_conv = pipe.vae.quant_conv
+            self.alphas_cumprod = pipe.scheduler.alphas_cumprod
+            self.encoder = pipe.vae.encoder
+
+        # Because CoreMLTools does not support the torch.randn op, we pass in both
+        # the diagonal Noise for the `DiagonalGaussianDistribution` operation and
+        # the noise tensor combined with precalculated `sqrtAlphasCumprod` and `sqrtOneMinusAlphasCumprod`
+        # for faster computation.
+        def forward(self, sample, diagonalNoise, noise, sqrtAlphasCumprod, sqrtOneMinusAlphasCumprod):
+            h = self.encoder(sample)
+            moments = self.quant_conv(h)
+            diagonalNoise = diagonalNoise.to(sample.device)
+            posterior = CoreMLDiagonalGaussianDistribution(moments, diagonalNoise)
+            posteriorSample = posterior.sample()
+            
+            # Add the scaling operation and the latent noise for faster computation
+            init_latents = 0.18215 * posteriorSample
+            result = self.add_noise(init_latents, noise, sqrtAlphasCumprod, sqrtOneMinusAlphasCumprod)
+            return result
+        
+        def add_noise(
+            self,
+            original_samples: torch.FloatTensor,
+            noise: torch.FloatTensor,
+            sqrtAlphasCumprod: torch.FloatTensor,
+            sqrtOneMinusAlphasCumprod: torch.FloatTensor
+        ) -> torch.FloatTensor:
+            noise = noise.to(original_samples.device)
+            sqrtAlphasCumprod = sqrtAlphasCumprod.to(original_samples.device)
+            sqrtOneMinusAlphasCumprod = sqrtOneMinusAlphasCumprod.to(original_samples.device)
+            noisy_samples = sqrtAlphasCumprod * original_samples + sqrtOneMinusAlphasCumprod * noise
+            return noisy_samples
+         
+
+    baseline_encoder = VAEEncoder().eval()
+
+    # No optimization needed for the VAE Encoder as it is a pure ConvNet
+    traced_vae_encoder = torch.jit.trace(
+        baseline_encoder, (
+            sample_vae_encoder_inputs["sample"].to(torch.float32),
+            sample_vae_encoder_inputs["diagonalNoise"].to(torch.float32),
+            sample_vae_encoder_inputs["noise"].to(torch.float32),
+            sqrtAlphasCumprodTorchShape.to(torch.float32),
+            sqrtOneMinusAlphasCumprodTorchShape.to(torch.float32)
+        ))
+
+    modify_coremltools_torch_frontend_badbmm()
+    coreml_vae_encoder, out_path = _convert_to_coreml(
+        "vae_encoder", traced_vae_encoder, sample_vae_encoder_inputs,
+        ["latent_dist"], args)
+
+    # Set model metadata
+    coreml_vae_encoder.author = f"Please refer to the Model Card available at huggingface.co/{args.model_version}"
+    coreml_vae_encoder.license = "OpenRAIL (https://huggingface.co/spaces/CompVis/stable-diffusion-license)"
+    coreml_vae_encoder.version = args.model_version
+    coreml_vae_encoder.short_description = \
+        "Stable Diffusion generates images conditioned on text and/or other images as input through the diffusion process. " \
+        "Please refer to https://arxiv.org/abs/2112.10752 for details."
+
+    # Set the input descriptions
+    coreml_vae_encoder.input_description["sample"] = \
+        "An image of the correct size to create the latent space with, image2image and in-painting."
+    coreml_vae_encoder.input_description["diagonalNoise"] = \
+        "Latent noise for `DiagonalGaussianDistribution` operation."
+    coreml_vae_encoder.input_description["noise"] = \
+        "Latent noise for use with strength parameter of image2image"
+    coreml_vae_encoder.input_description["sqrtAlphasCumprod"] = \
+        "Precalculated `sqrtAlphasCumprod` value based on strength and the current schedular's alphasCumprod values"
+    coreml_vae_encoder.input_description["sqrtOneMinusAlphasCumprod"] = \
+        "Precalculated `sqrtOneMinusAlphasCumprod` value based on strength and the current schedular's alphasCumprod values"
+
+    # Set the output descriptions
+    coreml_vae_encoder.output_description[
+        "latent_dist"] = "The latent embeddings from the unet model from the input image for image2image."
+
+    _save_mlpackage(coreml_vae_encoder, out_path)
+
+    logger.info(f"Saved vae_encoder into {out_path}")
+
+    # Parity check PyTorch vs CoreML
+    if args.check_output_correctness:
+        baseline_out = baseline_encoder(
+            sample=sample_vae_encoder_inputs["sample"].to(torch.float32),
+            diagonalNoise=sample_vae_encoder_inputs["diagonalNoise"].to(torch.float32),
+            noise=sample_vae_encoder_inputs["noise"].to(torch.float32),
+            sqrtAlphasCumprod=sqrtAlphasCumprodTorchShape,
+            sqrtOneMinusAlphasCumprod=sqrtOneMinusAlphasCumprodTorchShape,
+            ).numpy(),
+
+        coreml_out = list(
+            coreml_vae_encoder.predict(
+                {
+                    "sample": sample_vae_encoder_inputs["sample"].numpy(),
+                    "diagonalNoise": sample_vae_encoder_inputs["diagonalNoise"].numpy(),
+                    "noise": sample_vae_encoder_inputs["noise"].numpy(),
+                    "sqrtAlphasCumprod": sqrtAlphasCumprodTorchShape.numpy(),
+                    "sqrtOneMinusAlphasCumprod": sqrtOneMinusAlphasCumprodTorchShape.numpy()
+                }).values())
+
+        report_correctness(baseline_out[0], coreml_out[0],
+                           "vae_encoder baseline PyTorch to baseline CoreML")
+
+    del traced_vae_encoder, pipe.vae.encoder, coreml_vae_encoder
+    gc.collect()
+
+
 def convert_unet(pipe, args):
     """ Converts the UNet component of Stable Diffusion
     """
@@ -801,7 +976,12 @@ def main(args):
         logger.info("Converting vae_decoder")
         convert_vae_decoder(pipe, args)
         logger.info("Converted vae_decoder")
-
+        
+    if args.convert_vae_encoder:
+        logger.info("Converting vae_encoder")
+        convert_vae_encoder(pipe, args)
+        logger.info("Converted vae_encoder")
+        
     if args.convert_unet:
         logger.info("Converting unet")
         convert_unet(pipe, args)
@@ -835,6 +1015,7 @@ def parser_spec():
     # Select which models to export (All are needed for text-to-image pipeline to function)
     parser.add_argument("--convert-text-encoder", action="store_true")
     parser.add_argument("--convert-vae-decoder", action="store_true")
+    parser.add_argument("--convert-vae-encoder", action="store_true")
     parser.add_argument("--convert-unet", action="store_true")
     parser.add_argument("--convert-safety-checker", action="store_true")
     parser.add_argument(

From 41d4637350bfe9ddea552aa039f3dc5ab5e9e194 Mon Sep 17 00:00:00 2001
From: Timothy Kautz <tim@timothykautz.com>
Date: Sun, 18 Dec 2022 23:43:30 +0900
Subject: [PATCH 02/12] Image2Image Encoder

Encoder
---
 .../pipeline/AlphasCumprodCalculation.swift   |  29 +++++
 .../pipeline/CGImage+vImage.swift             | 120 ++++++++++++++++++
 swift/StableDiffusion/pipeline/Decoder.swift  |  52 +-------
 swift/StableDiffusion/pipeline/Encoder.swift  |  80 ++++++++++++
 4 files changed, 231 insertions(+), 50 deletions(-)
 create mode 100644 swift/StableDiffusion/pipeline/AlphasCumprodCalculation.swift
 create mode 100644 swift/StableDiffusion/pipeline/CGImage+vImage.swift
 create mode 100644 swift/StableDiffusion/pipeline/Encoder.swift

diff --git a/swift/StableDiffusion/pipeline/AlphasCumprodCalculation.swift b/swift/StableDiffusion/pipeline/AlphasCumprodCalculation.swift
new file mode 100644
index 00000000..350ba3a7
--- /dev/null
+++ b/swift/StableDiffusion/pipeline/AlphasCumprodCalculation.swift
@@ -0,0 +1,29 @@
+// For licensing see accompanying LICENSE.md file.
+// Copyright (C) 2022 Apple Inc. All Rights Reserved.
+
+import Foundation
+
+public struct AlphasCumprodCalculation {
+    public var sqrtAlphasCumprod: Float
+    public var sqrtOneMinusAlphasCumprod: Float
+    
+    public init(
+        sqrtAlphasCumprod: Float,
+        sqrtOneMinusAlphasCumprod: Float
+    ) {
+        self.sqrtAlphasCumprod = sqrtAlphasCumprod
+        self.sqrtOneMinusAlphasCumprod = sqrtOneMinusAlphasCumprod
+    }
+    
+    public init(
+        alphasCumprod: [Float],
+        timesteps: Int = 1_000,
+        steps: Int,
+        strength: Float
+    ) {
+        let tEnc = Int(strength * Float(steps))
+        let initTimestep = timesteps - timesteps / steps * (steps - tEnc) + 1
+        self.sqrtAlphasCumprod = alphasCumprod[initTimestep].squareRoot()
+        self.sqrtOneMinusAlphasCumprod = (1 - alphasCumprod[initTimestep]).squareRoot()
+    }
+}
diff --git a/swift/StableDiffusion/pipeline/CGImage+vImage.swift b/swift/StableDiffusion/pipeline/CGImage+vImage.swift
new file mode 100644
index 00000000..809836e7
--- /dev/null
+++ b/swift/StableDiffusion/pipeline/CGImage+vImage.swift
@@ -0,0 +1,120 @@
+// For licensing see accompanying LICENSE.md file.
+// Copyright (C) 2022 Apple Inc. All Rights Reserved.
+
+import Foundation
+import Accelerate
+import CoreML
+
+@available(iOS 16.0, macOS 13.0, *)
+extension CGImage {
+    
+    typealias PixelBufferPFx1 = vImage.PixelBuffer<vImage.PlanarF>
+    typealias PixelBufferP8x3 = vImage.PixelBuffer<vImage.Planar8x3>
+    typealias PixelBufferIFx3 = vImage.PixelBuffer<vImage.InterleavedFx3>
+    typealias PixelBufferI8x3 = vImage.PixelBuffer<vImage.Interleaved8x3>
+    
+    public enum ShapedArrayError: String, Swift.Error {
+        case wrongNumberOfChannels
+        case incorrectFormatsConvertingToShapedArray
+        case vImageConverterNotInitialized
+    }
+    
+    public static func fromShapedArray(_ array: MLShapedArray<Float32>) throws -> CGImage {
+        
+        // array is [N,C,H,W], where C==3
+        let channelCount = array.shape[1]
+        guard channelCount == 3 else {
+            throw ShapedArrayError.wrongNumberOfChannels
+        }
+        
+        let height = array.shape[2]
+        let width = array.shape[3]
+
+        // Normalize each channel into a float between 0 and 1.0
+        let floatChannels = (0..<channelCount).map { i in
+
+            // Normalized channel output
+            let cOut = PixelBufferPFx1(width: width, height:height)
+
+            // Reference this channel in the array and normalize
+            array[0][i].withUnsafeShapedBufferPointer { ptr, _, strides in
+                let cIn = PixelBufferPFx1(data: .init(mutating: ptr.baseAddress!),
+                                          width: width, height: height,
+                                          byteCountPerRow: strides[0]*4)
+                // Map [-1.0 1.0] -> [0.0 1.0]
+                cIn.multiply(by: 0.5, preBias: 1.0, postBias: 0.0, destination: cOut)
+            }
+            return cOut
+        }
+
+        // Convert to interleaved and then to UInt8
+        let floatImage = PixelBufferIFx3(planarBuffers: floatChannels)
+        let uint8Image = PixelBufferI8x3(width: width, height: height)
+        floatImage.convert(to:uint8Image) // maps [0.0 1.0] -> [0 255] and clips
+
+        // Convert to uint8x3 to RGB CGImage (no alpha)
+        let bitmapInfo = CGBitmapInfo(rawValue: CGImageAlphaInfo.none.rawValue)
+        let cgImage = uint8Image.makeCGImage(cgImageFormat:
+                .init(bitsPerComponent: 8,
+                      bitsPerPixel: 3*8,
+                      colorSpace: CGColorSpaceCreateDeviceRGB(),
+                      bitmapInfo: bitmapInfo)!)!
+
+        return cgImage
+    }
+    
+    public var plannerRGBShapedArray: MLShapedArray<Float32> {
+        get throws {
+            guard
+                var sourceFormat = vImage_CGImageFormat(cgImage: self),
+                var mediumFormat = vImage_CGImageFormat(
+                    bitsPerComponent: 8 * MemoryLayout<UInt8>.size,
+                    bitsPerPixel: 8 * MemoryLayout<UInt8>.size * 4,
+                    colorSpace: CGColorSpaceCreateDeviceRGB(),
+                    bitmapInfo: CGBitmapInfo(rawValue: CGImageAlphaInfo.first.rawValue)),
+                let width = vImagePixelCount(exactly: self.width),
+                let height = vImagePixelCount(exactly: self.height)
+            else {
+                throw ShapedArrayError.incorrectFormatsConvertingToShapedArray
+            }
+            
+            var sourceImageBuffer = try vImage_Buffer(cgImage: self)
+            
+            var mediumDesination = try vImage_Buffer(width: Int(width), height: Int(height), bitsPerPixel: mediumFormat.bitsPerPixel)
+            
+            let converter = vImageConverter_CreateWithCGImageFormat(
+                &sourceFormat,
+                &mediumFormat,
+                nil,
+                vImage_Flags(kvImagePrintDiagnosticsToConsole),
+                nil)
+            
+            guard let converter = converter?.takeRetainedValue() else {
+                throw ShapedArrayError.vImageConverterNotInitialized
+            }
+            
+            vImageConvert_AnyToAny(converter, &sourceImageBuffer, &mediumDesination, nil, vImage_Flags(kvImagePrintDiagnosticsToConsole))
+            
+            var destinationA = try vImage_Buffer(width: Int(width), height: Int(height), bitsPerPixel: 8 * UInt32(MemoryLayout<Float>.size))
+            var destinationR = try vImage_Buffer(width: Int(width), height: Int(height), bitsPerPixel: 8 * UInt32(MemoryLayout<Float>.size))
+            var destinationG = try vImage_Buffer(width: Int(width), height: Int(height), bitsPerPixel: 8 * UInt32(MemoryLayout<Float>.size))
+            var destinationB = try vImage_Buffer(width: Int(width), height: Int(height), bitsPerPixel: 8 * UInt32(MemoryLayout<Float>.size))
+            
+            var minFloat: [Float] = [-1.0, -1.0, -1.0, -1.0]
+            var maxFloat: [Float] = [1.0, 1.0, 1.0, 1.0]
+            
+            vImageConvert_ARGB8888toPlanarF(&mediumDesination, &destinationA, &destinationR, &destinationG, &destinationB, &maxFloat, &minFloat, .zero)
+           
+            let redData = Data(bytes: destinationR.data, count: Int(width) * Int(height) * MemoryLayout<Float>.size)
+            let greenData = Data(bytes: destinationG.data, count: Int(width) * Int(height) * MemoryLayout<Float>.size)
+            let blueData = Data(bytes: destinationB.data, count: Int(width) * Int(height) * MemoryLayout<Float>.size)
+            
+            let imageData = redData + greenData + blueData
+
+            let shapedArray = MLShapedArray<Float32>(data: imageData, shape: [1, 3, 512, 512])
+            
+            return shapedArray
+        }
+    }
+}
+
diff --git a/swift/StableDiffusion/pipeline/Decoder.swift b/swift/StableDiffusion/pipeline/Decoder.swift
index 04f04ba6..e9b2c704 100644
--- a/swift/StableDiffusion/pipeline/Decoder.swift
+++ b/swift/StableDiffusion/pipeline/Decoder.swift
@@ -3,7 +3,6 @@
 
 import Foundation
 import CoreML
-import Accelerate
 
 /// A decoder model which produces RGB images from latent samples
 @available(iOS 16.2, macOS 13.1, *)
@@ -57,12 +56,11 @@ public struct Decoder: ResourceManaging {
         }
 
         // Transform the outputs to CGImages
-        let images: [CGImage] = (0..<results.count).map { i in
+        let images: [CGImage] = try (0..<results.count).map { i in
             let result = results.features(at: i)
             let outputName = result.featureNames.first!
             let output = result.featureValue(for: outputName)!.multiArrayValue!
-
-            return toRGBCGImage(MLShapedArray<Float32>(output))
+            return try CGImage.fromShapedArray(MLShapedArray<Float32>(output))
         }
 
         return images
@@ -74,50 +72,4 @@ public struct Decoder: ResourceManaging {
         }
     }
 
-    typealias PixelBufferPFx1 = vImage.PixelBuffer<vImage.PlanarF>
-    typealias PixelBufferP8x3 = vImage.PixelBuffer<vImage.Planar8x3>
-    typealias PixelBufferIFx3 = vImage.PixelBuffer<vImage.InterleavedFx3>
-    typealias PixelBufferI8x3 = vImage.PixelBuffer<vImage.Interleaved8x3>
-
-    func toRGBCGImage(_ array: MLShapedArray<Float32>) -> CGImage {
-
-        // array is [N,C,H,W], where C==3
-        let channelCount = array.shape[1]
-        assert(channelCount == 3,
-               "Decoding model output has \(channelCount) channels, expected 3")
-        let height = array.shape[2]
-        let width = array.shape[3]
-
-        // Normalize each channel into a float between 0 and 1.0
-        let floatChannels = (0..<channelCount).map { i in
-
-            // Normalized channel output
-            let cOut = PixelBufferPFx1(width: width, height:height)
-
-            // Reference this channel in the array and normalize
-            array[0][i].withUnsafeShapedBufferPointer { ptr, _, strides in
-                let cIn = PixelBufferPFx1(data: .init(mutating: ptr.baseAddress!),
-                                          width: width, height: height,
-                                          byteCountPerRow: strides[0]*4)
-                // Map [-1.0 1.0] -> [0.0 1.0]
-                cIn.multiply(by: 0.5, preBias: 1.0, postBias: 0.0, destination: cOut)
-            }
-            return cOut
-        }
-
-        // Convert to interleaved and then to UInt8
-        let floatImage = PixelBufferIFx3(planarBuffers: floatChannels)
-        let uint8Image = PixelBufferI8x3(width: width, height: height)
-        floatImage.convert(to:uint8Image) // maps [0.0 1.0] -> [0 255] and clips
-
-        // Convert to uint8x3 to RGB CGImage (no alpha)
-        let bitmapInfo = CGBitmapInfo(rawValue: CGImageAlphaInfo.none.rawValue)
-        let cgImage = uint8Image.makeCGImage(cgImageFormat:
-                .init(bitsPerComponent: 8,
-                      bitsPerPixel: 3*8,
-                      colorSpace: CGColorSpaceCreateDeviceRGB(),
-                      bitmapInfo: bitmapInfo)!)!
-
-        return cgImage
-    }
 }
diff --git a/swift/StableDiffusion/pipeline/Encoder.swift b/swift/StableDiffusion/pipeline/Encoder.swift
new file mode 100644
index 00000000..043865cb
--- /dev/null
+++ b/swift/StableDiffusion/pipeline/Encoder.swift
@@ -0,0 +1,80 @@
+// For licensing see accompanying LICENSE.md file.
+// Copyright (C) 2022 Apple Inc. All Rights Reserved.
+
+import Foundation
+import CoreML
+
+@available(iOS 16.0, macOS 13.0, *)
+/// Encoder, currently supports image2image
+public struct Encoder {
+    
+    public enum Error: String, Swift.Error {
+        case latentOutputNotValid
+        case batchLatentOutputEmpty
+    }
+    
+    /// VAE encoder model + post math and adding noise from schedular
+    var model: MLModel
+    
+    /// Create decoder from Core ML model
+    ///
+    /// - Parameters
+    ///     - model: Core ML model for VAE decoder
+    public init(model: MLModel) {
+        self.model = model
+    }
+    
+    /// Prediction queue
+    let queue = DispatchQueue(label: "encoder.predict")
+
+    /// Batch encode latent samples into images
+    /// - Parameters:
+    ///   - image: image used for image2image
+    ///   - diagonalNoise: random noise for `DiagonalGaussianDistribution` operation
+    ///   - noise: random noise for initial latent space based on strength argument
+    ///   - alphasCumprodStep: calculations using the scheduler traditionally calculated in the pipeline in pyTorch Diffusers library.
+    /// - Returns: The encoded latent space as MLShapedArray
+    public func encode(
+        image:  CGImage,
+        diagonalNoise: MLShapedArray<Float32>,
+        noise: MLShapedArray<Float32>,
+        alphasCumprodStep: AlphasCumprodCalculation
+    ) throws -> MLShapedArray<Float32> {
+        let sample = try image.plannerRGBShapedArray
+        let sqrtAlphasCumprod = MLShapedArray(scalars: [alphasCumprodStep.sqrtAlphasCumprod], shape: [1, 1])
+        let sqrtOneMinusAlphasCumprod = MLShapedArray(scalars: [alphasCumprodStep.sqrtOneMinusAlphasCumprod], shape: [1, 1])
+        
+        let dict: [String: Any] = [
+            "sample": MLMultiArray(sample),
+            "diagonalNoise": MLMultiArray(diagonalNoise),
+            "noise": MLMultiArray(noise),
+            "sqrtAlphasCumprod": MLMultiArray(sqrtAlphasCumprod),
+            "sqrtOneMinusAlphasCumprod": MLMultiArray(sqrtOneMinusAlphasCumprod),
+        ]
+        let featureProvider = try MLDictionaryFeatureProvider(dictionary: dict)
+        
+        let batch = MLArrayBatchProvider(array: [featureProvider])
+
+        // Batch predict with model
+        let results = try queue.sync { try model.predictions(fromBatch: batch) }
+        
+        let batchLatents: [MLShapedArray<Float32>] = try (0..<results.count).compactMap { i in
+            let result = results.features(at: i)
+            guard
+                let outputName = result.featureNames.first,
+                let output = result.featureValue(for: outputName)?.multiArrayValue
+            else {
+                throw Error.latentOutputNotValid
+            }
+            print("output.shape: \(output.shape)")
+            return MLShapedArray(output)
+        }
+        
+        guard let latents = batchLatents.first else {
+            throw Error.batchLatentOutputEmpty
+        }
+        
+        return latents
+    }
+    
+}

From effc166c62b446aab487e3ea4c9e9ef8a7a77ca6 Mon Sep 17 00:00:00 2001
From: Timothy Kautz <tim@timothykautz.com>
Date: Mon, 19 Dec 2022 00:38:04 +0900
Subject: [PATCH 03/12] Scheduler and pipeline

---
 .../DPMSolverMultistepScheduler.swift         |  6 +-
 .../StableDiffusion/pipeline/Scheduler.swift  | 41 +++++++++++-
 .../StableDiffusionPipeline+Resources.swift   | 13 ++++
 .../pipeline/StableDiffusionPipeline.swift    | 65 ++++++++++++++++++-
 4 files changed, 120 insertions(+), 5 deletions(-)

diff --git a/swift/StableDiffusion/pipeline/DPMSolverMultistepScheduler.swift b/swift/StableDiffusion/pipeline/DPMSolverMultistepScheduler.swift
index 1fbfff1c..3e8342ce 100644
--- a/swift/StableDiffusion/pipeline/DPMSolverMultistepScheduler.swift
+++ b/swift/StableDiffusion/pipeline/DPMSolverMultistepScheduler.swift
@@ -23,12 +23,16 @@ public final class DPMSolverMultistepScheduler: Scheduler {
     public let betas: [Float]
     public let alphas: [Float]
     public let alphasCumProd: [Float]
-    public let timeSteps: [Int]
+    private let timeSteps: [Int]
 
     public let alpha_t: [Float]
     public let sigma_t: [Float]
     public let lambda_t: [Float]
     
+    public var allTimeSteps: [Int] {
+        timeSteps
+    }
+    
     public let solverOrder = 2
     private(set) var lowerOrderStepped = 0
     
diff --git a/swift/StableDiffusion/pipeline/Scheduler.swift b/swift/StableDiffusion/pipeline/Scheduler.swift
index 0bd92840..d0ed12e3 100644
--- a/swift/StableDiffusion/pipeline/Scheduler.swift
+++ b/swift/StableDiffusion/pipeline/Scheduler.swift
@@ -10,9 +10,12 @@ public protocol Scheduler {
 
     /// Number of inference steps to be performed
     var inferenceStepCount: Int { get }
+    
+    /// Training diffusion time steps index by inference time step
+    var allTimeSteps: [Int] { get }
 
     /// Training diffusion time steps index by inference time step
-    var timeSteps: [Int] { get }
+    func calculateTimesteps(strength: Float?) -> [Int]
 
     /// Schedule of betas which controls the amount of noise added at each timestep
     var betas: [Float] { get }
@@ -71,6 +74,35 @@ public extension Scheduler {
     }
 }
 
+// MARK: - Image2Image
+
+@available(iOS 16.2, macOS 13.1, *)
+public extension Scheduler {
+    
+    func calculateAlphasCumprod(strength: Float) -> AlphasCumprodCalculation {
+        AlphasCumprodCalculation(
+            alphasCumprod: alphasCumProd,
+            timesteps: trainStepCount,
+            steps: inferenceStepCount,
+            strength: strength)
+    }
+}
+
+// MARK: - Timesteps
+
+@available(iOS 16.2, macOS 13.1, *)
+public extension Scheduler {
+    
+    func calculateTimesteps(strength: Float?) -> [Int] {
+        guard let strength else { return allTimeSteps.reversed() }
+        let startStep = Int(Float(inferenceStepCount) * strength)
+        let acutalTimesteps = Array(allTimeSteps[0..<startStep].reversed())
+        return acutalTimesteps
+    }
+}
+
+// MARK: - BetaSchedule
+
 /// How to map a beta range to a sequence of betas to step over
 @available(iOS 16.2, macOS 13.1, *)
 public enum BetaSchedule {
@@ -80,6 +112,7 @@ public enum BetaSchedule {
     case scaledLinear
 }
 
+// MARK: - PNDMScheduler
 
 /// A scheduler used to compute a de-noised image
 ///
@@ -94,7 +127,11 @@ public final class PNDMScheduler: Scheduler {
     public let betas: [Float]
     public let alphas: [Float]
     public let alphasCumProd: [Float]
-    public let timeSteps: [Int]
+    private let timeSteps: [Int]
+    
+    public var allTimeSteps: [Int] {
+        timeSteps
+    }
 
     // Internal state
     var counter: Int
diff --git a/swift/StableDiffusion/pipeline/StableDiffusionPipeline+Resources.swift b/swift/StableDiffusion/pipeline/StableDiffusionPipeline+Resources.swift
index 65c6e03c..d4d703a7 100644
--- a/swift/StableDiffusion/pipeline/StableDiffusionPipeline+Resources.swift
+++ b/swift/StableDiffusion/pipeline/StableDiffusionPipeline+Resources.swift
@@ -14,6 +14,7 @@ public extension StableDiffusionPipeline {
         public let unetChunk1URL: URL
         public let unetChunk2URL: URL
         public let decoderURL: URL
+        public let encoderURL: URL
         public let safetyCheckerURL: URL
         public let vocabURL: URL
         public let mergesURL: URL
@@ -24,6 +25,7 @@ public extension StableDiffusionPipeline {
             unetChunk1URL = baseURL.appending(path: "UnetChunk1.mlmodelc")
             unetChunk2URL = baseURL.appending(path: "UnetChunk2.mlmodelc")
             decoderURL = baseURL.appending(path: "VAEDecoder.mlmodelc")
+            encoderURL = baseURL.appending(path: "VAEEncoder.mlmodelc")
             safetyCheckerURL = baseURL.appending(path: "SafetyChecker.mlmodelc")
             vocabURL = baseURL.appending(path: "vocab.json")
             mergesURL = baseURL.appending(path: "merges.txt")
@@ -74,11 +76,22 @@ public extension StableDiffusionPipeline {
             FileManager.default.fileExists(atPath: urls.safetyCheckerURL.path) {
             safetyChecker = SafetyChecker(modelAt: urls.safetyCheckerURL, configuration: config)
         }
+        
+        // Optional Image Encoder
+        let encoder: Encoder?
+        if
+            let encoderModel = try? MLModel(contentsOf: urls.encoderURL, configuration: config)
+        {
+            encoder = Encoder(model: encoderModel)
+        } else {
+            encoder = nil
+        }
 
         // Construct pipeline
         self.init(textEncoder: textEncoder,
                   unet: unet,
                   decoder: decoder,
+                  encoder: encoder,
                   safetyChecker: safetyChecker,
                   reduceMemory: reduceMemory)
     }
diff --git a/swift/StableDiffusion/pipeline/StableDiffusionPipeline.swift b/swift/StableDiffusion/pipeline/StableDiffusionPipeline.swift
index ea654723..66bf4afd 100644
--- a/swift/StableDiffusion/pipeline/StableDiffusionPipeline.swift
+++ b/swift/StableDiffusion/pipeline/StableDiffusionPipeline.swift
@@ -20,6 +20,10 @@ public enum StableDiffusionScheduler {
 /// [Hugging Face Diffusers Pipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py)
 @available(iOS 16.2, macOS 13.1, *)
 public struct StableDiffusionPipeline: ResourceManaging {
+    
+    public enum Error: String, Swift.Error {
+        case startingImageProvidedWithoutEncoder
+    }
 
     /// Model to generate embeddings for tokenized input text
     var textEncoder: TextEncoder
@@ -29,6 +33,9 @@ public struct StableDiffusionPipeline: ResourceManaging {
 
     /// Model used to generate final image from latent diffusion process
     var decoder: Decoder
+    
+    /// Model used to latent space for image2image, and soon, in-painting
+    var encoder: Encoder?
 
     /// Optional model for checking safety of generated image
     var safetyChecker: SafetyChecker? = nil
@@ -58,11 +65,13 @@ public struct StableDiffusionPipeline: ResourceManaging {
     public init(textEncoder: TextEncoder,
                 unet: Unet,
                 decoder: Decoder,
+                encoder: Encoder?,
                 safetyChecker: SafetyChecker? = nil,
                 reduceMemory: Bool = false) {
         self.textEncoder = textEncoder
         self.unet = unet
         self.decoder = decoder
+        self.encoder = encoder
         self.safetyChecker = safetyChecker
         self.reduceMemory = reduceMemory
     }
@@ -114,6 +123,8 @@ public struct StableDiffusionPipeline: ResourceManaging {
     public func generateImages(
         prompt: String,
         negativePrompt: String = "",
+        startingImage: CGImage? = nil,
+        strength: Float = 1.0,
         imageCount: Int = 1,
         stepCount: Int = 50,
         seed: UInt32 = 0,
@@ -150,10 +161,31 @@ public struct StableDiffusionPipeline: ResourceManaging {
         let stdev = scheduler[0].initNoiseSigma
 
         // Generate random latent samples from specified seed
-        var latents = generateLatentSamples(imageCount, stdev: stdev, seed: seed)
+        var latents: [MLShapedArray<Float32>]
+        let timestepStrength: Float?
+        
+        if let startingImage {
+            timestepStrength = strength
+            guard let encoder else {
+                throw Error.startingImageProvidedWithoutEncoder
+            }
+            let noiseTuples = generateImage2ImageLatentSamples(imageCount, stdev: 1, seed: seed)
+            latents = try noiseTuples.map({
+                try encoder.encode(
+                    image: startingImage,
+                    diagonalNoise: $0.diagonal,
+                    noise: $0.latentNoise,
+                    alphasCumprodStep: scheduler[0].calculateAlphasCumprod(strength: strength))
+            })
+        } else {
+            timestepStrength = nil
+            // Generate random latent samples from specified seed
+            latents = generateLatentSamples(imageCount, stdev: stdev, seed: seed)
+        }
 
         // De-noising loop
-        for (step,t) in scheduler[0].timeSteps.enumerated() {
+        let timeSteps = scheduler[0].calculateTimesteps(strength: timestepStrength)
+        for (step,t) in timeSteps.enumerated() {
 
             // Expand the latents for classifier-free guidance
             // and input to the Unet noise prediction model
@@ -215,6 +247,35 @@ public struct StableDiffusionPipeline: ResourceManaging {
         }
         return samples
     }
+    
+    
+    /// For image2image -
+    /// - Parameters:
+    ///   - count: batch size
+    ///   - stdev: 1
+    ///   - seed: seed provided
+    ///   - diagonalAndLatentNoiseIsSame: Diffusions library does not seem to use the same noise for the `DiagonalGaussianDistribution` operation,
+    ///     but I have seen implementations of pipelines where it is the same.
+    /// - Returns: An array of tuples of noise values with length of batch size.
+    func generateImage2ImageLatentSamples(_ count: Int, stdev: Float, seed: Int, diagonalAndLatentNoiseIsSame: Bool = false) -> [(diagonal: MLShapedArray<Float32>, latentNoise: MLShapedArray<Float32>)] {
+        var sampleShape = unet.latentSampleShape
+        sampleShape[0] = 1
+
+        var random = NumPyRandomSource(seed: UInt32(truncatingIfNeeded: seed))
+        let samples = (0..<count).map { _ in
+            if diagonalAndLatentNoiseIsSame {
+                let noise = MLShapedArray<Float32>(
+                    converting: random.normalShapedArray(sampleShape, mean: 0.0, stdev: Double(stdev)))
+                return (noise, noise)
+            } else {
+                return (MLShapedArray<Float32>(
+                    converting: random.normalShapedArray(sampleShape, mean: 0.0, stdev: Double(stdev))),
+                        MLShapedArray<Float32>(
+                            converting: random.normalShapedArray(sampleShape, mean: 0.0, stdev: Double(stdev))))
+            }
+        }
+        return samples
+    }
 
     func toHiddenStates(_ embedding: MLShapedArray<Float32>) -> MLShapedArray<Float32> {
         // Unoptimized manual transpose [0, 2, None, 1]

From 0a1b2b87b314200422cb978d4263e78c3255b5eb Mon Sep 17 00:00:00 2001
From: Timothy Kautz <tim@timothykautz.com>
Date: Mon, 19 Dec 2022 01:23:31 +0900
Subject: [PATCH 04/12] fix scheduler

---
 swift/StableDiffusion/pipeline/Scheduler.swift | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/swift/StableDiffusion/pipeline/Scheduler.swift b/swift/StableDiffusion/pipeline/Scheduler.swift
index d0ed12e3..cfaa71cf 100644
--- a/swift/StableDiffusion/pipeline/Scheduler.swift
+++ b/swift/StableDiffusion/pipeline/Scheduler.swift
@@ -179,7 +179,7 @@ public final class PNDMScheduler: Scheduler {
         timeSteps.append(contentsOf: forwardSteps.dropLast(1))
         timeSteps.append(timeSteps.last!)
         timeSteps.append(forwardSteps.last!)
-        timeSteps.reverse()
+        // do no revers timeSteps, this is now done in `calculateTimesteps` function
 
         self.timeSteps = timeSteps
         self.counter = 0

From cacf99d130225e114c67b91df83db673ca9dddaa Mon Sep 17 00:00:00 2001
From: Timothy Kautz <tim@timothykautz.com>
Date: Mon, 19 Dec 2022 01:23:40 +0900
Subject: [PATCH 05/12] cli

---
 swift/StableDiffusionCLI/main.swift | 40 ++++++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/swift/StableDiffusionCLI/main.swift b/swift/StableDiffusionCLI/main.swift
index 7343cf2b..1b1c8573 100644
--- a/swift/StableDiffusionCLI/main.swift
+++ b/swift/StableDiffusionCLI/main.swift
@@ -32,6 +32,12 @@ struct StableDiffusionSample: ParsableCommand {
         )
     )
     var resourcePath: String = "./"
+    
+    @Option(help: "Path to starting image.")
+    var image: String = "none"
+    
+    @Option(help: "Strength for image2image.")
+    var strength: Float = 0.5
 
     @Option(help: "Number of images to sample / generate")
     var imageCount: Int = 1
@@ -51,7 +57,7 @@ struct StableDiffusionSample: ParsableCommand {
     var outputPath: String = "./"
 
     @Option(help: "Random seed")
-    var seed: UInt32 = 93
+    var seed: UInt32 = UInt32.random(in: 0...UInt32.max)
 
     @Option(help: "Controls the influence of the text prompt on sampling process (0=random images)")
     var guidanceScale: Float = 7.5
@@ -84,6 +90,32 @@ struct StableDiffusionSample: ParsableCommand {
                                                    disableSafety: disableSafety,
                                                    reduceMemory: reduceMemory)
         try pipeline.loadResources()
+        
+        let startingImage: CGImage?
+        if image != "none" {
+            let imageURL = URL(filePath: image)
+//            if FileManager.default.fileExists(atPath: imageURL.path()) {
+//                throw RunError.resources("Starting image not found \(imageURL)")
+//            }
+            do {
+                let imageData = try Data(contentsOf: imageURL)
+                guard
+                    let imgDataProvider = CGDataProvider(data: imageData as CFData),
+                    let loadedImage = CGImage(
+                        pngDataProviderSource: imgDataProvider,
+                        decode: nil, shouldInterpolate: false,
+                        intent: CGColorRenderingIntent.defaultIntent)
+                else {
+                    throw RunError.resources("Starting Image not available \(resourcePath)")
+                }
+                startingImage = loadedImage
+            } catch let error {
+                throw RunError.resources("Starting image not found \(imageURL), error: \(error)")
+            }
+            
+        } else {
+            startingImage = nil
+        }
 
         log("Sampling ...\n")
         let sampleTimer = SampleTimer()
@@ -92,6 +124,8 @@ struct StableDiffusionSample: ParsableCommand {
         let images = try pipeline.generateImages(
             prompt: prompt,
             negativePrompt: negativePrompt,
+            startingImage: startingImage,
+            strength: strength,
             imageCount: imageCount,
             stepCount: stepCount,
             seed: seed,
@@ -168,6 +202,10 @@ struct StableDiffusionSample: ParsableCommand {
         if imageCount != 1 {
             name += ".\(sample)"
         }
+        
+        if image != "none" {
+            name += ".str\(Int(strength * 100))"
+        }
 
         name += ".\(seed)"
 

From ac158dc791ec789b42caacc5bdb7f24c343d4241 Mon Sep 17 00:00:00 2001
From: Timothy Kautz <tim@timothykautz.com>
Date: Mon, 19 Dec 2022 02:07:19 +0900
Subject: [PATCH 06/12] remove CLI comment

---
 swift/StableDiffusionCLI/main.swift | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/swift/StableDiffusionCLI/main.swift b/swift/StableDiffusionCLI/main.swift
index 1b1c8573..aaf44295 100644
--- a/swift/StableDiffusionCLI/main.swift
+++ b/swift/StableDiffusionCLI/main.swift
@@ -94,9 +94,6 @@ struct StableDiffusionSample: ParsableCommand {
         let startingImage: CGImage?
         if image != "none" {
             let imageURL = URL(filePath: image)
-//            if FileManager.default.fileExists(atPath: imageURL.path()) {
-//                throw RunError.resources("Starting image not found \(imageURL)")
-//            }
             do {
                 let imageData = try Data(contentsOf: imageURL)
                 guard

From 685da66efc4b325a294f558692bc4a6a324294b6 Mon Sep 17 00:00:00 2001
From: Timothy Kautz <tim@timothykautz.com>
Date: Mon, 19 Dec 2022 02:10:00 +0900
Subject: [PATCH 07/12] disable dpm multistep solver with image2image

---
 .../pipeline/DPMSolverMultistepScheduler.swift     | 11 ++++++-----
 swift/StableDiffusion/pipeline/Scheduler.swift     | 12 ++++--------
 .../pipeline/StableDiffusionPipeline.swift         | 14 +++++++++++---
 3 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/swift/StableDiffusion/pipeline/DPMSolverMultistepScheduler.swift b/swift/StableDiffusion/pipeline/DPMSolverMultistepScheduler.swift
index 3e8342ce..97c8525f 100644
--- a/swift/StableDiffusion/pipeline/DPMSolverMultistepScheduler.swift
+++ b/swift/StableDiffusion/pipeline/DPMSolverMultistepScheduler.swift
@@ -23,16 +23,12 @@ public final class DPMSolverMultistepScheduler: Scheduler {
     public let betas: [Float]
     public let alphas: [Float]
     public let alphasCumProd: [Float]
-    private let timeSteps: [Int]
+    public let timeSteps: [Int]
 
     public let alpha_t: [Float]
     public let sigma_t: [Float]
     public let lambda_t: [Float]
     
-    public var allTimeSteps: [Int] {
-        timeSteps
-    }
-    
     public let solverOrder = 2
     private(set) var lowerOrderStepped = 0
     
@@ -183,4 +179,9 @@ public final class DPMSolverMultistepScheduler: Scheduler {
         
         return prevSample
     }
+    
+    /// This scheduler does not support image2image strength value.
+    public func calculateTimesteps(strength: Float?) -> [Int] {
+        timeSteps
+    }
 }
diff --git a/swift/StableDiffusion/pipeline/Scheduler.swift b/swift/StableDiffusion/pipeline/Scheduler.swift
index cfaa71cf..d70a42df 100644
--- a/swift/StableDiffusion/pipeline/Scheduler.swift
+++ b/swift/StableDiffusion/pipeline/Scheduler.swift
@@ -12,7 +12,7 @@ public protocol Scheduler {
     var inferenceStepCount: Int { get }
     
     /// Training diffusion time steps index by inference time step
-    var allTimeSteps: [Int] { get }
+    var timeSteps: [Int] { get }
 
     /// Training diffusion time steps index by inference time step
     func calculateTimesteps(strength: Float?) -> [Int]
@@ -94,9 +94,9 @@ public extension Scheduler {
 public extension Scheduler {
     
     func calculateTimesteps(strength: Float?) -> [Int] {
-        guard let strength else { return allTimeSteps.reversed() }
+        guard let strength else { return timeSteps.reversed() }
         let startStep = Int(Float(inferenceStepCount) * strength)
-        let acutalTimesteps = Array(allTimeSteps[0..<startStep].reversed())
+        let acutalTimesteps = Array(timeSteps[0..<startStep].reversed())
         return acutalTimesteps
     }
 }
@@ -127,11 +127,7 @@ public final class PNDMScheduler: Scheduler {
     public let betas: [Float]
     public let alphas: [Float]
     public let alphasCumProd: [Float]
-    private let timeSteps: [Int]
-    
-    public var allTimeSteps: [Int] {
-        timeSteps
-    }
+    public let timeSteps: [Int]
 
     // Internal state
     var counter: Int
diff --git a/swift/StableDiffusion/pipeline/StableDiffusionPipeline.swift b/swift/StableDiffusion/pipeline/StableDiffusionPipeline.swift
index 66bf4afd..a0ba8247 100644
--- a/swift/StableDiffusion/pipeline/StableDiffusionPipeline.swift
+++ b/swift/StableDiffusion/pipeline/StableDiffusionPipeline.swift
@@ -23,6 +23,7 @@ public struct StableDiffusionPipeline: ResourceManaging {
     
     public enum Error: String, Swift.Error {
         case startingImageProvidedWithoutEncoder
+        case schedulerNotSupportedWithImage2Image
     }
 
     /// Model to generate embeddings for tokenized input text
@@ -130,7 +131,7 @@ public struct StableDiffusionPipeline: ResourceManaging {
         seed: UInt32 = 0,
         guidanceScale: Float = 7.5,
         disableSafety: Bool = false,
-        scheduler: StableDiffusionScheduler = .pndmScheduler,
+        scheduler schedulerType: StableDiffusionScheduler = .pndmScheduler,
         progressHandler: (Progress) -> Bool = { _ in true }
     ) throws -> [CGImage?] {
 
@@ -153,7 +154,7 @@ public struct StableDiffusionPipeline: ResourceManaging {
 
         /// Setup schedulers
         let scheduler: [Scheduler] = (0..<imageCount).map { _ in
-            switch scheduler {
+            switch schedulerType {
             case .pndmScheduler: return PNDMScheduler(stepCount: stepCount)
             case .dpmSolverMultistepScheduler: return DPMSolverMultistepScheduler(stepCount: stepCount)
             }
@@ -169,6 +170,13 @@ public struct StableDiffusionPipeline: ResourceManaging {
             guard let encoder else {
                 throw Error.startingImageProvidedWithoutEncoder
             }
+            switch schedulerType {
+            case .pndmScheduler:
+                break
+            case .dpmSolverMultistepScheduler:
+                throw Error.schedulerNotSupportedWithImage2Image
+            }
+            
             let noiseTuples = generateImage2ImageLatentSamples(imageCount, stdev: 1, seed: seed)
             latents = try noiseTuples.map({
                 try encoder.encode(
@@ -184,7 +192,7 @@ public struct StableDiffusionPipeline: ResourceManaging {
         }
 
         // De-noising loop
-        let timeSteps = scheduler[0].calculateTimesteps(strength: timestepStrength)
+        let timeSteps: [Int] = scheduler[0].calculateTimesteps(strength: timestepStrength)
         for (step,t) in timeSteps.enumerated() {
 
             // Expand the latents for classifier-free guidance

From 3c30af52eeba84e7df21ca41132aa5ee09726424 Mon Sep 17 00:00:00 2001
From: Timothy Kautz <tim@timothykautz.com>
Date: Tue, 20 Dec 2022 06:31:29 +0900
Subject: [PATCH 08/12] clamp initial timestamp

---
 swift/StableDiffusion/pipeline/AlphasCumprodCalculation.swift | 2 +-
 swift/StableDiffusion/pipeline/Encoder.swift                  | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/swift/StableDiffusion/pipeline/AlphasCumprodCalculation.swift b/swift/StableDiffusion/pipeline/AlphasCumprodCalculation.swift
index 350ba3a7..78756831 100644
--- a/swift/StableDiffusion/pipeline/AlphasCumprodCalculation.swift
+++ b/swift/StableDiffusion/pipeline/AlphasCumprodCalculation.swift
@@ -22,7 +22,7 @@ public struct AlphasCumprodCalculation {
         strength: Float
     ) {
         let tEnc = Int(strength * Float(steps))
-        let initTimestep = timesteps - timesteps / steps * (steps - tEnc) + 1
+        let initTimestep = min(max(0, timesteps - timesteps / steps * (steps - tEnc) + 1), timesteps - 1)
         self.sqrtAlphasCumprod = alphasCumprod[initTimestep].squareRoot()
         self.sqrtOneMinusAlphasCumprod = (1 - alphasCumprod[initTimestep]).squareRoot()
     }
diff --git a/swift/StableDiffusion/pipeline/Encoder.swift b/swift/StableDiffusion/pipeline/Encoder.swift
index 043865cb..856e89a0 100644
--- a/swift/StableDiffusion/pipeline/Encoder.swift
+++ b/swift/StableDiffusion/pipeline/Encoder.swift
@@ -66,7 +66,6 @@ public struct Encoder {
             else {
                 throw Error.latentOutputNotValid
             }
-            print("output.shape: \(output.shape)")
             return MLShapedArray(output)
         }
         

From d9563fb5867a8ca51535283449365547daab663f Mon Sep 17 00:00:00 2001
From: Pedro Cuenca <pedro@huggingface.co>
Date: Wed, 28 Dec 2022 20:23:38 +0100
Subject: [PATCH 09/12] Store timesteps in reverse order for consistency.

---
 .../pipeline/DPMSolverMultistepScheduler.swift        |  7 +------
 swift/StableDiffusion/pipeline/Scheduler.swift        | 11 +++++------
 .../pipeline/StableDiffusionPipeline.swift            |  7 -------
 3 files changed, 6 insertions(+), 19 deletions(-)

diff --git a/swift/StableDiffusion/pipeline/DPMSolverMultistepScheduler.swift b/swift/StableDiffusion/pipeline/DPMSolverMultistepScheduler.swift
index 97c8525f..1c3f2616 100644
--- a/swift/StableDiffusion/pipeline/DPMSolverMultistepScheduler.swift
+++ b/swift/StableDiffusion/pipeline/DPMSolverMultistepScheduler.swift
@@ -178,10 +178,5 @@ public final class DPMSolverMultistepScheduler: Scheduler {
         }
         
         return prevSample
-    }
-    
-    /// This scheduler does not support image2image strength value.
-    public func calculateTimesteps(strength: Float?) -> [Int] {
-        timeSteps
-    }
+    }    
 }
diff --git a/swift/StableDiffusion/pipeline/Scheduler.swift b/swift/StableDiffusion/pipeline/Scheduler.swift
index d70a42df..ee649f8e 100644
--- a/swift/StableDiffusion/pipeline/Scheduler.swift
+++ b/swift/StableDiffusion/pipeline/Scheduler.swift
@@ -92,12 +92,11 @@ public extension Scheduler {
 
 @available(iOS 16.2, macOS 13.1, *)
 public extension Scheduler {
-    
     func calculateTimesteps(strength: Float?) -> [Int] {
-        guard let strength else { return timeSteps.reversed() }
-        let startStep = Int(Float(inferenceStepCount) * strength)
-        let acutalTimesteps = Array(timeSteps[0..<startStep].reversed())
-        return acutalTimesteps
+        guard let strength else { return timeSteps }
+        let startStep = max(inferenceStepCount - Int(Float(inferenceStepCount) * strength), 0)
+        let actualTimesteps = Array(timeSteps[startStep...])
+        return actualTimesteps
     }
 }
 
@@ -175,7 +174,7 @@ public final class PNDMScheduler: Scheduler {
         timeSteps.append(contentsOf: forwardSteps.dropLast(1))
         timeSteps.append(timeSteps.last!)
         timeSteps.append(forwardSteps.last!)
-        // do no revers timeSteps, this is now done in `calculateTimesteps` function
+        timeSteps.reverse()
 
         self.timeSteps = timeSteps
         self.counter = 0
diff --git a/swift/StableDiffusion/pipeline/StableDiffusionPipeline.swift b/swift/StableDiffusion/pipeline/StableDiffusionPipeline.swift
index a0ba8247..7e54b914 100644
--- a/swift/StableDiffusion/pipeline/StableDiffusionPipeline.swift
+++ b/swift/StableDiffusion/pipeline/StableDiffusionPipeline.swift
@@ -23,7 +23,6 @@ public struct StableDiffusionPipeline: ResourceManaging {
     
     public enum Error: String, Swift.Error {
         case startingImageProvidedWithoutEncoder
-        case schedulerNotSupportedWithImage2Image
     }
 
     /// Model to generate embeddings for tokenized input text
@@ -170,12 +169,6 @@ public struct StableDiffusionPipeline: ResourceManaging {
             guard let encoder else {
                 throw Error.startingImageProvidedWithoutEncoder
             }
-            switch schedulerType {
-            case .pndmScheduler:
-                break
-            case .dpmSolverMultistepScheduler:
-                throw Error.schedulerNotSupportedWithImage2Image
-            }
             
             let noiseTuples = generateImage2ImageLatentSamples(imageCount, stdev: 1, seed: seed)
             latents = try noiseTuples.map({

From f816ed816f7a1ec9bf5f99485e5736ec331b2cdc Mon Sep 17 00:00:00 2001
From: Pedro Cuenca <pedro@huggingface.co>
Date: Wed, 28 Dec 2022 20:49:59 +0100
Subject: [PATCH 10/12] Report actual number of steps.

---
 swift/StableDiffusion/pipeline/StableDiffusionPipeline.swift | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/swift/StableDiffusion/pipeline/StableDiffusionPipeline.swift b/swift/StableDiffusion/pipeline/StableDiffusionPipeline.swift
index 7e54b914..68d676df 100644
--- a/swift/StableDiffusion/pipeline/StableDiffusionPipeline.swift
+++ b/swift/StableDiffusion/pipeline/StableDiffusionPipeline.swift
@@ -219,7 +219,7 @@ public struct StableDiffusionPipeline: ResourceManaging {
                 pipeline: self,
                 prompt: prompt,
                 step: step,
-                stepCount: stepCount,
+                stepCount: timeSteps.count,
                 currentLatentSamples: latents,
                 isSafetyEnabled: canSafetyCheck && !disableSafety
             )

From ccd2832af48838cf1a6e21f0e527cdd66b2b1376 Mon Sep 17 00:00:00 2001
From: Timothy Kautz <tim@timothykautz.com>
Date: Thu, 5 Jan 2023 14:12:53 -0800
Subject: [PATCH 11/12] diagonal test with randn

---
 python_coreml_stable_diffusion/torch2coreml.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/python_coreml_stable_diffusion/torch2coreml.py b/python_coreml_stable_diffusion/torch2coreml.py
index 3963f537..e576050c 100644
--- a/python_coreml_stable_diffusion/torch2coreml.py
+++ b/python_coreml_stable_diffusion/torch2coreml.py
@@ -10,6 +10,7 @@
 from copy import deepcopy
 import coremltools as ct
 from diffusers import StableDiffusionPipeline
+from diffusers.models.vae import DiagonalGaussianDistribution
 import gc
 
 import logging
@@ -29,11 +30,22 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
+#from coremltools.converters.mil.frontend.torch.torch_op_registry import register_torch_op
+#from coremltools.converters.mil.frontend.torch.ops import _get_inputs
+#from coremltools.converters.mil import Builder as mb
+#
+#@register_torch_op
+#def randn(context, node):
+#    inputs = _get_inputs(context, node, expected=5)
+#    shape = inputs[0]
+#
+#    x = mb.random_normal(shape=shape, mean=0., stddev=1.)
+#    context.add(x, node.name)
+
 torch.set_grad_enabled(False)
 
 from types import MethodType
 
-
 def _get_coreml_inputs(sample_inputs, args):
     return [
         ct.TensorType(
@@ -534,6 +546,7 @@ def forward(self, sample, diagonalNoise, noise, sqrtAlphasCumprod, sqrtOneMinusA
             h = self.encoder(sample)
             moments = self.quant_conv(h)
             diagonalNoise = diagonalNoise.to(sample.device)
+#            posterior = DiagonalGaussianDistribution(moments)
             posterior = CoreMLDiagonalGaussianDistribution(moments, diagonalNoise)
             posteriorSample = posterior.sample()
             

From 93158619030226e8d0bb183205504ba11dc2e490 Mon Sep 17 00:00:00 2001
From: Timothy Kautz <tim@timothykautz.com>
Date: Thu, 5 Jan 2023 20:40:29 -0800
Subject: [PATCH 12/12] uint32

---
 swift/StableDiffusion/pipeline/StableDiffusionPipeline.swift | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/swift/StableDiffusion/pipeline/StableDiffusionPipeline.swift b/swift/StableDiffusion/pipeline/StableDiffusionPipeline.swift
index 68d676df..63eb5758 100644
--- a/swift/StableDiffusion/pipeline/StableDiffusionPipeline.swift
+++ b/swift/StableDiffusion/pipeline/StableDiffusionPipeline.swift
@@ -258,7 +258,7 @@ public struct StableDiffusionPipeline: ResourceManaging {
     ///   - diagonalAndLatentNoiseIsSame: Diffusions library does not seem to use the same noise for the `DiagonalGaussianDistribution` operation,
     ///     but I have seen implementations of pipelines where it is the same.
     /// - Returns: An array of tuples of noise values with length of batch size.
-    func generateImage2ImageLatentSamples(_ count: Int, stdev: Float, seed: Int, diagonalAndLatentNoiseIsSame: Bool = false) -> [(diagonal: MLShapedArray<Float32>, latentNoise: MLShapedArray<Float32>)] {
+    func generateImage2ImageLatentSamples(_ count: Int, stdev: Float, seed: UInt32, diagonalAndLatentNoiseIsSame: Bool = false) -> [(diagonal: MLShapedArray<Float32>, latentNoise: MLShapedArray<Float32>)] {
         var sampleShape = unet.latentSampleShape
         sampleShape[0] = 1