diff --git a/python_coreml_stable_diffusion/torch2coreml.py b/python_coreml_stable_diffusion/torch2coreml.py index 6d6c2fad..e576050c 100644 --- a/python_coreml_stable_diffusion/torch2coreml.py +++ b/python_coreml_stable_diffusion/torch2coreml.py @@ -10,6 +10,7 @@ from copy import deepcopy import coremltools as ct from diffusers import StableDiffusionPipeline +from diffusers.models.vae import DiagonalGaussianDistribution import gc import logging @@ -29,11 +30,22 @@ import torch.nn as nn import torch.nn.functional as F +#from coremltools.converters.mil.frontend.torch.torch_op_registry import register_torch_op +#from coremltools.converters.mil.frontend.torch.ops import _get_inputs +#from coremltools.converters.mil import Builder as mb +# +#@register_torch_op +#def randn(context, node): +# inputs = _get_inputs(context, node, expected=5) +# shape = inputs[0] +# +# x = mb.random_normal(shape=shape, mean=0., stddev=1.) +# context.add(x, node.name) + torch.set_grad_enabled(False) from types import MethodType - def _get_coreml_inputs(sample_inputs, args): return [ ct.TensorType( @@ -43,6 +55,23 @@ def _get_coreml_inputs(sample_inputs, args): ) for k, v in sample_inputs.items() ] +# Simpler version of `DiagonalGaussianDistribution` with only needed calculations +# as implemented in vae.py as part of the AutoencoderKL class +# This is because coreml tools does not support the `randn` operation, so we pass in a random tensor. +class CoreMLDiagonalGaussianDistribution(object): + def __init__(self, parameters, noise): + self.parameters = parameters + self.noise = noise + self.mean, self.logvar = torch.chunk(parameters, 2, dim=1) + self.logvar = torch.clamp(self.logvar, -30.0, 20.0) + self.std = torch.exp(0.5 * self.logvar) + + def sample(self) -> torch.FloatTensor: + device = self.parameters.device + # make sure sample is on the same device as the parameters and has same dtype + sample = self.noise.to(device=device, dtype=self.parameters.dtype) + x = self.mean + self.std * sample + return x def compute_psnr(a, b): """ Compute Peak-Signal-to-Noise-Ratio across two numpy.ndarray objects @@ -140,7 +169,7 @@ def _convert_to_coreml(submodule_name, torchscript_module, sample_inputs, def quantize_weights_to_8bits(args): for model_name in [ - "text_encoder", "vae_decoder", "unet", "unet_chunk1", + "text_encoder", "vae_decoder", "vae_encoder", "unet", "unet_chunk1", "unet_chunk2", "safety_checker" ]: out_path = _get_out_path(args, model_name) @@ -190,6 +219,7 @@ def bundle_resources_for_swift_cli(args): # Compile model using coremlcompiler (Significantly reduces the load time for unet) for source_name, target_name in [("text_encoder", "TextEncoder"), ("vae_decoder", "VAEDecoder"), + ("vae_encoder", "VAEEncoder"), ("unet", "Unet"), ("unet_chunk1", "UnetChunk1"), ("unet_chunk2", "UnetChunk2"), @@ -453,6 +483,164 @@ def forward(self, z): gc.collect() +def convert_vae_encoder(pipe, args): + """ Converts the VAE Encoder component of Stable Diffusion + """ + out_path = _get_out_path(args, "vae_encoder") + if os.path.exists(out_path): + logger.info( + f"`vae_encoder` already exists at {out_path}, skipping conversion." + ) + return + + if not hasattr(pipe, "unet"): + raise RuntimeError( + "convert_unet() deletes pipe.unet to save RAM. " + "Please use convert_vae_encoder() before convert_unet()") + + sample_shape = ( + 1, # B + 3, # C (RGB range from -1 to 1) + args.latent_h or pipe.unet.config.sample_size * 8, # H + args.latent_w or pipe.unet.config.sample_size * 8, # w + ) + + noise_shape = ( + 1, # B + 4, # C + pipe.unet.config.sample_size, # H + pipe.unet.config.sample_size, # w + ) + + float_value_shape = ( + 1, + 1, + ) + + sqrtAlphasCumprodTorchShape = torch.tensor([[0.2,]]) + sqrtOneMinusAlphasCumprodTorchShape = torch.tensor([[0.8,]]) + + sample_vae_encoder_inputs = { + "sample": torch.rand(*sample_shape, dtype=torch.float16), + "diagonalNoise": torch.rand(*noise_shape, dtype=torch.float16), + "noise": torch.rand(*noise_shape, dtype=torch.float16), + "sqrtAlphasCumprod": torch.rand(*float_value_shape, dtype=torch.float16), + "sqrtOneMinusAlphasCumprod": torch.rand(*float_value_shape, dtype=torch.float16), + } + + class VAEEncoder(nn.Module): + """ Wrapper nn.Module wrapper for pipe.encode() method + """ + + def __init__(self): + super().__init__() + self.quant_conv = pipe.vae.quant_conv + self.alphas_cumprod = pipe.scheduler.alphas_cumprod + self.encoder = pipe.vae.encoder + + # Because CoreMLTools does not support the torch.randn op, we pass in both + # the diagonal Noise for the `DiagonalGaussianDistribution` operation and + # the noise tensor combined with precalculated `sqrtAlphasCumprod` and `sqrtOneMinusAlphasCumprod` + # for faster computation. + def forward(self, sample, diagonalNoise, noise, sqrtAlphasCumprod, sqrtOneMinusAlphasCumprod): + h = self.encoder(sample) + moments = self.quant_conv(h) + diagonalNoise = diagonalNoise.to(sample.device) +# posterior = DiagonalGaussianDistribution(moments) + posterior = CoreMLDiagonalGaussianDistribution(moments, diagonalNoise) + posteriorSample = posterior.sample() + + # Add the scaling operation and the latent noise for faster computation + init_latents = 0.18215 * posteriorSample + result = self.add_noise(init_latents, noise, sqrtAlphasCumprod, sqrtOneMinusAlphasCumprod) + return result + + def add_noise( + self, + original_samples: torch.FloatTensor, + noise: torch.FloatTensor, + sqrtAlphasCumprod: torch.FloatTensor, + sqrtOneMinusAlphasCumprod: torch.FloatTensor + ) -> torch.FloatTensor: + noise = noise.to(original_samples.device) + sqrtAlphasCumprod = sqrtAlphasCumprod.to(original_samples.device) + sqrtOneMinusAlphasCumprod = sqrtOneMinusAlphasCumprod.to(original_samples.device) + noisy_samples = sqrtAlphasCumprod * original_samples + sqrtOneMinusAlphasCumprod * noise + return noisy_samples + + + baseline_encoder = VAEEncoder().eval() + + # No optimization needed for the VAE Encoder as it is a pure ConvNet + traced_vae_encoder = torch.jit.trace( + baseline_encoder, ( + sample_vae_encoder_inputs["sample"].to(torch.float32), + sample_vae_encoder_inputs["diagonalNoise"].to(torch.float32), + sample_vae_encoder_inputs["noise"].to(torch.float32), + sqrtAlphasCumprodTorchShape.to(torch.float32), + sqrtOneMinusAlphasCumprodTorchShape.to(torch.float32) + )) + + modify_coremltools_torch_frontend_badbmm() + coreml_vae_encoder, out_path = _convert_to_coreml( + "vae_encoder", traced_vae_encoder, sample_vae_encoder_inputs, + ["latent_dist"], args) + + # Set model metadata + coreml_vae_encoder.author = f"Please refer to the Model Card available at huggingface.co/{args.model_version}" + coreml_vae_encoder.license = "OpenRAIL (https://huggingface.co/spaces/CompVis/stable-diffusion-license)" + coreml_vae_encoder.version = args.model_version + coreml_vae_encoder.short_description = \ + "Stable Diffusion generates images conditioned on text and/or other images as input through the diffusion process. " \ + "Please refer to https://arxiv.org/abs/2112.10752 for details." + + # Set the input descriptions + coreml_vae_encoder.input_description["sample"] = \ + "An image of the correct size to create the latent space with, image2image and in-painting." + coreml_vae_encoder.input_description["diagonalNoise"] = \ + "Latent noise for `DiagonalGaussianDistribution` operation." + coreml_vae_encoder.input_description["noise"] = \ + "Latent noise for use with strength parameter of image2image" + coreml_vae_encoder.input_description["sqrtAlphasCumprod"] = \ + "Precalculated `sqrtAlphasCumprod` value based on strength and the current schedular's alphasCumprod values" + coreml_vae_encoder.input_description["sqrtOneMinusAlphasCumprod"] = \ + "Precalculated `sqrtOneMinusAlphasCumprod` value based on strength and the current schedular's alphasCumprod values" + + # Set the output descriptions + coreml_vae_encoder.output_description[ + "latent_dist"] = "The latent embeddings from the unet model from the input image for image2image." + + _save_mlpackage(coreml_vae_encoder, out_path) + + logger.info(f"Saved vae_encoder into {out_path}") + + # Parity check PyTorch vs CoreML + if args.check_output_correctness: + baseline_out = baseline_encoder( + sample=sample_vae_encoder_inputs["sample"].to(torch.float32), + diagonalNoise=sample_vae_encoder_inputs["diagonalNoise"].to(torch.float32), + noise=sample_vae_encoder_inputs["noise"].to(torch.float32), + sqrtAlphasCumprod=sqrtAlphasCumprodTorchShape, + sqrtOneMinusAlphasCumprod=sqrtOneMinusAlphasCumprodTorchShape, + ).numpy(), + + coreml_out = list( + coreml_vae_encoder.predict( + { + "sample": sample_vae_encoder_inputs["sample"].numpy(), + "diagonalNoise": sample_vae_encoder_inputs["diagonalNoise"].numpy(), + "noise": sample_vae_encoder_inputs["noise"].numpy(), + "sqrtAlphasCumprod": sqrtAlphasCumprodTorchShape.numpy(), + "sqrtOneMinusAlphasCumprod": sqrtOneMinusAlphasCumprodTorchShape.numpy() + }).values()) + + report_correctness(baseline_out[0], coreml_out[0], + "vae_encoder baseline PyTorch to baseline CoreML") + + del traced_vae_encoder, pipe.vae.encoder, coreml_vae_encoder + gc.collect() + + def convert_unet(pipe, args): """ Converts the UNet component of Stable Diffusion """ @@ -801,7 +989,12 @@ def main(args): logger.info("Converting vae_decoder") convert_vae_decoder(pipe, args) logger.info("Converted vae_decoder") - + + if args.convert_vae_encoder: + logger.info("Converting vae_encoder") + convert_vae_encoder(pipe, args) + logger.info("Converted vae_encoder") + if args.convert_unet: logger.info("Converting unet") convert_unet(pipe, args) @@ -835,6 +1028,7 @@ def parser_spec(): # Select which models to export (All are needed for text-to-image pipeline to function) parser.add_argument("--convert-text-encoder", action="store_true") parser.add_argument("--convert-vae-decoder", action="store_true") + parser.add_argument("--convert-vae-encoder", action="store_true") parser.add_argument("--convert-unet", action="store_true") parser.add_argument("--convert-safety-checker", action="store_true") parser.add_argument( diff --git a/swift/StableDiffusion/pipeline/AlphasCumprodCalculation.swift b/swift/StableDiffusion/pipeline/AlphasCumprodCalculation.swift new file mode 100644 index 00000000..78756831 --- /dev/null +++ b/swift/StableDiffusion/pipeline/AlphasCumprodCalculation.swift @@ -0,0 +1,29 @@ +// For licensing see accompanying LICENSE.md file. +// Copyright (C) 2022 Apple Inc. All Rights Reserved. + +import Foundation + +public struct AlphasCumprodCalculation { + public var sqrtAlphasCumprod: Float + public var sqrtOneMinusAlphasCumprod: Float + + public init( + sqrtAlphasCumprod: Float, + sqrtOneMinusAlphasCumprod: Float + ) { + self.sqrtAlphasCumprod = sqrtAlphasCumprod + self.sqrtOneMinusAlphasCumprod = sqrtOneMinusAlphasCumprod + } + + public init( + alphasCumprod: [Float], + timesteps: Int = 1_000, + steps: Int, + strength: Float + ) { + let tEnc = Int(strength * Float(steps)) + let initTimestep = min(max(0, timesteps - timesteps / steps * (steps - tEnc) + 1), timesteps - 1) + self.sqrtAlphasCumprod = alphasCumprod[initTimestep].squareRoot() + self.sqrtOneMinusAlphasCumprod = (1 - alphasCumprod[initTimestep]).squareRoot() + } +} diff --git a/swift/StableDiffusion/pipeline/CGImage+vImage.swift b/swift/StableDiffusion/pipeline/CGImage+vImage.swift new file mode 100644 index 00000000..809836e7 --- /dev/null +++ b/swift/StableDiffusion/pipeline/CGImage+vImage.swift @@ -0,0 +1,120 @@ +// For licensing see accompanying LICENSE.md file. +// Copyright (C) 2022 Apple Inc. All Rights Reserved. + +import Foundation +import Accelerate +import CoreML + +@available(iOS 16.0, macOS 13.0, *) +extension CGImage { + + typealias PixelBufferPFx1 = vImage.PixelBuffer + typealias PixelBufferP8x3 = vImage.PixelBuffer + typealias PixelBufferIFx3 = vImage.PixelBuffer + typealias PixelBufferI8x3 = vImage.PixelBuffer + + public enum ShapedArrayError: String, Swift.Error { + case wrongNumberOfChannels + case incorrectFormatsConvertingToShapedArray + case vImageConverterNotInitialized + } + + public static func fromShapedArray(_ array: MLShapedArray) throws -> CGImage { + + // array is [N,C,H,W], where C==3 + let channelCount = array.shape[1] + guard channelCount == 3 else { + throw ShapedArrayError.wrongNumberOfChannels + } + + let height = array.shape[2] + let width = array.shape[3] + + // Normalize each channel into a float between 0 and 1.0 + let floatChannels = (0.. [0.0 1.0] + cIn.multiply(by: 0.5, preBias: 1.0, postBias: 0.0, destination: cOut) + } + return cOut + } + + // Convert to interleaved and then to UInt8 + let floatImage = PixelBufferIFx3(planarBuffers: floatChannels) + let uint8Image = PixelBufferI8x3(width: width, height: height) + floatImage.convert(to:uint8Image) // maps [0.0 1.0] -> [0 255] and clips + + // Convert to uint8x3 to RGB CGImage (no alpha) + let bitmapInfo = CGBitmapInfo(rawValue: CGImageAlphaInfo.none.rawValue) + let cgImage = uint8Image.makeCGImage(cgImageFormat: + .init(bitsPerComponent: 8, + bitsPerPixel: 3*8, + colorSpace: CGColorSpaceCreateDeviceRGB(), + bitmapInfo: bitmapInfo)!)! + + return cgImage + } + + public var plannerRGBShapedArray: MLShapedArray { + get throws { + guard + var sourceFormat = vImage_CGImageFormat(cgImage: self), + var mediumFormat = vImage_CGImageFormat( + bitsPerComponent: 8 * MemoryLayout.size, + bitsPerPixel: 8 * MemoryLayout.size * 4, + colorSpace: CGColorSpaceCreateDeviceRGB(), + bitmapInfo: CGBitmapInfo(rawValue: CGImageAlphaInfo.first.rawValue)), + let width = vImagePixelCount(exactly: self.width), + let height = vImagePixelCount(exactly: self.height) + else { + throw ShapedArrayError.incorrectFormatsConvertingToShapedArray + } + + var sourceImageBuffer = try vImage_Buffer(cgImage: self) + + var mediumDesination = try vImage_Buffer(width: Int(width), height: Int(height), bitsPerPixel: mediumFormat.bitsPerPixel) + + let converter = vImageConverter_CreateWithCGImageFormat( + &sourceFormat, + &mediumFormat, + nil, + vImage_Flags(kvImagePrintDiagnosticsToConsole), + nil) + + guard let converter = converter?.takeRetainedValue() else { + throw ShapedArrayError.vImageConverterNotInitialized + } + + vImageConvert_AnyToAny(converter, &sourceImageBuffer, &mediumDesination, nil, vImage_Flags(kvImagePrintDiagnosticsToConsole)) + + var destinationA = try vImage_Buffer(width: Int(width), height: Int(height), bitsPerPixel: 8 * UInt32(MemoryLayout.size)) + var destinationR = try vImage_Buffer(width: Int(width), height: Int(height), bitsPerPixel: 8 * UInt32(MemoryLayout.size)) + var destinationG = try vImage_Buffer(width: Int(width), height: Int(height), bitsPerPixel: 8 * UInt32(MemoryLayout.size)) + var destinationB = try vImage_Buffer(width: Int(width), height: Int(height), bitsPerPixel: 8 * UInt32(MemoryLayout.size)) + + var minFloat: [Float] = [-1.0, -1.0, -1.0, -1.0] + var maxFloat: [Float] = [1.0, 1.0, 1.0, 1.0] + + vImageConvert_ARGB8888toPlanarF(&mediumDesination, &destinationA, &destinationR, &destinationG, &destinationB, &maxFloat, &minFloat, .zero) + + let redData = Data(bytes: destinationR.data, count: Int(width) * Int(height) * MemoryLayout.size) + let greenData = Data(bytes: destinationG.data, count: Int(width) * Int(height) * MemoryLayout.size) + let blueData = Data(bytes: destinationB.data, count: Int(width) * Int(height) * MemoryLayout.size) + + let imageData = redData + greenData + blueData + + let shapedArray = MLShapedArray(data: imageData, shape: [1, 3, 512, 512]) + + return shapedArray + } + } +} + diff --git a/swift/StableDiffusion/pipeline/DPMSolverMultistepScheduler.swift b/swift/StableDiffusion/pipeline/DPMSolverMultistepScheduler.swift index 1fbfff1c..1c3f2616 100644 --- a/swift/StableDiffusion/pipeline/DPMSolverMultistepScheduler.swift +++ b/swift/StableDiffusion/pipeline/DPMSolverMultistepScheduler.swift @@ -178,5 +178,5 @@ public final class DPMSolverMultistepScheduler: Scheduler { } return prevSample - } + } } diff --git a/swift/StableDiffusion/pipeline/Decoder.swift b/swift/StableDiffusion/pipeline/Decoder.swift index 04f04ba6..e9b2c704 100644 --- a/swift/StableDiffusion/pipeline/Decoder.swift +++ b/swift/StableDiffusion/pipeline/Decoder.swift @@ -3,7 +3,6 @@ import Foundation import CoreML -import Accelerate /// A decoder model which produces RGB images from latent samples @available(iOS 16.2, macOS 13.1, *) @@ -57,12 +56,11 @@ public struct Decoder: ResourceManaging { } // Transform the outputs to CGImages - let images: [CGImage] = (0..(output)) + return try CGImage.fromShapedArray(MLShapedArray(output)) } return images @@ -74,50 +72,4 @@ public struct Decoder: ResourceManaging { } } - typealias PixelBufferPFx1 = vImage.PixelBuffer - typealias PixelBufferP8x3 = vImage.PixelBuffer - typealias PixelBufferIFx3 = vImage.PixelBuffer - typealias PixelBufferI8x3 = vImage.PixelBuffer - - func toRGBCGImage(_ array: MLShapedArray) -> CGImage { - - // array is [N,C,H,W], where C==3 - let channelCount = array.shape[1] - assert(channelCount == 3, - "Decoding model output has \(channelCount) channels, expected 3") - let height = array.shape[2] - let width = array.shape[3] - - // Normalize each channel into a float between 0 and 1.0 - let floatChannels = (0.. [0.0 1.0] - cIn.multiply(by: 0.5, preBias: 1.0, postBias: 0.0, destination: cOut) - } - return cOut - } - - // Convert to interleaved and then to UInt8 - let floatImage = PixelBufferIFx3(planarBuffers: floatChannels) - let uint8Image = PixelBufferI8x3(width: width, height: height) - floatImage.convert(to:uint8Image) // maps [0.0 1.0] -> [0 255] and clips - - // Convert to uint8x3 to RGB CGImage (no alpha) - let bitmapInfo = CGBitmapInfo(rawValue: CGImageAlphaInfo.none.rawValue) - let cgImage = uint8Image.makeCGImage(cgImageFormat: - .init(bitsPerComponent: 8, - bitsPerPixel: 3*8, - colorSpace: CGColorSpaceCreateDeviceRGB(), - bitmapInfo: bitmapInfo)!)! - - return cgImage - } } diff --git a/swift/StableDiffusion/pipeline/Encoder.swift b/swift/StableDiffusion/pipeline/Encoder.swift new file mode 100644 index 00000000..856e89a0 --- /dev/null +++ b/swift/StableDiffusion/pipeline/Encoder.swift @@ -0,0 +1,79 @@ +// For licensing see accompanying LICENSE.md file. +// Copyright (C) 2022 Apple Inc. All Rights Reserved. + +import Foundation +import CoreML + +@available(iOS 16.0, macOS 13.0, *) +/// Encoder, currently supports image2image +public struct Encoder { + + public enum Error: String, Swift.Error { + case latentOutputNotValid + case batchLatentOutputEmpty + } + + /// VAE encoder model + post math and adding noise from schedular + var model: MLModel + + /// Create decoder from Core ML model + /// + /// - Parameters + /// - model: Core ML model for VAE decoder + public init(model: MLModel) { + self.model = model + } + + /// Prediction queue + let queue = DispatchQueue(label: "encoder.predict") + + /// Batch encode latent samples into images + /// - Parameters: + /// - image: image used for image2image + /// - diagonalNoise: random noise for `DiagonalGaussianDistribution` operation + /// - noise: random noise for initial latent space based on strength argument + /// - alphasCumprodStep: calculations using the scheduler traditionally calculated in the pipeline in pyTorch Diffusers library. + /// - Returns: The encoded latent space as MLShapedArray + public func encode( + image: CGImage, + diagonalNoise: MLShapedArray, + noise: MLShapedArray, + alphasCumprodStep: AlphasCumprodCalculation + ) throws -> MLShapedArray { + let sample = try image.plannerRGBShapedArray + let sqrtAlphasCumprod = MLShapedArray(scalars: [alphasCumprodStep.sqrtAlphasCumprod], shape: [1, 1]) + let sqrtOneMinusAlphasCumprod = MLShapedArray(scalars: [alphasCumprodStep.sqrtOneMinusAlphasCumprod], shape: [1, 1]) + + let dict: [String: Any] = [ + "sample": MLMultiArray(sample), + "diagonalNoise": MLMultiArray(diagonalNoise), + "noise": MLMultiArray(noise), + "sqrtAlphasCumprod": MLMultiArray(sqrtAlphasCumprod), + "sqrtOneMinusAlphasCumprod": MLMultiArray(sqrtOneMinusAlphasCumprod), + ] + let featureProvider = try MLDictionaryFeatureProvider(dictionary: dict) + + let batch = MLArrayBatchProvider(array: [featureProvider]) + + // Batch predict with model + let results = try queue.sync { try model.predictions(fromBatch: batch) } + + let batchLatents: [MLShapedArray] = try (0.. [Int] + /// Schedule of betas which controls the amount of noise added at each timestep var betas: [Float] { get } @@ -71,6 +74,34 @@ public extension Scheduler { } } +// MARK: - Image2Image + +@available(iOS 16.2, macOS 13.1, *) +public extension Scheduler { + + func calculateAlphasCumprod(strength: Float) -> AlphasCumprodCalculation { + AlphasCumprodCalculation( + alphasCumprod: alphasCumProd, + timesteps: trainStepCount, + steps: inferenceStepCount, + strength: strength) + } +} + +// MARK: - Timesteps + +@available(iOS 16.2, macOS 13.1, *) +public extension Scheduler { + func calculateTimesteps(strength: Float?) -> [Int] { + guard let strength else { return timeSteps } + let startStep = max(inferenceStepCount - Int(Float(inferenceStepCount) * strength), 0) + let actualTimesteps = Array(timeSteps[startStep...]) + return actualTimesteps + } +} + +// MARK: - BetaSchedule + /// How to map a beta range to a sequence of betas to step over @available(iOS 16.2, macOS 13.1, *) public enum BetaSchedule { @@ -80,6 +111,7 @@ public enum BetaSchedule { case scaledLinear } +// MARK: - PNDMScheduler /// A scheduler used to compute a de-noised image /// diff --git a/swift/StableDiffusion/pipeline/StableDiffusionPipeline+Resources.swift b/swift/StableDiffusion/pipeline/StableDiffusionPipeline+Resources.swift index 65c6e03c..d4d703a7 100644 --- a/swift/StableDiffusion/pipeline/StableDiffusionPipeline+Resources.swift +++ b/swift/StableDiffusion/pipeline/StableDiffusionPipeline+Resources.swift @@ -14,6 +14,7 @@ public extension StableDiffusionPipeline { public let unetChunk1URL: URL public let unetChunk2URL: URL public let decoderURL: URL + public let encoderURL: URL public let safetyCheckerURL: URL public let vocabURL: URL public let mergesURL: URL @@ -24,6 +25,7 @@ public extension StableDiffusionPipeline { unetChunk1URL = baseURL.appending(path: "UnetChunk1.mlmodelc") unetChunk2URL = baseURL.appending(path: "UnetChunk2.mlmodelc") decoderURL = baseURL.appending(path: "VAEDecoder.mlmodelc") + encoderURL = baseURL.appending(path: "VAEEncoder.mlmodelc") safetyCheckerURL = baseURL.appending(path: "SafetyChecker.mlmodelc") vocabURL = baseURL.appending(path: "vocab.json") mergesURL = baseURL.appending(path: "merges.txt") @@ -74,11 +76,22 @@ public extension StableDiffusionPipeline { FileManager.default.fileExists(atPath: urls.safetyCheckerURL.path) { safetyChecker = SafetyChecker(modelAt: urls.safetyCheckerURL, configuration: config) } + + // Optional Image Encoder + let encoder: Encoder? + if + let encoderModel = try? MLModel(contentsOf: urls.encoderURL, configuration: config) + { + encoder = Encoder(model: encoderModel) + } else { + encoder = nil + } // Construct pipeline self.init(textEncoder: textEncoder, unet: unet, decoder: decoder, + encoder: encoder, safetyChecker: safetyChecker, reduceMemory: reduceMemory) } diff --git a/swift/StableDiffusion/pipeline/StableDiffusionPipeline.swift b/swift/StableDiffusion/pipeline/StableDiffusionPipeline.swift index ea654723..63eb5758 100644 --- a/swift/StableDiffusion/pipeline/StableDiffusionPipeline.swift +++ b/swift/StableDiffusion/pipeline/StableDiffusionPipeline.swift @@ -20,6 +20,10 @@ public enum StableDiffusionScheduler { /// [Hugging Face Diffusers Pipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py) @available(iOS 16.2, macOS 13.1, *) public struct StableDiffusionPipeline: ResourceManaging { + + public enum Error: String, Swift.Error { + case startingImageProvidedWithoutEncoder + } /// Model to generate embeddings for tokenized input text var textEncoder: TextEncoder @@ -29,6 +33,9 @@ public struct StableDiffusionPipeline: ResourceManaging { /// Model used to generate final image from latent diffusion process var decoder: Decoder + + /// Model used to latent space for image2image, and soon, in-painting + var encoder: Encoder? /// Optional model for checking safety of generated image var safetyChecker: SafetyChecker? = nil @@ -58,11 +65,13 @@ public struct StableDiffusionPipeline: ResourceManaging { public init(textEncoder: TextEncoder, unet: Unet, decoder: Decoder, + encoder: Encoder?, safetyChecker: SafetyChecker? = nil, reduceMemory: Bool = false) { self.textEncoder = textEncoder self.unet = unet self.decoder = decoder + self.encoder = encoder self.safetyChecker = safetyChecker self.reduceMemory = reduceMemory } @@ -114,12 +123,14 @@ public struct StableDiffusionPipeline: ResourceManaging { public func generateImages( prompt: String, negativePrompt: String = "", + startingImage: CGImage? = nil, + strength: Float = 1.0, imageCount: Int = 1, stepCount: Int = 50, seed: UInt32 = 0, guidanceScale: Float = 7.5, disableSafety: Bool = false, - scheduler: StableDiffusionScheduler = .pndmScheduler, + scheduler schedulerType: StableDiffusionScheduler = .pndmScheduler, progressHandler: (Progress) -> Bool = { _ in true } ) throws -> [CGImage?] { @@ -142,7 +153,7 @@ public struct StableDiffusionPipeline: ResourceManaging { /// Setup schedulers let scheduler: [Scheduler] = (0..] + let timestepStrength: Float? + + if let startingImage { + timestepStrength = strength + guard let encoder else { + throw Error.startingImageProvidedWithoutEncoder + } + + let noiseTuples = generateImage2ImageLatentSamples(imageCount, stdev: 1, seed: seed) + latents = try noiseTuples.map({ + try encoder.encode( + image: startingImage, + diagonalNoise: $0.diagonal, + noise: $0.latentNoise, + alphasCumprodStep: scheduler[0].calculateAlphasCumprod(strength: strength)) + }) + } else { + timestepStrength = nil + // Generate random latent samples from specified seed + latents = generateLatentSamples(imageCount, stdev: stdev, seed: seed) + } // De-noising loop - for (step,t) in scheduler[0].timeSteps.enumerated() { + let timeSteps: [Int] = scheduler[0].calculateTimesteps(strength: timestepStrength) + for (step,t) in timeSteps.enumerated() { // Expand the latents for classifier-free guidance // and input to the Unet noise prediction model @@ -186,7 +219,7 @@ public struct StableDiffusionPipeline: ResourceManaging { pipeline: self, prompt: prompt, step: step, - stepCount: stepCount, + stepCount: timeSteps.count, currentLatentSamples: latents, isSafetyEnabled: canSafetyCheck && !disableSafety ) @@ -215,6 +248,35 @@ public struct StableDiffusionPipeline: ResourceManaging { } return samples } + + + /// For image2image - + /// - Parameters: + /// - count: batch size + /// - stdev: 1 + /// - seed: seed provided + /// - diagonalAndLatentNoiseIsSame: Diffusions library does not seem to use the same noise for the `DiagonalGaussianDistribution` operation, + /// but I have seen implementations of pipelines where it is the same. + /// - Returns: An array of tuples of noise values with length of batch size. + func generateImage2ImageLatentSamples(_ count: Int, stdev: Float, seed: UInt32, diagonalAndLatentNoiseIsSame: Bool = false) -> [(diagonal: MLShapedArray, latentNoise: MLShapedArray)] { + var sampleShape = unet.latentSampleShape + sampleShape[0] = 1 + + var random = NumPyRandomSource(seed: UInt32(truncatingIfNeeded: seed)) + let samples = (0..( + converting: random.normalShapedArray(sampleShape, mean: 0.0, stdev: Double(stdev))) + return (noise, noise) + } else { + return (MLShapedArray( + converting: random.normalShapedArray(sampleShape, mean: 0.0, stdev: Double(stdev))), + MLShapedArray( + converting: random.normalShapedArray(sampleShape, mean: 0.0, stdev: Double(stdev)))) + } + } + return samples + } func toHiddenStates(_ embedding: MLShapedArray) -> MLShapedArray { // Unoptimized manual transpose [0, 2, None, 1] diff --git a/swift/StableDiffusionCLI/main.swift b/swift/StableDiffusionCLI/main.swift index 7343cf2b..aaf44295 100644 --- a/swift/StableDiffusionCLI/main.swift +++ b/swift/StableDiffusionCLI/main.swift @@ -32,6 +32,12 @@ struct StableDiffusionSample: ParsableCommand { ) ) var resourcePath: String = "./" + + @Option(help: "Path to starting image.") + var image: String = "none" + + @Option(help: "Strength for image2image.") + var strength: Float = 0.5 @Option(help: "Number of images to sample / generate") var imageCount: Int = 1 @@ -51,7 +57,7 @@ struct StableDiffusionSample: ParsableCommand { var outputPath: String = "./" @Option(help: "Random seed") - var seed: UInt32 = 93 + var seed: UInt32 = UInt32.random(in: 0...UInt32.max) @Option(help: "Controls the influence of the text prompt on sampling process (0=random images)") var guidanceScale: Float = 7.5 @@ -84,6 +90,29 @@ struct StableDiffusionSample: ParsableCommand { disableSafety: disableSafety, reduceMemory: reduceMemory) try pipeline.loadResources() + + let startingImage: CGImage? + if image != "none" { + let imageURL = URL(filePath: image) + do { + let imageData = try Data(contentsOf: imageURL) + guard + let imgDataProvider = CGDataProvider(data: imageData as CFData), + let loadedImage = CGImage( + pngDataProviderSource: imgDataProvider, + decode: nil, shouldInterpolate: false, + intent: CGColorRenderingIntent.defaultIntent) + else { + throw RunError.resources("Starting Image not available \(resourcePath)") + } + startingImage = loadedImage + } catch let error { + throw RunError.resources("Starting image not found \(imageURL), error: \(error)") + } + + } else { + startingImage = nil + } log("Sampling ...\n") let sampleTimer = SampleTimer() @@ -92,6 +121,8 @@ struct StableDiffusionSample: ParsableCommand { let images = try pipeline.generateImages( prompt: prompt, negativePrompt: negativePrompt, + startingImage: startingImage, + strength: strength, imageCount: imageCount, stepCount: stepCount, seed: seed, @@ -168,6 +199,10 @@ struct StableDiffusionSample: ParsableCommand { if imageCount != 1 { name += ".\(sample)" } + + if image != "none" { + name += ".str\(Int(strength * 100))" + } name += ".\(seed)"