leejet · leejet · Feb 25, 2024 · Feb 25, 2024 · Feb 25, 2024
diff --git a/clip.hpp b/clip.hpp
@@ -956,64 +956,32 @@ struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {
         return hidden_states;
     }
 
-    struct ggml_cgraph* build_graph(struct ggml_allocr* allocr, std::vector<int> tokens, bool return_pooled = false) {
+    struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids,
+                                    struct ggml_tensor* input_ids2 = NULL,
+                                    size_t max_token_idx           = 0,
+                                    bool return_pooled             = false) {
         struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
 
-        struct ggml_tensor* input_ids = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_I32, tokens.size());
-        ggml_allocr_alloc(allocr, input_ids);
-
-        if (!ggml_allocr_is_measure(allocr)) {
-            ggml_backend_tensor_set(input_ids, tokens.data(), 0, tokens.size() * ggml_element_size(input_ids));
-        }
-
-        struct ggml_tensor* input_ids2 = NULL;
-        size_t max_token_idx           = 0;
-        if (version == VERSION_XL) {
-            input_ids2 = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_I32, tokens.size());
-            ggml_allocr_alloc(allocr, input_ids2);
-
-            auto it = std::find(tokens.begin(), tokens.end(), EOS_TOKEN_ID);
-            if (it != tokens.end()) {
-                std::fill(std::next(it), tokens.end(), 0);
-            }
-
-            max_token_idx = std::min<size_t>(std::distance(tokens.begin(), it), tokens.size() - 1);
-
-            // for (int i = 0; i < tokens.size(); i++) {
-            //     printf("%d ", tokens[i]);
-            // }
-            // printf("\n");
-
-            if (!ggml_allocr_is_measure(allocr)) {
-                ggml_backend_tensor_set(input_ids2, tokens.data(), 0, tokens.size() * ggml_element_size(input_ids2));
-            }
+        input_ids2 = to_backend(input_ids2);
+        if (!return_pooled) {
+            input_ids  = to_backend(input_ids);
         }
 
         struct ggml_tensor* embeddings = NULL;
 
         if (num_custom_embeddings > 0 && version != VERSION_XL) {
-            embeddings = ggml_new_tensor_2d(compute_ctx,
-                                            wtype,
-                                            text_model.hidden_size,
-                                            text_model.vocab_size + num_custom_embeddings /* custom placeholder */);
-            ggml_allocr_alloc(allocr, embeddings);
-            if (!ggml_allocr_is_measure(allocr)) {
-                // really bad, there is memory inflexibility (this is for host<->device memory conflicts)
-                auto token_embed_weight = text_model.get_token_embed_weight();
-                void* freeze_data       = malloc(ggml_nbytes(token_embed_weight));
-                ggml_backend_tensor_get_and_sync(backend,
-                                                 token_embed_weight,
-                                                 freeze_data,
-                                                 0,
-                                                 ggml_nbytes(token_embed_weight));
-                ggml_backend_tensor_set(embeddings, freeze_data, 0, ggml_nbytes(token_embed_weight));
-                free(freeze_data);
-                // concatenate custom embeddings
-                ggml_backend_tensor_set(embeddings,
-                                        (const void*)token_embed_custom.data(),
-                                        ggml_nbytes(token_embed_weight),
-                                        num_custom_embeddings * text_model.hidden_size * ggml_type_size(wtype));
-            }
+            auto custom_embeddings = ggml_new_tensor_3d(compute_ctx,
+                                                        wtype,
+                                                        text_model.hidden_size,
+                                                        1,
+                                                        num_custom_embeddings);
+            set_backend_tensor_data(custom_embeddings, token_embed_custom.data());
+
+            auto token_embed_weight = text_model.get_token_embed_weight();
+            token_embed_weight      = ggml_reshape_3d(compute_ctx, token_embed_weight, token_embed_weight->ne[0], 1, token_embed_weight->ne[1]);
+            // concatenate custom embeddings
+            embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings);
+            embeddings = ggml_reshape_2d(compute_ctx, embeddings, embeddings->ne[0], embeddings->ne[2]);
         }
 
         struct ggml_tensor* hidden_states = forward(compute_ctx, input_ids, input_ids2, embeddings, max_token_idx, return_pooled);
@@ -1024,12 +992,14 @@ struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {
     }
 
     void compute(const int n_threads,
-                 std::vector<int> tokens,
+                 struct ggml_tensor* input_ids,
+                 struct ggml_tensor* input_ids2,
+                 size_t max_token_idx,
                  bool return_pooled,
                  ggml_tensor** output,
                  ggml_context* output_ctx = NULL) {
         auto get_graph = [&]() -> struct ggml_cgraph* {
-            return build_graph(compute_allocr, tokens, return_pooled);
+            return build_graph(input_ids, input_ids2, max_token_idx, return_pooled);
         };
         GGMLModule::compute(get_graph, n_threads, true, output, output_ctx);
     }
@@ -1143,8 +1113,7 @@ struct FrozenCLIPVisionEmbedder : public GGMLModule {
         vision_model.get_param_tensors(tensors, prefix + "transformer.visual_model");
     }
 
-    struct ggml_cgraph* build_graph(struct ggml_allocr* allocr,
-                                    struct ggml_tensor* pixel_values) {
+    struct ggml_cgraph* build_graph(struct ggml_tensor* pixel_values) {
         struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
 
         pixel_values = to_backend(pixel_values);
@@ -1156,19 +1125,12 @@ struct FrozenCLIPVisionEmbedder : public GGMLModule {
         return gf;
     }
 
-    void alloc_compute_buffer(ggml_context* work_ctx, ggml_tensor* pixel_values) {
-        auto get_graph = [&]() -> struct ggml_cgraph* {
-            return build_graph(compute_allocr, pixel_values);
-        };
-        GGMLModule::alloc_compute_buffer(get_graph);
-    }
-
     void compute(const int n_threads,
                  ggml_tensor* pixel_values,
                  ggml_tensor** output,
                  ggml_context* output_ctx) {
         auto get_graph = [&]() -> struct ggml_cgraph* {
-            return build_graph(compute_allocr, pixel_values);
+            return build_graph(pixel_values);
         };
         GGMLModule::compute(get_graph, n_threads, true, output, output_ctx);
     }

diff --git a/control.hpp b/control.hpp
@@ -166,7 +166,6 @@ class ControlNetBlock : public GGMLBlock {
 
     struct ggml_tensor* resblock_forward(std::string name,
                                          struct ggml_context* ctx,
-                                         struct ggml_allocr* allocr,
                                          struct ggml_tensor* x,
                                          struct ggml_tensor* emb) {
         auto block = std::dynamic_pointer_cast<ResBlock>(blocks[name]);
@@ -175,7 +174,6 @@ class ControlNetBlock : public GGMLBlock {
 
     struct ggml_tensor* attention_layer_forward(std::string name,
                                                 struct ggml_context* ctx,
-                                                struct ggml_allocr* allocr,
                                                 struct ggml_tensor* x,
                                                 struct ggml_tensor* context) {
         auto block = std::dynamic_pointer_cast<SpatialTransformer>(blocks[name]);
@@ -201,11 +199,10 @@ class ControlNetBlock : public GGMLBlock {
     }
 
     std::vector<struct ggml_tensor*> forward(struct ggml_context* ctx,
-                                             struct ggml_allocr* allocr,
                                              struct ggml_tensor* x,
                                              struct ggml_tensor* hint,
                                              struct ggml_tensor* guided_hint,
-                                             std::vector<float> timesteps,
+                                             struct ggml_tensor* timesteps,
                                              struct ggml_tensor* context,
                                              struct ggml_tensor* y = NULL) {
         // x: [N, in_channels, h, w] or [N, in_channels/2, h, w]
@@ -231,7 +228,7 @@ class ControlNetBlock : public GGMLBlock {
 
         auto middle_block_out = std::dynamic_pointer_cast<Conv2d>(blocks["middle_block_out.0"]);
 
-        auto t_emb = new_timestep_embedding(ctx, allocr, timesteps, model_channels);  // [N, model_channels]
+        auto t_emb = ggml_nn_timestep_embedding(ctx, timesteps, model_channels);  // [N, model_channels]
 
         auto emb = time_embed_0->forward(ctx, t_emb);
         emb      = ggml_silu_inplace(ctx, emb);
@@ -272,10 +269,10 @@ class ControlNetBlock : public GGMLBlock {
             for (int j = 0; j < num_res_blocks; j++) {
                 input_block_idx += 1;
                 std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
-                h                = resblock_forward(name, ctx, allocr, h, emb);  // [N, mult*model_channels, h, w]
+                h                = resblock_forward(name, ctx, h, emb);  // [N, mult*model_channels, h, w]
                 if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
                     std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1";
-                    h                = attention_layer_forward(name, ctx, allocr, h, context);  // [N, mult*model_channels, h, w]
+                    h                = attention_layer_forward(name, ctx, h, context);  // [N, mult*model_channels, h, w]
                 }
 
                 auto zero_conv = std::dynamic_pointer_cast<Conv2d>(blocks["zero_convs." + std::to_string(input_block_idx) + ".0"]);
@@ -299,9 +296,9 @@ class ControlNetBlock : public GGMLBlock {
         // [N, 4*model_channels, h/8, w/8]
 
         // middle_block
-        h = resblock_forward("middle_block.0", ctx, allocr, h, emb);             // [N, 4*model_channels, h/8, w/8]
-        h = attention_layer_forward("middle_block.1", ctx, allocr, h, context);  // [N, 4*model_channels, h/8, w/8]
-        h = resblock_forward("middle_block.2", ctx, allocr, h, emb);             // [N, 4*model_channels, h/8, w/8]
+        h = resblock_forward("middle_block.0", ctx, h, emb);             // [N, 4*model_channels, h/8, w/8]
+        h = attention_layer_forward("middle_block.1", ctx, h, context);  // [N, 4*model_channels, h/8, w/8]
+        h = resblock_forward("middle_block.2", ctx, h, emb);             // [N, 4*model_channels, h/8, w/8]
 
         // out
         outs.push_back(middle_block_out->forward(ctx, h));
@@ -386,18 +383,22 @@ struct ControlNet : public GGMLModule {
 
     struct ggml_cgraph* build_graph(struct ggml_tensor* x,
                                     struct ggml_tensor* hint,
-                                    std::vector<float> timesteps,
+                                    struct ggml_tensor* timesteps,
                                     struct ggml_tensor* context,
                                     struct ggml_tensor* y = NULL) {
         struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, CONTROL_NET_GRAPH_SIZE, false);
 
-        x       = to_backend(x);
-        hint    = to_backend(hint);
-        context = to_backend(context);
-        y       = to_backend(y);
+        x         = to_backend(x);
+        if (guided_hint_cached) {
+            hint = NULL;
+        } else {
+            hint      = to_backend(hint);
+        }
+        context   = to_backend(context);
+        y         = to_backend(y);
+        timesteps = to_backend(timesteps);
 
         auto outs = control_net.forward(compute_ctx,
-                                        compute_allocr,
                                         x,
                                         hint,
                                         guided_hint_cached ? guided_hint : NULL,
@@ -420,7 +421,7 @@ struct ControlNet : public GGMLModule {
     void compute(int n_threads,
                  struct ggml_tensor* x,
                  struct ggml_tensor* hint,
-                 std::vector<float> timesteps,
+                 struct ggml_tensor* timesteps,
                  struct ggml_tensor* context,
                  struct ggml_tensor* y,
                  struct ggml_tensor** output     = NULL,
@@ -434,7 +435,6 @@ struct ControlNet : public GGMLModule {
         };
 
         GGMLModule::compute(get_graph, n_threads, false, output, output_ctx);
-
         guided_hint_cached = true;
     }
 

diff --git a/ggml b/ggml